Apply clang-format on repository

Code is formatted as per a revised clang format configuration file(not part of this delivery). Version 14.0.6 is used. Exclusion List: - files with .cl extension - files that are not strictly C/C++ (e.g. Android.bp, Sconscript ...) And the following directories - compute_kernel_writer/validation/ - tests/ - include/ - src/core/NEON/kernels/convolution/ - src/core/NEON/kernels/arm_gemm/ - src/core/NEON/kernels/arm_conv/ - data/ There will be a follow up for formatting of .cl files and the files under tests/ and compute_kernel_writer/validation/. Signed-off-by: Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com> Change-Id: Ib7eb1fcf4e7537b9feaefcfc15098a804a3fde0a Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10391 Benchmark: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
author: Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com> 2023-09-27 17:46:17 +0100
committer: felixjohnny.thomasmathibalan <felixjohnny.thomasmathibalan@arm.com> 2023-09-28 12:08:05 +0000
commit: afd38f0c617d6f89b2b4532c6c44f116617e2b6f (patch)
tree: 03bc7d5a762099989b16a656fa8d397b490ed70e
parent: bdcb4c148ee2fdeaaddf4cf1e57bbb0de02bb894 (diff)
download: ComputeLibrary-afd38f0c617d6f89b2b4532c6c44f116617e2b6f.tar.gz
1866 files changed, 62956 insertions, 48773 deletions
diff --git a/arm_compute/Acl.h b/arm_compute/Acl.h
index 1cd45d5756..3e99a00cbe 100644
--- a/arm_compute/Acl.h
+++ b/arm_compute/Acl.h
@@ -25,7 +25,8 @@
 #define ARM_COMPUTE_ACL_H_
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif /* __cplusplus */
 
 /* Core headers */
diff --git a/arm_compute/Acl.hpp b/arm_compute/Acl.hpp
index 55e04e876d..6a9d585c14 100644
--- a/arm_compute/Acl.hpp
+++ b/arm_compute/Acl.hpp
@@ -75,7 +75,7 @@ struct ObjectDeleter
 #define OBJECT_DELETER(obj, func)              \
     template <>                                \
     struct ObjectDeleter<obj>                  \
-        \
+                                               \
     {                                          \
         static inline AclStatus Destroy(obj v) \
         {                                      \
@@ -171,7 +171,7 @@ protected:
     ObjectBase() = default;
 
 protected:
-    std::shared_ptr<T> _object{ nullptr }; /**< Library object */
+    std::shared_ptr<T> _object{nullptr}; /**< Library object */
 };
 
 /** Equality operator for library object
@@ -221,8 +221,7 @@ public:
      * @param[in] status Status returned
      * @param[in] msg    Error message to be bound with the exception
      */
-    Status(StatusCode status, const std::string &msg)
-        : _status(status), _msg(msg)
+    Status(StatusCode status, const std::string &msg) : _status(status), _msg(msg)
     {
     }
     /** Returns an explanatory exception message
@@ -266,7 +265,7 @@ private:
  */
 static inline void report_status(StatusCode status, const std::string &msg)
 {
-    if(status != StatusCode::Success)
+    if (status != StatusCode::Success)
     {
         throw Status(status, msg);
     }
@@ -299,7 +298,8 @@ enum class Target
 /**< Available execution modes */
 enum class ExecutionMode
 {
-    FastRerun = AclPreferFastRerun, /**< Prefer minimum latency in consecutive runs, might introduce higher startup times */
+    FastRerun =
+        AclPreferFastRerun, /**< Prefer minimum latency in consecutive runs, might introduce higher startup times */
     FastStart = AclPreferFastStart, /**< Prefer minimizing startup time */
 };
 
@@ -372,8 +372,7 @@ public:
      * @param[in]  target Target to create context for
      * @param[out] status Status information if requested
      */
-    explicit Context(Target target, StatusCode *status = nullptr)
-        : Context(target, Options(), status)
+    explicit Context(Target target, StatusCode *status = nullptr) : Context(target, Options(), status)
     {
     }
     /** Constructor
@@ -385,10 +384,11 @@ public:
     Context(Target target, const Options &options, StatusCode *status = nullptr)
     {
         AclContext ctx;
-        const auto st = detail::as_enum<StatusCode>(AclCreateContext(&ctx, detail::as_cenum<AclTarget>(target), &options.copts));
+        const auto st =
+            detail::as_enum<StatusCode>(AclCreateContext(&ctx, detail::as_cenum<AclTarget>(target), &options.copts));
         reset(ctx);
         report_status(st, "[Compute Library] Failed to create context");
-        if(status)
+        if (status)
         {
             *status = st;
         }
@@ -424,15 +424,13 @@ public:
          * As default options, no tuning will be performed, and the number of scheduling units will
          * depends on internal device discovery functionality
          */
-        Options()
-            : opts{ AclTuningModeNone, 0 } {};
+        Options() : opts{AclTuningModeNone, 0} {};
         /** Constructor
          *
          * @param[in] mode          Tuning mode to be used
          * @param[in] compute_units Number of scheduling units to be used
          */
-        Options(TuningMode mode, int32_t compute_units)
-            : opts{ detail::as_cenum<AclTuningMode>(mode), compute_units }
+        Options(TuningMode mode, int32_t compute_units) : opts{detail::as_cenum<AclTuningMode>(mode), compute_units}
         {
         }
 
@@ -448,8 +446,7 @@ public:
      * @param[in]  ctx    Context to create queue for
      * @param[out] status Status information if requested
      */
-    explicit Queue(Context &ctx, StatusCode *status = nullptr)
-        : Queue(ctx, Options(), status)
+    explicit Queue(Context &ctx, StatusCode *status = nullptr) : Queue(ctx, Options(), status)
     {
     }
     /** Constructor
@@ -466,7 +463,7 @@ public:
         const auto st = detail::as_enum<StatusCode>(AclCreateQueue(&queue, ctx.get(), &options.opts));
         reset(queue);
         report_status(st, "[Compute Library] Failed to create queue!");
-        if(status)
+        if (status)
         {
             *status = st;
         }
@@ -508,8 +505,7 @@ public:
      * @param[in] shape Shape of the tensor
      * @param[in] data_type Data type of the tensor
      */
-    TensorDescriptor(const std::vector<int32_t> &shape, DataType data_type)
-        : _shape(shape), _data_type(data_type)
+    TensorDescriptor(const std::vector<int32_t> &shape, DataType data_type) : _shape(shape), _data_type(data_type)
     {
         _cdesc.ndims     = _shape.size();
         _cdesc.shape     = _shape.data();
@@ -526,7 +522,7 @@ public:
         _cdesc     = desc;
         _data_type = detail::as_enum<DataType>(desc.data_type);
         _shape.reserve(desc.ndims);
-        for(int32_t d = 0; d < desc.ndims; ++d)
+        for (int32_t d = 0; d < desc.ndims; ++d)
         {
             _shape.emplace_back(desc.shape[d]);
         }
@@ -552,9 +548,9 @@ public:
         is_same &= _data_type == other._data_type;
         is_same &= _shape.size() == other._shape.size();
 
-        if(is_same)
+        if (is_same)
         {
-            for(uint32_t d = 0; d < _shape.size(); ++d)
+            for (uint32_t d = 0; d < _shape.size(); ++d)
             {
                 is_same &= _shape[d] == other._shape[d];
             }
@@ -592,8 +588,7 @@ public:
      * @param[in]  desc   Tensor descriptor to be used
      * @param[out] status Status information if requested
      */
-    Tensor(Context &ctx, const TensorDescriptor &desc, StatusCode *status = nullptr)
-        : Tensor(ctx, desc, true, status)
+    Tensor(Context &ctx, const TensorDescriptor &desc, StatusCode *status = nullptr) : Tensor(ctx, desc, true, status)
     {
     }
     /** Constructor
@@ -609,7 +604,7 @@ public:
         const auto st = detail::as_enum<StatusCode>(AclCreateTensor(&tensor, ctx.get(), desc.get(), allocate));
         reset(tensor);
         report_status(st, "[Compute Library] Failed to create tensor!");
-        if(status)
+        if (status)
         {
             *status = st;
         }
@@ -646,7 +641,8 @@ public:
      */
     StatusCode import(void *handle, ImportType type)
     {
-        const auto st = detail::as_enum<StatusCode>(AclTensorImport(_object.get(), handle, detail::as_cenum<AclImportMemoryType>(type)));
+        const auto st = detail::as_enum<StatusCode>(
+            AclTensorImport(_object.get(), handle, detail::as_cenum<AclImportMemoryType>(type)));
         report_status(st, "[Compute Library] Failed to import external memory to tensor!");
         return st;
     }
@@ -658,7 +654,7 @@ public:
      */
     uint64_t get_size()
     {
-        uint64_t   size{ 0 };
+        uint64_t   size{0};
         const auto st = detail::as_enum<StatusCode>(AclGetTensorSize(_object.get(), &size));
         report_status(st, "[Compute Library] Failed to get the size of the tensor");
         return size;
@@ -692,13 +688,12 @@ public:
          * @param[in] tensor_ Tensor to pack
          * @param[in] slot_id_ Slot identification of the tensor in respect with the operator
          */
-        PackPair(Tensor *tensor_, int32_t slot_id_)
-            : tensor(tensor_), slot_id(slot_id_)
+        PackPair(Tensor *tensor_, int32_t slot_id_) : tensor(tensor_), slot_id(slot_id_)
         {
         }
 
-        Tensor *tensor{ nullptr };         /**< Tensor object */
-        int32_t slot_id{ AclSlotUnknown }; /**< Slot id in respect with the operator */
+        Tensor *tensor{nullptr};         /**< Tensor object */
+        int32_t slot_id{AclSlotUnknown}; /**< Slot id in respect with the operator */
     };
 
 public:
@@ -713,7 +708,7 @@ public:
         const auto    st = detail::as_enum<StatusCode>(AclCreateTensorPack(&pack, ctx.get()));
         reset(pack);
         report_status(st, "[Compute Library] Failure during tensor pack creation");
-        if(status)
+        if (status)
         {
             *status = st;
         }
@@ -741,7 +736,7 @@ public:
         std::vector<int32_t>   slots(size);
         std::vector<AclTensor> tensors(size);
         int                    i = 0;
-        for(auto &p : packed)
+        for (auto &p : packed)
         {
             slots[i]   = p.slot_id;
             tensors[i] = AclTensor(p.tensor);
@@ -780,13 +775,17 @@ using ActivationDesc = AclActivationDescriptor;
 class Activation : public Operator
 {
 public:
-    Activation(Context &ctx, const TensorDescriptor &src, const TensorDescriptor &dst, const ActivationDesc &desc, StatusCode *status = nullptr)
+    Activation(Context                &ctx,
+               const TensorDescriptor &src,
+               const TensorDescriptor &dst,
+               const ActivationDesc   &desc,
+               StatusCode             *status = nullptr)
     {
         AclOperator op;
         const auto  st = detail::as_enum<StatusCode>(AclActivation(&op, ctx.get(), src.get(), dst.get(), desc));
         reset(op);
         report_status(st, "[Compute Library] Failure during Activation operator creation");
-        if(status)
+        if (status)
         {
             *status = st;
         }
diff --git a/arm_compute/AclDescriptors.h b/arm_compute/AclDescriptors.h
index a564bd2141..cdaf7c0dc8 100644
--- a/arm_compute/AclDescriptors.h
+++ b/arm_compute/AclDescriptors.h
@@ -25,37 +25,38 @@
 #define ARM_COMPUTE_ACL_DESCRIPTORS_H_
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif /** __cplusplus */
 
-/**< Supported activation types */
-typedef enum
-{
-    AclActivationTypeNone = 0,  /**< No activation */
-    AclIdentity           = 1,  /**< Identity */
-    AclLogistic           = 2,  /**< Logistic */
-    AclTanh               = 3,  /**< Hyperbolic tangent */
-    AclRelu               = 4,  /**< Rectifier */
-    AclBoundedRelu        = 5,  /**< Upper Bounded Rectifier */
-    AclLuBoundedRelu      = 6,  /**< Lower and Upper Bounded Rectifier */
-    AclLeakyRelu          = 7,  /**< Leaky Rectifier */
-    AclSoftRelu           = 8,  /**< Soft Rectifier */
-    AclElu                = 9,  /**< Exponential Linear Unit */
-    AclAbs                = 10, /**< Absolute */
-    AclSquare             = 11, /**< Square */
-    AclSqrt               = 12, /**< Square root */
-    AclLinear             = 13, /**< Linear */
-    AclHardSwish          = 14, /**< Hard-swish */
-} AclActivationType;
+    /**< Supported activation types */
+    typedef enum
+    {
+        AclActivationTypeNone = 0,  /**< No activation */
+        AclIdentity           = 1,  /**< Identity */
+        AclLogistic           = 2,  /**< Logistic */
+        AclTanh               = 3,  /**< Hyperbolic tangent */
+        AclRelu               = 4,  /**< Rectifier */
+        AclBoundedRelu        = 5,  /**< Upper Bounded Rectifier */
+        AclLuBoundedRelu      = 6,  /**< Lower and Upper Bounded Rectifier */
+        AclLeakyRelu          = 7,  /**< Leaky Rectifier */
+        AclSoftRelu           = 8,  /**< Soft Rectifier */
+        AclElu                = 9,  /**< Exponential Linear Unit */
+        AclAbs                = 10, /**< Absolute */
+        AclSquare             = 11, /**< Square */
+        AclSqrt               = 12, /**< Square root */
+        AclLinear             = 13, /**< Linear */
+        AclHardSwish          = 14, /**< Hard-swish */
+    } AclActivationType;
 
-/**< Activation layer descriptor */
-typedef struct
-{
-    AclActivationType type;    /**< Activation type */
-    float             a;       /**< Factor &alpha used by some activations */
-    float             b;       /**< Factor &beta used by some activations */
-    bool              inplace; /**< Hint that src and dst tensors will be the same */
-} AclActivationDescriptor;
+    /**< Activation layer descriptor */
+    typedef struct
+    {
+        AclActivationType type;    /**< Activation type */
+        float             a;       /**< Factor &alpha used by some activations */
+        float             b;       /**< Factor &beta used by some activations */
+        bool              inplace; /**< Hint that src and dst tensors will be the same */
+    } AclActivationDescriptor;
 #ifdef __cplusplus
 }
 #endif /** __cplusplus */
diff --git a/arm_compute/AclEntrypoints.h b/arm_compute/AclEntrypoints.h
index ca3a911f5d..0d4902a3d5 100644
--- a/arm_compute/AclEntrypoints.h
+++ b/arm_compute/AclEntrypoints.h
@@ -27,10 +27,11 @@
 #include "arm_compute/AclTypes.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif /** __cplusplus */
 
-/** Create a context object
+    /** Create a context object
  *
  * Context is responsible for retaining internal information and work as an aggregate service mechanism
  *
@@ -46,11 +47,9 @@ extern "C" {
  *  - @ref AclUnsupportedTarget if the requested target is unsupported
  *  - @ref AclInvalidArgument if a given argument is invalid
  */
-AclStatus AclCreateContext(AclContext              *ctx,
-                           AclTarget                target,
-                           const AclContextOptions *options);
+    AclStatus AclCreateContext(AclContext *ctx, AclTarget target, const AclContextOptions *options);
 
-/** Destroy a given context object
+    /** Destroy a given context object
  *
  * @param[in] ctx A valid context object to destroy
  *
@@ -60,9 +59,9 @@ AclStatus AclCreateContext(AclContext              *ctx,
  *  - @ref AclSuccess if functions was completed successfully
  *  - @ref AclInvalidArgument if the provided context is invalid
  */
-AclStatus AclDestroyContext(AclContext ctx);
+    AclStatus AclDestroyContext(AclContext ctx);
 
-/** Create an operator queue
+    /** Create an operator queue
  *
  * Queue is responsible for any scheduling related activities
  *
@@ -78,9 +77,9 @@ AclStatus AclDestroyContext(AclContext ctx);
  *  - @ref AclUnsupportedTarget if the requested target is unsupported
  *  - @ref AclInvalidArgument if a given argument is invalid
  */
-AclStatus AclCreateQueue(AclQueue *queue, AclContext ctx, const AclQueueOptions *options);
+    AclStatus AclCreateQueue(AclQueue *queue, AclContext ctx, const AclQueueOptions *options);
 
-/** Wait until all elements on the queue have been completed
+    /** Wait until all elements on the queue have been completed
  *
  * @param[in] queue Queue to wait on completion
  *
@@ -91,9 +90,9 @@ AclStatus AclCreateQueue(AclQueue *queue, AclContext ctx, const AclQueueOptions
  *  - @ref AclInvalidArgument if the provided queue is invalid
  *  - @ref AclRuntimeError on any other runtime related error
  */
-AclStatus AclQueueFinish(AclQueue queue);
+    AclStatus AclQueueFinish(AclQueue queue);
 
-/** Destroy a given queue object
+    /** Destroy a given queue object
  *
  * @param[in] queue A valid context object to destroy
  *
@@ -103,9 +102,9 @@ AclStatus AclQueueFinish(AclQueue queue);
  *  - @ref AclSuccess if functions was completed successfully
  *  - @ref AclInvalidArgument if the provided context is invalid
  */
-AclStatus AclDestroyQueue(AclQueue queue);
+    AclStatus AclDestroyQueue(AclQueue queue);
 
-/** Create a Tensor object
+    /** Create a Tensor object
  *
  * Tensor is a generalized matrix construct that can represent up to ND dimensionality (where N = 6 for Compute Library)
  * The object holds a backing memory along-side to operate on
@@ -121,9 +120,9 @@ AclStatus AclDestroyQueue(AclQueue queue);
  *  - @ref AclUnsupportedTarget if the requested target is unsupported
  *  - @ref AclInvalidArgument if a given argument is invalid
  */
-AclStatus AclCreateTensor(AclTensor *tensor, AclContext ctx, const AclTensorDescriptor *desc, bool allocate);
+    AclStatus AclCreateTensor(AclTensor *tensor, AclContext ctx, const AclTensorDescriptor *desc, bool allocate);
 
-/** Map a tensor's backing memory to the host
+    /** Map a tensor's backing memory to the host
  *
  * @param[in]      tensor Tensor to be mapped
  * @param[in, out] handle A handle to the underlying backing memory
@@ -134,9 +133,9 @@ AclStatus AclCreateTensor(AclTensor *tensor, AclContext ctx, const AclTensorDesc
  *  - @ref AclSuccess if function was completed successfully
  *  - @ref AclInvalidArgument if a given argument is invalid
  */
-AclStatus AclMapTensor(AclTensor tensor, void **handle);
+    AclStatus AclMapTensor(AclTensor tensor, void **handle);
 
-/** Unmap the tensor's backing memory
+    /** Unmap the tensor's backing memory
  *
  * @param[in] tensor tensor to unmap memory from
  * @param[in] handle Backing memory to be unmapped
@@ -147,9 +146,9 @@ AclStatus AclMapTensor(AclTensor tensor, void **handle);
  *  - @ref AclSuccess if function was completed successfully
  *  - @ref AclInvalidArgument if a given argument is invalid
  */
-AclStatus AclUnmapTensor(AclTensor tensor, void *handle);
+    AclStatus AclUnmapTensor(AclTensor tensor, void *handle);
 
-/** Import external memory to a given tensor object
+    /** Import external memory to a given tensor object
  *
  * @param[in, out] tensor Tensor to import memory to
  * @param[in]      handle Backing memory to be imported
@@ -159,9 +158,9 @@ AclStatus AclUnmapTensor(AclTensor tensor, void *handle);
  *  - @ref AclSuccess if function was completed successfully
  *  - @ref AclInvalidArgument if a given argument is invalid
  */
-AclStatus AclTensorImport(AclTensor tensor, void *handle, AclImportMemoryType type);
+    AclStatus AclTensorImport(AclTensor tensor, void *handle, AclImportMemoryType type);
 
-/** Destroy a given tensor object
+    /** Destroy a given tensor object
  *
  * @param[in,out] tensor A valid tensor object to be destroyed
  *
@@ -171,9 +170,9 @@ AclStatus AclTensorImport(AclTensor tensor, void *handle, AclImportMemoryType ty
  *  - @ref AclSuccess if function was completed successfully
  *  - @ref AclInvalidArgument if the provided tensor is invalid
  */
-AclStatus AclDestroyTensor(AclTensor tensor);
+    AclStatus AclDestroyTensor(AclTensor tensor);
 
-/** Creates a tensor pack
+    /** Creates a tensor pack
  *
  * Tensor packs are used to create a collection of tensors that can be passed around for operator execution
  *
@@ -187,9 +186,9 @@ AclStatus AclDestroyTensor(AclTensor tensor);
  *  - @ref AclOutOfMemory if there was a failure allocating memory resources
  *  - @ref AclInvalidArgument if a given argument is invalid
  */
-AclStatus AclCreateTensorPack(AclTensorPack *pack, AclContext ctx);
+    AclStatus AclCreateTensorPack(AclTensorPack *pack, AclContext ctx);
 
-/** Add a tensor to a tensor pack
+    /** Add a tensor to a tensor pack
  *
  * @param[in,out] pack    Pack to append a tensor to
  * @param[in]     tensor  Tensor to pack
@@ -202,9 +201,9 @@ AclStatus AclCreateTensorPack(AclTensorPack *pack, AclContext ctx);
  *  - @ref AclOutOfMemory if there was a failure allocating memory resources
  *  - @ref AclInvalidArgument if a given argument is invalid
  */
-AclStatus AclPackTensor(AclTensorPack pack, AclTensor tensor, int32_t slot_id);
+    AclStatus AclPackTensor(AclTensorPack pack, AclTensor tensor, int32_t slot_id);
 
-/** A list of tensors to a tensor pack
+    /** A list of tensors to a tensor pack
  *
  * @param[in,out] pack        Pack to append the tensors to
  * @param[in]     tensors     Tensors to append to the pack
@@ -218,9 +217,9 @@ AclStatus AclPackTensor(AclTensorPack pack, AclTensor tensor, int32_t slot_id);
  *  - @ref AclOutOfMemory if there was a failure allocating memory resources
  *  - @ref AclInvalidArgument if a given argument is invalid
  */
-AclStatus AclPackTensors(AclTensorPack pack, AclTensor *tensors, int32_t *slot_ids, size_t num_tensors);
+    AclStatus AclPackTensors(AclTensorPack pack, AclTensor *tensors, int32_t *slot_ids, size_t num_tensors);
 
-/** Destroy a given tensor pack object
+    /** Destroy a given tensor pack object
  *
  * @param[in,out] pack A valid tensor pack object to destroy
  *
@@ -230,9 +229,9 @@ AclStatus AclPackTensors(AclTensorPack pack, AclTensor *tensors, int32_t *slot_i
  *  - @ref AclSuccess if functions was completed successfully
  *  - @ref AclInvalidArgument if the provided context is invalid
  */
-AclStatus AclDestroyTensorPack(AclTensorPack pack);
+    AclStatus AclDestroyTensorPack(AclTensorPack pack);
 
-/** Eager execution of a given operator on a list of inputs and outputs
+    /** Eager execution of a given operator on a list of inputs and outputs
  *
  * @param[in]     op      Operator to execute
  * @param[in]     queue   Queue to schedule the operator on
@@ -247,9 +246,9 @@ AclStatus AclDestroyTensorPack(AclTensorPack pack);
  *  - @ref AclInvalidArgument if a given argument is invalid
  *  - @ref AclRuntimeError on any other runtime related error
  */
-AclStatus AclRunOperator(AclOperator op, AclQueue queue, AclTensorPack tensors);
+    AclStatus AclRunOperator(AclOperator op, AclQueue queue, AclTensorPack tensors);
 
-/** Destroy a given operator object
+    /** Destroy a given operator object
  *
  * @param[in,out] op A valid operator object to destroy
  *
@@ -259,7 +258,7 @@ AclStatus AclRunOperator(AclOperator op, AclQueue queue, AclTensorPack tensors);
  *  - @ref AclSuccess if functions was completed successfully
  *  - @ref AclInvalidArgument if the provided context is invalid
  */
-AclStatus AclDestroyOperator(AclOperator op);
+    AclStatus AclDestroyOperator(AclOperator op);
 #ifdef __cplusplus
 }
 #endif /* __cplusplus */
diff --git a/arm_compute/AclOpenClExt.h b/arm_compute/AclOpenClExt.h
index ef80fd2443..28e918d371 100644
--- a/arm_compute/AclOpenClExt.h
+++ b/arm_compute/AclOpenClExt.h
@@ -37,10 +37,11 @@
 #pragma GCC diagnostic pop
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif /* __cplusplus */
 
-/** Extract the underlying OpenCL context used by a given Compute Library context object
+    /** Extract the underlying OpenCL context used by a given Compute Library context object
  *
  * @note @ref AclContext should be of an OpenCL backend target
  *
@@ -49,9 +50,9 @@ extern "C" {
  *
  * @return Status code
  */
-AclStatus AclGetClContext(AclContext ctx, cl_context *opencl_context);
+    AclStatus AclGetClContext(AclContext ctx, cl_context *opencl_context);
 
-/** Extract the underlying OpenCL device id used by a given Compute Library context object
+    /** Extract the underlying OpenCL device id used by a given Compute Library context object
  *
  * @note @ref AclContext should be of an OpenCL backend target
  *
@@ -60,9 +61,9 @@ AclStatus AclGetClContext(AclContext ctx, cl_context *opencl_context);
  *
  * @return Status code
  */
-AclStatus AclGetClDevice(AclContext ctx, cl_device_id *opencl_device);
+    AclStatus AclGetClDevice(AclContext ctx, cl_device_id *opencl_device);
 
-/** Set the underlying OpenCL context to be used by a given Compute Library context object
+    /** Set the underlying OpenCL context to be used by a given Compute Library context object
  *
  * @note @ref AclContext should be of an OpenCL backend target
  *
@@ -71,9 +72,9 @@ AclStatus AclGetClDevice(AclContext ctx, cl_device_id *opencl_device);
  *
  * @return Status code
  */
-AclStatus AclSetClContext(AclContext ctx, cl_context opencl_context);
+    AclStatus AclSetClContext(AclContext ctx, cl_context opencl_context);
 
-/** Extract the underlying OpenCL queue used by a given Compute Library queue object
+    /** Extract the underlying OpenCL queue used by a given Compute Library queue object
  *
  * @note @ref AclQueue should be of an OpenCL backend target
  * @note @ref AclQueue refcount should be 0, meaning not used by other objects
@@ -83,9 +84,9 @@ AclStatus AclSetClContext(AclContext ctx, cl_context opencl_context);
  *
  * @return Status code
  */
-AclStatus AclGetClQueue(AclQueue queue, cl_command_queue *opencl_queue);
+    AclStatus AclGetClQueue(AclQueue queue, cl_command_queue *opencl_queue);
 
-/** Set the underlying OpenCL queue to be used by a given Compute Library queue object
+    /** Set the underlying OpenCL queue to be used by a given Compute Library queue object
  *
  * @note @ref AclQueue should be of an OpenCL backend target
  * @note opecl_queue needs to be created from the same context that the AclContext that the queue will use
@@ -95,16 +96,16 @@ AclStatus AclGetClQueue(AclQueue queue, cl_command_queue *opencl_queue);
  *
  * @return Status code
  */
-AclStatus AclSetClQueue(AclQueue queue, cl_command_queue opencl_queue);
+    AclStatus AclSetClQueue(AclQueue queue, cl_command_queue opencl_queue);
 
-/** Extract the underlying OpenCL memory object by a given Compute Library tensor object
+    /** Extract the underlying OpenCL memory object by a given Compute Library tensor object
  *
  * @param[in]  tensor     A valid non-zero tensor
  * @param[out] opencl_mem Underlyig OpenCL memory object
  *
  * @return Status code
  */
-AclStatus AclGetClMem(AclTensor tensor, cl_mem *opencl_mem);
+    AclStatus AclGetClMem(AclTensor tensor, cl_mem *opencl_mem);
 
 #ifdef __cplusplus
 }
diff --git a/arm_compute/AclOperators.h b/arm_compute/AclOperators.h
index bfdd7b1b9b..4f6f46e9c8 100644
--- a/arm_compute/AclOperators.h
+++ b/arm_compute/AclOperators.h
@@ -31,10 +31,11 @@
 #define ARM_COMPUTE_VALIDATE_OPERATOR_SUPPORT ((AclOperator *)(size_t)-1)
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif /** __cplusplus */
 
-/** Create an activation operator
+    /** Create an activation operator
  *
  * Applies an activation function to a given tensor .
  * Compute Library supports a wide list of activation functions @ref AclActivationType.
@@ -75,11 +76,11 @@ extern "C" {
  *  - @ref AclUnsupportedTarget if operator for the requested target is unsupported
  *  - @ref AclInvalidArgument if a given argument is invalid
  */
-AclStatus AclActivation(AclOperator                  *op,
-                        AclContext                    ctx,
-                        const AclTensorDescriptor    *src,
-                        const AclTensorDescriptor    *dst,
-                        const AclActivationDescriptor info);
+    AclStatus AclActivation(AclOperator                  *op,
+                            AclContext                    ctx,
+                            const AclTensorDescriptor    *src,
+                            const AclTensorDescriptor    *dst,
+                            const AclActivationDescriptor info);
 #ifdef __cplusplus
 }
 #endif /** __cplusplus */
diff --git a/arm_compute/AclTypes.h b/arm_compute/AclTypes.h
index 368be1292b..9a002ad22c 100644
--- a/arm_compute/AclTypes.h
+++ b/arm_compute/AclTypes.h
@@ -28,185 +28,186 @@
 #include <stdint.h>
 
 #ifdef __cplusplus
-extern "C" {
-#endif /* __cplusplus */
-
-/**< Opaque Context object */
-typedef struct AclContext_ *AclContext;
-/**< Opaque Queue object */
-typedef struct AclQueue_ *AclQueue;
-/**< Opaque Tensor object */
-typedef struct AclTensor_ *AclTensor;
-/**< Opaque Tensor pack object */
-typedef struct AclTensorPack_ *AclTensorPack;
-/**< Opaque Operator object */
-typedef struct AclOperator_ *AclOperator;
-
-// Capabilities bitfield (Note: if multiple are enabled ComputeLibrary will pick the best possible)
-typedef uint64_t AclTargetCapabilities;
-
-/**< Error codes returned by the public entry-points */
-typedef enum AclStatus : int32_t
-{
-    AclSuccess            = 0, /**< Call succeeded, leading to valid state for all involved objects/data */
-    AclRuntimeError       = 1, /**< Call failed during execution */
-    AclOutOfMemory        = 2, /**< Call failed due to failure to allocate resources */
-    AclUnimplemented      = 3, /**< Call failed as requested capability is not implemented */
-    AclUnsupportedTarget  = 4, /**< Call failed as an invalid backend was requested */
-    AclInvalidTarget      = 5, /**< Call failed as invalid argument was passed */
-    AclInvalidArgument    = 6, /**< Call failed as invalid argument was passed */
-    AclUnsupportedConfig  = 7, /**< Call failed as configuration is unsupported */
-    AclInvalidObjectState = 8, /**< Call failed as an object has invalid state */
-} AclStatus;
-
-/**< Supported CPU targets */
-typedef enum AclTarget
-{
-    AclCpu    = 0, /**< Cpu target that uses SIMD extensions */
-    AclGpuOcl = 1, /**< OpenCL target for GPU */
-} AclTarget;
-
-/** Execution mode types */
-typedef enum AclExecutionMode
-{
-    AclPreferFastRerun = 0, /**< Prioritize performance when multiple iterations are performed */
-    AclPreferFastStart = 1, /**< Prioritize performance when a single iterations is expected to be performed */
-} AclExecutionMode;
-
-/** Available CPU capabilities */
-typedef enum AclCpuCapabilities
+extern "C"
 {
-    AclCpuCapabilitiesAuto = 0, /**< Automatic discovery of capabilities */
-
-    AclCpuCapabilitiesNeon = (1 << 0), /**< Enable NEON optimized paths */
-    AclCpuCapabilitiesSve  = (1 << 1), /**< Enable SVE optimized paths */
-    AclCpuCapabilitiesSve2 = (1 << 2), /**< Enable SVE2 optimized paths */
-    // Reserve 3, 4, 5, 6
-
-    AclCpuCapabilitiesFp16 = (1 << 7), /**< Enable float16 data-type support */
-    AclCpuCapabilitiesBf16 = (1 << 8), /**< Enable bfloat16 data-type support */
-    // Reserve 9, 10, 11, 12
-
-    AclCpuCapabilitiesDot      = (1 << 13), /**< Enable paths that use the udot/sdot instructions */
-    AclCpuCapabilitiesMmlaInt8 = (1 << 14), /**< Enable paths that use the mmla integer instructions */
-    AclCpuCapabilitiesMmlaFp   = (1 << 15), /**< Enable paths that use the mmla float instructions */
-
-    AclCpuCapabilitiesAll = ~0 /**< Enable all paths */
-} AclCpuCapabilities;
+#endif /* __cplusplus */
 
-/**< Allocator interface that can be passed to a context */
-typedef struct AclAllocator
-{
-    /** Allocate a block of size bytes of memory.
+    /**< Opaque Context object */
+    typedef struct AclContext_ *AclContext;
+    /**< Opaque Queue object */
+    typedef struct AclQueue_ *AclQueue;
+    /**< Opaque Tensor object */
+    typedef struct AclTensor_ *AclTensor;
+    /**< Opaque Tensor pack object */
+    typedef struct AclTensorPack_ *AclTensorPack;
+    /**< Opaque Operator object */
+    typedef struct AclOperator_ *AclOperator;
+
+    // Capabilities bitfield (Note: if multiple are enabled ComputeLibrary will pick the best possible)
+    typedef uint64_t AclTargetCapabilities;
+
+    /**< Error codes returned by the public entry-points */
+    typedef enum AclStatus : int32_t
+    {
+        AclSuccess            = 0, /**< Call succeeded, leading to valid state for all involved objects/data */
+        AclRuntimeError       = 1, /**< Call failed during execution */
+        AclOutOfMemory        = 2, /**< Call failed due to failure to allocate resources */
+        AclUnimplemented      = 3, /**< Call failed as requested capability is not implemented */
+        AclUnsupportedTarget  = 4, /**< Call failed as an invalid backend was requested */
+        AclInvalidTarget      = 5, /**< Call failed as invalid argument was passed */
+        AclInvalidArgument    = 6, /**< Call failed as invalid argument was passed */
+        AclUnsupportedConfig  = 7, /**< Call failed as configuration is unsupported */
+        AclInvalidObjectState = 8, /**< Call failed as an object has invalid state */
+    } AclStatus;
+
+    /**< Supported CPU targets */
+    typedef enum AclTarget
+    {
+        AclCpu    = 0, /**< Cpu target that uses SIMD extensions */
+        AclGpuOcl = 1, /**< OpenCL target for GPU */
+    } AclTarget;
+
+    /** Execution mode types */
+    typedef enum AclExecutionMode
+    {
+        AclPreferFastRerun = 0, /**< Prioritize performance when multiple iterations are performed */
+        AclPreferFastStart = 1, /**< Prioritize performance when a single iterations is expected to be performed */
+    } AclExecutionMode;
+
+    /** Available CPU capabilities */
+    typedef enum AclCpuCapabilities
+    {
+        AclCpuCapabilitiesAuto = 0, /**< Automatic discovery of capabilities */
+
+        AclCpuCapabilitiesNeon = (1 << 0), /**< Enable NEON optimized paths */
+        AclCpuCapabilitiesSve  = (1 << 1), /**< Enable SVE optimized paths */
+        AclCpuCapabilitiesSve2 = (1 << 2), /**< Enable SVE2 optimized paths */
+        // Reserve 3, 4, 5, 6
+
+        AclCpuCapabilitiesFp16 = (1 << 7), /**< Enable float16 data-type support */
+        AclCpuCapabilitiesBf16 = (1 << 8), /**< Enable bfloat16 data-type support */
+        // Reserve 9, 10, 11, 12
+
+        AclCpuCapabilitiesDot      = (1 << 13), /**< Enable paths that use the udot/sdot instructions */
+        AclCpuCapabilitiesMmlaInt8 = (1 << 14), /**< Enable paths that use the mmla integer instructions */
+        AclCpuCapabilitiesMmlaFp   = (1 << 15), /**< Enable paths that use the mmla float instructions */
+
+        AclCpuCapabilitiesAll = ~0 /**< Enable all paths */
+    } AclCpuCapabilities;
+
+    /**< Allocator interface that can be passed to a context */
+    typedef struct AclAllocator
+    {
+        /** Allocate a block of size bytes of memory.
      *
      * @param[in] user_data User provided data that can be used by the allocator
      * @param[in] size      Size of the allocation
      *
      * @return A pointer to the allocated block if successfull else NULL
      */
-    void *(*alloc)(void *user_data, size_t size);
-    /** Release a block of size bytes of memory.
+        void *(*alloc)(void *user_data, size_t size);
+        /** Release a block of size bytes of memory.
      *
      * @param[in] user_data User provided data that can be used by the allocator
      * @param[in] size      Size of the allocation
      */
-    void (*free)(void *user_data, void *ptr);
-    /** Allocate a block of size bytes of memory.
+        void (*free)(void *user_data, void *ptr);
+        /** Allocate a block of size bytes of memory.
      *
      * @param[in] user_data User provided data that can be used by the allocator
      * @param[in] size      Size of the allocation
      *
      * @return A pointer to the allocated block if successfull else NULL
      */
-    void *(*aligned_alloc)(void *user_data, size_t size, size_t alignment);
-    /** Allocate a block of size bytes of memory.
+        void *(*aligned_alloc)(void *user_data, size_t size, size_t alignment);
+        /** Allocate a block of size bytes of memory.
      *
      * @param[in] user_data User provided data that can be used by the allocator
      * @param[in] size      Size of the allocation
      */
-    void (*aligned_free)(void *user_data, void *ptr);
-
-    /**< User provided information */
-    void *user_data;
-} AclAllocator;
-
-/**< Context options */
-typedef struct AclContextOptions
-{
-    AclExecutionMode      mode;               /**< Execution mode to use */
-    AclTargetCapabilities capabilities;       /**< Target capabilities */
-    bool                  enable_fast_math;   /**< Allow precision loss */
-    const char           *kernel_config_file; /**< Kernel cofiguration file */
-    int32_t               max_compute_units;  /**< Max compute units that can be used by a queue created from the context.
+        void (*aligned_free)(void *user_data, void *ptr);
+
+        /**< User provided information */
+        void *user_data;
+    } AclAllocator;
+
+    /**< Context options */
+    typedef struct AclContextOptions
+    {
+        AclExecutionMode      mode;               /**< Execution mode to use */
+        AclTargetCapabilities capabilities;       /**< Target capabilities */
+        bool                  enable_fast_math;   /**< Allow precision loss */
+        const char           *kernel_config_file; /**< Kernel cofiguration file */
+        int32_t       max_compute_units; /**< Max compute units that can be used by a queue created from the context.
                                                    If <=0 the system will use the hw concurency insted */
-    AclAllocator         *allocator;          /**< Allocator to be used by all the memory internally */
-} AclContextOptions;
-
-/**< Supported tuning modes */
-typedef enum
-{
-    AclTuningModeNone = 0, /**< No tuning */
-    AclRapid          = 1, /**< Fast tuning mode, testing a small portion of the tuning space */
-    AclNormal         = 2, /**< Normal tuning mode, gives a good balance between tuning mode and performance */
-    AclExhaustive     = 3, /**< Exhaustive tuning mode, increased tuning time but with best results */
-} AclTuningMode;
-
-/**< Queue options */
-typedef struct
-{
-    AclTuningMode mode;          /**< Tuning mode */
-    int32_t       compute_units; /**< Compute Units that the queue will deploy */
-} AclQueueOptions;
-
-/**< Supported data types */
-typedef enum AclDataType
-{
-    AclDataTypeUnknown = 0, /**< Unknown data type */
-    AclUInt8           = 1, /**< 8-bit unsigned integer */
-    AclInt8            = 2, /**< 8-bit signed integer */
-    AclUInt16          = 3, /**< 16-bit unsigned integer */
-    AclInt16           = 4, /**< 16-bit signed integer */
-    AclUint32          = 5, /**< 32-bit unsigned integer */
-    AclInt32           = 6, /**< 32-bit signed integer */
-    AclFloat16         = 7, /**< 16-bit floating point */
-    AclBFloat16        = 8, /**< 16-bit brain floating point */
-    AclFloat32         = 9, /**< 32-bit floating point */
-} AclDataType;
-
-/**< Supported data layouts for operations */
-typedef enum AclDataLayout
-{
-    AclDataLayoutUnknown = 0, /**< Unknown data layout */
-    AclNhwc              = 1, /**< Native, performant, Compute Library data layout */
-    AclNchw              = 2, /**< Data layout where width is the fastest changing dimension */
-} AclDataLayout;
-
-/** Type of memory to be imported */
-typedef enum AclImportMemoryType
-{
-    AclHostPtr = 0 /**< Host allocated memory */
-} AclImportMemoryType;
-
-/**< Tensor Descriptor */
-typedef struct AclTensorDescriptor
-{
-    int32_t     ndims;     /**< Number or dimensions */
-    int32_t    *shape;     /**< Tensor Shape */
-    AclDataType data_type; /**< Tensor Data type */
-    int64_t    *strides;   /**< Strides on each dimension. Linear memory is assumed if nullptr */
-    int64_t     boffset;   /**< Offset in terms of bytes for the first element */
-} AclTensorDescriptor;
-
-/**< Slot type of a tensor */
-typedef enum
-{
-    AclSlotUnknown = -1,
-    AclSrc         = 0,
-    AclSrc0        = 0,
-    AclSrc1        = 1,
-    AclDst         = 30,
-    AclSrcVec      = 256,
-} AclTensorSlot;
+        AclAllocator *allocator;         /**< Allocator to be used by all the memory internally */
+    } AclContextOptions;
+
+    /**< Supported tuning modes */
+    typedef enum
+    {
+        AclTuningModeNone = 0, /**< No tuning */
+        AclRapid          = 1, /**< Fast tuning mode, testing a small portion of the tuning space */
+        AclNormal         = 2, /**< Normal tuning mode, gives a good balance between tuning mode and performance */
+        AclExhaustive     = 3, /**< Exhaustive tuning mode, increased tuning time but with best results */
+    } AclTuningMode;
+
+    /**< Queue options */
+    typedef struct
+    {
+        AclTuningMode mode;          /**< Tuning mode */
+        int32_t       compute_units; /**< Compute Units that the queue will deploy */
+    } AclQueueOptions;
+
+    /**< Supported data types */
+    typedef enum AclDataType
+    {
+        AclDataTypeUnknown = 0, /**< Unknown data type */
+        AclUInt8           = 1, /**< 8-bit unsigned integer */
+        AclInt8            = 2, /**< 8-bit signed integer */
+        AclUInt16          = 3, /**< 16-bit unsigned integer */
+        AclInt16           = 4, /**< 16-bit signed integer */
+        AclUint32          = 5, /**< 32-bit unsigned integer */
+        AclInt32           = 6, /**< 32-bit signed integer */
+        AclFloat16         = 7, /**< 16-bit floating point */
+        AclBFloat16        = 8, /**< 16-bit brain floating point */
+        AclFloat32         = 9, /**< 32-bit floating point */
+    } AclDataType;
+
+    /**< Supported data layouts for operations */
+    typedef enum AclDataLayout
+    {
+        AclDataLayoutUnknown = 0, /**< Unknown data layout */
+        AclNhwc              = 1, /**< Native, performant, Compute Library data layout */
+        AclNchw              = 2, /**< Data layout where width is the fastest changing dimension */
+    } AclDataLayout;
+
+    /** Type of memory to be imported */
+    typedef enum AclImportMemoryType
+    {
+        AclHostPtr = 0 /**< Host allocated memory */
+    } AclImportMemoryType;
+
+    /**< Tensor Descriptor */
+    typedef struct AclTensorDescriptor
+    {
+        int32_t     ndims;     /**< Number or dimensions */
+        int32_t    *shape;     /**< Tensor Shape */
+        AclDataType data_type; /**< Tensor Data type */
+        int64_t    *strides;   /**< Strides on each dimension. Linear memory is assumed if nullptr */
+        int64_t     boffset;   /**< Offset in terms of bytes for the first element */
+    } AclTensorDescriptor;
+
+    /**< Slot type of a tensor */
+    typedef enum
+    {
+        AclSlotUnknown = -1,
+        AclSrc         = 0,
+        AclSrc0        = 0,
+        AclSrc1        = 1,
+        AclDst         = 30,
+        AclSrcVec      = 256,
+    } AclTensorSlot;
 
 #ifdef __cplusplus
 }
diff --git a/arm_compute/AclUtils.h b/arm_compute/AclUtils.h
index ef5fa42708..61a93e6060 100644
--- a/arm_compute/AclUtils.h
+++ b/arm_compute/AclUtils.h
@@ -27,10 +27,11 @@
 #include "arm_compute/AclTypes.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif /** __cplusplus */
 
-/** Get the size of the existing tensor in byte
+    /** Get the size of the existing tensor in byte
  *
  * @note The size isn't based on allocated memory, but based on information in its descriptor (dimensions, data type, etc.).
  *
@@ -42,9 +43,9 @@ extern "C" {
  *  - @ref AclSuccess if function was completed successfully
  *  - @ref AclInvalidArgument if a given argument is invalid
  */
-AclStatus AclGetTensorSize(AclTensor tensor, uint64_t *size);
+    AclStatus AclGetTensorSize(AclTensor tensor, uint64_t *size);
 
-/** Get the descriptor of this tensor
+    /** Get the descriptor of this tensor
  *
  * @param[in]  tensor A tensor in interest
  * @param[out] desc   The descriptor of the tensor
@@ -54,7 +55,7 @@ AclStatus AclGetTensorSize(AclTensor tensor, uint64_t *size);
  *  - @ref AclSuccess if function was completed successfully
  *  - @ref AclInvalidArgument if a given argument is invalid
  */
-AclStatus AclGetTensorDescriptor(AclTensor tensor, AclTensorDescriptor *desc);
+    AclStatus AclGetTensorDescriptor(AclTensor tensor, AclTensorDescriptor *desc);
 
 #ifdef __cplusplus
 }
diff --git a/arm_compute/AclVersion.h b/arm_compute/AclVersion.h
index 0b05a5e7dc..6eed13b924 100644
--- a/arm_compute/AclVersion.h
+++ b/arm_compute/AclVersion.h
@@ -25,17 +25,18 @@
 #define ARM_COMPUTE_ACL_VERSION_H_
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif /* __cplusplus */
 
-/** Semantic versioning information */
-typedef struct AclVersion
-{
-    int         major;      /**< Major version, is increased on API incompatible changes */
-    int         minor;      /**< Minor version, is increased on adding back-ward compatible functionality */
-    int         patch;      /**< Patch version, is increased when doing backward compatible fixes */
-    const char *build_info; /**< Build related information */
-} AclVersion;
+    /** Semantic versioning information */
+    typedef struct AclVersion
+    {
+        int         major;      /**< Major version, is increased on API incompatible changes */
+        int         minor;      /**< Minor version, is increased on adding back-ward compatible functionality */
+        int         patch;      /**< Patch version, is increased when doing backward compatible fixes */
+        const char *build_info; /**< Build related information */
+    } AclVersion;
 
 /**< Major version, is increased on API incompatible changes */
 #define ARM_COMPUTE_LIBRARY_VERSION_MAJOR 0
@@ -44,11 +45,11 @@ typedef struct AclVersion
 /**< Patch version, is increased when doing backward compatible fixes */
 #define ARM_COMPUTE_LIBRARY_VERSION_PATCH 0
 
-/** Get library's version meta-data
+    /** Get library's version meta-data
  *
  * @return Version information
  */
-const AclVersion *AclVersionInfo();
+    const AclVersion *AclVersionInfo();
 
 #ifdef __cplusplus
 }
diff --git a/arm_compute/core/CL/CLCompileContext.h b/arm_compute/core/CL/CLCompileContext.h
index 60e0f95f83..dcd3b45670 100644
--- a/arm_compute/core/CL/CLCompileContext.h
+++ b/arm_compute/core/CL/CLCompileContext.h
@@ -250,8 +250,12 @@ public:
      *
      * @return The created kernel.
      */
-    Kernel create_kernel(const std::string &kernel_name, const std::string &program_name, const std::string &program_source,
-                         const std::string &kernel_path, const StringSet &build_options_set, bool is_binary) const;
+    Kernel create_kernel(const std::string &kernel_name,
+                         const std::string &program_name,
+                         const std::string &program_source,
+                         const std::string &kernel_path,
+                         const StringSet   &build_options_set,
+                         bool               is_binary) const;
 
     /** Clear the library's cache of binary programs
      */
@@ -323,7 +327,8 @@ private:
      * @param[in] program_source Source of the program.
      * @param[in] is_binary      Flag to indicate if the program source is binary.
      */
-    const Program &load_program(const std::string &program_name, const std::string &program_source, bool is_binary) const;
+    const Program &
+    load_program(const std::string &program_name, const std::string &program_source, bool is_binary) const;
 
     /** Generates the build options given a string of user defined ones
      *
@@ -343,11 +348,11 @@ private:
      */
     std::string stringify_set(const StringSet &s, const std::string &kernel_path) const;
 
-    cl::Context _context;                                             /**< Underlying CL context. */
-    CLDevice    _device;                                              /**< Underlying CL device. */
+    cl::Context                                  _context;            /**< Underlying CL context. */
+    CLDevice                                     _device;             /**< Underlying CL device. */
     mutable std::map<std::string, const Program> _programs_map;       /**< Map with all already loaded program data. */
     mutable std::map<std::string, cl::Program>   _built_programs_map; /**< Map with all already built program data. */
-    bool _is_wbsm_supported;                                          /**< Support of worksize batch size modifier support boolean*/
+    bool _is_wbsm_supported; /**< Support of worksize batch size modifier support boolean*/
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLCOMPILECONTEXT_H */
diff --git a/arm_compute/core/CL/CLDevice.h b/arm_compute/core/CL/CLDevice.h
index 5e0f86e6d9..ded6bb8493 100644
--- a/arm_compute/core/CL/CLDevice.h
+++ b/arm_compute/core/CL/CLDevice.h
@@ -44,8 +44,7 @@ class CLDevice : public IDevice
 {
 public:
     /** Default Constructor */
-    CLDevice()
-        : _device(cl::Device()), _options()
+    CLDevice() : _device(cl::Device()), _options()
     {
     }
 
@@ -53,8 +52,7 @@ public:
      *
      * @param[in] cl_device OpenCL device
      */
-    CLDevice(const cl::Device &cl_device)
-        : _device(), _options()
+    CLDevice(const cl::Device &cl_device) : _device(), _options()
     {
         _device = cl_device;
 
@@ -66,13 +64,13 @@ public:
         std::string extensions = _device.getInfo<CL_DEVICE_EXTENSIONS>();
 
         std::istringstream iss(extensions);
-        for(std::string s; iss >> s;)
+        for (std::string s; iss >> s;)
         {
             _options.extensions.insert(s);
         }
 
         // SW workaround for G76
-        if(_options.gpu_target == GPUTarget::G76)
+        if (_options.gpu_target == GPUTarget::G76)
         {
             _options.extensions.insert("cl_arm_integer_dot_product_int8");
         }
@@ -153,15 +151,15 @@ public:
      */
     std::tuple<bool, std::string> is_non_uniform_workgroup_supported() const
     {
-        if(version() == CLVersion::CL30 && get_cl_non_uniform_work_group_supported(_device))
+        if (version() == CLVersion::CL30 && get_cl_non_uniform_work_group_supported(_device))
         {
             return {true, " -cl-std=CL3.0 "};
         }
-        else if(version() == CLVersion::CL20)
+        else if (version() == CLVersion::CL20)
         {
             return {true, " -cl-std=CL2.0 "};
         }
-        else if(supported("cl_arm_non_uniform_work_group_size"))
+        else if (supported("cl_arm_non_uniform_work_group_size"))
         {
             return {true, " -cl-arm-non-uniform-work-group-size "};
         }
diff --git a/arm_compute/core/CL/CLHelpers.h b/arm_compute/core/CL/CLHelpers.h
index 20d93df5a1..1a639e47f9 100644
--- a/arm_compute/core/CL/CLHelpers.h
+++ b/arm_compute/core/CL/CLHelpers.h
@@ -179,7 +179,9 @@ bool dot8_acc_supported(const cl::Device &device);
  *
  * @return True if the configuration is supported
  */
-bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Size2D &kernel_size, DataLayout data_layout);
+bool cl_winograd_convolution_layer_supported(const Size2D &output_tile,
+                                             const Size2D &kernel_size,
+                                             DataLayout    data_layout);
 
 /** Helper function to get the preferred native vector width size for built-in scalar types that can be put into vectors
  *
@@ -215,7 +217,9 @@ bool image2d_from_buffer_supported(const cl::Device &device);
  *
  * @return An opencl kernel
  */
-cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_name, const std::set<std::string> &build_opts = std::set<std::string>());
+cl::Kernel create_kernel(const CLCompileContext      &ctx,
+                         const std::string           &kernel_name,
+                         const std::set<std::string> &build_opts = std::set<std::string>());
 
 /** Creates a suitable LWS hint object for parallel implementations. Sets the number of WG based on the input size.
  *  If input width is smaller than 128 we can use fewer threads than 8.
diff --git a/arm_compute/core/CL/CLTypes.h b/arm_compute/core/CL/CLTypes.h
index 00b7cda2e1..0f088e2b10 100644
--- a/arm_compute/core/CL/CLTypes.h
+++ b/arm_compute/core/CL/CLTypes.h
@@ -63,15 +63,13 @@ struct CLDeviceOptions
 struct CLQuantization
 {
     /** Default Constructor */
-    CLQuantization()
-        : scale(nullptr), offset(nullptr) {};
+    CLQuantization() : scale(nullptr), offset(nullptr){};
     /** Constructor
      *
      * @param[in] scale  OpenCL scale array
      * @param[in] offset OpenCL offset array
      */
-    CLQuantization(const ICLFloatArray *scale, const ICLInt32Array *offset)
-        : scale(scale), offset(offset) {};
+    CLQuantization(const ICLFloatArray *scale, const ICLInt32Array *offset) : scale(scale), offset(offset){};
 
     const ICLFloatArray *scale;  /**< Quantization scale array */
     const ICLInt32Array *offset; /**< Quantization offset array */
diff --git a/arm_compute/core/CL/ICLArray.h b/arm_compute/core/CL/ICLArray.h
index 57f842b6f9..a2b2baa5b3 100644
--- a/arm_compute/core/CL/ICLArray.h
+++ b/arm_compute/core/CL/ICLArray.h
@@ -40,8 +40,7 @@ public:
      * @param[in] max_num_values Maximum size of the array.
      *
      */
-    explicit ICLArray(size_t max_num_values)
-        : IArray<T>(max_num_values), _mapping(nullptr)
+    explicit ICLArray(size_t max_num_values) : IArray<T>(max_num_values), _mapping(nullptr)
     {
     }
 
@@ -125,5 +124,5 @@ using ICLInt16Array = ICLArray<cl_short>;
 using ICLInt32Array = ICLArray<cl_int>;
 /** Interface for OpenCL Array of floats. */
 using ICLFloatArray = ICLArray<cl_float>;
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ICLARRAY_H*/
diff --git a/arm_compute/core/CL/ICLTensor.h b/arm_compute/core/CL/ICLTensor.h
index 78d3757e59..8de5423762 100644
--- a/arm_compute/core/CL/ICLTensor.h
+++ b/arm_compute/core/CL/ICLTensor.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_ICLTENSOR_H
 #define ARM_COMPUTE_ICLTENSOR_H
 
-#include "arm_compute/core/ITensor.h"
-
 #include "arm_compute/core/CL/CLTypes.h"
+#include "arm_compute/core/ITensor.h"
 
 #include <cstdint>
 
@@ -34,7 +33,7 @@ namespace cl
 {
 class Buffer;
 class CommandQueue;
-}
+} // namespace cl
 
 namespace arm_compute
 {
@@ -113,5 +112,5 @@ private:
 };
 
 using ICLImage = ICLTensor;
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ICLTENSOR_H */
diff --git a/arm_compute/core/CL/OpenCL.h b/arm_compute/core/CL/OpenCL.h
index f86d55a9ea..a5c4e39df2 100644
--- a/arm_compute/core/CL/OpenCL.h
+++ b/arm_compute/core/CL/OpenCL.h
@@ -31,8 +31,8 @@
 #ifndef ARM_COMPUTE_NO_EXCEPTIONS
 #define CL_HPP_ENABLE_EXCEPTIONS
 #endif // ARM_COMPUTE_NO_EXCEPTIONS
-#define CL_TARGET_OPENCL_VERSION 300
-#define CL_HPP_TARGET_OPENCL_VERSION 110
+#define CL_TARGET_OPENCL_VERSION      300
+#define CL_HPP_TARGET_OPENCL_VERSION  110
 #define CL_HPP_MINIMUM_OPENCL_VERSION 110
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Weffc++"
@@ -40,7 +40,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #if defined(__GNUG__) && __GNUG__ >= 8
 #pragma GCC diagnostic ignored "-Wcatch-value"
-#endif // defined(__GNUG__) && __GNUG__ >= 8
+#endif                   // defined(__GNUG__) && __GNUG__ >= 8
 #include <CL/opencl.hpp> // include new hpp header instead of cl2.hpp
 #pragma GCC diagnostic pop
 
@@ -88,8 +88,7 @@ public:
      */
     bool load_default();
 
-#define DECLARE_FUNCTION_PTR(func_name) \
-    std::function<decltype(func_name)> func_name##_ptr = nullptr
+#define DECLARE_FUNCTION_PTR(func_name) std::function<decltype(func_name)> func_name##_ptr = nullptr
 
     DECLARE_FUNCTION_PTR(clCreateContext);
     DECLARE_FUNCTION_PTR(clCreateContextFromType);
diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h
index e4cbd9ff9b..b080a86938 100644
--- a/arm_compute/core/CPP/CPPTypes.h
+++ b/arm_compute/core/CPP/CPPTypes.h
@@ -78,10 +78,10 @@ public:
 
     /* Delete move and copy constructors and assignment operator
     s */
-    CPUInfo(CPUInfo const &) = delete;            // Copy construct
-    CPUInfo(CPUInfo &&)      = delete;            // Move construct
+    CPUInfo(CPUInfo const &)            = delete; // Copy construct
+    CPUInfo(CPUInfo &&)                 = delete; // Move construct
     CPUInfo &operator=(CPUInfo const &) = delete; // Copy assign
-    CPUInfo &operator=(CPUInfo &&) = delete;      // Move assign
+    CPUInfo &operator=(CPUInfo &&)      = delete; // Move assign
 
     /** Checks if the cpu model supports fp16.
      *
@@ -179,9 +179,9 @@ private:
 /** Information about executing thread and CPU. */
 struct ThreadInfo
 {
-    int            thread_id{ 0 };
-    int            num_threads{ 1 };
-    const CPUInfo *cpu_info{ nullptr };
+    int            thread_id{0};
+    int            num_threads{1};
+    const CPUInfo *cpu_info{nullptr};
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CPP_TYPES_H */
diff --git a/arm_compute/core/CPP/ICPPKernel.h b/arm_compute/core/CPP/ICPPKernel.h
index 00a10555e3..03967a536d 100644
--- a/arm_compute/core/CPP/ICPPKernel.h
+++ b/arm_compute/core/CPP/ICPPKernel.h
@@ -25,9 +25,9 @@
 #define ARM_COMPUTE_ICPPKERNEL_H
 
 #include "arm_compute/core/CPP/CPPTypes.h"
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/IKernel.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/Types.h"
 
 namespace arm_compute
 {
@@ -38,7 +38,7 @@ class ITensor;
 class ICPPKernel : public IKernel
 {
 public:
-    static constexpr size_t default_mws       = 1; /* Default minimum workload size value  - no impact */
+    static constexpr size_t default_mws = 1; /* Default minimum workload size value  - no impact */
 
     /** Default destructor */
     virtual ~ICPPKernel() = default;
diff --git a/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h b/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h
index 068b37d80c..dd91595ea6 100644
--- a/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h
@@ -63,8 +63,16 @@ public:
      * @param[out] keeps_size       (Optional) Number of filtered indices per class tensor of size [num_classes]. Data types supported: U32
      * @param[in]  info             (Optional) BoxNMSLimitInfo information.
      */
-    void configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,
-                   ITensor *batch_splits_out = nullptr, ITensor *keeps = nullptr, ITensor *keeps_size = nullptr, const BoxNMSLimitInfo info = BoxNMSLimitInfo());
+    void configure(const ITensor        *scores_in,
+                   const ITensor        *boxes_in,
+                   const ITensor        *batch_splits_in,
+                   ITensor              *scores_out,
+                   ITensor              *boxes_out,
+                   ITensor              *classes,
+                   ITensor              *batch_splits_out = nullptr,
+                   ITensor              *keeps            = nullptr,
+                   ITensor              *keeps_size       = nullptr,
+                   const BoxNMSLimitInfo info             = BoxNMSLimitInfo());
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -74,9 +82,9 @@ public:
     void run_nmslimit();
 
 private:
-    const ITensor *_scores_in;
-    const ITensor *_boxes_in;
-    const ITensor *_batch_splits_in;
+    const ITensor  *_scores_in;
+    const ITensor  *_boxes_in;
+    const ITensor  *_batch_splits_in;
     ITensor        *_scores_out;
     ITensor        *_boxes_out;
     ITensor        *_classes;
diff --git a/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h b/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h
index e32b5d8f7b..d1f7f8670f 100644
--- a/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CPP_NONMAXIMUMSUPPRESSIONKERNEL_LAYER_H
 #define ARM_COMPUTE_CPP_NONMAXIMUMSUPPRESSIONKERNEL_LAYER_H
 
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
 
 namespace arm_compute
 {
@@ -65,7 +64,12 @@ public:
      * @param[in]  iou_threshold   The threshold used in non maximum suppression.
      *
      */
-    void configure(const ITensor *input_bboxes, const ITensor *input_scores, ITensor *output_indices, unsigned int max_output_size, const float score_threshold, const float iou_threshold);
+    void configure(const ITensor *input_bboxes,
+                   const ITensor *input_scores,
+                   ITensor       *output_indices,
+                   unsigned int   max_output_size,
+                   const float    score_threshold,
+                   const float    iou_threshold);
 
     /** Static function to check if given arguments will lead to a valid configuration of @ref CPPNonMaximumSuppressionKernel
      *
@@ -77,8 +81,12 @@ public:
      * @param[in]  iou_threshold   The threshold used in non maximum suppression.
      *
      */
-    static Status validate(const ITensorInfo *input_bboxes, const ITensorInfo *input_scores, const ITensorInfo *output_indices, unsigned int max_output_size,
-                           const float score_threshold, const float iou_threshold);
+    static Status validate(const ITensorInfo *input_bboxes,
+                           const ITensorInfo *input_scores,
+                           const ITensorInfo *output_indices,
+                           unsigned int       max_output_size,
+                           const float        score_threshold,
+                           const float        iou_threshold);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/core/CPP/kernels/CPPTopKVKernel.h b/arm_compute/core/CPP/kernels/CPPTopKVKernel.h
index 1245dbc14c..7326a10e2f 100644
--- a/arm_compute/core/CPP/kernels/CPPTopKVKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPTopKVKernel.h
@@ -69,7 +69,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k);
+    static Status
+    validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/core/Coordinates.h b/arm_compute/core/Coordinates.h
index f6e1f4d282..d1240bb10a 100644
--- a/arm_compute/core/Coordinates.h
+++ b/arm_compute/core/Coordinates.h
@@ -42,8 +42,7 @@ public:
      * @param[in] coords Values to initialize the dimensions.
      */
     template <typename... Ts>
-    constexpr Coordinates(Ts... coords)
-        : Dimensions{ coords... }
+    constexpr Coordinates(Ts... coords) : Dimensions{coords...}
     {
     }
     /** Allow instances of this class to be copy constructed */
@@ -57,5 +56,5 @@ public:
     /** Default destructor */
     ~Coordinates() = default;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_COORDINATES_H*/
diff --git a/arm_compute/core/CoreTypes.h b/arm_compute/core/CoreTypes.h
index 4a48a36651..1a9db1937c 100644
--- a/arm_compute/core/CoreTypes.h
+++ b/arm_compute/core/CoreTypes.h
@@ -25,6 +25,7 @@
 #define ACL_ARM_COMPUTE_CORE_CORETYPES
 
 #include "arm_compute/core/Strides.h"
+
 #include "support/Half.h"
 
 /** CoreTypes.h groups together essential small types that are used across functions */
@@ -146,9 +147,11 @@ public:
      * @param[in] pad_y    (Optional) Padding, in elements, across y. Defaults to 0.
      * @param[in] round    (Optional) Dimensions rounding. Defaults to @ref DimensionRoundingType::FLOOR.
      */
-    PadStrideInfo(unsigned int stride_x = 1, unsigned int stride_y = 1,
-                  unsigned int pad_x = 0, unsigned int pad_y = 0,
-                  DimensionRoundingType round = DimensionRoundingType::FLOOR)
+    PadStrideInfo(unsigned int          stride_x = 1,
+                  unsigned int          stride_y = 1,
+                  unsigned int          pad_x    = 0,
+                  unsigned int          pad_y    = 0,
+                  DimensionRoundingType round    = DimensionRoundingType::FLOOR)
         : _stride(std::make_pair(stride_x, stride_y)),
           _pad_left(pad_x),
           _pad_top(pad_y),
@@ -167,9 +170,12 @@ public:
      * @param[in] pad_bottom Padding across y on the bottom, in elements.
      * @param[in] round      Dimensions rounding.
      */
-    PadStrideInfo(unsigned int stride_x, unsigned int stride_y,
-                  unsigned int pad_left, unsigned int pad_right,
-                  unsigned int pad_top, unsigned int pad_bottom,
+    PadStrideInfo(unsigned int          stride_x,
+                  unsigned int          stride_y,
+                  unsigned int          pad_left,
+                  unsigned int          pad_right,
+                  unsigned int          pad_top,
+                  unsigned int          pad_bottom,
                   DimensionRoundingType round)
         : _stride(std::make_pair(stride_x, stride_y)),
           _pad_left(pad_left),
@@ -243,10 +249,10 @@ public:
 
 private:
     std::pair<unsigned int, unsigned int> _stride;
-    unsigned int _pad_left;
-    unsigned int _pad_top;
-    unsigned int _pad_right;
-    unsigned int _pad_bottom;
+    unsigned int                          _pad_left;
+    unsigned int                          _pad_top;
+    unsigned int                          _pad_right;
+    unsigned int                          _pad_bottom;
 
     DimensionRoundingType _round_type;
 };
diff --git a/arm_compute/core/Dimensions.h b/arm_compute/core/Dimensions.h
index 2ebfcd7f83..bb8692d70a 100644
--- a/arm_compute/core/Dimensions.h
+++ b/arm_compute/core/Dimensions.h
@@ -50,8 +50,7 @@ public:
      * @param[in] dims Values to initialize the dimensions.
      */
     template <typename... Ts>
-    explicit Dimensions(Ts... dims)
-        : _id{ { static_cast<T>(dims)... } }, _num_dimensions{ sizeof...(dims) }
+    explicit Dimensions(Ts... dims) : _id{{static_cast<T>(dims)...}}, _num_dimensions{sizeof...(dims)}
     {
     }
 
@@ -78,7 +77,7 @@ public:
         ARM_COMPUTE_ERROR_ON(dimension >= num_max_dimensions);
         _id[dimension] = value;
         // Don't increase the number of dimensions if the new dimension is 1
-        if(increase_dim_unit || value != 1)
+        if (increase_dim_unit || value != 1)
         {
             _num_dimensions = std::max(_num_dimensions, dimension + 1);
         }
@@ -108,7 +107,7 @@ public:
     void increment(size_t dim, T step = 1)
     {
         ARM_COMPUTE_ERROR_ON(dim >= _num_dimensions);
-        if((std::numeric_limits<T>::max() - _id[dim]) >= step)
+        if ((std::numeric_limits<T>::max() - _id[dim]) >= step)
         {
             _id[dim] += step;
         }
@@ -162,7 +161,7 @@ public:
 
         const size_t last = std::min(_num_dimensions, first + n);
 
-        if(last > (first + 1))
+        if (last > (first + 1))
         {
             // Collapse dimensions into the first
             _id[first] = std::accumulate(&_id[first], &_id[last], 1, std::multiplies<T>());
@@ -196,7 +195,7 @@ public:
     void remove(size_t idx)
     {
         ARM_COMPUTE_ERROR_ON(_num_dimensions < 1);
-        if(idx >= _num_dimensions)
+        if (idx >= _num_dimensions)
         {
             return;
         }
@@ -262,7 +261,7 @@ protected:
     ~Dimensions() = default;
 
     std::array<T, num_max_dimensions> _id;
-    size_t _num_dimensions{ 0 };
+    size_t                            _num_dimensions{0};
 };
 
 /** Check that given dimensions are equal.
@@ -289,5 +288,5 @@ inline bool operator!=(const Dimensions<T> &lhs, const Dimensions<T> &rhs)
 {
     return !(lhs == rhs);
 }
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_DIMENSIONS_H*/
diff --git a/arm_compute/core/Error.h b/arm_compute/core/Error.h
index 0854f2c527..7a7033805a 100644
--- a/arm_compute/core/Error.h
+++ b/arm_compute/core/Error.h
@@ -53,8 +53,7 @@ class Status
 {
 public:
     /** Default Constructor **/
-    Status()
-        : _code(ErrorCode::OK), _error_description(" ")
+    Status() : _code(ErrorCode::OK), _error_description(" ")
     {
     }
     /** Default Constructor
@@ -101,7 +100,7 @@ public:
     /** Throws a runtime exception in case it contains a valid error status */
     void throw_if_error() const
     {
-        if(!bool(*this))
+        if (!bool(*this))
         {
             internal_throw_on_error();
         }
@@ -141,7 +140,7 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] err Error status
  */
 [[noreturn]] void throw_error(Status err);
-}
+} // namespace arm_compute
 /** To avoid unused variables warnings
  *
  * This is useful if for example a variable is only used
@@ -156,7 +155,8 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] error_code Error code.
  * @param[in] msg        Message to encapsulate.
  */
-#define ARM_COMPUTE_CREATE_ERROR(error_code, msg) arm_compute::create_error_msg(error_code, __func__, __FILE__, __LINE__, msg)
+#define ARM_COMPUTE_CREATE_ERROR(error_code, msg) \
+    arm_compute::create_error_msg(error_code, __func__, __FILE__, __LINE__, msg)
 
 /** Creates an error on location with a given message
  *
@@ -166,7 +166,8 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] line       Line in which the error occurred.
  * @param[in] msg        Message to display before abandoning.
  */
-#define ARM_COMPUTE_CREATE_ERROR_LOC(error_code, func, file, line, msg) arm_compute::create_error_msg(error_code, func, file, line, msg)
+#define ARM_COMPUTE_CREATE_ERROR_LOC(error_code, func, file, line, msg) \
+    arm_compute::create_error_msg(error_code, func, file, line, msg)
 
 /** Creates an error on location with a given message. Accepts a message format
  *  and a variable list of arguments matching the format description.
@@ -178,14 +179,14 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] msg        Error description message format.
  * @param[in] ...        List of arguments matching the format description.
  */
-#define ARM_COMPUTE_CREATE_ERROR_LOC_VAR(error_code, func, file, line, msg, ...)          \
-    do                                                                                    \
-    {                                                                                     \
-        std::array<char, 512> out{ 0 };                                                   \
-        int offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", func, file, line); \
-        snprintf(out.data() + offset, out.size() - offset, msg, __VA_ARGS__);             \
-        arm_compute::create_error(error_code, std::string(out.data()));                   \
-    } while(false)
+#define ARM_COMPUTE_CREATE_ERROR_LOC_VAR(error_code, func, file, line, msg, ...)                            \
+    do                                                                                                      \
+    {                                                                                                       \
+        std::array<char, 512> out{0};                                                                       \
+        int                   offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", func, file, line); \
+        snprintf(out.data() + offset, out.size() - offset, msg, __VA_ARGS__);                               \
+        arm_compute::create_error(error_code, std::string(out.data()));                                     \
+    } while (false)
 
 /** An error is returned with the given description.
  *
@@ -195,7 +196,7 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
     do                                                                                       \
     {                                                                                        \
         return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, __VA_ARGS__); \
-    } while(false)
+    } while (false)
 
 /** Checks if a status contains an error and returns it
  *
@@ -205,18 +206,17 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
     do                                      \
     {                                       \
         const auto s = status;              \
-        if(!bool(s))                        \
+        if (!bool(s))                       \
         {                                   \
             return s;                       \
         }                                   \
-    } while(false)
+    } while (false)
 
 /** Checks if an error value is valid if not throws an exception with the error
  *
  * @param[in] error Error value to check.
  */
-#define ARM_COMPUTE_THROW_ON_ERROR(error) \
-    error.throw_if_error();
+#define ARM_COMPUTE_THROW_ON_ERROR(error) error.throw_if_error();
 
 /** If the condition is true, an error is returned. Accepts a message format
  *  and a variable list of arguments matching the format description.
@@ -228,28 +228,29 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
 #define ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(cond, msg, ...)                                                   \
     do                                                                                                        \
     {                                                                                                         \
-        if(cond)                                                                                              \
+        if (cond)                                                                                             \
         {                                                                                                     \
-            std::array<char, 512> out{ 0 };                                                                   \
+            std::array<char, 512> out{0};                                                                     \
             int offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", __func__, __FILE__, __LINE__);     \
             snprintf(out.data() + offset, out.size() - offset, msg, __VA_ARGS__);                             \
             return arm_compute::create_error(arm_compute::ErrorCode::RUNTIME_ERROR, std::string(out.data())); \
         }                                                                                                     \
-    } while(false)
+    } while (false)
 
 /** If the condition is true, an error is returned
  *
  * @param[in] cond Condition to evaluate.
  * @param[in] msg  Error description message
  */
-#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)                                                                          \
-    do                                                                                                                      \
-    {                                                                                                                       \
-        if(cond)                                                                                                            \
-        {                                                                                                                   \
-            return arm_compute::create_error_msg(arm_compute::ErrorCode::RUNTIME_ERROR, __func__, __FILE__, __LINE__, msg); \
-        }                                                                                                                   \
-    } while(false)
+#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)                                                                    \
+    do                                                                                                                \
+    {                                                                                                                 \
+        if (cond)                                                                                                     \
+        {                                                                                                             \
+            return arm_compute::create_error_msg(arm_compute::ErrorCode::RUNTIME_ERROR, __func__, __FILE__, __LINE__, \
+                                                 msg);                                                                \
+        }                                                                                                             \
+    } while (false)
 
 /** If the condition is true, an error is thrown. Accepts a message format
  *  and a variable list of arguments matching the format description.
@@ -261,17 +262,17 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] msg  Error description message format.
  * @param[in] ...  List of arguments matching the format description.
  */
-#define ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(cond, func, file, line, msg, ...)                \
-    do                                                                                           \
-    {                                                                                            \
-        if(cond)                                                                                 \
-        {                                                                                        \
-            std::array<char, 512> out{ 0 };                                                      \
-            int offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", func, file, line);    \
-            snprintf(out.data() + offset, out.size() - offset, msg, __VA_ARGS__);                \
-            return arm_compute::create_error(ErrorCode::RUNTIME_ERROR, std::string(out.data())); \
-        }                                                                                        \
-    } while(false)
+#define ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(cond, func, file, line, msg, ...)                               \
+    do                                                                                                          \
+    {                                                                                                           \
+        if (cond)                                                                                               \
+        {                                                                                                       \
+            std::array<char, 512> out{0};                                                                       \
+            int                   offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", func, file, line); \
+            snprintf(out.data() + offset, out.size() - offset, msg, __VA_ARGS__);                               \
+            return arm_compute::create_error(ErrorCode::RUNTIME_ERROR, std::string(out.data()));                \
+        }                                                                                                       \
+    } while (false)
 
 /** If the condition is true, an error is thrown.
  *
@@ -284,18 +285,17 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
 #define ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(cond, func, file, line, msg)                           \
     do                                                                                             \
     {                                                                                              \
-        if(cond)                                                                                   \
+        if (cond)                                                                                  \
         {                                                                                          \
             return arm_compute::create_error_msg(ErrorCode::RUNTIME_ERROR, func, file, line, msg); \
         }                                                                                          \
-    } while(false)
+    } while (false)
 
 /** If the condition is true, an error is returned
  *
  * @param[in] cond Condition to evaluate
  */
-#define ARM_COMPUTE_RETURN_ERROR_ON(cond) \
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, #cond)
+#define ARM_COMPUTE_RETURN_ERROR_ON(cond) ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, #cond)
 
 /** If the condition is true, an error is returned
  *
@@ -314,11 +314,12 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] line Line in which the error occurred.
  * @param[in] msg  Message to display.
  */
-#define ARM_COMPUTE_THROW_ERROR(func, file, line, msg)                                                                         \
-    do                                                                                                                         \
-    {                                                                                                                          \
-        arm_compute::throw_error(arm_compute::create_error_msg(arm_compute::ErrorCode::RUNTIME_ERROR, func, file, line, msg)); \
-    } while(false)
+#define ARM_COMPUTE_THROW_ERROR(func, file, line, msg)                                                    \
+    do                                                                                                    \
+    {                                                                                                     \
+        arm_compute::throw_error(                                                                         \
+            arm_compute::create_error_msg(arm_compute::ErrorCode::RUNTIME_ERROR, func, file, line, msg)); \
+    } while (false)
 
 /** Print the given message then throw an std::runtime_error. Accepts a message format
  *  and a variable list of arguments matching the format description.
@@ -332,11 +333,11 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
 #define ARM_COMPUTE_THROW_ERROR_VAR(func, file, line, msg, ...)                                                        \
     do                                                                                                                 \
     {                                                                                                                  \
-        std::array<char, 512> out{ 0 };                                                                                \
-        int offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", func, file, line);                              \
+        std::array<char, 512> out{0};                                                                                  \
+        int                   offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", func, file, line);            \
         snprintf(out.data() + offset, out.size() - offset, msg, __VA_ARGS__);                                          \
         arm_compute::throw_error(arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, std::string(out.data()))); \
-    } while(false)
+    } while (false)
 
 /** Print the given message then throw an std::runtime_error. Accepts a message format
  *  and a variable list of arguments matching the format description.
@@ -361,7 +362,8 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] msg  Error description message format.
  * @param[in] ...  List of arguments matching the format description.
  */
-#define ARM_COMPUTE_ERROR_LOC_VAR(func, file, line, msg, ...) ARM_COMPUTE_THROW_ERROR_VAR(func, file, line, msg, __VA_ARGS__) // NOLINT
+#define ARM_COMPUTE_ERROR_LOC_VAR(func, file, line, msg, ...) \
+    ARM_COMPUTE_THROW_ERROR_VAR(func, file, line, msg, __VA_ARGS__) // NOLINT
 
 /** Print the given message then throw an std::runtime_error.
  *
@@ -380,11 +382,11 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
 #define ARM_COMPUTE_EXIT_ON_MSG(cond, msg) \
     do                                     \
     {                                      \
-        if(cond)                           \
+        if (cond)                          \
         {                                  \
             ARM_COMPUTE_ERROR(msg);        \
         }                                  \
-    } while(false)
+    } while (false)
 
 /** If the condition is true, the given message is printed and program exits. Accepts a message format
  *  and a variable list of arguments matching the format description.
@@ -396,27 +398,25 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
 #define ARM_COMPUTE_EXIT_ON_MSG_VAR(cond, msg, ...)  \
     do                                               \
     {                                                \
-        if(cond)                                     \
+        if (cond)                                    \
         {                                            \
             ARM_COMPUTE_ERROR_VAR(msg, __VA_ARGS__); \
         }                                            \
-    } while(false)
+    } while (false)
 
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
 /** Checks if a status value is valid if not throws an exception with the error
  *
  * @param[in] status Status value to check.
  */
-#define ARM_COMPUTE_ERROR_THROW_ON(status) \
-    status.throw_if_error()
+#define ARM_COMPUTE_ERROR_THROW_ON(status) status.throw_if_error()
 
 /** If the condition is true, the given message is printed and an exception is thrown
  *
  * @param[in] cond Condition to evaluate.
  * @param[in] msg  Message to display.
  */
-#define ARM_COMPUTE_ERROR_ON_MSG(cond, msg) \
-    ARM_COMPUTE_EXIT_ON_MSG(cond, msg)
+#define ARM_COMPUTE_ERROR_ON_MSG(cond, msg) ARM_COMPUTE_EXIT_ON_MSG(cond, msg)
 
 /** If the condition is true, the given message is printed and an exception is thrown. Accepts a message format
  *  and a variable list of arguments matching the format description.
@@ -425,8 +425,7 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] msg  Error description message format.
  * @param[in] ...  List of arguments matching the format description.
  */
-#define ARM_COMPUTE_ERROR_ON_MSG_VAR(cond, msg, ...) \
-    ARM_COMPUTE_EXIT_ON_MSG_VAR(cond, msg, __VA_ARGS__)
+#define ARM_COMPUTE_ERROR_ON_MSG_VAR(cond, msg, ...) ARM_COMPUTE_EXIT_ON_MSG_VAR(cond, msg, __VA_ARGS__)
 
 /** If the condition is true, the given message is printed and an exception is thrown.
  *
@@ -439,11 +438,11 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
 #define ARM_COMPUTE_ERROR_ON_LOC_MSG(cond, func, file, line, ...)     \
     do                                                                \
     {                                                                 \
-        if(cond)                                                      \
+        if (cond)                                                     \
         {                                                             \
             ARM_COMPUTE_ERROR_LOC_VAR(func, file, line, __VA_ARGS__); \
         }                                                             \
-    } while(false)
+    } while (false)
 
 /** If the condition is true, the given message is printed and an exception is thrown, otherwise value is returned
  *
@@ -464,8 +463,7 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  *
  * @param[in] cond Condition to evaluate.
  */
-#define ARM_COMPUTE_ERROR_ON(cond) \
-    ARM_COMPUTE_ERROR_ON_MSG(cond, #cond)
+#define ARM_COMPUTE_ERROR_ON(cond) ARM_COMPUTE_ERROR_ON_MSG(cond, #cond)
 
 /** If the condition is true then an error message is printed and an exception thrown
  *
diff --git a/arm_compute/core/Helpers.h b/arm_compute/core/Helpers.h
index f19e1e12e0..960201510a 100644
--- a/arm_compute/core/Helpers.h
+++ b/arm_compute/core/Helpers.h
@@ -96,7 +96,6 @@ public:
     void reset(size_t dimension);
 
 private:
-
     /** Initialize a container iterator for the tensor with the specified number of dimensions, stride, buffer pointer and window.
      *
      * @param[in] num_dims The number of dimensions.
@@ -112,8 +111,7 @@ private:
     class Dimension
     {
     public:
-        constexpr Dimension()
-            : _dim_start(0), _stride(0)
+        constexpr Dimension() : _dim_start(0), _stride(0)
         {
         }
 
@@ -133,7 +131,7 @@ private:
  * @param[in,out] iterators       Tensor iterators which will be updated by this function before calling lambda_function.
  */
 template <typename L, typename... Ts>
-inline void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators);
+inline void execute_window_loop(const Window &w, L &&lambda_function, Ts &&...iterators);
 
 /** Permutes given Dimensions according to a permutation vector
  *
@@ -146,7 +144,7 @@ template <typename T>
 inline void permute(Dimensions<T> &dimensions, const PermutationVector &perm)
 {
     auto dimensions_copy = utility::make_array<Dimensions<T>::num_max_dimensions>(dimensions.begin(), dimensions.end());
-    for(unsigned int i = 0; i < perm.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < perm.num_dimensions(); ++i)
     {
         T dimension_val = (perm[i] < dimensions.num_dimensions()) ? dimensions_copy[perm[i]] : 0;
         dimensions.set(i, dimension_val);
@@ -163,7 +161,7 @@ inline void permute(Dimensions<T> &dimensions, const PermutationVector &perm)
 inline void permute(TensorShape &shape, const PermutationVector &perm)
 {
     TensorShape shape_copy = shape;
-    for(unsigned int i = 0; i < perm.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < perm.num_dimensions(); ++i)
     {
         size_t dimension_val = (perm[i] < shape.num_dimensions()) ? shape_copy[perm[i]] : 1;
         shape.set(i, dimension_val, false, false); // Avoid changes in _num_dimension
@@ -180,8 +178,11 @@ inline void permute(TensorShape &shape, const PermutationVector &perm)
  *
  * @return The corresponding valid region
  */
-ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const TensorShape &dst_shape,
-                                         InterpolationPolicy interpolate_policy, SamplingPolicy sampling_policy, bool border_undefined);
+ValidRegion calculate_valid_region_scale(const ITensorInfo  &src_info,
+                                         const TensorShape  &dst_shape,
+                                         InterpolationPolicy interpolate_policy,
+                                         SamplingPolicy      sampling_policy,
+                                         bool                border_undefined);
 
 /** Convert a linear index into n-dimensional coordinates.
  *
@@ -224,7 +225,8 @@ const std::map<DataLayout, std::vector<DataLayoutDimension>> &get_layout_map();
  *
  * @return The int conversion of the requested data layout index.
  */
-inline size_t get_data_layout_dimension_index(const DataLayout &data_layout, const DataLayoutDimension &data_layout_dimension);
+inline size_t get_data_layout_dimension_index(const DataLayout          &data_layout,
+                                              const DataLayoutDimension &data_layout_dimension);
 
 /** Get the DataLayoutDimension of a given index and layout.
  *
@@ -245,10 +247,17 @@ inline DataLayoutDimension get_index_data_layout_dimension(const DataLayout &dat
  *
  * @return the number of output tiles along the x and y directions of size "output_tile_size"
  */
-inline Size2D compute_winograd_convolution_tiles(const Size2D &in_dims, const Size2D &kernel_size, const Size2D &output_tile_size, const PadStrideInfo &conv_info)
+inline Size2D compute_winograd_convolution_tiles(const Size2D        &in_dims,
+                                                 const Size2D        &kernel_size,
+                                                 const Size2D        &output_tile_size,
+                                                 const PadStrideInfo &conv_info)
 {
-    int num_tiles_x = std::ceil((in_dims.width - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right()) / static_cast<float>(output_tile_size.width));
-    int num_tiles_y = std::ceil((in_dims.height - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom()) / static_cast<float>(output_tile_size.height));
+    int num_tiles_x =
+        std::ceil((in_dims.width - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right()) /
+                  static_cast<float>(output_tile_size.width));
+    int num_tiles_y =
+        std::ceil((in_dims.height - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom()) /
+                  static_cast<float>(output_tile_size.height));
 
     // Clamp in case we provide paddings but we have 1D convolution
     num_tiles_x = std::min(num_tiles_x, static_cast<int>(in_dims.width));
@@ -277,7 +286,7 @@ inline T wrap_around(T x, T m)
  */
 inline Coordinates &convert_negative_axis(Coordinates &coords, int max_value)
 {
-    for(unsigned int i = 0; i < coords.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < coords.num_dimensions(); ++i)
     {
         coords[i] = wrap_around(coords[i], max_value);
     }
diff --git a/arm_compute/core/Helpers.inl b/arm_compute/core/Helpers.inl
index ff902bba20..60a21e9418 100644
--- a/arm_compute/core/Helpers.inl
+++ b/arm_compute/core/Helpers.inl
@@ -32,12 +32,9 @@ template <size_t dimension>
 struct IncrementIterators
 {
     template <typename T, typename... Ts>
-    static void unroll(T &&it, Ts &&... iterators)
+    static void unroll(T &&it, Ts &&...iterators)
     {
-        auto increment = [](T && it)
-        {
-            it.increment(dimension);
-        };
+        auto increment = [](T &&it) { it.increment(dimension); };
         utility::for_each(increment, std::forward<T>(it), std::forward<Ts>(iterators)...);
     }
     static void unroll()
@@ -50,14 +47,14 @@ template <size_t dim>
 struct ForEachDimension
 {
     template <typename L, typename... Ts>
-    static void unroll(const Window &w, Coordinates &id, L &&lambda_function, Ts &&... iterators)
+    static void unroll(const Window &w, Coordinates &id, L &&lambda_function, Ts &&...iterators)
     {
         const auto &d = w[dim - 1];
 
-        for(auto v = d.start(); v < d.end(); v += d.step(), IncrementIterators < dim - 1 >::unroll(iterators...))
+        for (auto v = d.start(); v < d.end(); v += d.step(), IncrementIterators<dim - 1>::unroll(iterators...))
         {
             id.set(dim - 1, v);
-            ForEachDimension < dim - 1 >::unroll(w, id, lambda_function, iterators...);
+            ForEachDimension<dim - 1>::unroll(w, id, lambda_function, iterators...);
         }
     }
 };
@@ -66,7 +63,7 @@ template <>
 struct ForEachDimension<0>
 {
     template <typename L, typename... Ts>
-    static void unroll(const Window &w, Coordinates &id, L &&lambda_function, Ts &&... iterators)
+    static void unroll(const Window &w, Coordinates &id, L &&lambda_function, Ts &&...iterators)
     {
         ARM_COMPUTE_UNUSED(w, iterators...);
         lambda_function(id);
@@ -74,31 +71,31 @@ struct ForEachDimension<0>
 };
 
 template <typename L, typename... Ts>
-inline void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
+inline void execute_window_loop(const Window &w, L &&lambda_function, Ts &&...iterators)
 {
     w.validate();
 
-    for(unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i)
+    for (unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_ERROR_ON(w[i].step() == 0);
     }
 
     Coordinates id;
-    ForEachDimension<Coordinates::num_max_dimensions>::unroll(w, id, std::forward<L>(lambda_function), std::forward<Ts>(iterators)...);
+    ForEachDimension<Coordinates::num_max_dimensions>::unroll(w, id, std::forward<L>(lambda_function),
+                                                              std::forward<Ts>(iterators)...);
 }
 
-inline constexpr Iterator::Iterator()
-    : _ptr(nullptr), _dims()
+inline constexpr Iterator::Iterator() : _ptr(nullptr), _dims()
 {
 }
 
-inline Iterator::Iterator(const ITensor *tensor, const Window &win)
-    : Iterator()
+inline Iterator::Iterator(const ITensor *tensor, const Window &win) : Iterator()
 {
     ARM_COMPUTE_ERROR_ON(tensor == nullptr);
     ARM_COMPUTE_ERROR_ON(tensor->info() == nullptr);
 
-    initialize(tensor->info()->num_dimensions(), tensor->info()->strides_in_bytes(), tensor->buffer(), tensor->info()->offset_first_element_in_bytes(), win);
+    initialize(tensor->info()->num_dimensions(), tensor->info()->strides_in_bytes(), tensor->buffer(),
+               tensor->info()->offset_first_element_in_bytes(), win);
 }
 
 inline Iterator::Iterator(size_t num_dims, const Strides &strides, uint8_t *buffer, size_t offset, const Window &win)
@@ -107,21 +104,22 @@ inline Iterator::Iterator(size_t num_dims, const Strides &strides, uint8_t *buff
     initialize(num_dims, strides, buffer, offset, win);
 }
 
-inline void Iterator::initialize(size_t num_dims, const Strides &strides, uint8_t *buffer, size_t offset, const Window &win)
+inline void
+Iterator::initialize(size_t num_dims, const Strides &strides, uint8_t *buffer, size_t offset, const Window &win)
 {
     ARM_COMPUTE_ERROR_ON(buffer == nullptr);
 
     _ptr = buffer + offset;
 
     //Initialize the stride for each dimension and calculate the position of the first element of the iteration:
-    for(unsigned int n = 0; n < num_dims; ++n)
+    for (unsigned int n = 0; n < num_dims; ++n)
     {
         _dims[n]._stride = win[n].step() * strides[n];
         std::get<0>(_dims)._dim_start += static_cast<size_t>(strides[n]) * win[n].start();
     }
 
     //Copy the starting point to all the dimensions:
-    for(unsigned int n = 1; n < Coordinates::num_max_dimensions; ++n)
+    for (unsigned int n = 1; n < Coordinates::num_max_dimensions; ++n)
     {
         _dims[n]._dim_start = std::get<0>(_dims)._dim_start;
     }
@@ -135,7 +133,7 @@ inline void Iterator::increment(const size_t dimension)
 
     _dims[dimension]._dim_start += _dims[dimension]._stride;
 
-    for(unsigned int n = 0; n < dimension; ++n)
+    for (unsigned int n = 0; n < dimension; ++n)
     {
         _dims[n]._dim_start = _dims[dimension]._dim_start;
     }
@@ -157,7 +155,7 @@ inline void Iterator::reset(const size_t dimension)
 
     _dims[dimension]._dim_start = _dims[dimension + 1]._dim_start;
 
-    for(unsigned int n = 0; n < dimension; ++n)
+    for (unsigned int n = 0; n < dimension; ++n)
     {
         _dims[n]._dim_start = _dims[dimension]._dim_start;
     }
@@ -170,9 +168,9 @@ inline Coordinates index2coords(const TensorShape &shape, int index)
     ARM_COMPUTE_ERROR_ON_MSG(index < 0 || index >= num_elements, "Index has to be in [0, num_elements]!");
     ARM_COMPUTE_ERROR_ON_MSG(num_elements == 0, "Cannot create coordinate from empty shape!");
 
-    Coordinates coord{ 0 };
+    Coordinates coord{0};
 
-    for(int d = shape.num_dimensions() - 1; d >= 0; --d)
+    for (int d = shape.num_dimensions() - 1; d >= 0; --d)
     {
         num_elements /= shape[d];
         coord.set(d, index / num_elements);
@@ -191,7 +189,7 @@ inline int coords2index(const TensorShape &shape, const Coordinates &coord)
     int index  = 0;
     int stride = 1;
 
-    for(unsigned int d = 0; d < coord.num_dimensions(); ++d)
+    for (unsigned int d = 0; d < coord.num_dimensions(); ++d)
     {
         index += coord[d] * stride;
         stride *= shape[d];
@@ -200,9 +198,11 @@ inline int coords2index(const TensorShape &shape, const Coordinates &coord)
     return index;
 }
 
-inline size_t get_data_layout_dimension_index(const DataLayout &data_layout, const DataLayoutDimension &data_layout_dimension)
+inline size_t get_data_layout_dimension_index(const DataLayout          &data_layout,
+                                              const DataLayoutDimension &data_layout_dimension)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(data_layout == DataLayout::UNKNOWN, "Cannot retrieve the dimension index for an unknown layout!");
+    ARM_COMPUTE_ERROR_ON_MSG(data_layout == DataLayout::UNKNOWN,
+                             "Cannot retrieve the dimension index for an unknown layout!");
     const auto &dims = get_layout_map().at(data_layout);
     const auto &it   = std::find(dims.cbegin(), dims.cend(), data_layout_dimension);
     ARM_COMPUTE_ERROR_ON_MSG(it == dims.cend(), "Invalid dimension for the given layout.");
@@ -211,7 +211,8 @@ inline size_t get_data_layout_dimension_index(const DataLayout &data_layout, con
 
 inline DataLayoutDimension get_index_data_layout_dimension(const DataLayout &data_layout, const size_t index)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(data_layout == DataLayout::UNKNOWN, "Cannot retrieve the layout dimension for an unknown layout!");
+    ARM_COMPUTE_ERROR_ON_MSG(data_layout == DataLayout::UNKNOWN,
+                             "Cannot retrieve the layout dimension for an unknown layout!");
     const auto &dims = get_layout_map().at(data_layout);
     ARM_COMPUTE_ERROR_ON_MSG(index >= dims.size(), "Invalid index for the given layout.");
     return dims[index];
diff --git a/arm_compute/core/IAccessWindow.h b/arm_compute/core/IAccessWindow.h
index 880f6d6b27..9c9fb90915 100644
--- a/arm_compute/core/IAccessWindow.h
+++ b/arm_compute/core/IAccessWindow.h
@@ -100,7 +100,10 @@ public:
      * @return a valid region.
      *
      */
-    virtual ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const = 0;
+    virtual ValidRegion compute_valid_region(const Window &window,
+                                             ValidRegion   input_valid_region,
+                                             bool          border_undefined,
+                                             BorderSize    border_size) const = 0;
 };
 
 /** Implementation of a rectangular access pattern. */
@@ -161,7 +164,10 @@ public:
      * @param[in] border_undefined   (Optional) Undefined borders are excluded from the valid region.
      * @param[in] border_size        (Optional) Size of the border around the XY-plane of the tensor.
      */
-    void set_valid_region(const Window &window, const ValidRegion &input_valid_region, bool border_undefined = false, const BorderSize &border_size = BorderSize(0));
+    void set_valid_region(const Window      &window,
+                          const ValidRegion &input_valid_region,
+                          bool               border_undefined = false,
+                          const BorderSize  &border_size      = BorderSize(0));
 
     /** Compute the valid region based on access pattern, valid region of the inputs and border mode.
      *
@@ -189,7 +195,10 @@ public:
      * @return a valid region.
      *
      */
-    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+    ValidRegion compute_valid_region(const Window &window,
+                                     ValidRegion   input_valid_region,
+                                     bool          border_undefined,
+                                     BorderSize    border_size) const override;
 
     bool update_window_if_needed(Window &window) const override;
     bool update_padding_if_needed(const Window &window) override;
diff --git a/arm_compute/core/IArray.h b/arm_compute/core/IArray.h
index 6edbc1d5d5..3471fc9a86 100644
--- a/arm_compute/core/IArray.h
+++ b/arm_compute/core/IArray.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_IARRAY_H
 
 #include "arm_compute/core/Error.h"
+
 #include <cstddef>
 #include <cstdint>
 
@@ -36,14 +37,12 @@ class IArray
 {
 public:
     /** Default constructor */
-    IArray()
-        : _num_values(0), _max_size(0) {};
+    IArray() : _num_values(0), _max_size(0){};
     /** Constructor: initializes an array which can contain up to max_num_points values
      *
      * @param[in] max_num_values Maximum number of values the array will be able to stored
      */
-    IArray(size_t max_num_values)
-        : _num_values(0), _max_size(max_num_values)
+    IArray(size_t max_num_values) : _num_values(0), _max_size(max_num_values)
     {
     }
     /** Maximum number of values which can be stored in this array
@@ -73,7 +72,7 @@ public:
     bool push_back(const T &val)
     {
         ARM_COMPUTE_ERROR_ON(0 == _max_size);
-        if(_num_values >= max_num_values())
+        if (_num_values >= max_num_values())
         {
             _num_values = max_num_values() + 1;
             return false;
@@ -142,5 +141,5 @@ using IInt16Array = IArray<int16_t>;
 using IInt32Array = IArray<int32_t>;
 /** Interface for Array of floats. */
 using IFloatArray = IArray<float>;
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_IARRAY_H */
diff --git a/arm_compute/core/IKernel.h b/arm_compute/core/IKernel.h
index 98fd18cc91..403a2c724e 100644
--- a/arm_compute/core/IKernel.h
+++ b/arm_compute/core/IKernel.h
@@ -73,5 +73,5 @@ protected:
 private:
     Window _window;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_IKERNEL_H */
diff --git a/arm_compute/core/ITensor.h b/arm_compute/core/ITensor.h
index 32b93576bd..aad8313261 100644
--- a/arm_compute/core/ITensor.h
+++ b/arm_compute/core/ITensor.h
@@ -94,9 +94,9 @@ public:
     void mark_as_used() const;
 
 private:
-    mutable bool _is_used = { true }; /**< Flag that marks if the tensor is used or not */
+    mutable bool _is_used = {true}; /**< Flag that marks if the tensor is used or not */
 };
 
 using IImage = ITensor;
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ITENSOR_H */
diff --git a/arm_compute/core/ITensorInfo.h b/arm_compute/core/ITensorInfo.h
index e7c0b182c6..c42f4b57a1 100644
--- a/arm_compute/core/ITensorInfo.h
+++ b/arm_compute/core/ITensorInfo.h
@@ -29,6 +29,7 @@
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Utility.h"
+
 #include "support/ICloneable.h"
 
 #include <cstddef>
@@ -328,23 +329,23 @@ public:
      * not broadcast compatible.
      */
     template <typename... Infos>
-    static std::pair<TensorShape, ValidRegion> broadcast_shape_and_valid_region(const Infos &... infos)
+    static std::pair<TensorShape, ValidRegion> broadcast_shape_and_valid_region(const Infos &...infos)
     {
         TensorShape bc_shape = TensorShape::broadcast_shape(infos.tensor_shape()...);
-        ValidRegion bc_valid_region{ Coordinates(), bc_shape };
+        ValidRegion bc_valid_region{Coordinates(), bc_shape};
 
-        auto broadcast_valid_region = [&bc_valid_region](const ITensorInfo & info)
+        auto broadcast_valid_region = [&bc_valid_region](const ITensorInfo &info)
         {
-            if(info.num_dimensions() != 0)
+            if (info.num_dimensions() != 0)
             {
-                for(size_t d = 0; d < bc_valid_region.shape.num_dimensions(); ++d)
+                for (size_t d = 0; d < bc_valid_region.shape.num_dimensions(); ++d)
                 {
                     const bool is_broadcast = (info.tensor_shape()[d] == 1);
 
                     const int    anchor_max = std::max(bc_valid_region.anchor[d], info.valid_region().anchor[d]);
                     const size_t valid_min  = std::min(bc_valid_region.shape[d], info.valid_region().shape[d]);
 
-                    if(!is_broadcast || (valid_min == 0))
+                    if (!is_broadcast || (valid_min == 0))
                     {
                         bc_valid_region.anchor.set(d, anchor_max);
                         bc_valid_region.shape.set(d, valid_min);
diff --git a/arm_compute/core/ITensorPack.h b/arm_compute/core/ITensorPack.h
index 17b7241862..f456c50769 100644
--- a/arm_compute/core/ITensorPack.h
+++ b/arm_compute/core/ITensorPack.h
@@ -42,18 +42,16 @@ public:
     struct PackElement
     {
         PackElement() = default;
-        PackElement(int id, ITensor *tensor)
-            : id(id), tensor(tensor), ctensor(nullptr)
+        PackElement(int id, ITensor *tensor) : id(id), tensor(tensor), ctensor(nullptr)
         {
         }
-        PackElement(int id, const ITensor *ctensor)
-            : id(id), tensor(nullptr), ctensor(ctensor)
+        PackElement(int id, const ITensor *ctensor) : id(id), tensor(nullptr), ctensor(ctensor)
         {
         }
 
-        int            id{ -1 };
-        ITensor       *tensor{ nullptr };
-        const ITensor *ctensor{ nullptr };
+        int            id{-1};
+        ITensor       *tensor{nullptr};
+        const ITensor *ctensor{nullptr};
     };
 
 public:
diff --git a/arm_compute/core/KernelDescriptors.h b/arm_compute/core/KernelDescriptors.h
index 2bf5dee18c..168a06a55c 100644
--- a/arm_compute/core/KernelDescriptors.h
+++ b/arm_compute/core/KernelDescriptors.h
@@ -33,24 +33,24 @@ namespace arm_compute
 /** Descriptor for FFT scale kernels */
 struct FFTScaleKernelInfo
 {
-    float scale{ 0.f };      /**< Axis to perform the kernel on. */
-    bool  conjugate{ true }; /**< Flag to conjugate the output/ */
+    float scale{0.f};      /**< Axis to perform the kernel on. */
+    bool  conjugate{true}; /**< Flag to conjugate the output/ */
 };
 
 /** Descriptor for FFT digit reverse kernels */
 struct FFTDigitReverseKernelInfo
 {
-    unsigned int axis{ 0 };          /**< Axis to perform the kernel on. */
-    bool         conjugate{ false }; /**< Flag to conjugate the output/ */
+    unsigned int axis{0};          /**< Axis to perform the kernel on. */
+    bool         conjugate{false}; /**< Flag to conjugate the output/ */
 };
 
 /** Descriptor used by the FFT core kernels */
 struct FFTRadixStageKernelInfo
 {
-    unsigned int axis{ 0 };               /**< Axis to run the kernel on. */
-    unsigned int radix{ 0 };              /**< Radix to use. */
-    unsigned int Nx{ 0 };                 /**< Nx coefficient. */
-    bool         is_first_stage{ false }; /**< Flags if the FFT kernels is the first stage of a decomposed FFT. */
+    unsigned int axis{0};               /**< Axis to run the kernel on. */
+    unsigned int radix{0};              /**< Radix to use. */
+    unsigned int Nx{0};                 /**< Nx coefficient. */
+    bool         is_first_stage{false}; /**< Flags if the FFT kernels is the first stage of a decomposed FFT. */
 };
 
 class ITensorInfo;
@@ -58,89 +58,102 @@ class ITensorInfo;
 struct GEMMKernelInfo
 {
     GEMMKernelInfo() = default;
-    GEMMKernelInfo(
-        unsigned int        im,
-        unsigned int        in,
-        unsigned int        ik,
-        unsigned int        idepth_output_gemm3d,
-        bool                ireinterpret_input_as_3d,
-        bool                ibroadcast_bias,
-        bool                ifp_mixed_precision,
-        bool                ihas_pad_y,
-        ActivationLayerInfo iactivation_info,
-        int                 inmult_transpose1xW_width,
-        int                 imult_interleave4x4_height,
-        GEMMLHSMatrixInfo   ilhs_info,
-        GEMMRHSMatrixInfo   irhs_info,
-        int32_t             ina_offset,
-        int32_t             inb_offset)
-        : m(im), n(in), k(ik), depth_output_gemm3d(idepth_output_gemm3d), reinterpret_input_as_3d(ireinterpret_input_as_3d), broadcast_bias(ibroadcast_bias), fp_mixed_precision(ifp_mixed_precision),
-          has_pad_y(ihas_pad_y), activation_info(iactivation_info), mult_transpose1xW_width(inmult_transpose1xW_width), mult_interleave4x4_height(imult_interleave4x4_height), lhs_info(ilhs_info),
-          rhs_info(irhs_info), a_offset(ina_offset), b_offset(inb_offset)
+    GEMMKernelInfo(unsigned int        im,
+                   unsigned int        in,
+                   unsigned int        ik,
+                   unsigned int        idepth_output_gemm3d,
+                   bool                ireinterpret_input_as_3d,
+                   bool                ibroadcast_bias,
+                   bool                ifp_mixed_precision,
+                   bool                ihas_pad_y,
+                   ActivationLayerInfo iactivation_info,
+                   int                 inmult_transpose1xW_width,
+                   int                 imult_interleave4x4_height,
+                   GEMMLHSMatrixInfo   ilhs_info,
+                   GEMMRHSMatrixInfo   irhs_info,
+                   int32_t             ina_offset,
+                   int32_t             inb_offset)
+        : m(im),
+          n(in),
+          k(ik),
+          depth_output_gemm3d(idepth_output_gemm3d),
+          reinterpret_input_as_3d(ireinterpret_input_as_3d),
+          broadcast_bias(ibroadcast_bias),
+          fp_mixed_precision(ifp_mixed_precision),
+          has_pad_y(ihas_pad_y),
+          activation_info(iactivation_info),
+          mult_transpose1xW_width(inmult_transpose1xW_width),
+          mult_interleave4x4_height(imult_interleave4x4_height),
+          lhs_info(ilhs_info),
+          rhs_info(irhs_info),
+          a_offset(ina_offset),
+          b_offset(inb_offset)
     {
     }
 
-    unsigned int            m{ 0 };                           /**< Number of LHS rows*/
-    unsigned int            n{ 0 };                           /**< Number of RHS columns*/
-    unsigned int            k{ 0 };                           /**< Number of LHS columns or RHS rows */
-    unsigned int            depth_output_gemm3d{ 0 };         /**< Depth of the output tensor in case is reinterpreted as 3D */
-    bool                    reinterpret_input_as_3d{ false }; /**< Flag used to reinterpret the input as 3D */
-    bool                    broadcast_bias{ false };          /**< Flag used to broadcast the bias addition */
-    bool                    fp_mixed_precision{ false };      /**< Flag used to indicate wider accumulators (32 bit instead of 16 for FP16). */
-    bool                    has_pad_y{ false };               /**< Flag used to indicate if the input/output tensors have internal pad on the y direction */
-    ActivationLayerInfo     activation_info{};                /**< Activation function to perform after the matrix multiplication */
-    int                     mult_transpose1xW_width{ 1 };     /**< Multiplication factor for the width of the 1xW transposed block */
-    int                     mult_interleave4x4_height{ 1 };   /**< Multiplication factor for the height of the 4x4 interleaved block */
-    GEMMLHSMatrixInfo       lhs_info{};                       /**< LHS matrix information used to retrieve the number of rows processed by each thread */
-    GEMMRHSMatrixInfo       rhs_info{};                       /**< RHS matrix information used for reshaping the RHS matrix */
-    int32_t                 a_offset{ 0 };                    /**< Offset to be added to each element of the matrix A */
-    int32_t                 b_offset{ 0 };                    /**< Offset to be added to each element of the matrix B */
-    GEMMLowpOutputStageInfo output_stage{};                   /**< GEMMLowp output stage information */
+    unsigned int m{0};                           /**< Number of LHS rows*/
+    unsigned int n{0};                           /**< Number of RHS columns*/
+    unsigned int k{0};                           /**< Number of LHS columns or RHS rows */
+    unsigned int depth_output_gemm3d{0};         /**< Depth of the output tensor in case is reinterpreted as 3D */
+    bool         reinterpret_input_as_3d{false}; /**< Flag used to reinterpret the input as 3D */
+    bool         broadcast_bias{false};          /**< Flag used to broadcast the bias addition */
+    bool fp_mixed_precision{false}; /**< Flag used to indicate wider accumulators (32 bit instead of 16 for FP16). */
+    bool has_pad_y{
+        false}; /**< Flag used to indicate if the input/output tensors have internal pad on the y direction */
+    ActivationLayerInfo activation_info{}; /**< Activation function to perform after the matrix multiplication */
+    int mult_transpose1xW_width{1};        /**< Multiplication factor for the width of the 1xW transposed block */
+    int mult_interleave4x4_height{1};      /**< Multiplication factor for the height of the 4x4 interleaved block */
+    GEMMLHSMatrixInfo
+        lhs_info{}; /**< LHS matrix information used to retrieve the number of rows processed by each thread */
+    GEMMRHSMatrixInfo       rhs_info{};     /**< RHS matrix information used for reshaping the RHS matrix */
+    int32_t                 a_offset{0};    /**< Offset to be added to each element of the matrix A */
+    int32_t                 b_offset{0};    /**< Offset to be added to each element of the matrix B */
+    GEMMLowpOutputStageInfo output_stage{}; /**< GEMMLowp output stage information */
 };
 
 /** Compute descriptor used by the depthwise convolution native kernel */
 struct DWCComputeKernelInfo
 {
-    unsigned int n0{ 1 };                             /**< Number of columns processed by each thread */
-    unsigned int m0{ 1 };                             /**< Number of rows processed by each thread */
-    bool         export_input_to_cl_image{ false };   /**< Export input to cl_image */
-    bool         export_weights_to_cl_image{ false }; /**< Export the weights to cl_image */
+    unsigned int n0{1};                             /**< Number of columns processed by each thread */
+    unsigned int m0{1};                             /**< Number of rows processed by each thread */
+    bool         export_input_to_cl_image{false};   /**< Export input to cl_image */
+    bool         export_weights_to_cl_image{false}; /**< Export the weights to cl_image */
 };
 
 /** Compute descriptor used by the direct convolution kernel */
 struct DirectConvComputeKernelInfo
 {
-    int32_t m0{ 1 };                             /**< Number of rows to be processed by the kernel */
-    int32_t n0{ 1 };                             /**< Number of columns to be processed by the kernel */
-    int32_t k0{ 1 };                             /**< Number of partial accumulations to be processed in a single iteration by the kernel */
-    bool    export_weights_to_cl_image{ false }; /**< Flag to export the weights to cl_image */
-    bool    export_output_to_cl_image{ false };  /**< Flag to export the output to cl_image */
-    bool    export_input_to_cl_image{ false };   /**< Flag to export the input to cl_image */
+    int32_t m0{1}; /**< Number of rows to be processed by the kernel */
+    int32_t n0{1}; /**< Number of columns to be processed by the kernel */
+    int32_t k0{1}; /**< Number of partial accumulations to be processed in a single iteration by the kernel */
+    bool    export_weights_to_cl_image{false}; /**< Flag to export the weights to cl_image */
+    bool    export_output_to_cl_image{false};  /**< Flag to export the output to cl_image */
+    bool    export_input_to_cl_image{false};   /**< Flag to export the input to cl_image */
 };
 
 /** Descriptor used by the softmax kernels */
 struct SoftmaxKernelInfo
 {
-    float    beta{ 1.f };                          /**< A scaling factor for the exponent with default value 1.0 */
-    bool     is_log{ false };                      /**< Flag used to perform Log Softmax operation */
-    DataType input_data_type{ DataType::UNKNOWN }; /**< Input tensor data type */
-    int32_t  axis{ 0 };                            /**< The dimension in which to apply softmax. */
+    float    beta{1.f};                          /**< A scaling factor for the exponent with default value 1.0 */
+    bool     is_log{false};                      /**< Flag used to perform Log Softmax operation */
+    DataType input_data_type{DataType::UNKNOWN}; /**< Input tensor data type */
+    int32_t  axis{0};                            /**< The dimension in which to apply softmax. */
 };
 
 /** Descriptor used by the direct convolution layer output stage kernels */
 struct DirectConvolutionLayerOutputStageKernelInfo
 {
-    int32_t  result_fixedpoint_multiplier{ 0 };     /**< Result output stage multiplier used for quantizing */
-    int32_t  result_shift{ 0 };                     /**< Result output stage shift used for quantizing */
-    int32_t  result_offset_after_shift{ 0 };        /**< Result offset used for quantizing */
-    DataType output_data_type{ DataType::UNKNOWN }; /**< Output tensor data type to use if the output is not initialized */
+    int32_t  result_fixedpoint_multiplier{0}; /**< Result output stage multiplier used for quantizing */
+    int32_t  result_shift{0};                 /**< Result output stage shift used for quantizing */
+    int32_t  result_offset_after_shift{0};    /**< Result offset used for quantizing */
+    DataType output_data_type{
+        DataType::UNKNOWN}; /**< Output tensor data type to use if the output is not initialized */
 };
 
 struct InstanceNormalizationLayerKernelInfo
 {
     /** Default constructor */
-    InstanceNormalizationLayerKernelInfo()
-        : InstanceNormalizationLayerKernelInfo(1.f, 0.f, 1e-12, true)
+    InstanceNormalizationLayerKernelInfo() : InstanceNormalizationLayerKernelInfo(1.f, 0.f, 1e-12, true)
     {
     }
     /** Constructor
@@ -177,10 +190,10 @@ struct GEMMLowpReductionKernelInfo
     {
     }
 
-    int32_t k{ 0 };                 /**< Number of matrix columns/rows */
-    bool    is_reshaped{ false };   /**< True if the input tensor has been reshaped */
-    int32_t scalar{ 0 };            /**< Scalar value to multiply each reduced column/row by */
-    bool    mul_by_scalar{ false }; /**< True if each column/row reduction has to be multiplied by a scalar value */
+    int32_t k{0};                 /**< Number of matrix columns/rows */
+    bool    is_reshaped{false};   /**< True if the input tensor has been reshaped */
+    int32_t scalar{0};            /**< Scalar value to multiply each reduced column/row by */
+    bool    mul_by_scalar{false}; /**< True if each column/row reduction has to be multiplied by a scalar value */
 };
 
 struct ScaleKernelInfo
@@ -202,13 +215,13 @@ struct ScaleKernelInfo
                     bool                use_padding           = true,
                     bool                align_corners         = false,
                     DataLayout          data_layout           = DataLayout::UNKNOWN) noexcept
-        : interpolation_policy{ interpolation_policy },
-    border_mode{ border_mode },
-    constant_border_value{ constant_border_value },
-    sampling_policy{ sampling_policy },
-    use_padding{ use_padding },
-    align_corners{ align_corners },
-    data_layout{ data_layout }
+        : interpolation_policy{interpolation_policy},
+          border_mode{border_mode},
+          constant_border_value{constant_border_value},
+          sampling_policy{sampling_policy},
+          use_padding{use_padding},
+          align_corners{align_corners},
+          data_layout{data_layout}
     {
     }
 
@@ -224,16 +237,17 @@ struct ScaleKernelInfo
 struct MatMulKernelInfo
 {
     MatMulKernelInfo() = default;
-    MatMulKernelInfo(bool adj_lhs, bool adj_rhs, int m0 = 1, int n0 = 1, int k0 = 1, bool export_rhs_to_cl_image = false)
-        : adj_lhs{ adj_lhs }, adj_rhs{ adj_rhs }, m0{ m0 }, n0{ n0 }, k0{ k0 }, export_rhs_to_cl_image{ export_rhs_to_cl_image }
+    MatMulKernelInfo(
+        bool adj_lhs, bool adj_rhs, int m0 = 1, int n0 = 1, int k0 = 1, bool export_rhs_to_cl_image = false)
+        : adj_lhs{adj_lhs}, adj_rhs{adj_rhs}, m0{m0}, n0{n0}, k0{k0}, export_rhs_to_cl_image{export_rhs_to_cl_image}
     {
     }
-    bool adj_lhs{ false };                /**< Get Adjoint LHS flag value */
-    bool adj_rhs{ false };                /**< Get Adjoint RHS flag value */
-    int  m0{ 1 };                         /**< Number of output rows processed by each work-item*/
-    int  n0{ 1 };                         /**< Number of output columns processed by each work-item*/
-    int  k0{ 1 };                         /**< Number of inner accumulations */
-    bool export_rhs_to_cl_image{ false }; /**< Flag to know whether the RHS tensor should be exported to cl_image*/
+    bool adj_lhs{false};                /**< Get Adjoint LHS flag value */
+    bool adj_rhs{false};                /**< Get Adjoint RHS flag value */
+    int  m0{1};                         /**< Number of output rows processed by each work-item*/
+    int  n0{1};                         /**< Number of output columns processed by each work-item*/
+    int  k0{1};                         /**< Number of inner accumulations */
+    bool export_rhs_to_cl_image{false}; /**< Flag to know whether the RHS tensor should be exported to cl_image*/
 };
 } // namespace arm_compute
 #endif // ACL_ARM_COMPUTE_CORE_KERNELDESCRIPTORS_H
diff --git a/arm_compute/core/Log.h b/arm_compute/core/Log.h
index bc0ecb802e..03b861f765 100644
--- a/arm_compute/core/Log.h
+++ b/arm_compute/core/Log.h
@@ -34,11 +34,11 @@
 #define ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER()                                   \
     do                                                                             \
     {                                                                              \
-        if(arm_compute::logging::LoggerRegistry::get().logger("CORE") == nullptr)  \
+        if (arm_compute::logging::LoggerRegistry::get().logger("CORE") == nullptr) \
         {                                                                          \
             arm_compute::logging::LoggerRegistry::get().create_reserved_loggers(); \
         }                                                                          \
-    } while(false)
+    } while (false)
 #else /* ARM_COMPUTE_LOGGING_ENABLED */
 #define ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER()
 #endif /* ARM_COMPUTE_LOGGING_ENABLED */
@@ -53,7 +53,7 @@
     {                                                \
         ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();    \
         ARM_COMPUTE_LOG_MSG("CORE", log_level, msg); \
-    } while(false)
+    } while (false)
 
 /** Log a message with format to the core system logger
  *
@@ -66,7 +66,7 @@
     {                                                                         \
         ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();                             \
         ARM_COMPUTE_LOG_MSG_WITH_FORMAT("CORE", log_level, fmt, __VA_ARGS__); \
-    } while(false)
+    } while (false)
 
 /** Log a stream to the core system logger
  *
@@ -78,7 +78,7 @@
     {                                                  \
         ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();      \
         ARM_COMPUTE_LOG_STREAM("CORE", log_level, ss); \
-    } while(false)
+    } while (false)
 
 /** Log information level message to the core system logger
  *
@@ -89,7 +89,7 @@
     {                                                                        \
         ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();                            \
         ARM_COMPUTE_LOG_MSG_CORE(arm_compute::logging::LogLevel::INFO, msg); \
-    } while(false)
+    } while (false)
 
 /** Log information level formatted message to the core system logger
  *
@@ -101,7 +101,7 @@
     {                                                                                                  \
         ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();                                                      \
         ARM_COMPUTE_LOG_MSG_WITH_FORMAT_CORE(arm_compute::logging::LogLevel::INFO, #fmt, __VA_ARGS__); \
-    } while(false)
+    } while (false)
 
 /** Log information level stream to the core system logger
  *
@@ -112,6 +112,6 @@
     {                                                                          \
         ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();                              \
         ARM_COMPUTE_LOG_STREAM_CORE(arm_compute::logging::LogLevel::INFO, ss); \
-    } while(false)
+    } while (false)
 
 #endif /* ARM_COMPUTE_LOGGING_MACROS_H */
diff --git a/arm_compute/core/PixelValue.h b/arm_compute/core/PixelValue.h
index 790f58a793..0b4df4f2e2 100644
--- a/arm_compute/core/PixelValue.h
+++ b/arm_compute/core/PixelValue.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_PIXELVALUE_H
 #define ARM_COMPUTE_PIXELVALUE_H
 
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/QuantizationInfo.h"
+#include "arm_compute/core/Types.h"
 
 #include <cstdint>
 
@@ -36,11 +36,7 @@ class PixelValue
 {
 public:
     /** Default constructor: value initialized to 0 */
-    PixelValue() noexcept
-        : value
-    {
-        int64_t(0)
-    }
+    PixelValue() noexcept : value{int64_t(0)}
     {
     }
     /** Initialize the union with a pixel value of chosen datatype
@@ -49,10 +45,9 @@ public:
      * @param[in] datatype DataType that @p v have to be stored
      * @param[in] qinfo    (Optional) QuantizationInfo to apply in case of quantized data types to @p v
      */
-    PixelValue(double v, DataType datatype, QuantizationInfo qinfo = QuantizationInfo())
-        : PixelValue()
+    PixelValue(double v, DataType datatype, QuantizationInfo qinfo = QuantizationInfo()) : PixelValue()
     {
-        switch(datatype)
+        switch (datatype)
         {
             case DataType::U8:
                 value.u8 = static_cast<uint8_t>(v);
@@ -112,8 +107,7 @@ public:
      *
      * @param[in] v S8 value.
      */
-    PixelValue(int8_t v)
-        : PixelValue()
+    PixelValue(int8_t v) : PixelValue()
     {
         value.s8 = v;
     }
@@ -121,8 +115,7 @@ public:
      *
      * @param[in] v U8 value.
      */
-    PixelValue(uint8_t v)
-        : PixelValue()
+    PixelValue(uint8_t v) : PixelValue()
     {
         value.u8 = v;
     }
@@ -130,8 +123,7 @@ public:
      *
      * @param[in] v U16 value.
      */
-    PixelValue(uint16_t v)
-        : PixelValue()
+    PixelValue(uint16_t v) : PixelValue()
     {
         value.u16 = v;
     }
@@ -139,8 +131,7 @@ public:
      *
      * @param[in] v S16 value.
      */
-    PixelValue(int16_t v)
-        : PixelValue()
+    PixelValue(int16_t v) : PixelValue()
     {
         value.s16 = v;
     }
@@ -148,8 +139,7 @@ public:
      *
      * @param[in] v U32 value.
      */
-    PixelValue(uint32_t v)
-        : PixelValue()
+    PixelValue(uint32_t v) : PixelValue()
     {
         value.u32 = v;
     }
@@ -157,8 +147,7 @@ public:
      *
      * @param[in] v S32 value.
      */
-    PixelValue(int32_t v)
-        : PixelValue()
+    PixelValue(int32_t v) : PixelValue()
     {
         value.s32 = v;
     }
@@ -167,8 +156,7 @@ public:
      *
      * @param[in] v U64 value.
      */
-    PixelValue(uint64_t v)
-        : PixelValue()
+    PixelValue(uint64_t v) : PixelValue()
     {
         value.u64 = v;
     }
@@ -176,8 +164,7 @@ public:
      *
      * @param[in] v S64 value.
      */
-    PixelValue(int64_t v)
-        : PixelValue()
+    PixelValue(int64_t v) : PixelValue()
     {
         value.s64 = v;
     }
@@ -185,8 +172,7 @@ public:
      *
      * @param[in] v F16 value.
      */
-    PixelValue(bfloat16 v)
-        : PixelValue()
+    PixelValue(bfloat16 v) : PixelValue()
     {
         value.bf16 = v;
     }
@@ -194,8 +180,7 @@ public:
      *
      * @param[in] v F16 value.
      */
-    PixelValue(half v)
-        : PixelValue()
+    PixelValue(half v) : PixelValue()
     {
         value.f16 = v;
     }
@@ -203,8 +188,7 @@ public:
      *
      * @param[in] v F32 value.
      */
-    PixelValue(float v)
-        : PixelValue()
+    PixelValue(float v) : PixelValue()
     {
         value.f32 = v;
     }
@@ -212,8 +196,7 @@ public:
      *
      * @param[in] v F64 value.
      */
-    PixelValue(double v)
-        : PixelValue()
+    PixelValue(double v) : PixelValue()
     {
         value.f64 = v;
     }
@@ -221,23 +204,23 @@ public:
      * Use the field corresponding to the image format
      */
     union
-        {
-            uint64_t u64;     /**< Single channel U64 */
-            int64_t  s64;     /**< Single channel S64 */
-            uint8_t  rgb[3];  /**< 3 channels: RGB888 */
-            uint8_t  yuv[3];  /**< 3 channels: Any YUV format */
-            uint8_t  rgbx[4]; /**< 4 channels: RGBX8888 */
-            double   f64;     /**< Single channel double */
-            float    f32;     /**< Single channel float 32 */
-            half     f16;     /**< Single channel F16 */
-            bfloat16 bf16;    /**< Single channel brain floating-point number */
-            uint8_t  u8;      /**< Single channel U8 */
-            int8_t   s8;      /**< Single channel S8 */
-            uint16_t u16;     /**< Single channel U16 */
-            int16_t  s16;     /**< Single channel S16 */
-            uint32_t u32;     /**< Single channel U32 */
-            int32_t  s32;     /**< Single channel S32 */
-        } value;
+    {
+        uint64_t u64;     /**< Single channel U64 */
+        int64_t  s64;     /**< Single channel S64 */
+        uint8_t  rgb[3];  /**< 3 channels: RGB888 */
+        uint8_t  yuv[3];  /**< 3 channels: Any YUV format */
+        uint8_t  rgbx[4]; /**< 4 channels: RGBX8888 */
+        double   f64;     /**< Single channel double */
+        float    f32;     /**< Single channel float 32 */
+        half     f16;     /**< Single channel F16 */
+        bfloat16 bf16;    /**< Single channel brain floating-point number */
+        uint8_t  u8;      /**< Single channel U8 */
+        int8_t   s8;      /**< Single channel S8 */
+        uint16_t u16;     /**< Single channel U16 */
+        int16_t  s16;     /**< Single channel S16 */
+        uint32_t u32;     /**< Single channel U32 */
+        int32_t  s32;     /**< Single channel S32 */
+    } value;
     /** Interpret the pixel value as a U8
      *
      * @param[out] v Returned value
diff --git a/arm_compute/core/QuantizationInfo.h b/arm_compute/core/QuantizationInfo.h
index 8fa513eee1..471b8c57ab 100644
--- a/arm_compute/core/QuantizationInfo.h
+++ b/arm_compute/core/QuantizationInfo.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Rounding.h"
 #include "arm_compute/core/utils/misc/Utility.h"
+
 #include "support/ToolchainSupport.h"
 
 #include <vector>
@@ -41,8 +42,7 @@ using qasymm16_t       = uint16_t; /**< 16 bit quantized asymmetric scalar value
 struct UniformQuantizationInfo
 {
     /** Default constructor */
-    UniformQuantizationInfo()
-        : scale(0.f), offset(0)
+    UniformQuantizationInfo() : scale(0.f), offset(0)
     {
     }
     /** Constructor
@@ -50,8 +50,7 @@ struct UniformQuantizationInfo
      * @param[in] scale  Quantization scale
      * @param[in] offset Quantization offset
      */
-    UniformQuantizationInfo(float scale, int32_t offset)
-        : scale(scale), offset(offset)
+    UniformQuantizationInfo(float scale, int32_t offset) : scale(scale), offset(offset)
     {
     }
     /** Checks if the scale and offset are both zero */
@@ -69,9 +68,7 @@ class QuantizationInfo
 {
 public:
     /** Default constructor */
-    QuantizationInfo() noexcept
-        : _scale(),
-          _offset()
+    QuantizationInfo() noexcept : _scale(), _offset()
     {
     }
     /** Construct quantization info.
@@ -80,8 +77,7 @@ public:
      *
      * @param[in] scale Scale.
      */
-    QuantizationInfo(float scale)
-        : _scale(1, scale), _offset()
+    QuantizationInfo(float scale) : _scale(1, scale), _offset()
     {
     }
     /** Construct quantization info.
@@ -91,8 +87,7 @@ public:
      * @param[in] scale  Scale.
      * @param[in] offset Offset.
      */
-    QuantizationInfo(float scale, int offset)
-        : _scale(1, scale), _offset(1, offset)
+    QuantizationInfo(float scale, int offset) : _scale(1, scale), _offset(1, offset)
     {
     }
     /** Construct quantization info.
@@ -101,8 +96,7 @@ public:
      *
      * @param[in] scale Scale.
      */
-    QuantizationInfo(std::vector<float> scale)
-        : _scale(scale), _offset()
+    QuantizationInfo(std::vector<float> scale) : _scale(scale), _offset()
     {
     }
     /** Construct quantization info.
@@ -112,8 +106,7 @@ public:
      * @param[in] scale  Scale.
      * @param[in] offset Offset.
      */
-    QuantizationInfo(std::vector<float> scale, std::vector<int32_t> offset)
-        : _scale(scale), _offset(offset)
+    QuantizationInfo(std::vector<float> scale, std::vector<int32_t> offset) : _scale(scale), _offset(offset)
     {
     }
     /** Scale vector accessor
@@ -208,8 +201,7 @@ inline bool operator!=(const UniformQuantizationInfo &lhs, const UniformQuantiza
 template <typename QUANTIZED_TYPE = uint8_t>
 struct Qasymm8QuantizationHelper
 {
-    static_assert(std::is_same<QUANTIZED_TYPE, uint8_t>::value
-                  || std::is_same<QUANTIZED_TYPE, int8_t>::value,
+    static_assert(std::is_same<QUANTIZED_TYPE, uint8_t>::value || std::is_same<QUANTIZED_TYPE, int8_t>::value,
                   "quantized type should be either uint8_t or int8_t.");
 
     /** Quantize a value given a 8-bit asymmetric quantization scheme
@@ -234,9 +226,10 @@ struct Qasymm8QuantizationHelper
      *
      * @return Quantized value
      */
-    static inline QUANTIZED_TYPE quantize(float value, const UniformQuantizationInfo &qinfo, RoundingPolicy rounding_policy)
+    static inline QUANTIZED_TYPE
+    quantize(float value, const UniformQuantizationInfo &qinfo, RoundingPolicy rounding_policy)
     {
-        if(rounding_policy == RoundingPolicy::TO_NEAREST_UP)
+        if (rounding_policy == RoundingPolicy::TO_NEAREST_UP)
         {
             return quantize(value, qinfo);
         }
@@ -254,7 +247,8 @@ struct Qasymm8QuantizationHelper
      *
      * @return Quantized value
      */
-    static inline QUANTIZED_TYPE quantize(float value, const QuantizationInfo &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
+    static inline QUANTIZED_TYPE
+    quantize(float value, const QuantizationInfo &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
     {
         const UniformQuantizationInfo uqinfo = qinfo.uniform();
         ARM_COMPUTE_ERROR_ON(uqinfo.scale == 0);
@@ -297,7 +291,8 @@ struct Qasymm8QuantizationHelper
  * @return Quantized value
  */
 template <typename INFO_TYPE>
-inline uint8_t quantize_qasymm8(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
+inline uint8_t
+quantize_qasymm8(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
 {
     return Qasymm8QuantizationHelper<uint8_t>::quantize(value, qinfo, rounding_policy);
 }
@@ -311,7 +306,9 @@ inline uint8_t quantize_qasymm8(float value, const INFO_TYPE &qinfo, RoundingPol
  * @return Quantized value
  */
 template <typename INFO_TYPE>
-inline int8_t quantize_qasymm8_signed(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
+inline int8_t quantize_qasymm8_signed(float            value,
+                                      const INFO_TYPE &qinfo,
+                                      RoundingPolicy   rounding_policy = RoundingPolicy::TO_NEAREST_UP)
 {
     return Qasymm8QuantizationHelper<int8_t>::quantize(value, qinfo, rounding_policy);
 }
@@ -441,7 +438,9 @@ inline float dequantize(uint16_t value, float scale, int32_t offset)
  *
  * @return Quantized value
  */
-inline int16_t quantize_qsymm16(float value, const UniformQuantizationInfo &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
+inline int16_t quantize_qsymm16(float                          value,
+                                const UniformQuantizationInfo &qinfo,
+                                RoundingPolicy                 rounding_policy = RoundingPolicy::TO_NEAREST_UP)
 {
     int quantized = arm_compute::round(value / qinfo.scale, rounding_policy);
     quantized     = arm_compute::utility::clamp<int, int16_t>(quantized);
@@ -492,7 +491,9 @@ inline float dequantize_qsymm16(int16_t value, const QuantizationInfo &qinfo)
  *
  * @return Quantized value
  */
-inline uint16_t quantize_qasymm16(float value, const UniformQuantizationInfo &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
+inline uint16_t quantize_qasymm16(float                          value,
+                                  const UniformQuantizationInfo &qinfo,
+                                  RoundingPolicy                 rounding_policy = RoundingPolicy::TO_NEAREST_UP)
 {
     int quantized = arm_compute::round(value / qinfo.scale, rounding_policy) + qinfo.offset;
     quantized     = arm_compute::utility::clamp<int, uint16_t>(quantized);
@@ -565,7 +566,8 @@ inline float dequantize_qasymm16(uint16_t value, const QuantizationInfo &qinfo)
  * z_n = - z_i * s_i / s_o + z_o
  *
  */
-inline UniformQuantizationInfo compute_requantization_scale_offset(const UniformQuantizationInfo &uqinfo_in, const UniformQuantizationInfo &uqinfo_out)
+inline UniformQuantizationInfo compute_requantization_scale_offset(const UniformQuantizationInfo &uqinfo_in,
+                                                                   const UniformQuantizationInfo &uqinfo_out)
 {
     float   scale_to_apply  = uqinfo_out.scale;
     int32_t offset_to_apply = uqinfo_out.offset;
diff --git a/arm_compute/core/Rounding.h b/arm_compute/core/Rounding.h
index b6817b5107..30a5a0fe9d 100644
--- a/arm_compute/core/Rounding.h
+++ b/arm_compute/core/Rounding.h
@@ -42,5 +42,5 @@ enum class RoundingPolicy
  * @return Rounded value of the argument x.
  */
 int round(float x, RoundingPolicy rounding_policy);
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ROUNDING_H */
diff --git a/arm_compute/core/Size2D.h b/arm_compute/core/Size2D.h
index f3e9bea4c7..672b392050 100644
--- a/arm_compute/core/Size2D.h
+++ b/arm_compute/core/Size2D.h
@@ -41,9 +41,7 @@ public:
      * @param[in] w Width of the image or rectangle
      * @param[in] h Height of the image or rectangle
      */
-    Size2D(size_t w, size_t h) noexcept
-        : width(w),
-          height(h)
+    Size2D(size_t w, size_t h) noexcept : width(w), height(h)
     {
     }
     /** The area of the image or rectangle calculated as (width * height)
@@ -90,5 +88,5 @@ public:
     size_t width  = {}; /**< Width of the image region or rectangle */
     size_t height = {}; /**< Height of the image region or rectangle */
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_SIZE2D_H */
diff --git a/arm_compute/core/Size3D.h b/arm_compute/core/Size3D.h
index 4241ed4f7e..e2dc6fe012 100644
--- a/arm_compute/core/Size3D.h
+++ b/arm_compute/core/Size3D.h
@@ -40,8 +40,7 @@ public:
      * @param[in] h Height of the 3D shape or object
      * @param[in] d Depth of the 3D shape or object
      */
-    Size3D(size_t w, size_t h, size_t d) noexcept
-        : width(w), height(h), depth(d)
+    Size3D(size_t w, size_t h, size_t d) noexcept : width(w), height(h), depth(d)
     {
     }
 
diff --git a/arm_compute/core/Steps.h b/arm_compute/core/Steps.h
index 208fc4b294..6b261becc0 100644
--- a/arm_compute/core/Steps.h
+++ b/arm_compute/core/Steps.h
@@ -45,8 +45,7 @@ public:
      * @param[in] steps Values to initialize the steps.
      */
     template <typename... Ts>
-    Steps(Ts... steps)
-        : Dimensions{ steps... }
+    Steps(Ts... steps) : Dimensions{steps...}
     {
         // Initialize empty dimensions to 1
         std::fill(_id.begin() + _num_dimensions, _id.end(), 1);
@@ -62,5 +61,5 @@ public:
     /** Default destructor */
     ~Steps() = default;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_STEPS_H*/
diff --git a/arm_compute/core/Strides.h b/arm_compute/core/Strides.h
index b582d066f7..627b219987 100644
--- a/arm_compute/core/Strides.h
+++ b/arm_compute/core/Strides.h
@@ -43,8 +43,7 @@ public:
      * @param[in] strides Values to initialize the strides.
      */
     template <typename... Ts>
-    constexpr Strides(Ts... strides)
-        : Dimensions{ strides... }
+    constexpr Strides(Ts... strides) : Dimensions{strides...}
     {
     }
     /** Allow instances of this class to be copy constructed */
diff --git a/arm_compute/core/SubTensorInfo.h b/arm_compute/core/SubTensorInfo.h
index 21703b0d93..7a3ee2cfd0 100644
--- a/arm_compute/core/SubTensorInfo.h
+++ b/arm_compute/core/SubTensorInfo.h
@@ -24,10 +24,9 @@
 #ifndef ARM_COMPUTE_SUBTENSORINFO_H
 #define ARM_COMPUTE_SUBTENSORINFO_H
 
-#include "arm_compute/core/ITensorInfo.h"
-
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Strides.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
@@ -73,7 +72,7 @@ public:
 
     // Inherited methods overridden:
     std::unique_ptr<ITensorInfo> clone() const override;
-    ITensorInfo &set_data_type(DataType data_type) override
+    ITensorInfo                 &set_data_type(DataType data_type) override
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         _parent->set_data_type(data_type);
@@ -143,7 +142,7 @@ public:
         return _parent->offset_element_in_bytes(_coords);
     }
     int32_t offset_element_in_bytes(const Coordinates &pos) const override;
-    size_t element_size() const override
+    size_t  element_size() const override
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         return _parent->element_size();
@@ -227,7 +226,7 @@ public:
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         // Check if subtensor is valid if parent is configured
-        if(_parent->tensor_shape().total_size() != 0)
+        if (_parent->tensor_shape().total_size() != 0)
         {
             ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(_parent->valid_region(), valid_region);
         }
diff --git a/arm_compute/core/TensorInfo.h b/arm_compute/core/TensorInfo.h
index e738a797b2..b18f750427 100644
--- a/arm_compute/core/TensorInfo.h
+++ b/arm_compute/core/TensorInfo.h
@@ -24,15 +24,14 @@
 #ifndef ARM_COMPUTE_TENSORINFO_H
 #define ARM_COMPUTE_TENSORINFO_H
 
-#include "arm_compute/core/ITensorInfo.h"
-
-#include "ITensorInfo.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Strides.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
 
+#include "ITensorInfo.h"
 #include <cstddef>
 #include <memory>
 
@@ -112,7 +111,10 @@ public:
      * @param[in] data_type         Data type to use for each tensor element
      * @param[in] quantization_info The quantization settings for the tensor data.
      */
-    TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, QuantizationInfo quantization_info);
+    TensorInfo(const TensorShape &tensor_shape,
+               size_t             num_channels,
+               DataType           data_type,
+               QuantizationInfo   quantization_info);
 
     /** Initialize the tensor info with just a format.
      *
@@ -136,7 +138,11 @@ public:
      * @param[in] offset_first_element_in_bytes Offset in bytes from the beginning of memory allocation to access the first element.
      * @param[in] total_size_in_bytes           Size in bytes of the memory allocation (including the offset to the first element).
      */
-    void init(const TensorShape &tensor_shape, Format format, const Strides &strides_in_bytes, size_t offset_first_element_in_bytes, size_t total_size_in_bytes);
+    void init(const TensorShape &tensor_shape,
+              Format             format,
+              const Strides     &strides_in_bytes,
+              size_t             offset_first_element_in_bytes,
+              size_t             total_size_in_bytes);
 
     /** Initialize the tensor info with just a format.
      *
@@ -164,8 +170,12 @@ public:
      * @param[in] offset_first_element_in_bytes Offset in bytes from the beginning of memory allocation to access the first element.
      * @param[in] total_size_in_bytes           Size in bytes of the memory allocation (including the offset to the first element).
      */
-    void init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, const Strides &strides_in_bytes, size_t offset_first_element_in_bytes,
-              size_t total_size_in_bytes);
+    void init(const TensorShape &tensor_shape,
+              size_t             num_channels,
+              DataType           data_type,
+              const Strides     &strides_in_bytes,
+              size_t             offset_first_element_in_bytes,
+              size_t             total_size_in_bytes);
     /** Initialize the metadata structure for the given tensor shape and single-plane format, (Padding is automatically calculated)
      *
      * @note The padding used by this method is really conservative so that the tensor can be used for most functions.
@@ -191,19 +201,19 @@ public:
 
     // Inherited methods overridden:
     std::unique_ptr<ITensorInfo> clone() const override;
-    ITensorInfo &set_data_type(DataType data_type) override;
-    ITensorInfo &set_num_channels(int num_channels) override;
-    ITensorInfo &set_format(Format format) override;
-    ITensorInfo &set_tensor_shape(const TensorShape &shape) override;
-    ITensorInfo &set_tensor_dims_state(const TensorDimsState &state) override;
-    ITensorInfo &set_quantization_info(const QuantizationInfo &quantization_info) override;
-    ITensorInfo &set_data_layout(const DataLayout &data_layout) override;
-    ITensorInfo &reset_padding() override;
-    bool         auto_padding() override;
-    ITensorInfo &set_lock_paddings(bool flag) override;
-    bool lock_paddings() const override;
-    bool extend_padding(const PaddingSize &padding) override;
-    size_t dimension(size_t index) const override
+    ITensorInfo                 &set_data_type(DataType data_type) override;
+    ITensorInfo                 &set_num_channels(int num_channels) override;
+    ITensorInfo                 &set_format(Format format) override;
+    ITensorInfo                 &set_tensor_shape(const TensorShape &shape) override;
+    ITensorInfo                 &set_tensor_dims_state(const TensorDimsState &state) override;
+    ITensorInfo                 &set_quantization_info(const QuantizationInfo &quantization_info) override;
+    ITensorInfo                 &set_data_layout(const DataLayout &data_layout) override;
+    ITensorInfo                 &reset_padding() override;
+    bool                         auto_padding() override;
+    ITensorInfo                 &set_lock_paddings(bool flag) override;
+    bool                         lock_paddings() const override;
+    bool                         extend_padding(const PaddingSize &padding) override;
+    size_t                       dimension(size_t index) const override
     {
         return _tensor_shape[index];
     }
@@ -220,7 +230,7 @@ public:
         return _offset_first_element_in_bytes;
     }
     int32_t offset_element_in_bytes(const Coordinates &pos) const override;
-    size_t element_size() const override
+    size_t  element_size() const override
     {
         return data_size_from_type(_data_type) * _num_channels;
     }
@@ -266,7 +276,8 @@ public:
     }
     bool is_dynamic() const override
     {
-        return std::find(std::cbegin(_dims_state), std::cend(_dims_state), get_dynamic_state_value()) != std::cend(_dims_state);
+        return std::find(std::cbegin(_dims_state), std::cend(_dims_state), get_dynamic_state_value()) !=
+               std::cend(_dims_state);
     }
     bool are_values_constant() const override
     {
@@ -343,11 +354,15 @@ private:
  */
 inline bool operator==(const TensorInfo &lhs, const TensorInfo &rhs)
 {
-    return (lhs._total_size == rhs._total_size) && (lhs._offset_first_element_in_bytes == rhs._offset_first_element_in_bytes) && (lhs._strides_in_bytes == rhs._strides_in_bytes)
-           && (lhs._num_channels == rhs._num_channels) && (lhs._tensor_shape == rhs._tensor_shape) && (lhs._dims_state == rhs._dims_state) && (lhs._data_type == rhs._data_type) && (lhs._format == rhs._format)
-           && (lhs._is_resizable == rhs._is_resizable) && (lhs._valid_region == rhs._valid_region) && (lhs._padding == rhs._padding) && (lhs._quantization_info == rhs._quantization_info)
-           && (lhs._data_layout == rhs._data_layout) && (lhs._are_values_constant == rhs._are_values_constant)
-           && (lhs._id == rhs._id);
+    return (lhs._total_size == rhs._total_size) &&
+           (lhs._offset_first_element_in_bytes == rhs._offset_first_element_in_bytes) &&
+           (lhs._strides_in_bytes == rhs._strides_in_bytes) && (lhs._num_channels == rhs._num_channels) &&
+           (lhs._tensor_shape == rhs._tensor_shape) && (lhs._dims_state == rhs._dims_state) &&
+           (lhs._data_type == rhs._data_type) && (lhs._format == rhs._format) &&
+           (lhs._is_resizable == rhs._is_resizable) && (lhs._valid_region == rhs._valid_region) &&
+           (lhs._padding == rhs._padding) && (lhs._quantization_info == rhs._quantization_info) &&
+           (lhs._data_layout == rhs._data_layout) && (lhs._are_values_constant == rhs._are_values_constant) &&
+           (lhs._id == rhs._id);
 }
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_TENSORINFO_H */
diff --git a/arm_compute/core/TensorShape.h b/arm_compute/core/TensorShape.h
index 4c9186ac64..c1707e262f 100644
--- a/arm_compute/core/TensorShape.h
+++ b/arm_compute/core/TensorShape.h
@@ -44,11 +44,10 @@ public:
      * @param[in] dims Values to initialize the dimensions.
      */
     template <typename... Ts>
-    TensorShape(Ts... dims)
-        : Dimensions{ dims... }
+    TensorShape(Ts... dims) : Dimensions{dims...}
     {
         // Initialize unspecified dimensions to 1
-        if(_num_dimensions > 0)
+        if (_num_dimensions > 0)
         {
             std::fill(_id.begin() + _num_dimensions, _id.end(), 1);
         }
@@ -79,7 +78,7 @@ public:
     TensorShape &set(size_t dimension, size_t value, bool apply_dim_correction = true, bool increase_dim_unit = true)
     {
         // Clear entire shape if one dimension is zero
-        if(value == 0)
+        if (value == 0)
         {
             _num_dimensions = 0;
             std::fill(_id.begin(), _id.end(), 0);
@@ -94,7 +93,7 @@ public:
             Dimensions::set(dimension, value, increase_dim_unit);
 
             // Correct number dimensions to ignore trailing dimensions of size 1
-            if(apply_dim_correction)
+            if (apply_dim_correction)
             {
                 apply_dimension_correction();
             }
@@ -123,7 +122,7 @@ public:
         std::fill(_id.begin() + _num_dimensions, _id.end(), 1);
 
         // Correct number dimensions to ignore trailing dimensions of size 1
-        if(apply_dim_correction)
+        if (apply_dim_correction)
         {
             apply_dimension_correction();
         }
@@ -212,26 +211,26 @@ public:
      * @return The broadcasted shape or an empty shape if the shapes are not broadcast compatible.
      */
     template <typename... Shapes>
-    static TensorShape broadcast_shape(const Shapes &... shapes)
+    static TensorShape broadcast_shape(const Shapes &...shapes)
     {
         TensorShape bc_shape;
 
-        auto broadcast = [&bc_shape](const TensorShape & other)
+        auto broadcast = [&bc_shape](const TensorShape &other)
         {
-            if(bc_shape.num_dimensions() == 0)
+            if (bc_shape.num_dimensions() == 0)
             {
                 bc_shape = other;
             }
-            else if(other.num_dimensions() != 0)
+            else if (other.num_dimensions() != 0)
             {
-                for(size_t d = 0; d < TensorShape::num_max_dimensions; ++d)
+                for (size_t d = 0; d < TensorShape::num_max_dimensions; ++d)
                 {
                     const size_t dim_min = std::min(bc_shape[d], other[d]);
                     const size_t dim_max = std::max(bc_shape[d], other[d]);
 
-                    if((dim_min != 1) && (dim_min != dim_max))
+                    if ((dim_min != 1) && (dim_min != dim_max))
                     {
-                        bc_shape = TensorShape{ 0U };
+                        bc_shape = TensorShape{0U};
                         break;
                     }
 
@@ -249,9 +248,9 @@ private:
     /** Remove trailing dimensions of size 1 from the reported number of dimensions. */
     void apply_dimension_correction()
     {
-        for(int i = static_cast<int>(_num_dimensions) - 1; i > 0; --i)
+        for (int i = static_cast<int>(_num_dimensions) - 1; i > 0; --i)
         {
-            if(_id[i] == 1)
+            if (_id[i] == 1)
             {
                 --_num_dimensions;
             }
@@ -262,5 +261,5 @@ private:
         }
     }
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_TENSORSHAPE_H*/
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 9264cefe3e..6b51af17d4 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -59,13 +59,13 @@
 /** The following symbols have been moved to:
  * MatMulInfo
  */
-#include "arm_compute/function_info/MatMulInfo.h"
-
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Size3D.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/misc/Macros.h"
+#include "arm_compute/function_info/MatMulInfo.h"
+
 #include "support/Bfloat16.h"
 
 #include <cmath>
@@ -143,8 +143,7 @@ enum class ComparisonOperation
 struct ValidRegion
 {
     /** Default constructor */
-    ValidRegion()
-        : anchor{}, shape{}
+    ValidRegion() : anchor{}, shape{}
     {
     }
 
@@ -165,8 +164,7 @@ struct ValidRegion
      * @param[in] a_shape   Shape of the valid region.
      *
      */
-    ValidRegion(const Coordinates &an_anchor, const TensorShape &a_shape)
-        : anchor{ an_anchor }, shape{ a_shape }
+    ValidRegion(const Coordinates &an_anchor, const TensorShape &a_shape) : anchor{an_anchor}, shape{a_shape}
     {
         anchor.set_num_dimensions(std::max(anchor.num_dimensions(), shape.num_dimensions()));
     }
@@ -179,7 +177,7 @@ struct ValidRegion
      *
      */
     ValidRegion(const Coordinates &an_anchor, const TensorShape &a_shape, size_t num_dimensions)
-        : anchor{ an_anchor }, shape{ a_shape }
+        : anchor{an_anchor}, shape{a_shape}
     {
         ARM_COMPUTE_ERROR_ON(num_dimensions < std::max(anchor.num_dimensions(), shape.num_dimensions()));
         anchor.set_num_dimensions(num_dimensions);
@@ -241,32 +239,24 @@ enum class BorderMode
 struct BorderSize
 {
     /** Empty border, i.e. no border */
-    constexpr BorderSize() noexcept
-        : top{ 0 },
-    right{ 0 },
-    bottom{ 0 },
-    left{ 0 }
+    constexpr BorderSize() noexcept : top{0}, right{0}, bottom{0}, left{0}
     {
     }
 
     /** Border with equal size around the 2D plane */
-    explicit constexpr BorderSize(unsigned int size) noexcept
-        : top{ size },
-    right{ size },
-    bottom{ size },
-    left{ size }
+    explicit constexpr BorderSize(unsigned int size) noexcept : top{size}, right{size}, bottom{size}, left{size}
     {
     }
 
     /** Border with same size for top/bottom and left/right */
     constexpr BorderSize(unsigned int top_bottom, unsigned int left_right)
-        : top{ top_bottom }, right{ left_right }, bottom{ top_bottom }, left{ left_right }
+        : top{top_bottom}, right{left_right}, bottom{top_bottom}, left{left_right}
     {
     }
 
     /** Border with different sizes */
     constexpr BorderSize(unsigned int top, unsigned int right, unsigned int bottom, unsigned int left)
-        : top{ top }, right{ right }, bottom{ bottom }, left{ left }
+        : top{top}, right{right}, bottom{bottom}, left{left}
     {
     }
 
@@ -371,7 +361,7 @@ enum class InterpolationPolicy
 {
     NEAREST_NEIGHBOR, /**< Output values are defined to match the source pixel whose center is nearest to the sample position */
     BILINEAR,         /**< Output values are defined by bilinear interpolation between the pixels */
-    AREA,             /**< Output values are determined by averaging the source pixels whose areas fall under the area of the destination pixel, projected onto the source image */
+    AREA, /**< Output values are determined by averaging the source pixels whose areas fall under the area of the destination pixel, projected onto the source image */
 };
 
 /** Bilinear Interpolation method used by LKTracker */
@@ -478,12 +468,12 @@ enum class NormType
  */
 struct DetectionWindow
 {
-    uint16_t x{ 0 };         /**< Top-left x coordinate */
-    uint16_t y{ 0 };         /**< Top-left y coordinate */
-    uint16_t width{ 0 };     /**< Width of the detection window */
-    uint16_t height{ 0 };    /**< Height of the detection window */
-    uint16_t idx_class{ 0 }; /**< Index of the class */
-    float    score{ 0.f };   /**< Confidence value for the detection window */
+    uint16_t x{0};         /**< Top-left x coordinate */
+    uint16_t y{0};         /**< Top-left y coordinate */
+    uint16_t width{0};     /**< Width of the detection window */
+    uint16_t height{0};    /**< Height of the detection window */
+    uint16_t idx_class{0}; /**< Index of the class */
+    float    score{0.f};   /**< Confidence value for the detection window */
 };
 
 /** Available pooling types */
@@ -520,12 +510,28 @@ public:
      * @param[in] im_width                 (Optional) Boxes whose centers (on the x axis) is beyond im_width will be filtered. Defaults to 1
      * @param[in] im_height                (Optional) Boxes whose centers (on the y axis) is beyond im_height will be filtered. Defaults to 1
      */
-    BoxNMSLimitInfo(float score_thresh = 0.05f, float nms = 0.3f,
-                    int detections = 100, bool soft_nms_enabled = false,
-                    NMSType soft_nms_method = NMSType::LINEAR,
-                    float soft_nms_sigma = 0.5f, float soft_nms_min_score_thres = 0.001f, bool suppress_size = false, float min_size = 1.0f, float im_width = 1.0f, float im_height = 1.0f)
-        : _score_thresh(score_thresh), _nms(nms), _detections_per_im(detections), _soft_nms_enabled(soft_nms_enabled), _soft_nms_method(soft_nms_method), _soft_nms_sigma(soft_nms_sigma),
-          _soft_nms_min_score_thres(soft_nms_min_score_thres), _suppress_size(suppress_size), _min_size(min_size), _im_width(im_width), _im_height(im_height)
+    BoxNMSLimitInfo(float   score_thresh             = 0.05f,
+                    float   nms                      = 0.3f,
+                    int     detections               = 100,
+                    bool    soft_nms_enabled         = false,
+                    NMSType soft_nms_method          = NMSType::LINEAR,
+                    float   soft_nms_sigma           = 0.5f,
+                    float   soft_nms_min_score_thres = 0.001f,
+                    bool    suppress_size            = false,
+                    float   min_size                 = 1.0f,
+                    float   im_width                 = 1.0f,
+                    float   im_height                = 1.0f)
+        : _score_thresh(score_thresh),
+          _nms(nms),
+          _detections_per_im(detections),
+          _soft_nms_enabled(soft_nms_enabled),
+          _soft_nms_method(soft_nms_method),
+          _soft_nms_sigma(soft_nms_sigma),
+          _soft_nms_min_score_thres(soft_nms_min_score_thres),
+          _suppress_size(suppress_size),
+          _min_size(min_size),
+          _im_width(im_width),
+          _im_height(im_height)
     {
     }
     /** Get the score threshold */
@@ -603,14 +609,13 @@ private:
 struct Padding2D
 {
     Padding2D() = default;
-    Padding2D(size_t left, size_t right, size_t top, size_t bottom)
-        : left(left), right(right), top(top), bottom(bottom)
+    Padding2D(size_t left, size_t right, size_t top, size_t bottom) : left(left), right(right), top(top), bottom(bottom)
     {
     }
-    size_t left   = { 0 }; /**<  Padding across the width dimension on the left, in elements. */
-    size_t right  = { 0 }; /**<  Padding across the width dimension on the right, in elements. */
-    size_t top    = { 0 }; /**<  Padding across the height dimension on the top, in elements. */
-    size_t bottom = { 0 }; /**<  Padding across the height dimension on the bottom, in elements. */
+    size_t left   = {0}; /**<  Padding across the width dimension on the left, in elements. */
+    size_t right  = {0}; /**<  Padding across the width dimension on the right, in elements. */
+    size_t top    = {0}; /**<  Padding across the height dimension on the top, in elements. */
+    size_t bottom = {0}; /**<  Padding across the height dimension on the bottom, in elements. */
 };
 
 /** Padding information for 3D operations like Conv3d */
@@ -630,12 +635,12 @@ struct Padding3D
     {
     }
 
-    size_t left   = { 0 }; /**<  Padding across the width dimenstion on the left, in elements. */
-    size_t right  = { 0 }; /**<  Padding across the width dimenstion on the right, in elements. */
-    size_t top    = { 0 }; /**<  Padding across the height dimenstion  on the top, in elements. */
-    size_t bottom = { 0 }; /**<  Padding across the height dimenstion on the bottom, in elements. */
-    size_t front  = { 0 }; /**<  Padding across the depth dimenstion on the front, in elements. */
-    size_t back   = { 0 }; /**<  Padding across the depth dimenstion on the back, in elements. */
+    size_t left   = {0}; /**<  Padding across the width dimenstion on the left, in elements. */
+    size_t right  = {0}; /**<  Padding across the width dimenstion on the right, in elements. */
+    size_t top    = {0}; /**<  Padding across the height dimenstion  on the top, in elements. */
+    size_t bottom = {0}; /**<  Padding across the height dimenstion on the bottom, in elements. */
+    size_t front  = {0}; /**<  Padding across the depth dimenstion on the front, in elements. */
+    size_t back   = {0}; /**<  Padding across the depth dimenstion on the back, in elements. */
 };
 
 /** PriorBox layer info */
@@ -667,9 +672,15 @@ public:
      * @param[in] img_size      (Optional) Image size.
      * @param[in] steps         (Optional) Step values.
      */
-    PriorBoxLayerInfo(const std::vector<float> &min_sizes, const std::vector<float> &variances, float offset, bool flip = true, bool clip = false,
-                      const std::vector<float> &max_sizes = {}, const std::vector<float> &aspect_ratios = {},
-    const Coordinates2D &img_size = Coordinates2D{ 0, 0 }, const std::array<float, 2> &steps = { { 0.f, 0.f } })
+    PriorBoxLayerInfo(const std::vector<float>   &min_sizes,
+                      const std::vector<float>   &variances,
+                      float                       offset,
+                      bool                        flip          = true,
+                      bool                        clip          = false,
+                      const std::vector<float>   &max_sizes     = {},
+                      const std::vector<float>   &aspect_ratios = {},
+                      const Coordinates2D        &img_size      = Coordinates2D{0, 0},
+                      const std::array<float, 2> &steps         = {{0.f, 0.f}})
         : _min_sizes(min_sizes),
           _variances(variances),
           _offset(offset),
@@ -681,22 +692,22 @@ public:
           _steps(steps)
     {
         _aspect_ratios.push_back(1.);
-        for(unsigned int i = 0; i < aspect_ratios.size(); ++i)
+        for (unsigned int i = 0; i < aspect_ratios.size(); ++i)
         {
             float ar            = aspect_ratios[i];
             bool  already_exist = false;
-            for(auto ar_new : _aspect_ratios)
+            for (auto ar_new : _aspect_ratios)
             {
-                if(fabs(ar - ar_new) < 1e-6)
+                if (fabs(ar - ar_new) < 1e-6)
                 {
                     already_exist = true;
                     break;
                 }
             }
-            if(!already_exist)
+            if (!already_exist)
             {
                 _aspect_ratios.push_back(ar);
-                if(flip)
+                if (flip)
                 {
                     _aspect_ratios.push_back(1.f / ar);
                 }
@@ -808,8 +819,16 @@ public:
      * @param[in] variance_encoded_in_target (Optional) If true, variance is encoded in target. Otherwise we need to adjust the predicted offset accordingly.Default set to false.
      * @param[in] eta                        (Optional) Eta.
      */
-    DetectionOutputLayerInfo(int num_classes, bool share_location, DetectionOutputLayerCodeType code_type, int keep_top_k, float nms_threshold, int top_k = -1, int background_label_id = -1,
-                             float confidence_threshold = std::numeric_limits<float>::lowest(), bool variance_encoded_in_target = false, float eta = 1)
+    DetectionOutputLayerInfo(int                          num_classes,
+                             bool                         share_location,
+                             DetectionOutputLayerCodeType code_type,
+                             int                          keep_top_k,
+                             float                        nms_threshold,
+                             int                          top_k                = -1,
+                             int                          background_label_id  = -1,
+                             float                        confidence_threshold = std::numeric_limits<float>::lowest(),
+                             bool                         variance_encoded_in_target = false,
+                             float                        eta                        = 1)
         : _num_classes(num_classes),
           _share_location(share_location),
           _code_type(code_type),
@@ -923,8 +942,15 @@ public:
      * @param[in] detection_per_class       (Optional) Number of detection per class. Used in the Regular Non-Max-Suppression. Defaults to 100.
      * @param[in] dequantize_scores         (Optional) If the scores need to be dequantized. Defaults to true.
      */
-    DetectionPostProcessLayerInfo(unsigned int max_detections, unsigned int max_classes_per_detection, float nms_score_threshold, float iou_threshold, unsigned int num_classes,
-                                  std::array<float, 4> scales_values, bool use_regular_nms = false, unsigned int detection_per_class = 100, bool dequantize_scores = true)
+    DetectionPostProcessLayerInfo(unsigned int         max_detections,
+                                  unsigned int         max_classes_per_detection,
+                                  float                nms_score_threshold,
+                                  float                iou_threshold,
+                                  unsigned int         num_classes,
+                                  std::array<float, 4> scales_values,
+                                  bool                 use_regular_nms     = false,
+                                  unsigned int         detection_per_class = 100,
+                                  bool                 dequantize_scores   = true)
         : _max_detections(max_detections),
           _max_classes_per_detection(max_classes_per_detection),
           _nms_score_threshold(nms_score_threshold),
@@ -1240,8 +1266,14 @@ public:
      * @param[in] spatial_scale  Spatial scale to be applied to the ROI coordinates and dimensions.
      * @param[in] sampling_ratio Number of samples to include in each pooling region (if set to zero, a ceil(roi_dims/pooling_dims))
      */
-    ROIPoolingLayerInfo(unsigned int pooled_width, unsigned int pooled_height, float spatial_scale, unsigned int sampling_ratio = 0)
-        : _pooled_width(pooled_width), _pooled_height(pooled_height), _spatial_scale(spatial_scale), _sampling_ratio(sampling_ratio)
+    ROIPoolingLayerInfo(unsigned int pooled_width,
+                        unsigned int pooled_height,
+                        float        spatial_scale,
+                        unsigned int sampling_ratio = 0)
+        : _pooled_width(pooled_width),
+          _pooled_height(pooled_height),
+          _spatial_scale(spatial_scale),
+          _sampling_ratio(sampling_ratio)
     {
     }
     /** Get the pooled width of the layer */
@@ -1288,10 +1320,24 @@ public:
      * @param[in] min_size       (Optional)Size used to validate the anchors produced. Defaults to 16.
      * @param[in] values_per_roi (Optional)Values used to represent a ROI(Region of interest). Defaults to 4.
      */
-    GenerateProposalsInfo(float im_width, float im_height, float im_scale, float spatial_scale = 1.0, int pre_nms_topN = 6000, int post_nms_topN = 300, float nms_thres = 0.7, float min_size = 16.0,
+    GenerateProposalsInfo(float  im_width,
+                          float  im_height,
+                          float  im_scale,
+                          float  spatial_scale  = 1.0,
+                          int    pre_nms_topN   = 6000,
+                          int    post_nms_topN  = 300,
+                          float  nms_thres      = 0.7,
+                          float  min_size       = 16.0,
                           size_t values_per_roi = 4)
-        : _im_height(im_height), _im_width(im_width), _im_scale(im_scale), _spatial_scale(spatial_scale), _pre_nms_topN(pre_nms_topN), _post_nms_topN(post_nms_topN), _nms_thres(nms_thres),
-          _min_size(min_size), _values_per_roi(values_per_roi)
+        : _im_height(im_height),
+          _im_width(im_width),
+          _im_scale(im_scale),
+          _spatial_scale(spatial_scale),
+          _pre_nms_topN(pre_nms_topN),
+          _post_nms_topN(post_nms_topN),
+          _nms_thres(nms_thres),
+          _min_size(min_size),
+          _values_per_roi(values_per_roi)
     {
     }
 
@@ -1417,11 +1463,20 @@ public:
      * @param[in] correct_transform_coords (Optional)Correct bounding box transform coordinates. Defaults to false
      * @param[in] bbox_xform_clip          (Optional)Minimum bounding box width and height after bounding box transformation in log-space. Defaults to log(1000/16)
      */
-    BoundingBoxTransformInfo(float img_width, float img_height, float scale, bool apply_scale = false, const std::array<float, 4> weights = { { 1.f, 1.f, 1.f, 1.f } }, bool correct_transform_coords =
-    false,
-    float bbox_xform_clip =
-        4.135166556742356f)
-        : _img_width(img_width), _img_height(img_height), _scale(scale), _apply_scale(apply_scale), _correct_transform_coords(correct_transform_coords), _weights(weights), _bbox_xform_clip(bbox_xform_clip)
+    BoundingBoxTransformInfo(float                      img_width,
+                             float                      img_height,
+                             float                      scale,
+                             bool                       apply_scale              = false,
+                             const std::array<float, 4> weights                  = {{1.f, 1.f, 1.f, 1.f}},
+                             bool                       correct_transform_coords = false,
+                             float                      bbox_xform_clip          = 4.135166556742356f)
+        : _img_width(img_width),
+          _img_height(img_height),
+          _scale(scale),
+          _apply_scale(apply_scale),
+          _correct_transform_coords(correct_transform_coords),
+          _weights(weights),
+          _bbox_xform_clip(bbox_xform_clip)
     {
     }
 
@@ -1484,7 +1539,12 @@ public:
      * @param[in] is_scaled (Optional) Boolean that specifies if alpha will be scaled by the normalization size or not.
      *                      Should be false to follow [Krichevksy 2012].
      */
-    NormalizationLayerInfo(NormType type, uint32_t norm_size = 5, float alpha = 0.0001f, float beta = 0.5f, float kappa = 1.f, bool is_scaled = true)
+    NormalizationLayerInfo(NormType type,
+                           uint32_t norm_size = 5,
+                           float    alpha     = 0.0001f,
+                           float    beta      = 0.5f,
+                           float    kappa     = 1.f,
+                           bool     is_scaled = true)
         : _type(type), _norm_size(norm_size), _alpha(alpha), _beta(beta), _kappa(kappa), _is_scaled(is_scaled)
     {
     }
@@ -1612,7 +1672,12 @@ class WeightsInfo
 public:
     /** Default constructor */
     WeightsInfo()
-        : _are_reshaped(false), _kernel_width(0), _kernel_height(0), _num_kernels(0), _retain_internal_weights(false), _weight_format(arm_compute::WeightFormat::UNSPECIFIED)
+        : _are_reshaped(false),
+          _kernel_width(0),
+          _kernel_height(0),
+          _num_kernels(0),
+          _retain_internal_weights(false),
+          _weight_format(arm_compute::WeightFormat::UNSPECIFIED)
     {
     }
     /** Constructor
@@ -1624,9 +1689,18 @@ public:
      * @param[in] retain_internal_weights (Optional) True if internal reshaped weights must be retained. Used for reconfiguration purposes. Default is false.
      * @param[in] weight_format           (Optional) arm_gemm:WeightFormat enumeration requested by the user. Default is arm_compute::WeightFormat::UNSPECIFIED.
      */
-    WeightsInfo(bool are_reshaped, unsigned int kernel_width, unsigned int kernel_height, unsigned int num_kernels, bool retain_internal_weights = false,
-                arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED)
-        : _are_reshaped(are_reshaped), _kernel_width(kernel_width), _kernel_height(kernel_height), _num_kernels(num_kernels), _retain_internal_weights(retain_internal_weights), _weight_format(weight_format)
+    WeightsInfo(bool                      are_reshaped,
+                unsigned int              kernel_width,
+                unsigned int              kernel_height,
+                unsigned int              num_kernels,
+                bool                      retain_internal_weights = false,
+                arm_compute::WeightFormat weight_format           = arm_compute::WeightFormat::UNSPECIFIED)
+        : _are_reshaped(are_reshaped),
+          _kernel_width(kernel_width),
+          _kernel_height(kernel_height),
+          _num_kernels(num_kernels),
+          _retain_internal_weights(retain_internal_weights),
+          _weight_format(weight_format)
     {
     }
     /** Flag which specifies if the weights tensor has been reshaped.
@@ -1698,7 +1772,14 @@ class GEMMReshapeInfo final
 public:
     /** Default constructor */
     GEMMReshapeInfo()
-        : _m(1), _n(1), _k(1), _mult_transpose1xW_width(1), _mult_interleave4x4_height(1), _depth_output_gemm3d(0), _reinterpret_input_as_3d(false), _broadcast_bias(false)
+        : _m(1),
+          _n(1),
+          _k(1),
+          _mult_transpose1xW_width(1),
+          _mult_interleave4x4_height(1),
+          _depth_output_gemm3d(0),
+          _reinterpret_input_as_3d(false),
+          _broadcast_bias(false)
     {
     }
     /** Constructor
@@ -1714,9 +1795,22 @@ public:
      *                                      to perform 1x1 convolutions with the NHWC data layout)
      * @param[in] broadcast_bias            (Optional) Broadcast the shape of the bias tensor from a vector to a matrix.
      */
-    GEMMReshapeInfo(int m, int n, int k, int mult_transpose1xW_width = 1, int mult_interleave4x4_height = 1, int depth_output_gemm3d = 0, bool reinterpret_input_as_3d = false, bool broadcast_bias = false)
-        : _m(m), _n(n), _k(k), _mult_transpose1xW_width(mult_transpose1xW_width), _mult_interleave4x4_height(mult_interleave4x4_height), _depth_output_gemm3d(depth_output_gemm3d),
-          _reinterpret_input_as_3d(reinterpret_input_as_3d), _broadcast_bias(broadcast_bias)
+    GEMMReshapeInfo(int  m,
+                    int  n,
+                    int  k,
+                    int  mult_transpose1xW_width   = 1,
+                    int  mult_interleave4x4_height = 1,
+                    int  depth_output_gemm3d       = 0,
+                    bool reinterpret_input_as_3d   = false,
+                    bool broadcast_bias            = false)
+        : _m(m),
+          _n(n),
+          _k(k),
+          _mult_transpose1xW_width(mult_transpose1xW_width),
+          _mult_interleave4x4_height(mult_interleave4x4_height),
+          _depth_output_gemm3d(depth_output_gemm3d),
+          _reinterpret_input_as_3d(reinterpret_input_as_3d),
+          _broadcast_bias(broadcast_bias)
     {
     }
     /** Number of matrix A rows
@@ -1806,11 +1900,11 @@ struct GEMMLHSMatrixInfo
         : m0(m), k0(k), v0(v), transpose(trans), interleave(inter)
     {
     }
-    unsigned int m0{ 1 };            /**< Number of rows processed by the matrix multiplication */
-    unsigned int k0{ 1 };            /**< Number of partial accumulations performed by the matrix multiplication */
-    unsigned int v0{ 1 };            /**< Number of vertical blocks of size (m0xk0) stored on the same output row */
-    bool         transpose{ true };  /**< True if the (m0xk0) block has to be transposed before been stored */
-    bool         interleave{ true }; /**< True if the v0 (m0xk0) blocks have to be interleaved in the output row */
+    unsigned int m0{1};            /**< Number of rows processed by the matrix multiplication */
+    unsigned int k0{1};            /**< Number of partial accumulations performed by the matrix multiplication */
+    unsigned int v0{1};            /**< Number of vertical blocks of size (m0xk0) stored on the same output row */
+    bool         transpose{true};  /**< True if the (m0xk0) block has to be transposed before been stored */
+    bool         interleave{true}; /**< True if the v0 (m0xk0) blocks have to be interleaved in the output row */
 };
 
 /** GEMM RHS (Right Hand Side) matrix information */
@@ -1821,12 +1915,13 @@ struct GEMMRHSMatrixInfo
         : n0(n), k0(k), h0(h), transpose(trans), interleave(inter), export_to_cl_image(export_to_cl_img)
     {
     }
-    unsigned int n0{ 1 };                     /**< Number of columns processed by the matrix multiplication */
-    unsigned int k0{ 1 };                     /**< Number of partial accumulations performed by the matrix multiplication */
-    unsigned int h0{ 1 };                     /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
-    bool         transpose{ true };           /**< True if the (k0xn0) block has to be transposed before been stored */
-    bool         interleave{ true };          /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
-    bool         export_to_cl_image{ false }; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
+    unsigned int n0{1};            /**< Number of columns processed by the matrix multiplication */
+    unsigned int k0{1};            /**< Number of partial accumulations performed by the matrix multiplication */
+    unsigned int h0{1};            /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
+    bool         transpose{true};  /**< True if the (k0xn0) block has to be transposed before been stored */
+    bool         interleave{true}; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
+    bool         export_to_cl_image{
+        false}; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
 };
 
 class ITensorInfo;
@@ -1842,16 +1937,23 @@ struct WinogradInfo
      * @param[in] conv_info      Convolution info (Pads, strides)
      * @param[in] data_layout    Data layout to use for the output tensor once the convolution has been applied
      */
-    WinogradInfo(Size2D output_tile_sz, Size2D kernel_sz, Size2D input_dims, PadStrideInfo conv_info, DataLayout data_layout)
-        : output_tile_size(output_tile_sz), kernel_size(kernel_sz), input_dimensions(input_dims), convolution_info(conv_info), output_data_layout(data_layout)
-    {
-    }
-
-    Size2D        output_tile_size{};                     /**< Width and height of the output tile */
-    Size2D        kernel_size{};                          /**< Width and height of the kernel*/
-    Size2D        input_dimensions{};                     /**< Width and height of the input tensor before the convolution is applied */
-    PadStrideInfo convolution_info{};                     /**< Convolution info (Pads, strides,...) */
-    DataLayout    output_data_layout{ DataLayout::NCHW }; /**< Data layout to use for the output tensor once the convolution has been applied (NCHW or NHWC) */
+    WinogradInfo(
+        Size2D output_tile_sz, Size2D kernel_sz, Size2D input_dims, PadStrideInfo conv_info, DataLayout data_layout)
+        : output_tile_size(output_tile_sz),
+          kernel_size(kernel_sz),
+          input_dimensions(input_dims),
+          convolution_info(conv_info),
+          output_data_layout(data_layout)
+    {
+    }
+
+    Size2D        output_tile_size{}; /**< Width and height of the output tile */
+    Size2D        kernel_size{};      /**< Width and height of the kernel*/
+    Size2D        input_dimensions{}; /**< Width and height of the input tensor before the convolution is applied */
+    PadStrideInfo convolution_info{}; /**< Convolution info (Pads, strides,...) */
+    DataLayout    output_data_layout{
+        DataLayout::
+            NCHW}; /**< Data layout to use for the output tensor once the convolution has been applied (NCHW or NHWC) */
 };
 
 /** IO formatting information class*/
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
index c5b50167bf..a2146522f7 100644
--- a/arm_compute/core/Utils.h
+++ b/arm_compute/core/Utils.h
@@ -69,7 +69,7 @@ template <typename T>
 inline void permute_strides(Dimensions<T> &dimensions, const PermutationVector &perm)
 {
     const auto old_dim = utility::make_array<Dimensions<T>::num_max_dimensions>(dimensions.begin(), dimensions.end());
-    for(unsigned int i = 0; i < perm.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < perm.num_dimensions(); ++i)
     {
         T dimension_val = old_dim[i];
         dimensions.set(perm[i], dimension_val);
@@ -87,7 +87,11 @@ inline void permute_strides(Dimensions<T> &dimensions, const PermutationVector &
  *
  * @return PadStrideInfo for SAME padding
  */
-PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info, DataLayout data_layout = DataLayout::NCHW, const Size2D &dilation = Size2D(1u, 1u),
+PadStrideInfo calculate_same_pad(TensorShape                  input_shape,
+                                 TensorShape                  weights_shape,
+                                 PadStrideInfo                conv_info,
+                                 DataLayout                   data_layout   = DataLayout::NCHW,
+                                 const Size2D                &dilation      = Size2D(1u, 1u),
                                  const DimensionRoundingType &rounding_type = DimensionRoundingType::FLOOR);
 
 /** Returns expected width and height of the deconvolution's output tensor.
@@ -100,8 +104,10 @@ PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_sh
  *
  * @return A pair with the new width in the first position and the new height in the second.
  */
-std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned int in_width, unsigned int in_height,
-                                                                      unsigned int kernel_width, unsigned int kernel_height,
+std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned int         in_width,
+                                                                      unsigned int         in_height,
+                                                                      unsigned int         kernel_width,
+                                                                      unsigned int         kernel_height,
                                                                       const PadStrideInfo &pad_stride_info);
 
 /** Returns expected width and height of output scaled tensor depending on dimensions rounding mode.
@@ -115,8 +121,10 @@ std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned i
  *
  * @return A pair with the new width in the first position and the new height in the second.
  */
-std::pair<unsigned int, unsigned int> scaled_dimensions(int width, int height,
-                                                        int kernel_width, int kernel_height,
+std::pair<unsigned int, unsigned int> scaled_dimensions(int                  width,
+                                                        int                  height,
+                                                        int                  kernel_width,
+                                                        int                  kernel_height,
                                                         const PadStrideInfo &pad_stride_info,
                                                         const Size2D        &dilation = Size2D(1U, 1U));
 
@@ -130,9 +138,8 @@ std::pair<unsigned int, unsigned int> scaled_dimensions(int width, int height,
  *
  * @return A pair with the new width in the first position and the new height in the second, returned values can be < 1
  */
-std::pair<int, int> scaled_dimensions_signed(int width, int height,
-                                             int kernel_width, int kernel_height,
-                                             const PadStrideInfo &pad_stride_info);
+std::pair<int, int> scaled_dimensions_signed(
+    int width, int height, int kernel_width, int kernel_height, const PadStrideInfo &pad_stride_info);
 
 /** Returns calculated width, height and depth of output scaled tensor depending on dimensions rounding mode.
  *
@@ -147,8 +154,12 @@ std::pair<int, int> scaled_dimensions_signed(int width, int height,
  * @return A tuple with the new width in the first position, the new height in the second, and the new depth in the third.
  *         Returned values can be < 1
  */
-std::tuple<int, int, int> scaled_3d_dimensions_signed(int width, int height, int depth,
-                                                      int kernel_width, int kernel_height, int kernel_depth,
+std::tuple<int, int, int> scaled_3d_dimensions_signed(int                       width,
+                                                      int                       height,
+                                                      int                       depth,
+                                                      int                       kernel_width,
+                                                      int                       kernel_height,
+                                                      int                       kernel_depth,
                                                       const Pooling3dLayerInfo &pool3d_info);
 
 /** Check if the given reduction operation should be handled in a serial way.
@@ -178,7 +189,9 @@ QuantizationInfo get_softmax_output_quantization_info(DataType input_type, bool
  *
  * @return The pair with minimum and maximum values
  */
-std::pair<int32_t, int32_t> get_quantized_activation_min_max(const ActivationLayerInfo &act_info, DataType data_type, UniformQuantizationInfo oq_info);
+std::pair<int32_t, int32_t> get_quantized_activation_min_max(const ActivationLayerInfo &act_info,
+                                                             DataType                   data_type,
+                                                             UniformQuantizationInfo    oq_info);
 
 /** Convert a channel identity into a string.
  *
@@ -295,26 +308,27 @@ inline size_t num_of_elements_in_range(const float start, const float end, const
  * @param[in]  element_delim (Optional) Delimeter among the consecutive elements. Defaults to space delimeter
  */
 template <typename T>
-void print_consecutive_elements_impl(std::ostream &s, const T *ptr, unsigned int n, int stream_width = 0, const std::string &element_delim = " ")
+void print_consecutive_elements_impl(
+    std::ostream &s, const T *ptr, unsigned int n, int stream_width = 0, const std::string &element_delim = " ")
 {
     using print_type = typename std::conditional<std::is_floating_point<T>::value, T, int>::type;
     std::ios stream_status(nullptr);
     stream_status.copyfmt(s);
 
-    for(unsigned int i = 0; i < n; ++i)
+    for (unsigned int i = 0; i < n; ++i)
     {
         // Set stream width as it is not a "sticky" stream manipulator
-        if(stream_width != 0)
+        if (stream_width != 0)
         {
             s.width(stream_width);
         }
 
-        if(std::is_same<typename std::decay<T>::type, half>::value)
+        if (std::is_same<typename std::decay<T>::type, half>::value)
         {
             // We use T instead of print_type here is because the std::is_floating_point<half> returns false and then the print_type becomes int.
             s << std::right << static_cast<T>(ptr[i]) << element_delim;
         }
-        else if(std::is_same<typename std::decay<T>::type, bfloat16>::value)
+        else if (std::is_same<typename std::decay<T>::type, bfloat16>::value)
         {
             // We use T instead of print_type here is because the std::is_floating_point<bfloat16> returns false and then the print_type becomes int.
             s << std::right << float(ptr[i]) << element_delim;
@@ -343,17 +357,17 @@ int max_consecutive_elements_display_width_impl(std::ostream &s, const T *ptr, u
     using print_type = typename std::conditional<std::is_floating_point<T>::value, T, int>::type;
 
     int max_width = -1;
-    for(unsigned int i = 0; i < n; ++i)
+    for (unsigned int i = 0; i < n; ++i)
     {
         std::stringstream ss;
         ss.copyfmt(s);
 
-        if(std::is_same<typename std::decay<T>::type, half>::value)
+        if (std::is_same<typename std::decay<T>::type, half>::value)
         {
             // We use T instead of print_type here is because the std::is_floating_point<half> returns false and then the print_type becomes int.
             ss << static_cast<T>(ptr[i]);
         }
-        else if(std::is_same<typename std::decay<T>::type, bfloat16>::value)
+        else if (std::is_same<typename std::decay<T>::type, bfloat16>::value)
         {
             // We use T instead of print_type here is because the std::is_floating_point<bfloat> returns false and then the print_type becomes int.
             ss << float(ptr[i]);
@@ -377,7 +391,12 @@ int max_consecutive_elements_display_width_impl(std::ostream &s, const T *ptr, u
  * @param[in]  stream_width  (Optional) Width of the stream. If set to 0 the element's width is used. Defaults to 0.
  * @param[in]  element_delim (Optional) Delimeter among the consecutive elements. Defaults to space delimeter
  */
-void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n, int stream_width, const std::string &element_delim = " ");
+void print_consecutive_elements(std::ostream      &s,
+                                DataType           dt,
+                                const uint8_t     *ptr,
+                                unsigned int       n,
+                                int                stream_width,
+                                const std::string &element_delim = " ");
 
 /** Identify the maximum width of n consecutive elements.
  *
@@ -390,5 +409,5 @@ void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr
  */
 int max_consecutive_elements_display_width(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n);
 #endif /* ARM_COMPUTE_ASSERTS_ENABLED */
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_UTILS_H */
diff --git a/arm_compute/core/Validate.h b/arm_compute/core/Validate.h
index 5bffc16f3b..5550560aff 100644
--- a/arm_compute/core/Validate.h
+++ b/arm_compute/core/Validate.h
@@ -24,13 +24,13 @@
 #ifndef ARM_COMPUTE_VALIDATE_H
 #define ARM_COMPUTE_VALIDATE_H
 
-#include "arm_compute/core/utils/DataLayoutUtils.h"
-#include "arm_compute/core/utils/DataTypeUtils.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/utils/FormatUtils.h"
 #include "arm_compute/core/IKernel.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/QuantizationInfo.h"
+#include "arm_compute/core/utils/DataLayoutUtils.h"
+#include "arm_compute/core/utils/DataTypeUtils.h"
+#include "arm_compute/core/utils/FormatUtils.h"
 #include "arm_compute/core/Window.h"
 
 #include <algorithm>
@@ -50,9 +50,9 @@ namespace detail
 template <typename T>
 inline bool have_different_dimensions(const Dimensions<T> &dim1, const Dimensions<T> &dim2, unsigned int upper_dim)
 {
-    for(unsigned int i = upper_dim; i < arm_compute::Dimensions<T>::num_max_dimensions; ++i)
+    for (unsigned int i = upper_dim; i < arm_compute::Dimensions<T>::num_max_dimensions; ++i)
     {
-        if(dim1[i] != dim2[i])
+        if (dim1[i] != dim2[i])
         {
             return true;
         }
@@ -80,7 +80,7 @@ public:
      * @param[in] line     Source code line. Used for error reporting.
      */
     compare_dimension(const Dimensions<T> &dim, const char *function, const char *file, int line)
-        : _dim{ dim }, _function{ function }, _file{ file }, _line{ line }
+        : _dim{dim}, _function{function}, _file{file}, _line{line}
     {
     }
 
@@ -111,7 +111,7 @@ inline arm_compute::Status for_each_error(F &&)
 }
 
 template <typename F, typename T, typename... Ts>
-inline arm_compute::Status for_each_error(F &&func, T &&arg, Ts &&... args)
+inline arm_compute::Status for_each_error(F &&func, T &&arg, Ts &&...args)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(func(arg));
     ARM_COMPUTE_RETURN_ON_ERROR(for_each_error(func, args...));
@@ -148,13 +148,11 @@ struct get_tensor_info_t<ITensorInfo *>
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_nullptr(const char *function, const char *file, const int line, Ts &&... pointers)
+inline arm_compute::Status error_on_nullptr(const char *function, const char *file, const int line, Ts &&...pointers)
 {
-    const std::array<const void *, sizeof...(Ts)> pointers_array{ { std::forward<Ts>(pointers)... } };
-    bool has_nullptr = std::any_of(pointers_array.begin(), pointers_array.end(), [&](const void *ptr)
-    {
-        return (ptr == nullptr);
-    });
+    const std::array<const void *, sizeof...(Ts)> pointers_array{{std::forward<Ts>(pointers)...}};
+    bool                                          has_nullptr =
+        std::any_of(pointers_array.begin(), pointers_array.end(), [&](const void *ptr) { return (ptr == nullptr); });
     ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(has_nullptr, function, file, line, "Nullptr object!");
     return arm_compute::Status{};
 }
@@ -178,8 +176,8 @@ inline arm_compute::Status error_on_nullptr(const char *function, const char *fi
  *
  * @return Status
  */
-arm_compute::Status error_on_mismatching_windows(const char *function, const char *file, const int line,
-                                                 const Window &full, const Window &win);
+arm_compute::Status error_on_mismatching_windows(
+    const char *function, const char *file, const int line, const Window &full, const Window &win);
 #define ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(f, w) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_windows(__func__, __FILE__, __LINE__, f, w))
 #define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_WINDOWS(f, w) \
@@ -200,8 +198,8 @@ arm_compute::Status error_on_mismatching_windows(const char *function, const cha
  *
  * @return Status
  */
-arm_compute::Status error_on_invalid_subwindow(const char *function, const char *file, const int line,
-                                               const Window &full, const Window &sub);
+arm_compute::Status error_on_invalid_subwindow(
+    const char *function, const char *file, const int line, const Window &full, const Window &sub);
 #define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_invalid_subwindow(__func__, __FILE__, __LINE__, f, s))
 #define ARM_COMPUTE_RETURN_ERROR_ON_INVALID_SUBWINDOW(f, s) \
@@ -220,12 +218,14 @@ arm_compute::Status error_on_invalid_subwindow(const char *function, const char
  *
  * @return Status
  */
-arm_compute::Status error_on_window_not_collapsable_at_dimension(const char *function, const char *file, const int line,
-                                                                 const Window &full, const Window &window, const int dim);
+arm_compute::Status error_on_window_not_collapsable_at_dimension(
+    const char *function, const char *file, const int line, const Window &full, const Window &window, const int dim);
 #define ARM_COMPUTE_ERROR_ON_WINDOW_NOT_COLLAPSABLE_AT_DIMENSION(f, w, d) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_window_not_collapsable_at_dimension(__func__, __FILE__, __LINE__, f, w, d))
+    ARM_COMPUTE_ERROR_THROW_ON(                                           \
+        ::arm_compute::error_on_window_not_collapsable_at_dimension(__func__, __FILE__, __LINE__, f, w, d))
 #define ARM_COMPUTE_RETURN_ERROR_ON_WINDOW_NOT_COLLAPSABLE_AT_DIMENSION(f, w, d) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_window_not_collapsable_at_dimension(__func__, __FILE__, __LINE__, f, w, d))
+    ARM_COMPUTE_RETURN_ON_ERROR(                                                 \
+        ::arm_compute::error_on_window_not_collapsable_at_dimension(__func__, __FILE__, __LINE__, f, w, d))
 
 /** Return an error if the passed coordinates have too many dimensions.
  *
@@ -239,8 +239,8 @@ arm_compute::Status error_on_window_not_collapsable_at_dimension(const char *fun
  *
  * @return Status
  */
-arm_compute::Status error_on_coordinates_dimensions_gte(const char *function, const char *file, const int line,
-                                                        const Coordinates &pos, unsigned int max_dim);
+arm_compute::Status error_on_coordinates_dimensions_gte(
+    const char *function, const char *file, const int line, const Coordinates &pos, unsigned int max_dim);
 #define ARM_COMPUTE_ERROR_ON_COORDINATES_DIMENSIONS_GTE(p, md) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_coordinates_dimensions_gte(__func__, __FILE__, __LINE__, p, md))
 #define ARM_COMPUTE_RETURN_ERROR_ON_COORDINATES_DIMENSIONS_GTE(p, md) \
@@ -258,8 +258,8 @@ arm_compute::Status error_on_coordinates_dimensions_gte(const char *function, co
  *
  * @return Status
  */
-arm_compute::Status error_on_window_dimensions_gte(const char *function, const char *file, const int line,
-                                                   const Window &win, unsigned int max_dim);
+arm_compute::Status error_on_window_dimensions_gte(
+    const char *function, const char *file, const int line, const Window &win, unsigned int max_dim);
 #define ARM_COMPUTE_ERROR_ON_WINDOW_DIMENSIONS_GTE(w, md) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_window_dimensions_gte(__func__, __FILE__, __LINE__, w, md))
 #define ARM_COMPUTE_RETURN_ERROR_ON_WINDOW_DIMENSIONS_GTE(w, md) \
@@ -277,16 +277,23 @@ arm_compute::Status error_on_window_dimensions_gte(const char *function, const c
  * @return Status
  */
 template <typename T, typename... Ts>
-arm_compute::Status error_on_mismatching_dimensions(const char *function, const char *file, int line,
-                                                    const Dimensions<T> &dim1, const Dimensions<T> &dim2, Ts &&... dims)
+arm_compute::Status error_on_mismatching_dimensions(const char          *function,
+                                                    const char          *file,
+                                                    int                  line,
+                                                    const Dimensions<T> &dim1,
+                                                    const Dimensions<T> &dim2,
+                                                    Ts &&...dims)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(detail::for_each_error(detail::compare_dimension<T>(dim1, function, file, line), dim2, std::forward<Ts>(dims)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(detail::for_each_error(detail::compare_dimension<T>(dim1, function, file, line), dim2,
+                                                       std::forward<Ts>(dims)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(...) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_dimensions(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_ERROR_THROW_ON(                          \
+        ::arm_compute::error_on_mismatching_dimensions(__func__, __FILE__, __LINE__, __VA_ARGS__))
 #define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(...) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_dimensions(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_RETURN_ON_ERROR(                                \
+        ::arm_compute::error_on_mismatching_dimensions(__func__, __FILE__, __LINE__, __VA_ARGS__))
 
 /** Return true if the given format has horizontal subsampling.
  *
@@ -296,7 +303,10 @@ arm_compute::Status error_on_mismatching_dimensions(const char *function, const
  */
 inline bool has_format_horizontal_subsampling(Format format)
 {
-    return (format == Format::YUYV422 || format == Format::UYVY422 || format == Format::NV12 || format == Format::NV21 || format == Format::IYUV || format == Format::UV88) ? true : false;
+    return (format == Format::YUYV422 || format == Format::UYVY422 || format == Format::NV12 ||
+            format == Format::NV21 || format == Format::IYUV || format == Format::UV88)
+               ? true
+               : false;
 }
 
 /** Return true if the given format has vertical subsampling.
@@ -307,7 +317,9 @@ inline bool has_format_horizontal_subsampling(Format format)
  */
 inline bool has_format_vertical_subsampling(Format format)
 {
-    return (format == Format::NV12 || format == Format::NV21 || format == Format::IYUV || format == Format::UV88) ? true : false;
+    return (format == Format::NV12 || format == Format::NV21 || format == Format::IYUV || format == Format::UV88)
+               ? true
+               : false;
 }
 
 /** Adjust tensor shape size if width or height are odd for a given multi-planar format. No modification is done for other formats.
@@ -325,16 +337,16 @@ inline bool has_format_vertical_subsampling(Format format)
  */
 inline TensorShape adjust_odd_shape(const TensorShape &shape, Format format)
 {
-    TensorShape output{ shape };
+    TensorShape output{shape};
 
     // Force width to be even for formats which require subsampling of the U and V channels
-    if(has_format_horizontal_subsampling(format))
+    if (has_format_horizontal_subsampling(format))
     {
         output.set(0, (output.x() + 1) & ~1U);
     }
 
     // Force height to be even for formats which require subsampling of the U and V channels
-    if(has_format_vertical_subsampling(format))
+    if (has_format_vertical_subsampling(format))
     {
         output.set(1, (output.y() + 1) & ~1U);
     }
@@ -354,18 +366,20 @@ inline TensorShape adjust_odd_shape(const TensorShape &shape, Format format)
  * @return Status
  */
 template <typename... Ts>
-arm_compute::Status error_on_tensors_not_even(const char *function, const char *file, int line,
-                                              const Format &format, const ITensor *tensor1, Ts... tensors)
+arm_compute::Status error_on_tensors_not_even(
+    const char *function, const char *file, int line, const Format &format, const ITensor *tensor1, Ts... tensors)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor1 == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensors)...));
-    const std::array < const ITensor *, 1 + sizeof...(Ts) > tensors_info_array{ { tensor1, std::forward<Ts>(tensors)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensors_info_array.cbegin(), tensors_info_array.cend(), [&](const ITensor * tensor)
-    {
-        const TensorShape correct_shape = adjust_odd_shape(tensor->info()->tensor_shape(), format);
-        return detail::have_different_dimensions(tensor->info()->tensor_shape(), correct_shape, 2);
-    }),
-    function, file, line, "Tensor shape has odd dimensions");
+    const std::array<const ITensor *, 1 + sizeof...(Ts)> tensors_info_array{{tensor1, std::forward<Ts>(tensors)...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(
+        std::any_of(tensors_info_array.cbegin(), tensors_info_array.cend(),
+                    [&](const ITensor *tensor)
+                    {
+                        const TensorShape correct_shape = adjust_odd_shape(tensor->info()->tensor_shape(), format);
+                        return detail::have_different_dimensions(tensor->info()->tensor_shape(), correct_shape, 2);
+                    }),
+        function, file, line, "Tensor shape has odd dimensions");
     return arm_compute::Status{};
 }
 
@@ -382,21 +396,22 @@ arm_compute::Status error_on_tensors_not_even(const char *function, const char *
  *
  * @return The subsampled tensor shape.
  */
-inline TensorShape calculate_subsampled_shape(const TensorShape &shape, Format format, Channel channel = Channel::UNKNOWN)
+inline TensorShape
+calculate_subsampled_shape(const TensorShape &shape, Format format, Channel channel = Channel::UNKNOWN)
 {
-    TensorShape output{ shape };
+    TensorShape output{shape};
 
     // Subsample shape only for U or V channel
-    if(Channel::U == channel || Channel::V == channel || Channel::UNKNOWN == channel)
+    if (Channel::U == channel || Channel::V == channel || Channel::UNKNOWN == channel)
     {
         // Subsample width for the tensor shape when channel is U or V
-        if(has_format_horizontal_subsampling(format))
+        if (has_format_horizontal_subsampling(format))
         {
             output.set(0, output.x() / 2U);
         }
 
         // Subsample height for the tensor shape when channel is U or V
-        if(has_format_vertical_subsampling(format))
+        if (has_format_vertical_subsampling(format))
         {
             output.set(1, output.y() / 2U);
         }
@@ -418,25 +433,32 @@ inline TensorShape calculate_subsampled_shape(const TensorShape &shape, Format f
  * @return Status
  */
 template <typename... Ts>
-arm_compute::Status error_on_tensors_not_subsampled(const char *function, const char *file, int line,
-                                                    const Format &format, const TensorShape &shape, const ITensor *tensor1, Ts... tensors)
+arm_compute::Status error_on_tensors_not_subsampled(const char        *function,
+                                                    const char        *file,
+                                                    int                line,
+                                                    const Format      &format,
+                                                    const TensorShape &shape,
+                                                    const ITensor     *tensor1,
+                                                    Ts... tensors)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor1 == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensors)...));
-    const TensorShape sub2_shape = calculate_subsampled_shape(shape, format);
-    const std::array < const ITensor *, 1 + sizeof...(Ts) > tensors_info_array{ { tensor1, std::forward<Ts>(tensors)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensors_info_array.cbegin(), tensors_info_array.cend(), [&](const ITensor * tensor)
-    {
-        return detail::have_different_dimensions(tensor->info()->tensor_shape(), sub2_shape, 2);
-    }),
-    function, file, line, "Tensor shape has mismatch dimensions for sub-sampling");
+    const TensorShape                                    sub2_shape = calculate_subsampled_shape(shape, format);
+    const std::array<const ITensor *, 1 + sizeof...(Ts)> tensors_info_array{{tensor1, std::forward<Ts>(tensors)...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(
+        std::any_of(tensors_info_array.cbegin(), tensors_info_array.cend(),
+                    [&](const ITensor *tensor)
+                    { return detail::have_different_dimensions(tensor->info()->tensor_shape(), sub2_shape, 2); }),
+        function, file, line, "Tensor shape has mismatch dimensions for sub-sampling");
     return arm_compute::Status{};
 }
 
 #define ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(...) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_tensors_not_subsampled(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_ERROR_THROW_ON(                          \
+        ::arm_compute::error_on_tensors_not_subsampled(__func__, __FILE__, __LINE__, __VA_ARGS__))
 #define ARM_COMPUTE_RETURN_ERROR_ON_TENSORS_NOT_SUBSAMPLED(...) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_tensors_not_subsampled(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_RETURN_ON_ERROR(                                \
+        ::arm_compute::error_on_tensors_not_subsampled(__func__, __FILE__, __LINE__, __VA_ARGS__))
 
 /** Return an error if the passed two tensor infos have different shapes from the given dimension
  *
@@ -450,10 +472,15 @@ arm_compute::Status error_on_tensors_not_subsampled(const char *function, const
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_shapes(const char *function, const char *file, const int line,
-                                                       const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos)
+inline arm_compute::Status error_on_mismatching_shapes(const char        *function,
+                                                       const char        *file,
+                                                       const int          line,
+                                                       const ITensorInfo *tensor_info_1,
+                                                       const ITensorInfo *tensor_info_2,
+                                                       Ts... tensor_infos)
 {
-    return error_on_mismatching_shapes(function, file, line, 0U, tensor_info_1, tensor_info_2, std::forward<Ts>(tensor_infos)...);
+    return error_on_mismatching_shapes(function, file, line, 0U, tensor_info_1, tensor_info_2,
+                                       std::forward<Ts>(tensor_infos)...);
 }
 /** Return an error if the passed two tensors have different shapes from the given dimension
  *
@@ -467,8 +494,12 @@ inline arm_compute::Status error_on_mismatching_shapes(const char *function, con
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_shapes(const char *function, const char *file, const int line,
-                                                       const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+inline arm_compute::Status error_on_mismatching_shapes(const char    *function,
+                                                       const char    *file,
+                                                       const int      line,
+                                                       const ITensor *tensor_1,
+                                                       const ITensor *tensor_2,
+                                                       Ts... tensors)
 {
     return error_on_mismatching_shapes(function, file, line, 0U, tensor_1, tensor_2, std::forward<Ts>(tensors)...);
 }
@@ -485,19 +516,28 @@ inline arm_compute::Status error_on_mismatching_shapes(const char *function, con
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_shapes(const char *function, const char *file, const int line,
-                                                       unsigned int upper_dim, const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos)
+inline arm_compute::Status error_on_mismatching_shapes(const char        *function,
+                                                       const char        *file,
+                                                       const int          line,
+                                                       unsigned int       upper_dim,
+                                                       const ITensorInfo *tensor_info_1,
+                                                       const ITensorInfo *tensor_info_2,
+                                                       Ts... tensor_infos)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info_1 == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info_2 == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, tensor_infos...));
 
-    const std::array < const ITensorInfo *, 2 + sizeof...(Ts) > tensors_info_array{ { tensor_info_1, tensor_info_2, tensor_infos... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(std::next(tensors_info_array.cbegin()), tensors_info_array.cend(), [&](const ITensorInfo * tensor_info)
-    {
-        return detail::have_different_dimensions((*tensors_info_array.cbegin())->tensor_shape(), tensor_info->tensor_shape(), upper_dim);
-    }),
-    function, file, line, "Tensors have different shapes");
+    const std::array<const ITensorInfo *, 2 + sizeof...(Ts)> tensors_info_array{
+        {tensor_info_1, tensor_info_2, tensor_infos...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(std::next(tensors_info_array.cbegin()), tensors_info_array.cend(),
+                                                    [&](const ITensorInfo *tensor_info)
+                                                    {
+                                                        return detail::have_different_dimensions(
+                                                            (*tensors_info_array.cbegin())->tensor_shape(),
+                                                            tensor_info->tensor_shape(), upper_dim);
+                                                    }),
+                                        function, file, line, "Tensors have different shapes");
     return arm_compute::Status{};
 }
 /** Return an error if the passed two tensors have different shapes from the given dimension
@@ -513,14 +553,20 @@ inline arm_compute::Status error_on_mismatching_shapes(const char *function, con
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_shapes(const char *function, const char *file, const int line,
-                                                       unsigned int upper_dim, const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+inline arm_compute::Status error_on_mismatching_shapes(const char    *function,
+                                                       const char    *file,
+                                                       const int      line,
+                                                       unsigned int   upper_dim,
+                                                       const ITensor *tensor_1,
+                                                       const ITensor *tensor_2,
+                                                       Ts... tensors)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_1 == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_2 == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, tensors...));
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_shapes(function, file, line, upper_dim, tensor_1->info(), tensor_2->info(),
-                                                                           detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        ::arm_compute::error_on_mismatching_shapes(function, file, line, upper_dim, tensor_1->info(), tensor_2->info(),
+                                                   detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(...) \
@@ -539,19 +585,18 @@ inline arm_compute::Status error_on_mismatching_shapes(const char *function, con
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_data_layouts(const char *function, const char *file, const int line,
-                                                             const ITensorInfo *tensor_info, Ts... tensor_infos)
+inline arm_compute::Status error_on_mismatching_data_layouts(
+    const char *function, const char *file, const int line, const ITensorInfo *tensor_info, Ts... tensor_infos)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, tensor_infos...));
 
-    DataLayout &&tensor_data_layout = tensor_info->data_layout();
-    const std::array<const ITensorInfo *, sizeof...(Ts)> tensors_infos_array{ { tensor_infos... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensors_infos_array.begin(), tensors_infos_array.end(), [&](const ITensorInfo * tensor_info_obj)
-    {
-        return tensor_info_obj->data_layout() != tensor_data_layout;
-    }),
-    function, file, line, "Tensors have different data layouts");
+    DataLayout                                         &&tensor_data_layout = tensor_info->data_layout();
+    const std::array<const ITensorInfo *, sizeof...(Ts)> tensors_infos_array{{tensor_infos...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensors_infos_array.begin(), tensors_infos_array.end(),
+                                                    [&](const ITensorInfo *tensor_info_obj)
+                                                    { return tensor_info_obj->data_layout() != tensor_data_layout; }),
+                                        function, file, line, "Tensors have different data layouts");
     return arm_compute::Status{};
 }
 /** Return an error if the passed tensors have different data layouts
@@ -565,19 +610,21 @@ inline arm_compute::Status error_on_mismatching_data_layouts(const char *functio
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_data_layouts(const char *function, const char *file, const int line,
-                                                             const ITensor *tensor, Ts... tensors)
+inline arm_compute::Status error_on_mismatching_data_layouts(
+    const char *function, const char *file, const int line, const ITensor *tensor, Ts... tensors)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensors)...));
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_layouts(function, file, line, tensor->info(),
-                                                                                 detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_layouts(
+        function, file, line, tensor->info(), detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_LAYOUT(...) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_data_layouts(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_ERROR_THROW_ON(                           \
+        ::arm_compute::error_on_mismatching_data_layouts(__func__, __FILE__, __LINE__, __VA_ARGS__))
 #define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(...) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_layouts(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_RETURN_ON_ERROR(                                 \
+        ::arm_compute::error_on_mismatching_data_layouts(__func__, __FILE__, __LINE__, __VA_ARGS__))
 
 /** Return an error if the passed two tensor infos have different data types
  *
@@ -590,19 +637,18 @@ inline arm_compute::Status error_on_mismatching_data_layouts(const char *functio
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_data_types(const char *function, const char *file, const int line,
-                                                           const ITensorInfo *tensor_info, Ts... tensor_infos)
+inline arm_compute::Status error_on_mismatching_data_types(
+    const char *function, const char *file, const int line, const ITensorInfo *tensor_info, Ts... tensor_infos)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, tensor_infos...));
 
-    DataType &&tensor_data_type = tensor_info->data_type();
-    const std::array<const ITensorInfo *, sizeof...(Ts)> tensors_infos_array{ { tensor_infos... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensors_infos_array.begin(), tensors_infos_array.end(), [&](const ITensorInfo * tensor_info_obj)
-    {
-        return tensor_info_obj->data_type() != tensor_data_type;
-    }),
-    function, file, line, "Tensors have different data types");
+    DataType                                           &&tensor_data_type = tensor_info->data_type();
+    const std::array<const ITensorInfo *, sizeof...(Ts)> tensors_infos_array{{tensor_infos...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensors_infos_array.begin(), tensors_infos_array.end(),
+                                                    [&](const ITensorInfo *tensor_info_obj)
+                                                    { return tensor_info_obj->data_type() != tensor_data_type; }),
+                                        function, file, line, "Tensors have different data types");
     return arm_compute::Status{};
 }
 /** Return an error if the passed two tensors have different data types
@@ -616,19 +662,21 @@ inline arm_compute::Status error_on_mismatching_data_types(const char *function,
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_data_types(const char *function, const char *file, const int line,
-                                                           const ITensor *tensor, Ts... tensors)
+inline arm_compute::Status error_on_mismatching_data_types(
+    const char *function, const char *file, const int line, const ITensor *tensor, Ts... tensors)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, tensors...));
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_types(function, file, line, tensor->info(),
-                                                                               detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_types(
+        function, file, line, tensor->info(), detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(...) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_data_types(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_ERROR_THROW_ON(                          \
+        ::arm_compute::error_on_mismatching_data_types(__func__, __FILE__, __LINE__, __VA_ARGS__))
 #define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_types(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_RETURN_ON_ERROR(                                \
+        ::arm_compute::error_on_mismatching_data_types(__func__, __FILE__, __LINE__, __VA_ARGS__))
 
 /** Return an error if the passed tensor infos have different asymmetric quantized data types or different quantization info
  *
@@ -644,28 +692,32 @@ inline arm_compute::Status error_on_mismatching_data_types(const char *function,
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_quantization_info(const char *function, const char *file, const int line,
-                                                                  const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos)
+inline arm_compute::Status error_on_mismatching_quantization_info(const char        *function,
+                                                                  const char        *file,
+                                                                  const int          line,
+                                                                  const ITensorInfo *tensor_info_1,
+                                                                  const ITensorInfo *tensor_info_2,
+                                                                  Ts... tensor_infos)
 {
     DataType             &&first_data_type         = tensor_info_1->data_type();
     const QuantizationInfo first_quantization_info = tensor_info_1->quantization_info();
 
-    if(!is_data_type_quantized(first_data_type))
+    if (!is_data_type_quantized(first_data_type))
     {
         return arm_compute::Status{};
     }
 
-    const std::array < const ITensorInfo *, 1 + sizeof...(Ts) > tensor_infos_array{ { tensor_info_2, std::forward<Ts>(tensor_infos)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info)
-    {
-        return tensor_info->data_type() != first_data_type;
-    }),
-    function, file, line, "Tensors have different asymmetric quantized data types");
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info)
-    {
-        return tensor_info->quantization_info() != first_quantization_info;
-    }),
-    function, file, line, "Tensors have different quantization information");
+    const std::array<const ITensorInfo *, 1 + sizeof...(Ts)> tensor_infos_array{
+        {tensor_info_2, std::forward<Ts>(tensor_infos)...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(),
+                                                    [&](const ITensorInfo *tensor_info)
+                                                    { return tensor_info->data_type() != first_data_type; }),
+                                        function, file, line, "Tensors have different asymmetric quantized data types");
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(
+        std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(),
+                    [&](const ITensorInfo *tensor_info)
+                    { return tensor_info->quantization_info() != first_quantization_info; }),
+        function, file, line, "Tensors have different quantization information");
 
     return arm_compute::Status{};
 }
@@ -683,17 +735,24 @@ inline arm_compute::Status error_on_mismatching_quantization_info(const char *fu
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_quantization_info(const char *function, const char *file, const int line,
-                                                                  const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+inline arm_compute::Status error_on_mismatching_quantization_info(const char    *function,
+                                                                  const char    *file,
+                                                                  const int      line,
+                                                                  const ITensor *tensor_1,
+                                                                  const ITensor *tensor_2,
+                                                                  Ts... tensors)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_quantization_info(function, file, line, tensor_1->info(), tensor_2->info(),
-                                                                                      detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        ::arm_compute::error_on_mismatching_quantization_info(function, file, line, tensor_1->info(), tensor_2->info(),
+                                                              detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(...) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_quantization_info(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_ERROR_THROW_ON(                                 \
+        ::arm_compute::error_on_mismatching_quantization_info(__func__, __FILE__, __LINE__, __VA_ARGS__))
 #define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(...) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_quantization_info(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_RETURN_ON_ERROR(                                       \
+        ::arm_compute::error_on_mismatching_quantization_info(__func__, __FILE__, __LINE__, __VA_ARGS__))
 
 /** Throw an error if the format of the passed tensor/multi-image does not match any of the formats provided.
  *
@@ -705,8 +764,8 @@ inline arm_compute::Status error_on_mismatching_quantization_info(const char *fu
  * @param[in] formats  (Optional) Further allowed formats.
  */
 template <typename T, typename F, typename... Fs>
-void error_on_format_not_in(const char *function, const char *file, const int line,
-                            const T *object, F &&format, Fs &&... formats)
+void error_on_format_not_in(
+    const char *function, const char *file, const int line, const T *object, F &&format, Fs &&...formats)
 {
     ARM_COMPUTE_ERROR_ON_LOC(object == nullptr, function, file, line);
 
@@ -715,17 +774,17 @@ void error_on_format_not_in(const char *function, const char *file, const int li
 
     ARM_COMPUTE_ERROR_ON_LOC(object_format == Format::UNKNOWN, function, file, line);
 
-    const std::array<F, sizeof...(Fs)> formats_array{ { std::forward<Fs>(formats)... } };
+    const std::array<F, sizeof...(Fs)> formats_array{{std::forward<Fs>(formats)...}};
     ARM_COMPUTE_UNUSED(formats_array);
 
-    ARM_COMPUTE_ERROR_ON_LOC_MSG(object_format != format && std::none_of(formats_array.begin(), formats_array.end(), [&](const F & f)
-    {
-        return f == object_format;
-    }),
-    function, file, line, "Format %s not supported by this kernel", string_from_format(object_format).c_str());
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(
+        object_format != format &&
+            std::none_of(formats_array.begin(), formats_array.end(), [&](const F &f) { return f == object_format; }),
+        function, file, line, "Format %s not supported by this kernel", string_from_format(object_format).c_str());
     ARM_COMPUTE_UNUSED(function, format, file, line);
 }
-#define ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(t, ...) ::arm_compute::error_on_format_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__)
+#define ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(t, ...) \
+    ::arm_compute::error_on_format_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__)
 
 /** Return an error if the data type of the passed tensor info does not match any of the data types provided.
  *
@@ -739,20 +798,19 @@ void error_on_format_not_in(const char *function, const char *file, const int li
  * @return Status
  */
 template <typename T, typename... Ts>
-inline arm_compute::Status error_on_data_type_not_in(const char *function, const char *file, const int line,
-                                                     const ITensorInfo *tensor_info, T &&dt, Ts &&... dts)
+inline arm_compute::Status error_on_data_type_not_in(
+    const char *function, const char *file, const int line, const ITensorInfo *tensor_info, T &&dt, Ts &&...dts)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
 
     const DataType &tensor_dt = tensor_info->data_type(); //NOLINT
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_dt == DataType::UNKNOWN, function, file, line);
 
-    const std::array<T, sizeof...(Ts)> dts_array{ { std::forward<Ts>(dts)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor_dt != dt && std::none_of(dts_array.begin(), dts_array.end(), [&](const T & d)
-    {
-        return d == tensor_dt;
-    }),
-    function, file, line, "ITensor data type %s not supported by this kernel", string_from_data_type(tensor_dt).c_str());
+    const std::array<T, sizeof...(Ts)> dts_array{{std::forward<Ts>(dts)...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(
+        tensor_dt != dt && std::none_of(dts_array.begin(), dts_array.end(), [&](const T &d) { return d == tensor_dt; }),
+        function, file, line, "ITensor data type %s not supported by this kernel",
+        string_from_data_type(tensor_dt).c_str());
     return arm_compute::Status{};
 }
 /** Return an error if the data type of the passed tensor does not match any of the data types provided.
@@ -767,11 +825,12 @@ inline arm_compute::Status error_on_data_type_not_in(const char *function, const
  * @return Status
  */
 template <typename T, typename... Ts>
-inline arm_compute::Status error_on_data_type_not_in(const char *function, const char *file, const int line,
-                                                     const ITensor *tensor, T &&dt, Ts &&... dts)
+inline arm_compute::Status error_on_data_type_not_in(
+    const char *function, const char *file, const int line, const ITensor *tensor, T &&dt, Ts &&...dts)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_type_not_in(function, file, line, tensor->info(), std::forward<T>(dt), std::forward<Ts>(dts)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_type_not_in(
+        function, file, line, tensor->info(), std::forward<T>(dt), std::forward<Ts>(dts)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(t, ...) \
@@ -791,20 +850,19 @@ inline arm_compute::Status error_on_data_type_not_in(const char *function, const
  * @return Status
  */
 template <typename T, typename... Ts>
-inline arm_compute::Status error_on_data_layout_not_in(const char *function, const char *file, const int line,
-                                                       const ITensorInfo *tensor_info, T &&dl, Ts &&... dls)
+inline arm_compute::Status error_on_data_layout_not_in(
+    const char *function, const char *file, const int line, const ITensorInfo *tensor_info, T &&dl, Ts &&...dls)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
 
     const DataLayout &tensor_dl = tensor_info->data_layout(); //NOLINT
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_dl == DataLayout::UNKNOWN, function, file, line);
 
-    const std::array<T, sizeof...(Ts)> dls_array{ { std::forward<Ts>(dls)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor_dl != dl && std::none_of(dls_array.begin(), dls_array.end(), [&](const T & l)
-    {
-        return l == tensor_dl;
-    }),
-    function, file, line, "ITensor data layout %s not supported by this kernel", string_from_data_layout(tensor_dl).c_str());
+    const std::array<T, sizeof...(Ts)> dls_array{{std::forward<Ts>(dls)...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(
+        tensor_dl != dl && std::none_of(dls_array.begin(), dls_array.end(), [&](const T &l) { return l == tensor_dl; }),
+        function, file, line, "ITensor data layout %s not supported by this kernel",
+        string_from_data_layout(tensor_dl).c_str());
     return arm_compute::Status{};
 }
 /** Return an error if the data layout of the passed tensor does not match any of the data layout provided.
@@ -819,17 +877,19 @@ inline arm_compute::Status error_on_data_layout_not_in(const char *function, con
  * @return Status
  */
 template <typename T, typename... Ts>
-inline arm_compute::Status error_on_data_layout_not_in(const char *function, const char *file, const int line,
-                                                       const ITensor *tensor, T &&dl, Ts &&... dls)
+inline arm_compute::Status error_on_data_layout_not_in(
+    const char *function, const char *file, const int line, const ITensor *tensor, T &&dl, Ts &&...dls)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_layout_not_in(function, file, line, tensor->info(), std::forward<T>(dl), std::forward<Ts>(dls)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_layout_not_in(
+        function, file, line, tensor->info(), std::forward<T>(dl), std::forward<Ts>(dls)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_DATA_LAYOUT_NOT_IN(t, ...) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_data_layout_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__))
 #define ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(t, ...) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_layout_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__))
+    ARM_COMPUTE_RETURN_ON_ERROR(                               \
+        ::arm_compute::error_on_data_layout_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__))
 
 /** Return an error if the data type or the number of channels of the passed tensor info does not match any of the data types and number of channels provided.
  *
@@ -844,12 +904,20 @@ inline arm_compute::Status error_on_data_layout_not_in(const char *function, con
  * @return Status
  */
 template <typename T, typename... Ts>
-inline arm_compute::Status error_on_data_type_channel_not_in(const char *function, const char *file, const int line,
-                                                             const ITensorInfo *tensor_info, size_t num_channels, T &&dt, Ts &&... dts)
+inline arm_compute::Status error_on_data_type_channel_not_in(const char        *function,
+                                                             const char        *file,
+                                                             const int          line,
+                                                             const ITensorInfo *tensor_info,
+                                                             size_t             num_channels,
+                                                             T                &&dt,
+                                                             Ts &&...dts)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_type_not_in(function, file, line, tensor_info, std::forward<T>(dt), std::forward<Ts>(dts)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_type_not_in(
+        function, file, line, tensor_info, std::forward<T>(dt), std::forward<Ts>(dts)...));
     const size_t tensor_nc = tensor_info->num_channels();
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor_nc != num_channels, function, file, line, "Number of channels %zu. Required number of channels %zu", tensor_nc, num_channels);
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor_nc != num_channels, function, file, line,
+                                            "Number of channels %zu. Required number of channels %zu", tensor_nc,
+                                            num_channels);
     return arm_compute::Status{};
 }
 /** Return an error if the data type or the number of channels of the passed tensor does not match any of the data types and number of channels provided.
@@ -865,17 +933,25 @@ inline arm_compute::Status error_on_data_type_channel_not_in(const char *functio
  * @return Status
  */
 template <typename T, typename... Ts>
-inline arm_compute::Status error_on_data_type_channel_not_in(const char *function, const char *file, const int line,
-                                                             const ITensor *tensor, size_t num_channels, T &&dt, Ts &&... dts)
+inline arm_compute::Status error_on_data_type_channel_not_in(const char    *function,
+                                                             const char    *file,
+                                                             const int      line,
+                                                             const ITensor *tensor,
+                                                             size_t         num_channels,
+                                                             T            &&dt,
+                                                             Ts &&...dts)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(error_on_data_type_channel_not_in(function, file, line, tensor->info(), num_channels, std::forward<T>(dt), std::forward<Ts>(dts)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(error_on_data_type_channel_not_in(function, file, line, tensor->info(), num_channels,
+                                                                  std::forward<T>(dt), std::forward<Ts>(dts)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c, ...) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_data_type_channel_not_in(__func__, __FILE__, __LINE__, t, c, __VA_ARGS__))
+    ARM_COMPUTE_ERROR_THROW_ON(                                  \
+        ::arm_compute::error_on_data_type_channel_not_in(__func__, __FILE__, __LINE__, t, c, __VA_ARGS__))
 #define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c, ...) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_type_channel_not_in(__func__, __FILE__, __LINE__, t, c, __VA_ARGS__))
+    ARM_COMPUTE_RETURN_ON_ERROR(                                        \
+        ::arm_compute::error_on_data_type_channel_not_in(__func__, __FILE__, __LINE__, t, c, __VA_ARGS__))
 
 /** Return an error if the data type of the passed tensor info is FP16 and FP16 extension is not supported by the device.
  *
@@ -887,12 +963,12 @@ inline arm_compute::Status error_on_data_type_channel_not_in(const char *functio
  *
  * @return Status
  */
-inline arm_compute::Status error_on_unsupported_fp16(const char *function, const char *file, const int line,
-                                                     const ITensorInfo *tensor_info, bool is_fp16_supported)
+inline arm_compute::Status error_on_unsupported_fp16(
+    const char *function, const char *file, const int line, const ITensorInfo *tensor_info, bool is_fp16_supported)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG((tensor_info->data_type() == DataType::F16 && !is_fp16_supported),
-                                        function, file, line, "FP16 not supported by the device");
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG((tensor_info->data_type() == DataType::F16 && !is_fp16_supported), function,
+                                        file, line, "FP16 not supported by the device");
     return arm_compute::Status{};
 }
 
@@ -906,11 +982,12 @@ inline arm_compute::Status error_on_unsupported_fp16(const char *function, const
  *
  * @return Status
  */
-inline arm_compute::Status error_on_unsupported_fp16(const char *function, const char *file, const int line,
-                                                     const ITensor *tensor, bool is_fp16_supported)
+inline arm_compute::Status error_on_unsupported_fp16(
+    const char *function, const char *file, const int line, const ITensor *tensor, bool is_fp16_supported)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_fp16(function, file, line, tensor->info(), is_fp16_supported));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        ::arm_compute::error_on_unsupported_fp16(function, file, line, tensor->info(), is_fp16_supported));
     return arm_compute::Status{};
 }
 
@@ -923,8 +1000,8 @@ inline arm_compute::Status error_on_unsupported_fp16(const char *function, const
  *
  * @return Status
  */
-arm_compute::Status error_on_tensor_not_2d(const char *function, const char *file, const int line,
-                                           const ITensor *tensor);
+arm_compute::Status
+error_on_tensor_not_2d(const char *function, const char *file, const int line, const ITensor *tensor);
 
 /** Return an error if the tensor info is not 2D.
  *
@@ -935,8 +1012,8 @@ arm_compute::Status error_on_tensor_not_2d(const char *function, const char *fil
  *
  * @return Status
  */
-arm_compute::Status error_on_tensor_not_2d(const char *function, const char *file, const int line,
-                                           const ITensorInfo *tensor);
+arm_compute::Status
+error_on_tensor_not_2d(const char *function, const char *file, const int line, const ITensorInfo *tensor);
 
 #define ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(t) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_tensor_not_2d(__func__, __FILE__, __LINE__, t))
@@ -955,17 +1032,15 @@ arm_compute::Status error_on_tensor_not_2d(const char *function, const char *fil
  * @return Status
  */
 template <typename T, typename... Ts>
-inline arm_compute::Status error_on_channel_not_in(const char *function, const char *file, const int line,
-                                                   T cn, T &&channel, Ts &&... channels)
+inline arm_compute::Status
+error_on_channel_not_in(const char *function, const char *file, const int line, T cn, T &&channel, Ts &&...channels)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(cn == Channel::UNKNOWN, function, file, line);
 
-    const std::array<T, sizeof...(Ts)> channels_array{ { std::forward<Ts>(channels)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC(channel != cn && std::none_of(channels_array.begin(), channels_array.end(), [&](const T & f)
-    {
-        return f == cn;
-    }),
-    function, file, line);
+    const std::array<T, sizeof...(Ts)> channels_array{{std::forward<Ts>(channels)...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(channel != cn && std::none_of(channels_array.begin(), channels_array.end(),
+                                                                  [&](const T &f) { return f == cn; }),
+                                    function, file, line);
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN(c, ...) \
@@ -983,8 +1058,8 @@ inline arm_compute::Status error_on_channel_not_in(const char *function, const c
  *
  * @return Status
  */
-arm_compute::Status error_on_channel_not_in_known_format(const char *function, const char *file, const int line,
-                                                         Format fmt, Channel cn);
+arm_compute::Status
+error_on_channel_not_in_known_format(const char *function, const char *file, const int line, Format fmt, Channel cn);
 #define ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(f, c) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_channel_not_in_known_format(__func__, __FILE__, __LINE__, f, c))
 #define ARM_COMPUTE_RETURN_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(f, c) \
@@ -999,8 +1074,8 @@ arm_compute::Status error_on_channel_not_in_known_format(const char *function, c
  *
  * @return Status
  */
-arm_compute::Status error_on_unconfigured_kernel(const char *function, const char *file, const int line,
-                                                 const IKernel *kernel);
+arm_compute::Status
+error_on_unconfigured_kernel(const char *function, const char *file, const int line, const IKernel *kernel);
 #define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unconfigured_kernel(__func__, __FILE__, __LINE__, k))
 #define ARM_COMPUTE_RETURN_ERROR_ON_UNCONFIGURED_KERNEL(k) \
@@ -1017,8 +1092,12 @@ arm_compute::Status error_on_unconfigured_kernel(const char *function, const cha
  *
  * @return Status
  */
-arm_compute::Status error_on_invalid_subtensor(const char *function, const char *file, const int line,
-                                               const TensorShape &parent_shape, const Coordinates &coords, const TensorShape &shape);
+arm_compute::Status error_on_invalid_subtensor(const char        *function,
+                                               const char        *file,
+                                               const int          line,
+                                               const TensorShape &parent_shape,
+                                               const Coordinates &coords,
+                                               const TensorShape &shape);
 #define ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(p, c, s) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_invalid_subtensor(__func__, __FILE__, __LINE__, p, c, s))
 #define ARM_COMPUTE_RETURN_ERROR_ON_INVALID_SUBTENSOR(p, c, s) \
@@ -1034,11 +1113,16 @@ arm_compute::Status error_on_invalid_subtensor(const char *function, const char
  *
  * @return Status
  */
-arm_compute::Status error_on_invalid_subtensor_valid_region(const char *function, const char *file, const int line,
-                                                            const ValidRegion &parent_valid_region, const ValidRegion &valid_region);
+arm_compute::Status error_on_invalid_subtensor_valid_region(const char        *function,
+                                                            const char        *file,
+                                                            const int          line,
+                                                            const ValidRegion &parent_valid_region,
+                                                            const ValidRegion &valid_region);
 #define ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(pv, sv) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_invalid_subtensor_valid_region(__func__, __FILE__, __LINE__, pv, sv))
+    ARM_COMPUTE_ERROR_THROW_ON(                                     \
+        ::arm_compute::error_on_invalid_subtensor_valid_region(__func__, __FILE__, __LINE__, pv, sv))
 #define ARM_COMPUTE_RETURN_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(pv, sv) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_invalid_subtensor_valid_region(__func__, __FILE__, __LINE__, pv, sv))
-}
+    ARM_COMPUTE_RETURN_ON_ERROR(                                           \
+        ::arm_compute::error_on_invalid_subtensor_valid_region(__func__, __FILE__, __LINE__, pv, sv))
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_VALIDATE_H*/
diff --git a/arm_compute/core/Version.h b/arm_compute/core/Version.h
index a4d307950a..44d400bad8 100644
--- a/arm_compute/core/Version.h
+++ b/arm_compute/core/Version.h
@@ -28,7 +28,7 @@
 
 /* Macro utilities */
 #define ARM_COMPUTE_STRINGIFY2(s) #s
-#define ARM_COMPUTE_STRINGIFY(s) ARM_COMPUTE_STRINGIFY2(s)
+#define ARM_COMPUTE_STRINGIFY(s)  ARM_COMPUTE_STRINGIFY2(s)
 
 #define ARM_COMPUTE_VERSION_STR                      \
     ARM_COMPUTE_STRINGIFY(ARM_COMPUTE_VERSION_MAJOR) \
diff --git a/arm_compute/core/Window.h b/arm_compute/core/Window.h
index 8ae859f4b3..4863b95045 100644
--- a/arm_compute/core/Window.h
+++ b/arm_compute/core/Window.h
@@ -24,15 +24,15 @@
 #ifndef ARM_COMPUTE_WINDOW_H
 #define ARM_COMPUTE_WINDOW_H
 
-#include <algorithm>
-#include <array>
-#include <cstddef>
-
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/utils/math/Math.h"
 
+#include <algorithm>
+#include <array>
+#include <cstddef>
+
 namespace arm_compute
 {
 /** Describe a multidimensional execution window. */
@@ -86,8 +86,7 @@ public:
          * @param[in] step  Step between two elements of the dimension when iterating.
          *
          */
-        constexpr Dimension(int start = 0, int end = 1, int step = 1)
-            : _start(start), _end(end), _step(step)
+        constexpr Dimension(int start = 0, int end = 1, int step = 1) : _start(start), _end(end), _step(step)
         {
         }
         Dimension(const Dimension &d) = default;
@@ -373,7 +372,8 @@ public:
      *
      * @return Collapsed window.
      */
-    Window collapse_if_possible(const Window &full_window, size_t first, size_t last, bool *has_collapsed = nullptr) const;
+    Window
+    collapse_if_possible(const Window &full_window, size_t first, size_t last, bool *has_collapsed = nullptr) const;
 
     /** Collapse the dimensions higher than @p first if possible.
      *
@@ -441,7 +441,7 @@ private:
      * @return The first slice of the window.
      */
     template <unsigned int window_dimension>
-    Window                 first_slice_window() const;
+    Window first_slice_window() const;
 
     /** Slide the passed window slice.
      *
diff --git a/arm_compute/core/Window.inl b/arm_compute/core/Window.inl
index 5ee4b57145..d935507b1d 100644
--- a/arm_compute/core/Window.inl
+++ b/arm_compute/core/Window.inl
@@ -26,7 +26,7 @@ namespace arm_compute
 inline Window::Window(const Window &src)
     : _dims(), _is_broadcasted(utility::generate_array<bool, Coordinates::num_max_dimensions, false>::value)
 {
-    for(size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
     {
         set(i, src[i]);
         _is_broadcasted[i] = src.is_broadcasted(i);
@@ -65,32 +65,34 @@ inline bool Window::is_broadcasted(size_t dimension) const
     return _is_broadcasted[dimension];
 }
 
-inline Window Window::collapse_if_possible(const Window &full_window, const size_t first,
-                                           const size_t last, bool *has_collapsed) const
+inline Window Window::collapse_if_possible(const Window &full_window,
+                                           const size_t  first,
+                                           const size_t  last,
+                                           bool         *has_collapsed) const
 {
     Window collapsed(*this);
 
     bool is_collapsable = true;
     int  collapsed_end  = _dims[first].end();
 
-    for(size_t d = first + 1; is_collapsable && (d < last); ++d)
+    for (size_t d = first + 1; is_collapsable && (d < last); ++d)
     {
         // The _dims's dimension must match the full _dims dimension to be collapsable:
-        is_collapsable = (_dims[d].start() == 0) && (full_window[d].start() == 0) && (_dims[d].step() <= 1)
-                         && (full_window[d].end() == _dims[d].end());
+        is_collapsable = (_dims[d].start() == 0) && (full_window[d].start() == 0) && (_dims[d].step() <= 1) &&
+                         (full_window[d].end() == _dims[d].end());
         collapsed_end *= _dims[d].end();
     }
 
-    if(is_collapsable)
+    if (is_collapsable)
     {
         collapsed._dims.at(first).set_end(collapsed_end);
-        for(size_t d = first + 1; is_collapsable && (d < last); ++d)
+        for (size_t d = first + 1; is_collapsable && (d < last); ++d)
         {
             collapsed.set(d, Dimension());
         }
     }
 
-    if(has_collapsed != nullptr)
+    if (has_collapsed != nullptr)
     {
         *has_collapsed = is_collapsable;
     }
@@ -101,7 +103,7 @@ inline Window Window::collapse_if_possible(const Window &full_window, const size
 inline Window Window::shift_dimensions(unsigned int shift_value) const
 {
     Window shifted_window;
-    for(size_t n = 0; n < (Coordinates::num_max_dimensions - shift_value); n++)
+    for (size_t n = 0; n < (Coordinates::num_max_dimensions - shift_value); n++)
     {
         shifted_window.set(n, _dims[n + shift_value]);
     }
@@ -120,9 +122,9 @@ inline Window Window::collapse(const Window &full_window, const size_t first, co
 inline Window Window::broadcast_if_dimension_le_one(const TensorShape &shape) const
 {
     Window broadcastWin(*this);
-    for(size_t d = 0; d < TensorShape::num_max_dimensions; ++d)
+    for (size_t d = 0; d < TensorShape::num_max_dimensions; ++d)
     {
-        if(shape[d] <= 1)
+        if (shape[d] <= 1)
         {
             broadcastWin.set_broadcasted(d);
         }
@@ -142,7 +144,7 @@ inline void Window::adjust(size_t dimension, int adjust_value, bool is_at_start)
     ARM_COMPUTE_ERROR_ON(dimension >= Coordinates::num_max_dimensions);
     Window::Dimension &d = _dims[dimension];
 
-    if(is_at_start)
+    if (is_at_start)
     {
         d = Window::Dimension(d.start() + adjust_value, d.end(), d.step());
     }
@@ -172,7 +174,7 @@ inline void Window::set_dimension_step(size_t dimension, int step)
 
 inline void Window::validate() const
 {
-    for(size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_ERROR_ON(_dims[i].end() < _dims[i].start());
         ARM_COMPUTE_ERROR_ON((_dims[i].step() != 0) && (((_dims[i].end() - _dims[i].start()) % _dims[i].step()) != 0));
@@ -193,9 +195,9 @@ inline Window Window::split_window(size_t dimension, size_t id, size_t total) co
 
     Window out;
 
-    for(size_t d = 0; d < Coordinates::num_max_dimensions; ++d)
+    for (size_t d = 0; d < Coordinates::num_max_dimensions; ++d)
     {
-        if(d == dimension)
+        if (d == dimension)
         {
             int       start = _dims[d].start();
             int       end   = _dims[d].end();
@@ -207,7 +209,7 @@ inline Window Window::split_window(size_t dimension, size_t id, size_t total) co
 
             int it_start = work * id;
 
-            if(int(id) < rem)
+            if (int(id) < rem)
             {
                 ++work;
                 it_start += id;
@@ -234,18 +236,18 @@ inline Window Window::split_window(size_t dimension, size_t id, size_t total) co
 template <unsigned int window_dimension>
 inline bool Window::slide_window_slice(Window &slice) const
 {
-    for(unsigned int n = window_dimension; n < Coordinates::num_max_dimensions; ++n)
+    for (unsigned int n = window_dimension; n < Coordinates::num_max_dimensions; ++n)
     {
         // Did we reach the end of this dimension?
         const int v = slice._dims[n].start() + 1;
 
-        if(v < _dims[n].end())
+        if (v < _dims[n].end())
         {
             // No: increment
             slice._dims[n] = Dimension(v, v + 1, 1);
 
             // Reset lower dimensions:
-            for(unsigned int lower = window_dimension; lower < n; ++lower)
+            for (unsigned int lower = window_dimension; lower < n; ++lower)
             {
                 slice._dims[lower] = Dimension(_dims[lower].start(), _dims[lower].start() + 1, 1);
             }
@@ -258,14 +260,14 @@ inline bool Window::slide_window_slice(Window &slice) const
 }
 
 template <unsigned int window_dimension>
-inline Window          Window::first_slice_window() const
+inline Window Window::first_slice_window() const
 {
     Window slice;
 
     std::copy_n(_dims.begin(), window_dimension, slice._dims.begin());
 
     //Initialise higher dimensions to be the first slice.
-    for(unsigned int n = window_dimension; n < Coordinates::num_max_dimensions; ++n)
+    for (unsigned int n = window_dimension; n < Coordinates::num_max_dimensions; ++n)
     {
         slice._dims[n] = Dimension(_dims[n].start(), _dims[n].start() + 1, 1);
     }
@@ -275,7 +277,7 @@ inline Window          Window::first_slice_window() const
 
 inline void Window::use_tensor_dimensions(const TensorShape &shape, size_t first_dimension)
 {
-    for(unsigned int n = first_dimension; n < shape.num_dimensions(); ++n)
+    for (unsigned int n = first_dimension; n < shape.num_dimensions(); ++n)
     {
         set(n, Window::Dimension(0, std::max(shape[n], static_cast<size_t>(1))));
     }
@@ -284,7 +286,7 @@ inline void Window::use_tensor_dimensions(const TensorShape &shape, size_t first
 inline TensorShape Window::shape() const
 {
     TensorShape shape;
-    for(size_t d = 0; d < TensorShape::num_max_dimensions; ++d)
+    for (size_t d = 0; d < TensorShape::num_max_dimensions; ++d)
     {
         shape.set(d, (_dims[d].end() - _dims[d].start()) / _dims[d].step());
     }
@@ -294,7 +296,7 @@ inline TensorShape Window::shape() const
 inline size_t Window::num_iterations_total() const
 {
     size_t total = 1;
-    for(size_t d = 0; d < Coordinates::num_max_dimensions; ++d)
+    for (size_t d = 0; d < Coordinates::num_max_dimensions; ++d)
     {
         total *= num_iterations(d);
     }
diff --git a/arm_compute/core/WindowIterator.h b/arm_compute/core/WindowIterator.h
index b1e399c872..29302c410a 100644
--- a/arm_compute/core/WindowIterator.h
+++ b/arm_compute/core/WindowIterator.h
@@ -28,7 +28,6 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Window.h"
 
-
 namespace arm_compute
 {
 /** Convert an offset in window steps into absolute coordinates.
@@ -41,7 +40,7 @@ namespace arm_compute
 inline Coordinates convert_window_coord_to_position(const Window &w, const Coordinates &offset)
 {
     Coordinates position;
-    for(unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i)
+    for (unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i)
     {
         position.set(i, w[i].start() + offset[i] * w[i].step());
     }
@@ -165,7 +164,7 @@ public:
     template <typename M>
     void iterate_3D(M &&on_new_row_size)
     {
-        while(_end.z() != _position.z())
+        while (_end.z() != _position.z())
         {
             iterate_2D_internal(on_new_row_size, _w.x().end() - _w.x().step(), _w.y().end() - _w.y().step());
             _position[2] += _w.z().step();
@@ -212,7 +211,7 @@ private:
     void iterate_2D_internal(M &&on_new_row_size, int end_x, int end_y)
     {
         //Is there more than one row to process ?
-        if(end_y == _position.y())
+        if (end_y == _position.y())
         {
             // Both start and end belong to the same row:
             iterate_over_dim0(end_x + _w.x().step(), on_new_row_size);
@@ -220,7 +219,7 @@ private:
         else
         {
             // Do we start from the beginning of the row ?
-            if(_w.x().start() != _position.x())
+            if (_w.x().start() != _position.x())
             {
                 //Start in the middle of a row: process left-over X
                 iterate_over_dim0(_w.x().end(), on_new_row_size);
@@ -229,7 +228,7 @@ private:
 
             //Middle rows
             bool no_leftover = end_x + _w.x().step() == _w.x().end();
-            if(no_leftover)
+            if (no_leftover)
             {
                 //Switch to full row size:
                 on_new_row_size(_w[0].start(), _w.x().end());
@@ -241,7 +240,7 @@ private:
             else
             {
                 // Are there full rows to process ?
-                if(_position[1] != end_y)
+                if (_position[1] != end_y)
                 {
                     //Switch to full row size:
                     on_new_row_size(_w[0].start(), _w.x().end());
@@ -261,7 +260,7 @@ private:
      */
     void iterate_over_dim1(int end)
     {
-        for(; _position[1] != end; _position[1] += _w[1].step())
+        for (; _position[1] != end; _position[1] += _w[1].step())
         {
             _position[0] = _w[0].start();
             iterate_over_dim0(_w[0].end());
@@ -288,7 +287,7 @@ private:
     {
         // Both start and end belong to the same row:
         ARM_COMPUTE_ERROR_ON(_position[0] > end);
-        for(; _position.x() < end; _position[0] += _w[0].step())
+        for (; _position.x() < end; _position[0] += _w[0].step())
         {
             _lambda_function(_position);
         }
@@ -310,9 +309,10 @@ private:
  * @return A WindowIterator object.
  */
 template <typename L>
-WindowIterator<L> create_window_iterator(const Window &w, const Coordinates &start, const Coordinates &end, L &&lambda_function)
+WindowIterator<L>
+create_window_iterator(const Window &w, const Coordinates &start, const Coordinates &end, L &&lambda_function)
 {
     return WindowIterator<L>(w, start, end, std::move(lambda_function));
 }
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_WINDOW_ITERATOR_H*/
diff --git a/arm_compute/core/experimental/Types.h b/arm_compute/core/experimental/Types.h
index 8dd6812b58..63a3a1a1ec 100644
--- a/arm_compute/core/experimental/Types.h
+++ b/arm_compute/core/experimental/Types.h
@@ -92,24 +92,18 @@ struct MemoryInfo
 {
     MemoryInfo() = default;
 
-    MemoryInfo(int slot, size_t size, size_t alignment = 0) noexcept
-        : slot(slot),
-          size(size),
-          alignment(alignment)
+    MemoryInfo(int slot, size_t size, size_t alignment = 0) noexcept : slot(slot), size(size), alignment(alignment)
     {
     }
 
     MemoryInfo(int slot, MemoryLifetime lifetime, size_t size, size_t alignment = 0) noexcept
-        : slot(slot),
-          lifetime(lifetime),
-          size(size),
-          alignment(alignment)
+        : slot(slot), lifetime(lifetime), size(size), alignment(alignment)
     {
     }
 
     bool merge(int slot, size_t new_size, size_t new_alignment = 0) noexcept
     {
-        if(slot != this->slot)
+        if (slot != this->slot)
         {
             return false;
         }
@@ -120,10 +114,10 @@ struct MemoryInfo
         return true;
     }
 
-    int            slot{ ACL_UNKNOWN };
-    MemoryLifetime lifetime{ MemoryLifetime::Temporary };
-    size_t         size{ 0 };
-    size_t         alignment{ 64 };
+    int            slot{ACL_UNKNOWN};
+    MemoryLifetime lifetime{MemoryLifetime::Temporary};
+    size_t         size{0};
+    size_t         alignment{64};
 };
 
 using MemoryRequirements = std::vector<MemoryInfo>;
diff --git a/arm_compute/core/utils/ActivationFunctionUtils.h b/arm_compute/core/utils/ActivationFunctionUtils.h
index 1cb66da13d..c988efa256 100644
--- a/arm_compute/core/utils/ActivationFunctionUtils.h
+++ b/arm_compute/core/utils/ActivationFunctionUtils.h
@@ -37,5 +37,5 @@ namespace arm_compute
  * @return The string describing the activation function.
  */
 const std::string &string_from_activation_func(const ActivationFunction &act);
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_CORE_UTILS_ACTIVATIONFUNCTIONUTILS_H */
diff --git a/arm_compute/core/utils/DataLayoutUtils.h b/arm_compute/core/utils/DataLayoutUtils.h
index 399f55c63f..61839c9f91 100644
--- a/arm_compute/core/utils/DataLayoutUtils.h
+++ b/arm_compute/core/utils/DataLayoutUtils.h
@@ -36,5 +36,5 @@ namespace arm_compute
  * @return The string describing the data layout.
  */
 const std::string &string_from_data_layout(DataLayout dl);
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_CORE_UTILS_DATALAYOUTUTILS_H */
diff --git a/arm_compute/core/utils/DataTypeUtils.h b/arm_compute/core/utils/DataTypeUtils.h
index cbb409c8a1..7ea5eb7670 100644
--- a/arm_compute/core/utils/DataTypeUtils.h
+++ b/arm_compute/core/utils/DataTypeUtils.h
@@ -37,7 +37,7 @@ namespace arm_compute
  */
 inline size_t data_size_from_type(DataType data_type)
 {
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::U8:
         case DataType::S8:
@@ -77,7 +77,7 @@ inline size_t data_size_from_type(DataType data_type)
  */
 inline size_t element_size_from_data_type(DataType dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::S8:
         case DataType::U8:
@@ -114,7 +114,7 @@ inline size_t element_size_from_data_type(DataType dt)
  */
 inline DataType data_type_from_format(Format format)
 {
-    switch(format)
+    switch (format)
     {
         case Format::U8:
         case Format::UV88:
@@ -158,7 +158,7 @@ inline DataType data_type_from_format(Format format)
  */
 inline DataType get_promoted_data_type(DataType dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
             return DataType::U16;
@@ -196,7 +196,7 @@ inline std::tuple<PixelValue, PixelValue> get_min_max(DataType dt)
 {
     PixelValue min{};
     PixelValue max{};
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::QASYMM8:
@@ -303,7 +303,7 @@ inline ::std::istream &operator>>(::std::istream &stream, DataType &data_type)
  */
 inline bool is_data_type_float(DataType dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::F16:
         case DataType::F32:
@@ -323,7 +323,7 @@ inline bool is_data_type_float(DataType dt)
  */
 inline bool is_data_type_quantized(DataType dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::QSYMM8:
         case DataType::QASYMM8:
@@ -345,7 +345,7 @@ inline bool is_data_type_quantized(DataType dt)
  */
 inline bool is_data_type_quantized_asymmetric(DataType dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::QASYMM8:
         case DataType::QASYMM8_SIGNED:
@@ -364,7 +364,7 @@ inline bool is_data_type_quantized_asymmetric(DataType dt)
  */
 inline bool is_data_type_quantized_asymmetric_signed(DataType dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::QASYMM8_SIGNED:
             return true;
@@ -381,7 +381,7 @@ inline bool is_data_type_quantized_asymmetric_signed(DataType dt)
  */
 inline bool is_data_type_quantized_symmetric(DataType dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::QSYMM8:
         case DataType::QSYMM8_PER_CHANNEL:
@@ -400,7 +400,7 @@ inline bool is_data_type_quantized_symmetric(DataType dt)
  */
 inline bool is_data_type_quantized_per_channel(DataType dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::QSYMM8_PER_CHANNEL:
             return true;
@@ -420,12 +420,13 @@ inline bool is_data_type_quantized_per_channel(DataType dt)
 template <typename T>
 bool check_value_range(T val, DataType dt, QuantizationInfo qinfo = QuantizationInfo())
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         {
             const auto val_u8 = static_cast<uint8_t>(val);
-            return ((val_u8 == val) && val >= std::numeric_limits<uint8_t>::lowest() && val <= std::numeric_limits<uint8_t>::max());
+            return ((val_u8 == val) && val >= std::numeric_limits<uint8_t>::lowest() &&
+                    val <= std::numeric_limits<uint8_t>::max());
         }
         case DataType::QASYMM8:
         {
@@ -436,29 +437,34 @@ bool check_value_range(T val, DataType dt, QuantizationInfo qinfo = Quantization
         case DataType::S8:
         {
             const auto val_s8 = static_cast<int8_t>(val);
-            return ((val_s8 == val) && val >= std::numeric_limits<int8_t>::lowest() && val <= std::numeric_limits<int8_t>::max());
+            return ((val_s8 == val) && val >= std::numeric_limits<int8_t>::lowest() &&
+                    val <= std::numeric_limits<int8_t>::max());
         }
         case DataType::U16:
         {
             const auto val_u16 = static_cast<uint16_t>(val);
-            return ((val_u16 == val) && val >= std::numeric_limits<uint16_t>::lowest() && val <= std::numeric_limits<uint16_t>::max());
+            return ((val_u16 == val) && val >= std::numeric_limits<uint16_t>::lowest() &&
+                    val <= std::numeric_limits<uint16_t>::max());
         }
         case DataType::S16:
         {
             const auto val_s16 = static_cast<int16_t>(val);
-            return ((val_s16 == val) && val >= std::numeric_limits<int16_t>::lowest() && val <= std::numeric_limits<int16_t>::max());
+            return ((val_s16 == val) && val >= std::numeric_limits<int16_t>::lowest() &&
+                    val <= std::numeric_limits<int16_t>::max());
         }
         case DataType::U32:
         {
             const auto val_d64 = static_cast<double>(val);
             const auto val_u32 = static_cast<uint32_t>(val);
-            return ((val_u32 == val_d64) && val_d64 >= std::numeric_limits<uint32_t>::lowest() && val_d64 <= std::numeric_limits<uint32_t>::max());
+            return ((val_u32 == val_d64) && val_d64 >= std::numeric_limits<uint32_t>::lowest() &&
+                    val_d64 <= std::numeric_limits<uint32_t>::max());
         }
         case DataType::S32:
         {
             const auto val_d64 = static_cast<double>(val);
             const auto val_s32 = static_cast<int32_t>(val);
-            return ((val_s32 == val_d64) && val_d64 >= std::numeric_limits<int32_t>::lowest() && val_d64 <= std::numeric_limits<int32_t>::max());
+            return ((val_s32 == val_d64) && val_d64 >= std::numeric_limits<int32_t>::lowest() &&
+                    val_d64 <= std::numeric_limits<int32_t>::max());
         }
         case DataType::BFLOAT16:
             return (val >= bfloat16::lowest() && val <= bfloat16::max());
@@ -482,7 +488,7 @@ inline std::string cpu_impl_dt(const DataType &data_type)
 {
     std::string ret = "";
 
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::F32:
             ret = "fp32";
@@ -521,5 +527,5 @@ inline std::string cpu_impl_dt(const DataType &data_type)
     return ret;
 }
 
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_CORE_UTILS_DATATYPEUTILS_H */
diff --git a/arm_compute/core/utils/FormatUtils.h b/arm_compute/core/utils/FormatUtils.h
index afb0f78255..a8e96bd361 100644
--- a/arm_compute/core/utils/FormatUtils.h
+++ b/arm_compute/core/utils/FormatUtils.h
@@ -37,7 +37,7 @@ namespace arm_compute
  */
 inline size_t pixel_size_from_format(Format format)
 {
-    switch(format)
+    switch (format)
     {
         case Format::U8:
             return 1;
@@ -77,7 +77,7 @@ inline size_t pixel_size_from_format(Format format)
  */
 inline int plane_idx_from_channel(Format format, Channel channel)
 {
-    switch(format)
+    switch (format)
     {
         // Single planar formats have a single plane
         case Format::U8:
@@ -99,7 +99,7 @@ inline int plane_idx_from_channel(Format format, Channel channel)
         case Format::NV21:
         {
             // Channel U and V share the same plane of format UV88
-            switch(channel)
+            switch (channel)
             {
                 case Channel::Y:
                     return 0;
@@ -114,7 +114,7 @@ inline int plane_idx_from_channel(Format format, Channel channel)
         case Format::IYUV:
         case Format::YUV444:
         {
-            switch(channel)
+            switch (channel)
             {
                 case Channel::Y:
                     return 0;
@@ -142,11 +142,11 @@ inline int plane_idx_from_channel(Format format, Channel channel)
  */
 inline int channel_idx_from_format(Format format, Channel channel)
 {
-    switch(format)
+    switch (format)
     {
         case Format::RGB888:
         {
-            switch(channel)
+            switch (channel)
             {
                 case Channel::R:
                     return 0;
@@ -161,7 +161,7 @@ inline int channel_idx_from_format(Format format, Channel channel)
         }
         case Format::RGBA8888:
         {
-            switch(channel)
+            switch (channel)
             {
                 case Channel::R:
                     return 0;
@@ -178,7 +178,7 @@ inline int channel_idx_from_format(Format format, Channel channel)
         }
         case Format::YUYV422:
         {
-            switch(channel)
+            switch (channel)
             {
                 case Channel::Y:
                     return 0;
@@ -193,7 +193,7 @@ inline int channel_idx_from_format(Format format, Channel channel)
         }
         case Format::UYVY422:
         {
-            switch(channel)
+            switch (channel)
             {
                 case Channel::Y:
                     return 1;
@@ -208,7 +208,7 @@ inline int channel_idx_from_format(Format format, Channel channel)
         }
         case Format::NV12:
         {
-            switch(channel)
+            switch (channel)
             {
                 case Channel::Y:
                     return 0;
@@ -223,7 +223,7 @@ inline int channel_idx_from_format(Format format, Channel channel)
         }
         case Format::NV21:
         {
-            switch(channel)
+            switch (channel)
             {
                 case Channel::Y:
                     return 0;
@@ -239,7 +239,7 @@ inline int channel_idx_from_format(Format format, Channel channel)
         case Format::YUV444:
         case Format::IYUV:
         {
-            switch(channel)
+            switch (channel)
             {
                 case Channel::Y:
                     return 0;
@@ -266,7 +266,7 @@ inline int channel_idx_from_format(Format format, Channel channel)
  */
 inline size_t num_planes_from_format(Format format)
 {
-    switch(format)
+    switch (format)
     {
         case Format::U8:
         case Format::S16:
@@ -301,7 +301,7 @@ inline size_t num_planes_from_format(Format format)
  */
 inline size_t num_channels_from_format(Format format)
 {
-    switch(format)
+    switch (format)
     {
         case Format::U8:
         case Format::U16:
@@ -340,5 +340,5 @@ inline size_t num_channels_from_format(Format format)
  * @return The string describing the format.
  */
 const std::string &string_from_format(Format format);
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_CORE_UTILS_FORMATUTILS_H */
diff --git a/arm_compute/core/utils/InterpolationPolicyUtils.h b/arm_compute/core/utils/InterpolationPolicyUtils.h
index 79f6e3aa5f..8d4ae4321c 100644
--- a/arm_compute/core/utils/InterpolationPolicyUtils.h
+++ b/arm_compute/core/utils/InterpolationPolicyUtils.h
@@ -37,5 +37,5 @@ namespace arm_compute
  * @return The string describing the interpolation policy.
  */
 const std::string &string_from_interpolation_policy(InterpolationPolicy policy);
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_CORE_UTILS_INTERPOLATIONPOLICYUTILS_H */
diff --git a/arm_compute/core/utils/StringUtils.h b/arm_compute/core/utils/StringUtils.h
index 41f29b0901..c13cbaa334 100644
--- a/arm_compute/core/utils/StringUtils.h
+++ b/arm_compute/core/utils/StringUtils.h
@@ -61,5 +61,5 @@ std::string float_to_string_with_full_precision(float val);
  * @return std::string
  */
 std::string join(const std::vector<std::string> strings, const std::string &sep);
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_CORE_UTILS_STRINGUTILS_H */
diff --git a/arm_compute/core/utils/helpers/AdjustVecSize.h b/arm_compute/core/utils/helpers/AdjustVecSize.h
index bbb3048b84..842e3b57d6 100644
--- a/arm_compute/core/utils/helpers/AdjustVecSize.h
+++ b/arm_compute/core/utils/helpers/AdjustVecSize.h
@@ -39,17 +39,17 @@ inline unsigned int adjust_vec_size(unsigned int vec_size, size_t dim0)
 {
     ARM_COMPUTE_ERROR_ON(vec_size > 16);
 
-    if((vec_size >= dim0) && (dim0 == 3))
+    if ((vec_size >= dim0) && (dim0 == 3))
     {
         return dim0;
     }
 
-    while(vec_size > dim0)
+    while (vec_size > dim0)
     {
         vec_size >>= 1;
     }
 
     return vec_size;
 }
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_UTILS_H */
diff --git a/arm_compute/core/utils/helpers/tensor_transform.h b/arm_compute/core/utils/helpers/tensor_transform.h
index faa5b4433c..7a61fa192a 100644
--- a/arm_compute/core/utils/helpers/tensor_transform.h
+++ b/arm_compute/core/utils/helpers/tensor_transform.h
@@ -52,7 +52,8 @@ int calculate_stride_on_index(int index, Coordinates strides);
  *
  * @return Absolute start position of a given index
  */
-int calculate_start_on_index(TensorShape input_shape, int index, Coordinates starts, Coordinates strides, int32_t begin_mask);
+int calculate_start_on_index(
+    TensorShape input_shape, int index, Coordinates starts, Coordinates strides, int32_t begin_mask);
 
 /** Returns the absolute end position of a given index for a strided slice operation
  *
@@ -68,8 +69,13 @@ int calculate_start_on_index(TensorShape input_shape, int index, Coordinates sta
  *
  * @return Absolute end position of a given index
  */
-int calculate_end_on_index(TensorShape input_shape, int index, int start_on_index, Coordinates ends, Coordinates strides,
-                           int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+int calculate_end_on_index(TensorShape input_shape,
+                           int         index,
+                           int         start_on_index,
+                           Coordinates ends,
+                           Coordinates strides,
+                           int32_t     end_mask         = 0,
+                           int32_t     shrink_axis_mask = 0);
 
 /** Calculate start, end and stride coordinates for a strided slice
  *
@@ -87,8 +93,12 @@ int calculate_end_on_index(TensorShape input_shape, int index, int start_on_inde
  * @return A tuple with <Start,End,Strides>
  */
 std::tuple<Coordinates, Coordinates, Coordinates> calculate_strided_slice_coords(TensorShape input_shape,
-                                                                                 Coordinates starts, Coordinates ends, Coordinates strides,
-                                                                                 int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+                                                                                 Coordinates starts,
+                                                                                 Coordinates ends,
+                                                                                 Coordinates strides,
+                                                                                 int32_t     begin_mask       = 0,
+                                                                                 int32_t     end_mask         = 0,
+                                                                                 int32_t     shrink_axis_mask = 0);
 
 /** Computes output shape of strided slice
  *
@@ -109,9 +119,14 @@ std::tuple<Coordinates, Coordinates, Coordinates> calculate_strided_slice_coords
  *
  * @return The output tensor shape
  */
-TensorShape compute_strided_slice_output_shape(TensorShape input_shape, Coordinates starts, Coordinates ends, Coordinates strides,
-                                               int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0,
-                                               bool return_unshrinked = false);
+TensorShape compute_strided_slice_output_shape(TensorShape input_shape,
+                                               Coordinates starts,
+                                               Coordinates ends,
+                                               Coordinates strides,
+                                               int32_t     begin_mask        = 0,
+                                               int32_t     end_mask          = 0,
+                                               int32_t     shrink_axis_mask  = 0,
+                                               bool        return_unshrinked = false);
 
 /** Constructs end mask in case we want to perform a slice operation using the strided slice interface
  *
@@ -122,7 +137,7 @@ TensorShape compute_strided_slice_output_shape(TensorShape input_shape, Coordina
  * @return End mask
  */
 int32_t construct_slice_end_mask(Coordinates ends);
-} // namespace tensor_tranform
+} // namespace tensor_transform
 } // namespace helpers
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_UTILS_HELPERS_TENSOR_TRANSFORM_H */
diff --git a/arm_compute/core/utils/logging/FilePrinter.h b/arm_compute/core/utils/logging/FilePrinter.h
index 0e5b84f084..a865aadddb 100644
--- a/arm_compute/core/utils/logging/FilePrinter.h
+++ b/arm_compute/core/utils/logging/FilePrinter.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_LOGGING_FILE_PRINTER_H
 #define ARM_COMPUTE_LOGGING_FILE_PRINTER_H
 
-#include "arm_compute/core/utils/logging/IPrinter.h"
-
 #include "arm_compute/core/utils/io/FileHandler.h"
+#include "arm_compute/core/utils/logging/IPrinter.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/core/utils/logging/Helpers.h b/arm_compute/core/utils/logging/Helpers.h
index 5f8b948592..c3c2f0f0b8 100644
--- a/arm_compute/core/utils/logging/Helpers.h
+++ b/arm_compute/core/utils/logging/Helpers.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_LOGGING_HELPERS_H
 
 #include "arm_compute/core/utils/logging/Types.h"
+
 #include "support/ToolchainSupport.h"
 
 #include <cstddef>
@@ -45,7 +46,7 @@ namespace logging
  * @return The formatted string
  */
 template <typename... Ts>
-inline std::string string_with_format(const std::string &fmt, Ts &&... args)
+inline std::string string_with_format(const std::string &fmt, Ts &&...args)
 {
     size_t size     = support::cpp11::snprintf(nullptr, 0, fmt.c_str(), args...) + 1;
     auto   char_str = std::make_unique<char[]>(size);
diff --git a/arm_compute/core/utils/logging/IPrinter.h b/arm_compute/core/utils/logging/IPrinter.h
index 42dca58ea1..7fde4d9302 100644
--- a/arm_compute/core/utils/logging/IPrinter.h
+++ b/arm_compute/core/utils/logging/IPrinter.h
@@ -35,8 +35,7 @@ class Printer
 {
 public:
     /** Default Constructor */
-    Printer() noexcept
-        : _mtx()
+    Printer() noexcept : _mtx()
     {
     }
     /** Prevent instances of this class from being copied */
diff --git a/arm_compute/core/utils/logging/LogMsgDecorators.h b/arm_compute/core/utils/logging/LogMsgDecorators.h
index 9c9e62740f..66a8180e21 100644
--- a/arm_compute/core/utils/logging/LogMsgDecorators.h
+++ b/arm_compute/core/utils/logging/LogMsgDecorators.h
@@ -63,8 +63,7 @@ public:
      *
      * @param str Sting to append
      */
-    StringDecorator(const std::string &str)
-        : _str(str)
+    StringDecorator(const std::string &str) : _str(str)
     {
         _str = angle_wrap_value(str);
     }
@@ -103,7 +102,7 @@ private:
         auto time = std::chrono::system_clock::to_time_t(now);
 
         // TODO: use put_time for gcc > 4.9
-        char buf[100] = { 0 };
+        char buf[100] = {0};
         std::strftime(buf, sizeof(buf), "%d-%m-%Y %I:%M:%S", std::localtime(&time));
         return buf;
     }
diff --git a/arm_compute/core/utils/logging/Logger.h b/arm_compute/core/utils/logging/Logger.h
index 4fc9bb7dbf..608db39138 100644
--- a/arm_compute/core/utils/logging/Logger.h
+++ b/arm_compute/core/utils/logging/Logger.h
@@ -88,7 +88,7 @@ public:
      * @param[in] args      Message arguments
      */
     template <typename... Ts>
-    void log(LogLevel log_level, const std::string &fmt, Ts &&... args);
+    void log(LogLevel log_level, const std::string &fmt, Ts &&...args);
     /** Sets log level of the logger
      *
      * @warning Not thread-safe
@@ -159,11 +159,11 @@ private:
 };
 
 template <typename... Ts>
-inline void Logger::log(LogLevel log_level, const std::string &fmt, Ts &&... args)
+inline void Logger::log(LogLevel log_level, const std::string &fmt, Ts &&...args)
 {
     // Return if message shouldn't be logged
     // i.e. if log level does not match the logger's
-    if(!is_loggable(log_level))
+    if (!is_loggable(log_level))
     {
         return;
     }
diff --git a/arm_compute/core/utils/logging/LoggerRegistry.h b/arm_compute/core/utils/logging/LoggerRegistry.h
index 7c9931a260..4e52a10935 100644
--- a/arm_compute/core/utils/logging/LoggerRegistry.h
+++ b/arm_compute/core/utils/logging/LoggerRegistry.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/utils/logging/Logger.h"
 #include "arm_compute/core/utils/logging/Printers.h"
 #include "arm_compute/core/utils/logging/Types.h"
+
 #include "support/Mutex.h"
 
 #include <memory>
@@ -54,8 +55,9 @@ public:
      * @param[in] log_level Logger's log level. Defaults to INFO
      * @param[in] printers  Printers to attach to the system loggers. Defaults with a @ref StdPrinter.
      */
-    void create_logger(const std::string &name, LogLevel log_level = LogLevel::INFO,
-                       const std::vector<std::shared_ptr<Printer>> &printers = { std::make_shared<StdPrinter>() });
+    void create_logger(const std::string                           &name,
+                       LogLevel                                     log_level = LogLevel::INFO,
+                       const std::vector<std::shared_ptr<Printer>> &printers  = {std::make_shared<StdPrinter>()});
     /** Remove a logger
      *
      * @param name Logger's name
@@ -74,16 +76,17 @@ public:
      * @param[in] printers  (Optional) Printers to attach to the system loggers. Defaults with a @ref StdPrinter.
      */
     void create_reserved_loggers(LogLevel                                     log_level = LogLevel::INFO,
-                                 const std::vector<std::shared_ptr<Printer>> &printers  = { std::make_shared<StdPrinter>() });
+                                 const std::vector<std::shared_ptr<Printer>> &printers  = {
+                                      std::make_shared<StdPrinter>()});
 
 private:
     /** Default constructor */
     LoggerRegistry();
 
 private:
-    arm_compute::Mutex _mtx;
+    arm_compute::Mutex                                       _mtx;
     std::unordered_map<std::string, std::shared_ptr<Logger>> _loggers;
-    static std::set<std::string> _reserved_loggers;
+    static std::set<std::string>                             _reserved_loggers;
 };
 } // namespace logging
 } // namespace arm_compute
diff --git a/arm_compute/core/utils/logging/Macros.h b/arm_compute/core/utils/logging/Macros.h
index 0ab17c4464..4d5aa5fe2c 100644
--- a/arm_compute/core/utils/logging/Macros.h
+++ b/arm_compute/core/utils/logging/Macros.h
@@ -48,48 +48,48 @@ inline std::string signature_name(const std::string &pretty_func)
     do                                                                                   \
     {                                                                                    \
         auto __logger = arm_compute::logging::LoggerRegistry::get().logger(logger_name); \
-        if(__logger != nullptr)                                                          \
+        if (__logger != nullptr)                                                         \
         {                                                                                \
             __logger->log(log_level, msg);                                               \
         }                                                                                \
-    } while(false)
+    } while (false)
 
 #define ARM_COMPUTE_LOG_MSG_WITH_FUNCNAME(logger_name, log_level, msg)                   \
     do                                                                                   \
     {                                                                                    \
         auto __logger = arm_compute::logging::LoggerRegistry::get().logger(logger_name); \
-        if(__logger != nullptr)                                                          \
+        if (__logger != nullptr)                                                         \
         {                                                                                \
             std::ostringstream s;                                                        \
             s << ARM_COMPUTE_SIGNATURE_NAME << " : " << msg;                             \
             __logger->log(log_level, s.str());                                           \
         }                                                                                \
-    } while(false)
+    } while (false)
 
 #define ARM_COMPUTE_LOG_MSG_WITH_FORMAT(logger_name, log_level, fmt, ...)                     \
     do                                                                                        \
     {                                                                                         \
         auto __logger = arm_compute::logging::LoggerRegistry::get().logger(logger_name);      \
-        if(__logger != nullptr)                                                               \
+        if (__logger != nullptr)                                                              \
         {                                                                                     \
             size_t size     = ::snprintf(nullptr, 0, fmt, __VA_ARGS__) + 1;                   \
             auto   char_str = std::make_unique<char[]>(size);                                 \
             ::snprintf(char_str.get(), size, fmt, __VA_ARGS__);                               \
             __logger->log(log_level, std::string(char_str.get(), char_str.get() + size - 1)); \
         }                                                                                     \
-    } while(false)
+    } while (false)
 
 #define ARM_COMPUTE_LOG_STREAM(logger_name, log_level, stream)                           \
     do                                                                                   \
     {                                                                                    \
         auto __logger = arm_compute::logging::LoggerRegistry::get().logger(logger_name); \
-        if(__logger != nullptr)                                                          \
+        if (__logger != nullptr)                                                         \
         {                                                                                \
             std::ostringstream s;                                                        \
             s << stream;                                                                 \
             __logger->log(log_level, s.str());                                           \
         }                                                                                \
-    } while(false)
+    } while (false)
 
 #else /* ARM_COMPUTE_LOGGING_ENABLED */
 
diff --git a/arm_compute/core/utils/logging/Types.h b/arm_compute/core/utils/logging/Types.h
index f0ddae6c84..64c567b984 100644
--- a/arm_compute/core/utils/logging/Types.h
+++ b/arm_compute/core/utils/logging/Types.h
@@ -44,8 +44,7 @@ enum class LogLevel
 struct LogMsg
 {
     /** Default constructor */
-    LogMsg()
-        : raw_(), log_level_(LogLevel::OFF)
+    LogMsg() : raw_(), log_level_(LogLevel::OFF)
     {
     }
     /** Construct a log message
@@ -53,8 +52,7 @@ struct LogMsg
      * @param[in] msg       Message to log.
      * @param[in] log_level Logging level. Default: OFF
      */
-    LogMsg(std::string msg, LogLevel log_level = LogLevel::OFF)
-        : raw_(msg), log_level_(log_level)
+    LogMsg(std::string msg, LogLevel log_level = LogLevel::OFF) : raw_(msg), log_level_(log_level)
     {
     }
 
diff --git a/arm_compute/core/utils/math/Math.h b/arm_compute/core/utils/math/Math.h
index c1dce7ff08..e70337ba0f 100644
--- a/arm_compute/core/utils/math/Math.h
+++ b/arm_compute/core/utils/math/Math.h
@@ -67,5 +67,5 @@ inline auto floor_to_multiple(S value, T divisor) -> decltype((value / divisor)
     return (value / divisor) * divisor;
 }
 
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_UTILS_MATH_H */
diff --git a/arm_compute/core/utils/math/SafeOps.h b/arm_compute/core/utils/math/SafeOps.h
index dc928a0e5d..ef8bcf7e14 100644
--- a/arm_compute/core/utils/math/SafeOps.h
+++ b/arm_compute/core/utils/math/SafeOps.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_UTILS_MATH_SAFE_OPS
 
 #include "arm_compute/core/Error.h"
+
 #include "support/AclRequires.h"
 
 #include <limits>
@@ -51,11 +52,11 @@ T safe_integer_add(T val_a, T val_b)
 {
     T result = 0;
 
-    if((val_b > 0) && (val_a > std::numeric_limits<T>::max() - val_b))
+    if ((val_b > 0) && (val_a > std::numeric_limits<T>::max() - val_b))
     {
         result = std::numeric_limits<T>::max();
     }
-    else if((val_b < 0) && (val_a < std::numeric_limits<T>::min() - val_b))
+    else if ((val_b < 0) && (val_a < std::numeric_limits<T>::min() - val_b))
     {
         result = std::numeric_limits<T>::min();
     }
@@ -83,11 +84,11 @@ T safe_integer_sub(T val_a, T val_b)
 {
     T result = 0;
 
-    if((val_b < 0) && (val_a > std::numeric_limits<T>::max() + val_b))
+    if ((val_b < 0) && (val_a > std::numeric_limits<T>::max() + val_b))
     {
         result = std::numeric_limits<T>::max();
     }
-    else if((val_b > 0) && (val_a < std::numeric_limits<T>::min() + val_b))
+    else if ((val_b > 0) && (val_a < std::numeric_limits<T>::min() + val_b))
     {
         result = std::numeric_limits<T>::min();
     }
@@ -115,13 +116,13 @@ T safe_integer_mul(T val_a, T val_b)
 {
     T result = 0;
 
-    if(val_a > 0)
+    if (val_a > 0)
     {
-        if((val_b > 0) && (val_a > (std::numeric_limits<T>::max() / val_b)))
+        if ((val_b > 0) && (val_a > (std::numeric_limits<T>::max() / val_b)))
         {
             result = std::numeric_limits<T>::max();
         }
-        else if(val_b < (std::numeric_limits<T>::min() / val_a))
+        else if (val_b < (std::numeric_limits<T>::min() / val_a))
         {
             result = std::numeric_limits<T>::min();
         }
@@ -132,11 +133,11 @@ T safe_integer_mul(T val_a, T val_b)
     }
     else
     {
-        if((val_b > 0) && (val_a < (std::numeric_limits<T>::min() / val_b)))
+        if ((val_b > 0) && (val_a < (std::numeric_limits<T>::min() / val_b)))
         {
             result = std::numeric_limits<T>::max();
         }
-        else if((val_a != 0) && (val_b < (std::numeric_limits<T>::max() / val_a)))
+        else if ((val_a != 0) && (val_b < (std::numeric_limits<T>::max() / val_a)))
         {
             result = std::numeric_limits<T>::min();
         }
@@ -165,7 +166,7 @@ T safe_integer_div(T val_a, T val_b)
 {
     T result = 0;
 
-    if((val_b == 0) || ((val_a == std::numeric_limits<T>::min()) && (val_b == -1)))
+    if ((val_b == 0) || ((val_a == std::numeric_limits<T>::min()) && (val_b == -1)))
     {
         result = std::numeric_limits<T>::min();
     }
@@ -176,7 +177,7 @@ T safe_integer_div(T val_a, T val_b)
 
     return result;
 }
-} // namespace cast
+} // namespace math
 } // namespace utils
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_UTILS_MATH_SAFE_OPS */
diff --git a/arm_compute/core/utils/misc/InfoHelpers.h b/arm_compute/core/utils/misc/InfoHelpers.h
index ced0d24b56..1d1b4ea8d7 100644
--- a/arm_compute/core/utils/misc/InfoHelpers.h
+++ b/arm_compute/core/utils/misc/InfoHelpers.h
@@ -53,10 +53,12 @@ inline bool is_relu(ActivationLayerInfo activation_info)
  */
 inline bool is_relu6(ActivationLayerInfo activation_info)
 {
-    const bool is_lu_bounded_relu = activation_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                    && activation_info.a() == 6.f && activation_info.b() == 0.f;
-    const bool is_bounded_relu = activation_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
-                                 && activation_info.a() == 6.f;
+    const bool is_lu_bounded_relu =
+        activation_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU &&
+        activation_info.a() == 6.f && activation_info.b() == 0.f;
+    const bool is_bounded_relu =
+        activation_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+        activation_info.a() == 6.f;
     return activation_info.enabled() && (is_lu_bounded_relu || is_bounded_relu);
 }
 
@@ -68,34 +70,37 @@ inline bool is_relu6(ActivationLayerInfo activation_info)
  *
  */
 template <typename T>
-inline void build_lstm_params_tensor_info(const LSTMParams<T>     &lstm_params,
-                                          LSTMParams<ITensorInfo> *lstm_params_info)
+inline void build_lstm_params_tensor_info(const LSTMParams<T> &lstm_params, LSTMParams<ITensorInfo> *lstm_params_info)
 {
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
-        lstm_params_info->set_peephole_params(lstm_params.cell_to_forget_weights()->info(), lstm_params.cell_to_output_weights()->info());
+        lstm_params_info->set_peephole_params(lstm_params.cell_to_forget_weights()->info(),
+                                              lstm_params.cell_to_output_weights()->info());
     }
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.projection_weights());
-        lstm_params_info->set_projection_params(lstm_params.projection_weights()->info(),
-                                                lstm_params.projection_bias() != nullptr ? lstm_params.projection_bias()->info() : nullptr);
+        lstm_params_info->set_projection_params(
+            lstm_params.projection_weights()->info(),
+            lstm_params.projection_bias() != nullptr ? lstm_params.projection_bias()->info() : nullptr);
     }
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
+        ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(),
+                                     lstm_params.input_gate_bias());
 
-        ITensorInfo *cell_to_input_weights_info = (lstm_params.has_peephole_opt()) ? lstm_params.cell_to_input_weights()->info() : nullptr;
-        lstm_params_info->set_cifg_params(lstm_params.input_to_input_weights()->info(), lstm_params.recurrent_to_input_weights()->info(),
-                                          cell_to_input_weights_info, lstm_params.input_gate_bias()->info());
+        ITensorInfo *cell_to_input_weights_info =
+            (lstm_params.has_peephole_opt()) ? lstm_params.cell_to_input_weights()->info() : nullptr;
+        lstm_params_info->set_cifg_params(lstm_params.input_to_input_weights()->info(),
+                                          lstm_params.recurrent_to_input_weights()->info(), cell_to_input_weights_info,
+                                          lstm_params.input_gate_bias()->info());
     }
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(),
-                                     lstm_params.output_layer_norm_weights(),
+        ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.output_layer_norm_weights(),
                                      lstm_params.cell_layer_norm_weights());
-        if(!lstm_params.has_cifg_opt())
+        if (!lstm_params.has_cifg_opt())
         {
             ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.input_layer_norm_weights());
         }
@@ -103,15 +108,14 @@ inline void build_lstm_params_tensor_info(const LSTMParams<T>     &lstm_params,
         ITensorInfo *forget_info = lstm_params.forget_layer_norm_weights()->info();
         ITensorInfo *cell_info   = lstm_params.cell_layer_norm_weights()->info();
         ITensorInfo *output_info = lstm_params.output_layer_norm_weights()->info();
-        ITensorInfo *input_info  = lstm_params.has_cifg_opt() ? nullptr : lstm_params.input_layer_norm_weights()->info();
+        ITensorInfo *input_info = lstm_params.has_cifg_opt() ? nullptr : lstm_params.input_layer_norm_weights()->info();
 
         lstm_params_info->set_layer_normalization_params(input_info, forget_info, cell_info, output_info);
     }
 
-    lstm_params_info->set_matmul_scale_params(lstm_params.input_intermediate_scale(),
-                                              lstm_params.forget_intermediate_scale(),
-                                              lstm_params.cell_intermediate_scale(),
-                                              lstm_params.output_intermediate_scale());
+    lstm_params_info->set_matmul_scale_params(
+        lstm_params.input_intermediate_scale(), lstm_params.forget_intermediate_scale(),
+        lstm_params.cell_intermediate_scale(), lstm_params.output_intermediate_scale());
 
     lstm_params_info->set_hidden_state_params(lstm_params.hidden_state_zero(), lstm_params.hidden_state_scale());
 }
diff --git a/arm_compute/core/utils/misc/Macros.h b/arm_compute/core/utils/misc/Macros.h
index de66b6a52f..fa861fa442 100644
--- a/arm_compute/core/utils/misc/Macros.h
+++ b/arm_compute/core/utils/misc/Macros.h
@@ -26,15 +26,16 @@
 
 #if defined(__cplusplus) && (__cplusplus >= 201402L)
 
-#define ARM_COMPUTE_DEPRECATED [[deprecated]]
-#define ARM_COMPUTE_DEPRECATED_REL(rel) [[deprecated("Deprecated in : " #rel)]]
+#define ARM_COMPUTE_DEPRECATED                           [[deprecated]]
+#define ARM_COMPUTE_DEPRECATED_REL(rel)                  [[deprecated("Deprecated in : " #rel)]]
 #define ARM_COMPUTE_DEPRECATED_REL_REPLACE(rel, replace) [[deprecated("Deprecated in : " #rel " - Use : " #replace)]]
 
 #elif defined(__GNUC__) || defined(__clang__)
 
-#define ARM_COMPUTE_DEPRECATED __attribute__((deprecated))
+#define ARM_COMPUTE_DEPRECATED          __attribute__((deprecated))
 #define ARM_COMPUTE_DEPRECATED_REL(rel) __attribute__((deprecated("Deprecated in : " #rel)))
-#define ARM_COMPUTE_DEPRECATED_REL_REPLACE(rel, replace) __attribute__((deprecated("Deprecated in : " #rel " - Use : " #replace)))
+#define ARM_COMPUTE_DEPRECATED_REL_REPLACE(rel, replace) \
+    __attribute__((deprecated("Deprecated in : " #rel " - Use : " #replace)))
 
 #else // defined(__cplusplus) && (__cplusplus >= 201402L)
 
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index 4c2037ab8d..31362f1ac4 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -28,11 +28,10 @@
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/tensor_transform.h"
 #include "arm_compute/function_info/ConvolutionInfo.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
 
-#include "arm_compute/core/utils/helpers/tensor_transform.h"
-
 #include <cmath>
 
 namespace arm_compute
@@ -57,12 +56,12 @@ inline TensorShape calculate_reduce_mean_shape(ITensorInfo *input, const Coordin
     convert_negative_axis(axis_local, input_dims);
     TensorShape out_shape = input->tensor_shape();
     // Configure reshape layer if we want to drop the dimensions
-    if(!keep_dims)
+    if (!keep_dims)
     {
         // We have to sort the reduction axis vectors in order for remove_dimension
         // to work properly
         std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
-        for(int i = 0; i < reduction_ops; ++i)
+        for (int i = 0; i < reduction_ops; ++i)
         {
             out_shape.remove_dimension(axis_local[i] - i, false);
         }
@@ -70,7 +69,7 @@ inline TensorShape calculate_reduce_mean_shape(ITensorInfo *input, const Coordin
     }
     else
     {
-        for(int i = 0; i < reduction_ops; ++i)
+        for (int i = 0; i < reduction_ops; ++i)
         {
             out_shape.set(axis_local[i], 1);
         }
@@ -86,7 +85,10 @@ inline TensorShape calculate_reduce_mean_shape(ITensorInfo *input, const Coordin
  *
  * @return the calculated shape
  */
-inline TensorShape compute_vector_to_tensor_output_shape(const TensorShape &input, size_t conv_w, size_t conv_h, const DataLayout &data_layout)
+inline TensorShape compute_vector_to_tensor_output_shape(const TensorShape &input,
+                                                         size_t             conv_w,
+                                                         size_t             conv_h,
+                                                         const DataLayout  &data_layout)
 {
     const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
@@ -128,10 +130,12 @@ inline TensorShape compute_reorg_output_shape(const ITensorInfo &input, int32_t
     const size_t idx_channel = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::CHANNEL);
 
     ARM_COMPUTE_ERROR_ON(stride <= 0);
-    ARM_COMPUTE_ERROR_ON_MSG((input.tensor_shape()[idx_width] % stride != 0), "The width of the input tensor must be a multiple of stride");
-    ARM_COMPUTE_ERROR_ON_MSG((input.tensor_shape()[idx_height] % stride != 0), "The height of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_ERROR_ON_MSG((input.tensor_shape()[idx_width] % stride != 0),
+                             "The width of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_ERROR_ON_MSG((input.tensor_shape()[idx_height] % stride != 0),
+                             "The height of the input tensor must be a multiple of stride");
 
-    TensorShape output_shape{ input.tensor_shape() };
+    TensorShape output_shape{input.tensor_shape()};
 
     output_shape.set(idx_width, output_shape[idx_width] / stride);
     output_shape.set(idx_height, output_shape[idx_height] / stride);
@@ -148,7 +152,8 @@ inline TensorShape compute_reorg_output_shape(const ITensorInfo &input, int32_t
  *
  * @return the calculated shape of the reshaped weights
  */
-inline TensorShape compute_weights_reshaped_shape(const ITensorInfo &weights, bool has_bias = false, unsigned int num_groups = 1)
+inline TensorShape
+compute_weights_reshaped_shape(const ITensorInfo &weights, bool has_bias = false, unsigned int num_groups = 1)
 {
     // Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
     ARM_COMPUTE_ERROR_ON(num_groups == 0);
@@ -156,14 +161,14 @@ inline TensorShape compute_weights_reshaped_shape(const ITensorInfo &weights, bo
     ARM_COMPUTE_ERROR_ON((weights.dimension(3) % num_groups) != 0);
 
     // Calculate output shape
-    TensorShape weights_reshaped{ weights.tensor_shape() };
+    TensorShape weights_reshaped{weights.tensor_shape()};
     weights_reshaped.set(3, weights_reshaped[3] / num_groups);
 
     weights_reshaped.collapse(3);
     const size_t tmp_dim = weights_reshaped[0];
     weights_reshaped.set(0, weights_reshaped[1]);
     weights_reshaped.set(1, tmp_dim + (has_bias ? 1 : 0));
-    if(weights.num_dimensions() < 5)
+    if (weights.num_dimensions() < 5)
     {
         weights_reshaped.set(2, num_groups);
     }
@@ -179,7 +184,9 @@ inline TensorShape compute_weights_reshaped_shape(const ITensorInfo &weights, bo
  *
  * @return the calculated shape
  */
-inline TensorShape compute_lhs_reshaped_shape(const ITensorInfo &a, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d = false)
+inline TensorShape compute_lhs_reshaped_shape(const ITensorInfo       &a,
+                                              const GEMMLHSMatrixInfo &lhs_info,
+                                              bool                     reinterpret_input_as_3d = false)
 {
     ARM_COMPUTE_ERROR_ON(lhs_info.m0 == 0);
     ARM_COMPUTE_ERROR_ON(lhs_info.k0 == 0);
@@ -200,11 +207,11 @@ inline TensorShape compute_lhs_reshaped_shape(const ITensorInfo &a, const GEMMLH
     const unsigned int output_width  = block_size * num_horiz_blocks * lhs_info.v0;
     const unsigned int output_height = std::ceil(num_vert_blocks / static_cast<float>(lhs_info.v0));
 
-    TensorShape lhs_shape{ a.tensor_shape() };
+    TensorShape lhs_shape{a.tensor_shape()};
     lhs_shape.set(0, output_width);
     lhs_shape.set(1, output_height);
 
-    if((reinterpret_input_as_3d) && (lhs_shape.num_dimensions() > 2))
+    if ((reinterpret_input_as_3d) && (lhs_shape.num_dimensions() > 2))
     {
         // When the data format is NHWC and the shapes are Nx1x1
         // the tensor shape num_dimensions is automatically set to 1 instead of 3.
@@ -244,7 +251,7 @@ inline TensorShape compute_rhs_reshaped_shape(const ITensorInfo &a, const GEMMRH
     const unsigned int output_width  = block_size * num_vert_blocks * rhs_info.h0;
     const unsigned int output_height = std::ceil(num_horiz_blocks / static_cast<float>(rhs_info.h0));
 
-    TensorShape rhs_shape{ a.tensor_shape() };
+    TensorShape rhs_shape{a.tensor_shape()};
     rhs_shape.set(0, output_width);
     rhs_shape.set(1, output_height);
 
@@ -259,14 +266,15 @@ inline TensorShape compute_rhs_reshaped_shape(const ITensorInfo &a, const GEMMRH
  *
  * @return the calculated shape
  */
-inline TensorShape compute_interleaved_shape(const ITensorInfo &a, int mult_interleave4x4_height = 1, bool reinterpret_input_as_3d = false)
+inline TensorShape
+compute_interleaved_shape(const ITensorInfo &a, int mult_interleave4x4_height = 1, bool reinterpret_input_as_3d = false)
 {
     // The interleaved output matrix will have the following shape: [ a_height * W, ceil(a_width / W) ] where W = 4 * mult_interleave4x4_height
     ARM_COMPUTE_ERROR_ON(mult_interleave4x4_height < 1);
     const int   interleave_width = 4 * mult_interleave4x4_height;
-    TensorShape shape_interleaved_a{ a.tensor_shape() };
+    TensorShape shape_interleaved_a{a.tensor_shape()};
     shape_interleaved_a.set(0, a.dimension(0) * interleave_width);
-    if(reinterpret_input_as_3d)
+    if (reinterpret_input_as_3d)
     {
         const int M      = a.dimension(1) * a.dimension(2);
         const int height = std::ceil(M / static_cast<float>(interleave_width));
@@ -276,7 +284,7 @@ inline TensorShape compute_interleaved_shape(const ITensorInfo &a, int mult_inte
         // the tensor shape num_dimensions is automatically set to 1 instead of 3.
         // To avoid failures by removing a dimension that doesn't exist
         // check if the number of dimensions is greater than 2.
-        if(shape_interleaved_a.num_dimensions() > 2)
+        if (shape_interleaved_a.num_dimensions() > 2)
         {
             shape_interleaved_a.remove_dimension(2);
         }
@@ -298,7 +306,7 @@ inline TensorShape compute_interleaved_shape(const ITensorInfo &a, int mult_inte
 inline TensorShape compute_transpose1xW_shape(const ITensorInfo &b)
 {
     // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
-    TensorShape shape_transposed1xW_b{ b.tensor_shape() };
+    TensorShape shape_transposed1xW_b{b.tensor_shape()};
     shape_transposed1xW_b.set(0, b.dimension(1) * 16);
     shape_transposed1xW_b.set(1, std::ceil(b.dimension(0) / 16.f));
 
@@ -318,7 +326,7 @@ inline TensorShape compute_transpose1xW_with_element_size_shape(const ITensorInf
     //       The transpose1xW output matrix will have the following shape:
     //       [ b_height * W, ceil(b_width / W) ] where W = (16 / element size of the tensor) * mult_transpose1xW_width
     ARM_COMPUTE_ERROR_ON(mult_transpose1xW_width < 1);
-    TensorShape  shape_transposed1xW_b{ b.tensor_shape() };
+    TensorShape  shape_transposed1xW_b{b.tensor_shape()};
     const size_t transpose_width = (16 / b.element_size()) * mult_transpose1xW_width;
     shape_transposed1xW_b.set(0, b.dimension(1) * transpose_width);
     shape_transposed1xW_b.set(1, static_cast<size_t>(std::ceil(b.dimension(0) / static_cast<float>(transpose_width))));
@@ -334,8 +342,8 @@ inline TensorShape compute_transpose1xW_with_element_size_shape(const ITensorInf
  */
 inline TensorShape compute_reductionA_shape(const ITensorInfo &b)
 {
-    TensorShape shape_vector_sum_col{ b.tensor_shape() };
-    if(shape_vector_sum_col.num_dimensions() > 1)
+    TensorShape shape_vector_sum_col{b.tensor_shape()};
+    if (shape_vector_sum_col.num_dimensions() > 1)
     {
         shape_vector_sum_col.remove_dimension(1);
     }
@@ -351,9 +359,9 @@ inline TensorShape compute_reductionA_shape(const ITensorInfo &b)
  */
 inline TensorShape compute_reductionB_shape(const ITensorInfo &a)
 {
-    TensorShape shape_vector_sum_row{ a.tensor_shape() };
+    TensorShape shape_vector_sum_row{a.tensor_shape()};
     shape_vector_sum_row.set(Window::DimX, a.dimension(1));
-    if(shape_vector_sum_row.num_dimensions() > 1)
+    if (shape_vector_sum_row.num_dimensions() > 1)
     {
         shape_vector_sum_row.remove_dimension(1);
     }
@@ -370,7 +378,10 @@ inline TensorShape compute_reductionB_shape(const ITensorInfo &a)
  *
  * @return the calculated shape
  */
-inline TensorShape compute_col2im_shape(const ITensorInfo &input, const Size2D &convolved_dims, bool batch_size_on_z, unsigned int num_groups = 1)
+inline TensorShape compute_col2im_shape(const ITensorInfo &input,
+                                        const Size2D      &convolved_dims,
+                                        bool               batch_size_on_z,
+                                        unsigned int       num_groups = 1)
 {
     ARM_COMPUTE_ERROR_ON(num_groups == 0);
     ARM_COMPUTE_ERROR_ON(input.tensor_shape()[1] != (convolved_dims.area()));
@@ -381,10 +392,10 @@ inline TensorShape compute_col2im_shape(const ITensorInfo &input, const Size2D &
     const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
-    TensorShape col2im_shape{ input.tensor_shape() };
+    TensorShape col2im_shape{input.tensor_shape()};
     // If batches start on 3rd dimension shift dimensions right by 1 to retain upper tensor shape,
     // as first three will be override by H,W,C data
-    if(batch_size_on_z && num_groups == 1)
+    if (batch_size_on_z && num_groups == 1)
     {
         col2im_shape.shift_right(1);
     }
@@ -403,7 +414,7 @@ inline TensorShape compute_col2im_shape(const ITensorInfo &input, const Size2D &
  */
 inline TensorShape compute_transposed_shape(const ITensorInfo &input)
 {
-    TensorShape shape_transposed{ input.tensor_shape() };
+    TensorShape shape_transposed{input.tensor_shape()};
 
     shape_transposed.set(0, input.dimension(1), false);
     shape_transposed.set(1, input.dimension(0), false);
@@ -419,10 +430,11 @@ inline TensorShape compute_transposed_shape(const ITensorInfo &input)
  *
  * @return the calculated shape
  */
-inline TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, const ConvolutionInfo &info)
+inline TensorShape
+compute_depthwise_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, const ConvolutionInfo &info)
 {
-    const TensorShape input_shape{ input.tensor_shape() };
-    const TensorShape weights_shape{ weights.tensor_shape() };
+    const TensorShape input_shape{input.tensor_shape()};
+    const TensorShape weights_shape{weights.tensor_shape()};
 
     const DataLayout data_layout = input.data_layout();
     const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -430,16 +442,16 @@ inline TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input,
     const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
     const DataLayout weights_data_layout = weights.data_layout();
-    const int        weights_width_idx   = get_data_layout_dimension_index(weights_data_layout, DataLayoutDimension::WIDTH);
-    const int        weights_height_idx  = get_data_layout_dimension_index(weights_data_layout, DataLayoutDimension::HEIGHT);
+    const int weights_width_idx  = get_data_layout_dimension_index(weights_data_layout, DataLayoutDimension::WIDTH);
+    const int weights_height_idx = get_data_layout_dimension_index(weights_data_layout, DataLayoutDimension::HEIGHT);
 
     unsigned int output_width  = 0;
     unsigned int output_height = 0;
-    std::tie(output_width, output_height) = scaled_dimensions(input_shape[width_idx], input_shape[height_idx],
-                                                              weights_shape[weights_width_idx], weights_shape[weights_height_idx],
-                                                              info.pad_stride_info, info.dilation);
+    std::tie(output_width, output_height) =
+        scaled_dimensions(input_shape[width_idx], input_shape[height_idx], weights_shape[weights_width_idx],
+                          weights_shape[weights_height_idx], info.pad_stride_info, info.dilation);
 
-    TensorShape output_shape{ input_shape };
+    TensorShape output_shape{input_shape};
     output_shape.set(width_idx, output_width);
     output_shape.set(height_idx, output_height);
     output_shape.set(channel_idx, input_shape[channel_idx] * info.depth_multiplier);
@@ -459,8 +471,13 @@ inline TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input,
  *
  * @return the calculated shape
  */
-inline TensorShape compute_deconvolution_upsampled_shape(const ITensorInfo &input, const ITensorInfo &weights, unsigned int sx, unsigned int sy,
-                                                         std::pair<unsigned int, unsigned int> &out_dims, uint32_t &padx, uint32_t &pady)
+inline TensorShape compute_deconvolution_upsampled_shape(const ITensorInfo                     &input,
+                                                         const ITensorInfo                     &weights,
+                                                         unsigned int                           sx,
+                                                         unsigned int                           sy,
+                                                         std::pair<unsigned int, unsigned int> &out_dims,
+                                                         uint32_t                              &padx,
+                                                         uint32_t                              &pady)
 {
     const DataLayout data_layout = input.data_layout();
     const size_t     idx_w       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -491,10 +508,12 @@ inline TensorShape compute_deconvolution_upsampled_shape(const ITensorInfo &inpu
  *
  * @return the calculated shape
  */
-inline TensorShape compute_deconvolution_output_shape(const std::pair<unsigned int, unsigned int> &out_dims, const ITensorInfo &input, const ITensorInfo &weights)
+inline TensorShape compute_deconvolution_output_shape(const std::pair<unsigned int, unsigned int> &out_dims,
+                                                      const ITensorInfo                           &input,
+                                                      const ITensorInfo                           &weights)
 {
-    const TensorShape input_shape{ input.tensor_shape() };
-    const TensorShape weights_shape{ weights.tensor_shape() };
+    const TensorShape input_shape{input.tensor_shape()};
+    const TensorShape weights_shape{weights.tensor_shape()};
 
     const DataLayout data_layout = input.data_layout();
     const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -502,7 +521,7 @@ inline TensorShape compute_deconvolution_output_shape(const std::pair<unsigned i
     const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
     const int        batch_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
 
-    TensorShape out_shape{ input_shape };
+    TensorShape out_shape{input_shape};
     out_shape.set(width_idx, out_dims.first);
     out_shape.set(height_idx, out_dims.second);
     out_shape.set(channel_idx, weights_shape[batch_idx]);
@@ -522,8 +541,14 @@ inline TensorShape compute_deconvolution_output_shape(const std::pair<unsigned i
  *
  * @return the calculated shape
  */
-inline TensorShape compute_im2col_conv_shape(const ITensorInfo *input, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, bool batch_size_on_z,
-                                             unsigned int num_groups = 1, unsigned int input_pad_right = 0)
+inline TensorShape compute_im2col_conv_shape(const ITensorInfo   *input,
+                                             const Size2D        &kernel_dims,
+                                             const PadStrideInfo &conv_info,
+                                             bool                 has_bias,
+                                             const Size2D        &dilation,
+                                             bool                 batch_size_on_z,
+                                             unsigned int         num_groups      = 1,
+                                             unsigned int         input_pad_right = 0)
 {
     // The output shape will be the 3D shape [ out_channels * kernel_area, num_elems_per_out_channel, batches ]                           if batch_size_on_z == true
     //                       or the 4D shape [ out_channels * kernel_area / num_groups, num_elems_per_out_channel, num_groups, batches ]  if batch_size_on_z == false
@@ -532,17 +557,19 @@ inline TensorShape compute_im2col_conv_shape(const ITensorInfo *input, const Siz
     ARM_COMPUTE_ERROR_ON(num_groups > 1 && input->data_layout() != DataLayout::NCHW);
     ARM_COMPUTE_ERROR_ON(num_groups > 1 && batch_size_on_z);
 
-    TensorShape output_shape{ input->tensor_shape() };
+    TensorShape output_shape{input->tensor_shape()};
 
     const DataLayout data_layout = input->data_layout();
     const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
-    std::pair<unsigned int, unsigned int> out_dims = scaled_dimensions(output_shape[width_idx], output_shape[height_idx], kernel_dims.width, kernel_dims.height, conv_info, dilation);
-    output_shape.set(0, ((output_shape[channel_idx] + input_pad_right) / num_groups * kernel_dims.area() + (has_bias ? 1 : 0))); // NOLINT
+    std::pair<unsigned int, unsigned int> out_dims = scaled_dimensions(
+        output_shape[width_idx], output_shape[height_idx], kernel_dims.width, kernel_dims.height, conv_info, dilation);
+    output_shape.set(0, ((output_shape[channel_idx] + input_pad_right) / num_groups * kernel_dims.area() +
+                         (has_bias ? 1 : 0))); // NOLINT
     output_shape.set(1, (out_dims.first * out_dims.second));
-    if(batch_size_on_z && output_shape.num_dimensions() >= 3)
+    if (batch_size_on_z && output_shape.num_dimensions() >= 3)
     {
         output_shape.remove_dimension(2);
     }
@@ -564,7 +591,7 @@ inline TensorShape compute_flatten_shape(const ITensorInfo *input)
 {
     // The output shape will be the flatten version of the input (i.e. [ width * height * channels, num_batches, ... ] ). Used for FlattenLayer and FullyConnectedLayer.
 
-    TensorShape output_shape{ input->tensor_shape() };
+    TensorShape output_shape{input->tensor_shape()};
 
     output_shape.collapse(3);
 
@@ -586,7 +613,7 @@ inline TensorShape compute_softmax_shape(const ITensorInfo *input, size_t axis =
     // - [x,y,z,w] and axis 3 will return [x*y*z, w]
     TensorShape shape2D = input->tensor_shape();
 
-    if(axis < input->num_dimensions())
+    if (axis < input->num_dimensions())
     {
         // Collapse from axis onward (this changes the shape)
         shape2D.collapse_from(axis);
@@ -600,7 +627,7 @@ inline TensorShape compute_softmax_shape(const ITensorInfo *input, size_t axis =
         shape2D.collapse(shape2D.num_dimensions());
     }
 
-    if(axis == 0)
+    if (axis == 0)
     {
         // If axis is zero the first dim should be one. Since
         // collapse is an inclusive operation we need to shift
@@ -619,15 +646,17 @@ inline TensorShape compute_softmax_shape(const ITensorInfo *input, size_t axis =
  */
 inline TensorShape compute_winograd_filter_transform_shape(const ITensorInfo &input, const WinogradInfo &winograd_info)
 {
-    TensorShape tensor_shape{ input.tensor_shape() };
+    TensorShape tensor_shape{input.tensor_shape()};
 
     const Size2D kernel_size      = winograd_info.kernel_size;
     const Size2D output_tile_size = winograd_info.output_tile_size;
-    const Size2D input_tile_size  = Size2D(output_tile_size.width + kernel_size.width - 1, output_tile_size.height + kernel_size.height - 1);
+    const Size2D input_tile_size =
+        Size2D(output_tile_size.width + kernel_size.width - 1, output_tile_size.height + kernel_size.height - 1);
 
     tensor_shape.remove_dimension(get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::WIDTH));
     tensor_shape.set(Window::DimX, input.dimension(3));
-    tensor_shape.set(Window::DimY, input.dimension(get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::CHANNEL)));
+    tensor_shape.set(Window::DimY, input.dimension(get_data_layout_dimension_index(input.data_layout(),
+                                                                                   DataLayoutDimension::CHANNEL)));
     tensor_shape.set(Window::DimZ, input_tile_size.area());
 
     return tensor_shape;
@@ -645,23 +674,22 @@ inline TensorShape compute_winograd_input_transform_shape(const ITensorInfo &inp
     const PadStrideInfo conv_info        = winograd_info.convolution_info;
     const Size2D        kernel_size      = winograd_info.kernel_size;
     const Size2D        output_tile_size = winograd_info.output_tile_size;
-    const Size2D        input_tile_size  = Size2D(output_tile_size.width + kernel_size.width - 1, output_tile_size.height + kernel_size.height - 1);
+    const Size2D        input_tile_size =
+        Size2D(output_tile_size.width + kernel_size.width - 1, output_tile_size.height + kernel_size.height - 1);
 
     const size_t idx_w = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::HEIGHT);
     const size_t idx_c = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::CHANNEL);
 
     // Compute the number of output tiles along the x and y direction of size "output_tile_size"
-    const Size2D num_tiles = compute_winograd_convolution_tiles(Size2D(input.tensor_shape()[idx_w], input.tensor_shape()[idx_h]),
-                                                                kernel_size,
-                                                                output_tile_size,
-                                                                conv_info);
+    const Size2D num_tiles = compute_winograd_convolution_tiles(
+        Size2D(input.tensor_shape()[idx_w], input.tensor_shape()[idx_h]), kernel_size, output_tile_size, conv_info);
 
     const unsigned int width  = input.tensor_shape()[idx_c];
     const unsigned int height = num_tiles.area();
     const unsigned int depth  = input_tile_size.area();
 
-    TensorShape output_shape{ input.tensor_shape() };
+    TensorShape output_shape{input.tensor_shape()};
     output_shape.set(0, width);
     output_shape.set(1, height);
     output_shape.set(2, depth);
@@ -684,12 +712,12 @@ inline TensorShape compute_winograd_output_transform_shape(const ITensorInfo &in
     const DataLayout    data_layout      = winograd_info.output_data_layout;
 
     // Compute output shape
-    unsigned int output_width  = 0;
-    unsigned int output_height = 0;
+    unsigned int output_width             = 0;
+    unsigned int output_height            = 0;
     std::tie(output_width, output_height) = scaled_dimensions(input_dimensions.width, input_dimensions.height,
                                                               kernel_size.width, kernel_size.height, conv_info);
 
-    TensorShape tensor_shape{ input.tensor_shape() };
+    TensorShape tensor_shape{input.tensor_shape()};
 
     // Output dimension
     const unsigned int out_w = output_width;
@@ -712,7 +740,10 @@ inline TensorShape compute_winograd_output_transform_shape(const ITensorInfo &in
  *
  * @return the calculated shape
  */
-inline TensorShape compute_deep_convolution_shape(const TensorShape &input_shape, DataLayout input_data_layout, const TensorShape &weights_shape, const PadStrideInfo &conv_info)
+inline TensorShape compute_deep_convolution_shape(const TensorShape   &input_shape,
+                                                  DataLayout           input_data_layout,
+                                                  const TensorShape   &weights_shape,
+                                                  const PadStrideInfo &conv_info)
 {
     const size_t idx_width   = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::WIDTH);
     const size_t idx_height  = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::HEIGHT);
@@ -725,9 +756,10 @@ inline TensorShape compute_deep_convolution_shape(const TensorShape &input_shape
     const unsigned int weights_out_channel = weights_shape[3];
     unsigned int       output_width        = 0;
     unsigned int       output_height       = 0;
-    std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, weights_width, weights_height, conv_info);
+    std::tie(output_width, output_height) =
+        scaled_dimensions(input_width, input_height, weights_width, weights_height, conv_info);
 
-    TensorShape output_shape{ input_shape };
+    TensorShape output_shape{input_shape};
     output_shape.set(idx_width, output_width);
     output_shape.set(idx_height, output_height);
     output_shape.set(idx_channel, weights_out_channel);
@@ -743,7 +775,8 @@ inline TensorShape compute_deep_convolution_shape(const TensorShape &input_shape
  *
  * @return the calculated shape
  */
-inline TensorShape compute_deep_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info)
+inline TensorShape
+compute_deep_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info)
 {
     return compute_deep_convolution_shape(input.tensor_shape(), input.data_layout(), weights.tensor_shape(), conv_info);
 }
@@ -758,7 +791,10 @@ inline TensorShape compute_deep_convolution_shape(const ITensorInfo &input, cons
  *
  * @return the calculated shape
  */
-inline TensorShape compute_indirect_buffer_shape(const TensorShape &input_shape, DataLayout input_data_layout, const TensorShape &weights_shape, const PadStrideInfo &conv_info,
+inline TensorShape compute_indirect_buffer_shape(const TensorShape                 &input_shape,
+                                                 DataLayout                         input_data_layout,
+                                                 const TensorShape                 &weights_shape,
+                                                 const PadStrideInfo               &conv_info,
                                                  const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_ERROR_ON_MSG(input_data_layout != DataLayout::NHWC, "The data layout can only be NHWC");
@@ -768,7 +804,8 @@ inline TensorShape compute_indirect_buffer_shape(const TensorShape &input_shape,
     const unsigned int kw = weights_shape[1];
     const unsigned int kh = weights_shape[2];
 
-    TensorShape output_conv2d_shape = compute_deep_convolution_shape(input_shape, input_data_layout, weights_shape, conv_info);
+    TensorShape output_conv2d_shape =
+        compute_deep_convolution_shape(input_shape, input_data_layout, weights_shape, conv_info);
 
     const unsigned int output_w = m0 * kw * kh;
     const unsigned int output_h = DIV_CEIL(output_conv2d_shape[1] * output_conv2d_shape[2], m0);
@@ -785,7 +822,7 @@ inline TensorShape compute_indirect_buffer_shape(const TensorShape &input_shape,
  */
 inline TensorShape compute_min_max_shape(const ITensorInfo *input)
 {
-    TensorShape output_shape{ input->tensor_shape() };
+    TensorShape output_shape{input->tensor_shape()};
     output_shape.set(Window::DimX, 2);
     output_shape.remove_dimension(1);
     output_shape.remove_dimension(1);
@@ -805,7 +842,7 @@ inline TensorShape compute_pool_shape(const ITensorInfo &input, PoolingLayerInfo
     int pooled_w = 0;
     int pooled_h = 0;
 
-    TensorShape output_shape{ input.tensor_shape() };
+    TensorShape output_shape{input.tensor_shape()};
 
     const bool is_global_pooling = pool_info.is_global_pooling;
     const int  idx_width         = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::WIDTH);
@@ -815,9 +852,8 @@ inline TensorShape compute_pool_shape(const ITensorInfo &input, PoolingLayerInfo
     const int  pool_size_x       = is_global_pooling ? output_shape[idx_width] : pool_info.pool_size.width;
     const int  pool_size_y       = is_global_pooling ? output_shape[idx_height] : pool_info.pool_size.height;
 
-    std::tie(pooled_w, pooled_h) = scaled_dimensions_signed(input_width, input_height,
-                                                            pool_size_x, pool_size_y,
-                                                            pool_info.pad_stride_info);
+    std::tie(pooled_w, pooled_h) =
+        scaled_dimensions_signed(input_width, input_height, pool_size_x, pool_size_y, pool_info.pad_stride_info);
 
     ARM_COMPUTE_ERROR_ON_MSG((pooled_w < 1 || pooled_h < 1), "Calculated output dimension size is invalid");
 
@@ -850,8 +886,10 @@ inline TensorShape compute_unpool_shape(const ITensorInfo &input, PoolingLayerIn
     const int pad_bottom = pad_stride_info.pad_bottom();
 
     TensorShape        output_shape = input_shape;
-    const unsigned int out_width    = (input_shape[idx_width] - 1) * stride_x - pad_left - pad_right + pool_info.pool_size.width;
-    const unsigned int out_height   = (input_shape[idx_height] - 1) * stride_y - pad_top - pad_bottom + pool_info.pool_size.height;
+    const unsigned int out_width =
+        (input_shape[idx_width] - 1) * stride_x - pad_left - pad_right + pool_info.pool_size.width;
+    const unsigned int out_height =
+        (input_shape[idx_height] - 1) * stride_y - pad_top - pad_bottom + pool_info.pool_size.height;
 
     output_shape.set(idx_width, out_width);
     output_shape.set(idx_height, out_height);
@@ -866,9 +904,10 @@ inline TensorShape compute_unpool_shape(const ITensorInfo &input, PoolingLayerIn
  *
  * @return the calculated shape
  */
-inline TensorShape compute_roi_align_shape(const ITensorInfo &input, const ITensorInfo &rois, ROIPoolingLayerInfo pool_info)
+inline TensorShape
+compute_roi_align_shape(const ITensorInfo &input, const ITensorInfo &rois, ROIPoolingLayerInfo pool_info)
 {
-    TensorShape output_shape{ input.tensor_shape() };
+    TensorShape output_shape{input.tensor_shape()};
 
     const unsigned int idx_width  = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::WIDTH);
     const unsigned int idx_height = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::HEIGHT);
@@ -889,7 +928,7 @@ inline TensorShape compute_roi_align_shape(const ITensorInfo &input, const ITens
  */
 inline TensorShape compute_rnn_shape(const ITensorInfo *input, const unsigned int batch_size)
 {
-    TensorShape output_shape{ input->tensor_shape() };
+    TensorShape output_shape{input->tensor_shape()};
     output_shape.set(1, batch_size);
 
     return output_shape;
@@ -904,15 +943,21 @@ inline TensorShape compute_rnn_shape(const ITensorInfo *input, const unsigned in
  *
  * @return the calculated shape
  */
-inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
+inline TensorShape compute_mm_shape(const ITensorInfo     &input0,
+                                    const ITensorInfo     &input1,
+                                    bool                   is_interleaved_transposed,
+                                    const GEMMReshapeInfo &reshape_info)
 {
     ARM_COMPUTE_ERROR_ON_MSG(input0.num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");
-    ARM_COMPUTE_ERROR_ON_MSG(is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(), "The first input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true");
+    ARM_COMPUTE_ERROR_ON_MSG(
+        is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(),
+        "The first input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true");
 
     const bool reinterpret_input_as_3d  = reshape_info.reinterpret_input_as_3d();
     const bool reinterpret_output_as_3d = reshape_info.depth_output_gemm3d() != 0;
     const int  depth_output_gemm3d      = reinterpret_output_as_3d ? reshape_info.depth_output_gemm3d() : 1;
-    const int  m                        = reshape_info.reinterpret_input_as_3d() ? input0.dimension(1) * input0.dimension(2) : input0.dimension(1);
+    const int  m =
+        reshape_info.reinterpret_input_as_3d() ? input0.dimension(1) * input0.dimension(2) : input0.dimension(1);
 
     // If the output of GEMM has to be reinterpreted as 3D, the number of input0 rows (M) is obtained collapsing the second and third
     // dimension of the output tensor
@@ -921,7 +966,7 @@ inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo
     const int dim2 = reinterpret_input_as_3d ? input0.tensor_shape()[3] : input0.tensor_shape()[2];
     const int dim3 = reinterpret_input_as_3d ? 1 : input0.tensor_shape()[3];
 
-    TensorShape output_shape{ input0.tensor_shape() };
+    TensorShape output_shape{input0.tensor_shape()};
 
     output_shape.set(0, dim0);
     output_shape.set(1, dim1);
@@ -940,7 +985,8 @@ inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo
  *
  * @return the calculated shape
  */
-inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, const GEMMReshapeInfo &gemm_info)
+inline TensorShape
+compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, const GEMMReshapeInfo &gemm_info)
 {
     ARM_COMPUTE_UNUSED(input1);
     ARM_COMPUTE_ERROR_ON_MSG(input0.num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");
@@ -949,9 +995,9 @@ inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo
     const bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d() != 0;
     const int  depth_output_gemm3d      = reinterpret_output_as_3d ? gemm_info.depth_output_gemm3d() : 1;
 
-    TensorShape output_shape{ input0.tensor_shape() };
+    TensorShape output_shape{input0.tensor_shape()};
 
-    if(!reinterpret_input_as_3d && !reinterpret_output_as_3d)
+    if (!reinterpret_input_as_3d && !reinterpret_output_as_3d)
     {
         output_shape.set(0, gemm_info.n());
         output_shape.set(1, gemm_info.m());
@@ -978,7 +1024,8 @@ inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo
  *
  * @return the calculated shape
  */
-inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, const GEMMKernelInfo &gemm_info)
+inline TensorShape
+compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, const GEMMKernelInfo &gemm_info)
 {
     ARM_COMPUTE_UNUSED(input1);
     ARM_COMPUTE_ERROR_ON_MSG(input0.num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");
@@ -987,9 +1034,9 @@ inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo
     const bool         reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
     const unsigned int depth_output_gemm3d      = reinterpret_output_as_3d ? gemm_info.depth_output_gemm3d : 1;
 
-    TensorShape output_shape{ input0.tensor_shape() };
+    TensorShape output_shape{input0.tensor_shape()};
 
-    if(!reinterpret_input_as_3d && !reinterpret_output_as_3d)
+    if (!reinterpret_input_as_3d && !reinterpret_output_as_3d)
     {
         output_shape.set(0, gemm_info.n);
         output_shape.set(1, gemm_info.m);
@@ -1016,16 +1063,17 @@ inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo
  *
  * @return the calculated shape
  */
-inline TensorShape compute_matmul_shape(const TensorShape &input0, const TensorShape &input1, const MatMulKernelInfo &matmul_info)
+inline TensorShape
+compute_matmul_shape(const TensorShape &input0, const TensorShape &input1, const MatMulKernelInfo &matmul_info)
 {
-    TensorShape output_shape{ input0 };
+    TensorShape output_shape{input0};
 
-    if(matmul_info.adj_lhs)
+    if (matmul_info.adj_lhs)
     {
         output_shape.set(1, input0[0]); // The vertical (M) dimension
     }
 
-    if(matmul_info.adj_rhs)
+    if (matmul_info.adj_rhs)
     {
         output_shape.set(0, input1[1]); // The horizontal (N) dimension
     }
@@ -1044,14 +1092,15 @@ inline TensorShape compute_matmul_shape(const TensorShape &input0, const TensorS
  *
  * @return the calculated shape
  */
-inline TensorShape compute_output_stage_shape(const ITensorInfo &input, unsigned int gemm_3d_depth = 1, bool batch_size_on_z = false)
+inline TensorShape
+compute_output_stage_shape(const ITensorInfo &input, unsigned int gemm_3d_depth = 1, bool batch_size_on_z = false)
 {
     ARM_COMPUTE_ERROR_ON(input.data_layout() != DataLayout::NHWC && gemm_3d_depth > 1);
 
     TensorShape output_shape = input.tensor_shape();
-    if(gemm_3d_depth > 1)
+    if (gemm_3d_depth > 1)
     {
-        if(batch_size_on_z)
+        if (batch_size_on_z)
         {
             output_shape.shift_right(1);
         }
@@ -1076,11 +1125,16 @@ inline TensorShape compute_output_stage_shape(const ITensorInfo &input, unsigned
  * @return the calculated shape
  */
 inline TensorShape compute_strided_slice_shape(const ITensorInfo &input,
-                                               const Coordinates &starts, const Coordinates &ends, const Coordinates &strides,
-                                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+                                               const Coordinates &starts,
+                                               const Coordinates &ends,
+                                               const Coordinates &strides,
+                                               int32_t            begin_mask,
+                                               int32_t            end_mask,
+                                               int32_t            shrink_axis_mask)
 {
     using namespace arm_compute::helpers::tensor_transform;
-    return compute_strided_slice_output_shape(input.tensor_shape(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    return compute_strided_slice_output_shape(input.tensor_shape(), starts, ends, strides, begin_mask, end_mask,
+                                              shrink_axis_mask);
 }
 
 /** Calculate the slice output shape of a tensor
@@ -1091,13 +1145,13 @@ inline TensorShape compute_strided_slice_shape(const ITensorInfo &input,
  *
  * @return the calculated shape
  */
-inline TensorShape compute_slice_shape(const TensorShape &input_shape, const Coordinates &starts, const Coordinates &ends)
+inline TensorShape
+compute_slice_shape(const TensorShape &input_shape, const Coordinates &starts, const Coordinates &ends)
 {
     using namespace arm_compute::helpers::tensor_transform;
 
-    return compute_strided_slice_output_shape(input_shape,
-                                              starts, ends, BiStrides(),
-                                              0, construct_slice_end_mask(ends), 0);
+    return compute_strided_slice_output_shape(input_shape, starts, ends, BiStrides(), 0, construct_slice_end_mask(ends),
+                                              0);
 }
 
 /** Calculate the batch to space output shape of a tensor
@@ -1110,7 +1164,8 @@ inline TensorShape compute_slice_shape(const TensorShape &input_shape, const Coo
  *
  * @return the calculated shape
  */
-inline TensorShape compute_batch_to_space_shape(DataLayout data_layout, const TensorShape &input, int block_x, int block_y, const CropInfo &crop_info = CropInfo{})
+inline TensorShape compute_batch_to_space_shape(
+    DataLayout data_layout, const TensorShape &input, int block_x, int block_y, const CropInfo &crop_info = CropInfo{})
 {
     ARM_COMPUTE_ERROR_ON(block_x < 1 || block_y < 1);
 
@@ -1118,7 +1173,7 @@ inline TensorShape compute_batch_to_space_shape(DataLayout data_layout, const Te
     const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const int idx_batch  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
 
-    TensorShape output_shape{ input };
+    TensorShape output_shape{input};
 
     unsigned int       new_width   = input[idx_width] * static_cast<unsigned int>(block_x);
     unsigned int       new_height  = input[idx_height] * static_cast<unsigned int>(block_y);
@@ -1152,7 +1207,7 @@ inline TensorShape compute_depth_to_space_shape(const TensorShape &input_shape,
     const int idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
-    TensorShape output_shape{ input_shape };
+    TensorShape output_shape{input_shape};
     output_shape.set(idx_width, input_shape[idx_width] * block);
     output_shape.set(idx_height, input_shape[idx_height] * block);
     output_shape.set(idx_channel, input_shape[idx_channel] / (block * block));
@@ -1173,10 +1228,10 @@ inline TensorShape compute_split_shape(const ITensorInfo *input, unsigned int ax
     TensorShape empty_shape;
     empty_shape.set(0, 0);
 
-    TensorShape out_shape{ input->tensor_shape() };
+    TensorShape out_shape{input->tensor_shape()};
 
     // Return empty shape if axis is invalid
-    if(axis > input->tensor_shape().num_dimensions())
+    if (axis > input->tensor_shape().num_dimensions())
     {
         return empty_shape;
     }
@@ -1184,7 +1239,7 @@ inline TensorShape compute_split_shape(const ITensorInfo *input, unsigned int ax
     size_t axis_size = out_shape[axis];
 
     // Return empty shape if num_split is not valid
-    if(axis_size % num_splits)
+    if (axis_size % num_splits)
     {
         return empty_shape;
     }
@@ -1203,9 +1258,10 @@ inline TensorShape compute_split_shape(const ITensorInfo *input, unsigned int ax
  *
  * @return the calculated shape
  */
-inline TensorShape compute_space_to_batch_shape(const ITensorInfo *input, int block_x, int block_y, const Size2D &padding_left, const Size2D &padding_right)
+inline TensorShape compute_space_to_batch_shape(
+    const ITensorInfo *input, int block_x, int block_y, const Size2D &padding_left, const Size2D &padding_right)
 {
-    TensorShape output_shape{ input->tensor_shape() };
+    TensorShape output_shape{input->tensor_shape()};
 
     const DataLayout data_layout = input->data_layout();
     const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -1231,7 +1287,7 @@ inline TensorShape compute_space_to_batch_shape(const ITensorInfo *input, int bl
  */
 inline TensorShape compute_space_to_depth_shape(const ITensorInfo *input, int32_t block_shape)
 {
-    TensorShape output_shape{ input->tensor_shape() };
+    TensorShape output_shape{input->tensor_shape()};
 
     const DataLayout data_layout = input->data_layout();
     const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -1276,7 +1332,7 @@ inline TensorShape compute_prior_box_shape(const ITensorInfo &input, const Prior
 inline TensorShape compute_padded_shape(const TensorShape &input_shape, const PaddingList &padding)
 {
     TensorShape padded_shape = input_shape;
-    for(size_t dim = 0; dim < padding.size(); ++dim)
+    for (size_t dim = 0; dim < padding.size(); ++dim)
     {
         const auto    &padding_pair   = padding[dim];
         const uint32_t shape_on_index = (padded_shape.num_dimensions() <= dim) ? 1 : input_shape[dim];
@@ -1295,7 +1351,7 @@ inline TensorShape compute_padded_shape(const TensorShape &input_shape, const Pa
 inline TensorShape compute_tiled_shape(const TensorShape &input_shape, const Multiples &multiples)
 {
     TensorShape tiled_shape = input_shape;
-    for(size_t dim = 0; dim < multiples.size(); ++dim)
+    for (size_t dim = 0; dim < multiples.size(); ++dim)
     {
         tiled_shape.set(dim, input_shape[dim] * multiples[dim]);
     }
@@ -1312,9 +1368,9 @@ inline TensorShape compute_tiled_shape(const TensorShape &input_shape, const Mul
  */
 inline TensorShape compute_reduced_shape(const TensorShape &input, unsigned int axis, bool keep_dims = true)
 {
-    TensorShape output_shape{ input };
+    TensorShape output_shape{input};
 
-    if(!keep_dims)
+    if (!keep_dims)
     {
         output_shape.remove_dimension(axis);
     }
@@ -1407,14 +1463,14 @@ inline TensorShape calculate_concatenate_shape(const std::vector<T *> &input, si
 
 #if defined(ARM_COMPUTE_ASSERTS_ENABLED)
     // All dimensions must match except the axis one
-    for(unsigned int i = 0; i < MAX_DIMS; ++i)
+    for (unsigned int i = 0; i < MAX_DIMS; ++i)
     {
-        if(i == axis)
+        if (i == axis)
         {
             continue;
         }
 
-        for(const auto &tensor : input)
+        for (const auto &tensor : input)
         {
             ARM_COMPUTE_ERROR_ON(tensor == nullptr);
             const TensorShape shape = extract_shape(tensor);
@@ -1425,7 +1481,7 @@ inline TensorShape calculate_concatenate_shape(const std::vector<T *> &input, si
 
     // Calculate output shape
     size_t new_size = 0;
-    for(const auto &tensor : input)
+    for (const auto &tensor : input)
     {
         const TensorShape shape = extract_shape(tensor);
         new_size += shape[axis];
@@ -1448,14 +1504,14 @@ inline TensorShape compute_stack_shape(const ITensorInfo &a, unsigned int axis,
     ARM_COMPUTE_ERROR_ON(axis > a.num_dimensions());
     ARM_COMPUTE_ERROR_ON(a.num_dimensions() > 4);
 
-    TensorShape shape_out{ a.tensor_shape() };
+    TensorShape shape_out{a.tensor_shape()};
     shape_out.set(axis, num_tensors);
 
     unsigned int i_shift = 0;
 
-    for(unsigned int i = 0; i < a.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < a.num_dimensions(); ++i)
     {
-        if(i == axis)
+        if (i == axis)
         {
             i_shift++;
         }
@@ -1473,7 +1529,8 @@ inline TensorShape compute_stack_shape(const ITensorInfo &a, unsigned int axis,
  *
  * @return the calculated shape
  */
-inline TensorShape compute_conv3d_shape(const TensorShape &src, const TensorShape &weights, const Conv3dInfo &conv3d_info)
+inline TensorShape
+compute_conv3d_shape(const TensorShape &src, const TensorShape &weights, const Conv3dInfo &conv3d_info)
 {
     // Weight tensor shape indices (D H W Cin Cout)
     constexpr unsigned int weights_depth_dim  = 4u;
@@ -1488,7 +1545,7 @@ inline TensorShape compute_conv3d_shape(const TensorShape &src, const TensorShap
     constexpr unsigned int width_dim   = 1u;
     constexpr unsigned int channel_dim = 0u;
 
-    TensorShape  output_shape{ src };
+    TensorShape  output_shape{src};
     const size_t pad_left   = conv3d_info.padding.left;
     const size_t pad_right  = conv3d_info.padding.right;
     const size_t pad_top    = conv3d_info.padding.top;
@@ -1506,17 +1563,41 @@ inline TensorShape compute_conv3d_shape(const TensorShape &src, const TensorShap
     int output_height_size = 0;
     int output_depth_size  = 0;
 
-    switch(conv3d_info.round_type)
+    switch (conv3d_info.round_type)
     {
         case DimensionRoundingType::FLOOR:
-            output_width_size  = static_cast<int>(std::floor((static_cast<float>(src[width_dim] + pad_left + pad_right - (dilation_x * (weights[weights_width_dim] - 1) + 1)) / stride_x) + 1));
-            output_height_size = static_cast<int>(std::floor((static_cast<float>(src[height_dim] + pad_top + pad_bottom - (dilation_y * (weights[weights_height_dim] - 1) + 1)) / stride_y) + 1));
-            output_depth_size  = static_cast<int>(std::floor((static_cast<float>(src[depth_dim] + pad_front + pad_back - (dilation_z * (weights[weights_depth_dim] - 1) + 1)) / stride_z) + 1));
+            output_width_size =
+                static_cast<int>(std::floor((static_cast<float>(src[width_dim] + pad_left + pad_right -
+                                                                (dilation_x * (weights[weights_width_dim] - 1) + 1)) /
+                                             stride_x) +
+                                            1));
+            output_height_size =
+                static_cast<int>(std::floor((static_cast<float>(src[height_dim] + pad_top + pad_bottom -
+                                                                (dilation_y * (weights[weights_height_dim] - 1) + 1)) /
+                                             stride_y) +
+                                            1));
+            output_depth_size =
+                static_cast<int>(std::floor((static_cast<float>(src[depth_dim] + pad_front + pad_back -
+                                                                (dilation_z * (weights[weights_depth_dim] - 1) + 1)) /
+                                             stride_z) +
+                                            1));
             break;
         case DimensionRoundingType::CEIL:
-            output_width_size  = static_cast<int>(std::ceil((static_cast<float>(src[width_dim] + pad_left + pad_right - (dilation_x * (weights[weights_width_dim] - 1) + 1)) / stride_x) + 1));
-            output_height_size = static_cast<int>(std::ceil((static_cast<float>(src[height_dim] + pad_top + pad_bottom - (dilation_y * (weights[weights_height_dim] - 1) + 1)) / stride_y) + 1));
-            output_depth_size  = static_cast<int>(std::ceil((static_cast<float>(src[depth_dim] + pad_front + pad_back - (dilation_z * (weights[weights_depth_dim] - 1) + 1)) / stride_z) + 1));
+            output_width_size =
+                static_cast<int>(std::ceil((static_cast<float>(src[width_dim] + pad_left + pad_right -
+                                                               (dilation_x * (weights[weights_width_dim] - 1) + 1)) /
+                                            stride_x) +
+                                           1));
+            output_height_size =
+                static_cast<int>(std::ceil((static_cast<float>(src[height_dim] + pad_top + pad_bottom -
+                                                               (dilation_y * (weights[weights_height_dim] - 1) + 1)) /
+                                            stride_y) +
+                                           1));
+            output_depth_size =
+                static_cast<int>(std::ceil((static_cast<float>(src[depth_dim] + pad_front + pad_back -
+                                                               (dilation_z * (weights[weights_depth_dim] - 1) + 1)) /
+                                            stride_z) +
+                                           1));
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported rounding type");
@@ -1539,7 +1620,7 @@ inline TensorShape compute_conv3d_shape(const TensorShape &src, const TensorShap
  */
 inline TensorShape compute_pool3d_shape(const TensorShape &src, Pooling3dLayerInfo pool3d_info)
 {
-    TensorShape output_shape{ src };
+    TensorShape output_shape{src};
 
     const auto data_layout      = DataLayout::NDHWC;
     const int  idx_width        = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -1552,10 +1633,12 @@ inline TensorShape compute_pool3d_shape(const TensorShape &src, Pooling3dLayerIn
     int        output_height    = 0;
     int        output_depth     = 0;
 
-    std::tie(output_width, output_height, output_depth) = scaled_3d_dimensions_signed(src[idx_width], src[idx_height], src[idx_depth], pool_size_width, pool_size_height,
-                                                                                      pool_size_depth, pool3d_info);
+    std::tie(output_width, output_height, output_depth) =
+        scaled_3d_dimensions_signed(src[idx_width], src[idx_height], src[idx_depth], pool_size_width, pool_size_height,
+                                    pool_size_depth, pool3d_info);
 
-    ARM_COMPUTE_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1), "Calculated output dimension size is invalid");
+    ARM_COMPUTE_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1),
+                             "Calculated output dimension size is invalid");
 
     output_shape.set(idx_width, static_cast<size_t>(output_width));
     output_shape.set(idx_height, static_cast<size_t>(output_height));
@@ -1576,7 +1659,8 @@ inline TensorShape compute_pool3d_shape(const TensorShape &src, Pooling3dLayerIn
  *
  * @return the calculated shape
  */
-inline TensorShape compute_gather_shape(const TensorShape &input_shape, const TensorShape &indices_shape, uint32_t actual_axis)
+inline TensorShape
+compute_gather_shape(const TensorShape &input_shape, const TensorShape &indices_shape, uint32_t actual_axis)
 {
     const auto input_num_dims   = input_shape.num_dimensions();
     const auto indices_num_dims = indices_shape.num_dimensions();
@@ -1587,22 +1671,23 @@ inline TensorShape compute_gather_shape(const TensorShape &input_shape, const Te
     TensorShape output_shape;
     size_t      dim_no = 0;
 
-    for(; dim_no < actual_axis; ++dim_no)
+    for (; dim_no < actual_axis; ++dim_no)
     {
         output_shape.set(dim_no, input_shape[dim_no]);
     }
 
-    for(; dim_no < actual_axis + indices_num_dims; ++dim_no)
+    for (; dim_no < actual_axis + indices_num_dims; ++dim_no)
     {
         output_shape.set(dim_no, indices_shape[dim_no - actual_axis]);
     }
 
-    for(; dim_no < input_num_dims + indices_num_dims - 1; ++dim_no)
+    for (; dim_no < input_num_dims + indices_num_dims - 1; ++dim_no)
     {
         output_shape.set(dim_no, input_shape[dim_no + 1 - indices_num_dims]);
     }
 
-    ARM_COMPUTE_ERROR_ON(input_shape.total_size() * indices_shape.total_size() != output_shape.total_size() * input_shape[actual_axis]);
+    ARM_COMPUTE_ERROR_ON(input_shape.total_size() * indices_shape.total_size() !=
+                         output_shape.total_size() * input_shape[actual_axis]);
 
     return output_shape;
 }
diff --git a/arm_compute/core/utils/misc/Traits.h b/arm_compute/core/utils/misc/Traits.h
index 933922f63c..944fcb95f9 100644
--- a/arm_compute/core/utils/misc/Traits.h
+++ b/arm_compute/core/utils/misc/Traits.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_UTILS_TRAITS_TRAITS_H
 
 #include "arm_compute/core/Types.h"
+
 #include <type_traits>
 
 namespace arm_compute
diff --git a/arm_compute/core/utils/misc/Utility.h b/arm_compute/core/utils/misc/Utility.h
index e3e20d719f..22f10d74cc 100644
--- a/arm_compute/core/utils/misc/Utility.h
+++ b/arm_compute/core/utils/misc/Utility.h
@@ -44,7 +44,7 @@ struct index_sequence
 };
 
 template <std::size_t N, std::size_t... S>
-struct index_sequence_generator : index_sequence_generator < N - 1, N - 1, S... >
+struct index_sequence_generator : index_sequence_generator<N - 1, N - 1, S...>
 {
 };
 
@@ -58,17 +58,17 @@ template <std::size_t N>
 using index_sequence_t = typename index_sequence_generator<N>::type;
 
 template <typename T, std::size_t N, T val, T... vals>
-struct generate_array : generate_array < T, N - 1, val, val, vals... >
+struct generate_array : generate_array<T, N - 1, val, val, vals...>
 {
 };
 
 template <typename T, T val, T... vals>
 struct generate_array<T, 0, val, vals...>
 {
-    static constexpr std::array<T, sizeof...(vals)> value{ vals... };
+    static constexpr std::array<T, sizeof...(vals)> value{vals...};
 };
 
-template <typename T, T                  val, T... vals>
+template <typename T, T val, T... vals>
 constexpr std::array<T, sizeof...(vals)> generate_array<T, 0, val, vals...>::value;
 /** @endcond */
 
@@ -79,7 +79,7 @@ template <std::size_t... S,
           typename T = std::array<typename std::iterator_traits<Iterator>::value_type, sizeof...(S)>>
 T make_array(Iterator first, index_sequence<S...>)
 {
-    return T{ { first[S]... } };
+    return T{{first[S]...}};
 }
 } // namespace detail
 
@@ -87,7 +87,7 @@ template <std::size_t N, typename Iterator>
 std::array<typename std::iterator_traits<Iterator>::value_type, N> make_array(Iterator first, Iterator last)
 {
     ARM_COMPUTE_UNUSED(last);
-    return detail::make_array(first, index_sequence_t<N> {});
+    return detail::make_array(first, index_sequence_t<N>{});
 }
 
 /** Performs clamping among a lower and upper value.
@@ -119,7 +119,7 @@ inline void for_each(F &&)
  * @param[in] args Remaining arguments
  */
 template <typename F, typename T, typename... Ts>
-inline void for_each(F &&func, T &&arg, Ts &&... args)
+inline void for_each(F &&func, T &&arg, Ts &&...args)
 {
     func(std::forward<T>(arg));
     for_each(std::forward<F>(func), std::forward<Ts>(args)...);
@@ -143,9 +143,11 @@ inline T &&foldl(F &&, T &&value)
  * @param[in] values  Remaining arguments
  */
 template <typename F, typename T, typename U, typename... Us>
-inline auto foldl(F &&func, T &&initial, U &&value, Us &&... values) -> decltype(func(std::forward<T>(initial), std::forward<U>(value)))
+inline auto foldl(F &&func, T &&initial, U &&value, Us &&...values)
+    -> decltype(func(std::forward<T>(initial), std::forward<U>(value)))
 {
-    return foldl(std::forward<F>(func), func(std::forward<T>(initial), std::forward<U>(value)), std::forward<Us>(values)...);
+    return foldl(std::forward<F>(func), func(std::forward<T>(initial), std::forward<U>(value)),
+                 std::forward<Us>(values)...);
 }
 
 /** Perform an index sort of a given vector.
@@ -160,11 +162,7 @@ std::vector<size_t> sort_indices(const std::vector<T> &v)
     std::vector<size_t> idx(v.size());
     std::iota(idx.begin(), idx.end(), 0);
 
-    std::sort(idx.begin(), idx.end(),
-              [&v](size_t i1, size_t i2)
-    {
-        return v[i1] < v[i2];
-    });
+    std::sort(idx.begin(), idx.end(), [&v](size_t i1, size_t i2) { return v[i1] < v[i2]; });
 
     return idx;
 }
@@ -178,7 +176,7 @@ std::vector<size_t> sort_indices(const std::vector<T> &v)
  */
 inline bool endswith(const std::string &str, const std::string &suffix)
 {
-    if(str.size() < suffix.size())
+    if (str.size() < suffix.size())
     {
         return false;
     }
@@ -205,10 +203,7 @@ inline bool check_aligned(void *ptr, const size_t alignment)
  */
 inline std::string tolower(std::string string)
 {
-    std::transform(string.begin(), string.end(), string.begin(), [](unsigned char c)
-    {
-        return std::tolower(c);
-    });
+    std::transform(string.begin(), string.end(), string.begin(), [](unsigned char c) { return std::tolower(c); });
     return string;
 }
 
@@ -227,7 +222,7 @@ inline std::string getenv(const std::string &env_name)
     return std::string{};
 #else  // BARE_METAL
     const auto env_chr = std::getenv(env_name.c_str());
-    return env_chr == nullptr ? std::string{} : std::string{ env_chr };
+    return env_chr == nullptr ? std::string{} : std::string{env_chr};
 #endif // BARE_METAL
 }
 } // namespace utility
diff --git a/arm_compute/core/utils/quantization/AsymmHelpers.h b/arm_compute/core/utils/quantization/AsymmHelpers.h
index a15f3e5cde..2324fe1838 100644
--- a/arm_compute/core/utils/quantization/AsymmHelpers.h
+++ b/arm_compute/core/utils/quantization/AsymmHelpers.h
@@ -41,7 +41,10 @@ namespace quantization
  *
  * @return a status
  */
-Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplier, int32_t *shift, bool ignore_epsilon = false);
+Status calculate_quantized_multiplier(float    multiplier,
+                                      int32_t *quant_multiplier,
+                                      int32_t *shift,
+                                      bool     ignore_epsilon = false);
 /** Calculate quantized representation of multiplier with value less than one.
  *
  * @param[in]  multiplier       Real multiplier.
@@ -51,7 +54,10 @@ Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplie
  *
  * @return a status
  */
-Status calculate_quantized_multiplier_less_than_one(float multiplier, int32_t *quant_multiplier, int32_t *right_shift, bool ignore_epsilon = false);
+Status calculate_quantized_multiplier_less_than_one(float    multiplier,
+                                                    int32_t *quant_multiplier,
+                                                    int32_t *right_shift,
+                                                    bool     ignore_epsilon = false);
 /** Calculate quantized representation of multiplier having value greater than one.
  *
  * @param[in]  multiplier           Real multiplier.
@@ -60,7 +66,8 @@ Status calculate_quantized_multiplier_less_than_one(float multiplier, int32_t *q
  *
  * @return a status
  */
-Status calculate_quantized_multiplier_greater_than_one(float multiplier, int32_t *quantized_multiplier, int32_t *left_shift);
+Status
+calculate_quantized_multiplier_greater_than_one(float multiplier, int32_t *quantized_multiplier, int32_t *left_shift);
 
 /** Calculate quantized representation of per-channel multipliers
  *
@@ -71,9 +78,9 @@ Status calculate_quantized_multiplier_greater_than_one(float multiplier, int32_t
  *
  * @return a status
  */
-Status calculate_quantized_multipliers(const QuantizationInfo &iq_info,
-                                       const QuantizationInfo &wq_info,
-                                       const QuantizationInfo &oq_info,
+Status calculate_quantized_multipliers(const QuantizationInfo  &iq_info,
+                                       const QuantizationInfo  &wq_info,
+                                       const QuantizationInfo  &oq_info,
                                        GEMMLowpOutputStageInfo &stage_info);
 
 /** Get minimum and maximum values for the input quantized data type
@@ -147,7 +154,10 @@ int32_t saturating_rounding_multiply_by_pow2(int32_t exponent, int32_t v);
  * @param[out] output_shift    Shift for inverse square root
  *
  */
-void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift, int32_t &output_inv_sqrt, int32_t &output_shift);
+void get_invsqrt_quantized_multiplier_exp(int32_t  input,
+                                          int32_t  reverse_shift,
+                                          int32_t &output_inv_sqrt,
+                                          int32_t &output_shift);
 
 } // namespace quantization
 } // namespace arm_compute
diff --git a/arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h b/arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h
index b5af589cd2..3deaff74fc 100644
--- a/arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h
+++ b/arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+
 #include <map>
 #include <memory>
 
diff --git a/arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h b/arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h
index 3daedd4efb..e2a34e4424 100644
--- a/arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h
+++ b/arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h
@@ -53,10 +53,12 @@ namespace dynamic_fusion
 enum class MemoryType
 {
     /** Both User and Auxiliary types are of Alloc type. Since they require memory allocation */
-    User      = 0, /**< Memory coming directly from users, e.g. for argument tensors */
-    Auxiliary = 1, /**< Additional memory required by the workload tensor, e.g. for tensors holding temporary results between kernels */
+    User = 0, /**< Memory coming directly from users, e.g. for argument tensors */
+    Auxiliary =
+        1, /**< Additional memory required by the workload tensor, e.g. for tensors holding temporary results between kernels */
     /** Virtual type is of No-Alloc type. Since it doesn't require memory allocation */
-    Virtual = 2, /**< Temporary tile which is not allocated as a whole tensor in the memory. It is mainly used at sketch time to link operators; there should be no Virtual tensors at runtime */
+    Virtual =
+        2, /**< Temporary tile which is not allocated as a whole tensor in the memory. It is mainly used at sketch time to link operators; there should be no Virtual tensors at runtime */
 };
 
 /** Memory information for tensors with @ref MemoryType::Auxiliary.
@@ -66,9 +68,7 @@ struct AuxMemoryInfo
 {
     AuxMemoryInfo() = default;
 
-    AuxMemoryInfo(size_t size, size_t alignment = 0) noexcept
-        : size(size),
-          alignment(alignment)
+    AuxMemoryInfo(size_t size, size_t alignment = 0) noexcept : size(size), alignment(alignment)
     {
     }
 
@@ -76,8 +76,8 @@ struct AuxMemoryInfo
     {
         return info0.size == info1.size && info0.alignment == info1.alignment;
     }
-    size_t size{ 0 };      /**< Total memory size in bytes */
-    size_t alignment{ 0 }; /**< Memory alignment in bytes */
+    size_t size{0};      /**< Total memory size in bytes */
+    size_t alignment{0}; /**< Memory alignment in bytes */
 };
 
 /** Descriptor of a workload tensor memory */
diff --git a/arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h b/arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h
index 59efc8bd5d..ba2f658c7c 100644
--- a/arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h
+++ b/arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h
@@ -49,8 +49,8 @@ public:
     ConvertPolicy convert_policy() const;
 
 private:
-    DataType      _data_type{};                               /**< Data Type to be casted to */
-    ConvertPolicy _convert_policy{ ConvertPolicy::SATURATE }; /**< Convert Policy */
+    DataType      _data_type{};                             /**< Data Type to be casted to */
+    ConvertPolicy _convert_policy{ConvertPolicy::SATURATE}; /**< Convert Policy */
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h b/arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h
index 58102d8d88..a98ef0363e 100644
--- a/arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h
+++ b/arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Types.h"
+
 #include <cstdint>
 
 namespace arm_compute
@@ -55,9 +56,9 @@ public:
     Size2D dilation() const;
 
 private:
-    Padding2D _pad{};              /**< Padding */
-    Size2D    _stride{ 1U, 1U };   /**< Stride */
-    Size2D    _dilation{ 1U, 1U }; /**< Dilation */
+    Padding2D _pad{};            /**< Padding */
+    Size2D    _stride{1U, 1U};   /**< Stride */
+    Size2D    _dilation{1U, 1U}; /**< Dilation */
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h b/arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h
index 6d05e9e4d6..c46b25cb5d 100644
--- a/arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h
+++ b/arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Types.h"
+
 #include <cstdint>
 
 namespace arm_compute
@@ -63,11 +64,11 @@ public:
     DimensionRoundingType dimension_rounding_type() const;
 
 private:
-    Padding2D             _pad{};                                                   /**< Padding */
-    Size2D                _stride{ 1U, 1U };                                        /**< Stride */
-    Size2D                _dilation{ 1U, 1U };                                      /**< Dilation */
-    uint32_t              _depth_multiplier{ 1U };                                  /**< Depth multiplier */
-    DimensionRoundingType _dimension_rounding_type{ DimensionRoundingType::FLOOR }; /**< Dimension rounding type */
+    Padding2D             _pad{};                                                 /**< Padding */
+    Size2D                _stride{1U, 1U};                                        /**< Stride */
+    Size2D                _dilation{1U, 1U};                                      /**< Dilation */
+    uint32_t              _depth_multiplier{1U};                                  /**< Depth multiplier */
+    DimensionRoundingType _dimension_rounding_type{DimensionRoundingType::FLOOR}; /**< Dimension rounding type */
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h b/arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h
index be30781d86..19d8b96dcf 100644
--- a/arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h
+++ b/arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h
@@ -72,8 +72,8 @@ private:
     PoolingType _pool_type{};
     Padding2D   _pad{};
     Size2D      _pool_size{};
-    Size2D      _stride{ 1U, 1U };
-    bool        _exclude_padding{ true };
+    Size2D      _stride{1U, 1U};
+    bool        _exclude_padding{true};
 };
 
 } // namespace dynamic_fusion
diff --git a/arm_compute/dynamic_fusion/sketch/attributes/ResizeAttributes.h b/arm_compute/dynamic_fusion/sketch/attributes/ResizeAttributes.h
index 8992693cd1..7410cc7e70 100644
--- a/arm_compute/dynamic_fusion/sketch/attributes/ResizeAttributes.h
+++ b/arm_compute/dynamic_fusion/sketch/attributes/ResizeAttributes.h
@@ -27,6 +27,7 @@
 
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
+
 #include <cstdint>
 
 namespace arm_compute
@@ -75,9 +76,9 @@ public:
 private:
     int32_t             _output_width{};
     int32_t             _output_height{};
-    InterpolationPolicy _interpolation_policy{ InterpolationPolicy::BILINEAR };
-    SamplingPolicy      _sampling_policy{ SamplingPolicy::CENTER };
-    bool                _align_corners{ false };
+    InterpolationPolicy _interpolation_policy{InterpolationPolicy::BILINEAR};
+    SamplingPolicy      _sampling_policy{SamplingPolicy::CENTER};
+    bool                _align_corners{false};
 };
 
 } // namespace dynamic_fusion
diff --git a/arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h b/arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h
index fc50aa0f68..61654bfa27 100644
--- a/arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h
+++ b/arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h
@@ -51,7 +51,7 @@ public:
     int axis() const;
 
 private:
-    float _beta{ 1.f };      /**< Scaling factor for the exponent */
+    float _beta{1.f};        /**< Scaling factor for the exponent */
     bool  _is_log_softmax{}; /**< True if operation is log-softmax */
     int   _axis{};           /**< Axis/Dimension to perform the operation */
 };
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h b/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h
index 0b60899734..38b350c7eb 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h
@@ -85,7 +85,7 @@ public:
      * @return TensorInfo Newly created tensor info
      */
     template <typename... TArgs>
-    TensorInfo create_tensor_info(TArgs &&... args)
+    TensorInfo create_tensor_info(TArgs &&...args)
     {
         auto tensor_info = TensorInfo(std::forward<TArgs>(args)...);
         register_user_tensor(tensor_info);
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h
index 33eded4dff..5b6c1b90ab 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h
@@ -65,9 +65,7 @@ public:
      *
      * @return Pointer for the destination tensor info
      */
-    static ITensorInfo *create_op(GpuWorkloadSketch &sketch,
-                                  ITensorInfo       *lhs,
-                                  ITensorInfo       *rhs);
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs);
     /** Check if the operator configuration is supported, irrespective of fusion
      *
      * @param[in] context Workload context within which the operator is running
@@ -76,18 +74,14 @@ public:
      *
      * @return Status
      */
-    static Status is_supported_op(const GpuWorkloadContext &context,
-                                  const ITensorInfo        *lhs,
-                                  const ITensorInfo        *rhs);
+    static Status is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs);
     /** Validate the operator and check if the its configuration is supported and if it can be fused into the workload sketch.
      *
      * Parameters are similar to @ref GpuAdd::create_op()
      *
      * @return Status
      */
-    static Status validate_op(const GpuWorkloadSketch &sketch,
-                              const ITensorInfo       *rhs,
-                              const ITensorInfo       *lhs);
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *rhs, const ITensorInfo *lhs);
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h
index 83b004b8b8..1593cec804 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h
@@ -68,9 +68,7 @@ public:
      *
      * @return Pointer for the destination tensor info
      */
-    static ITensorInfo *create_op(GpuWorkloadSketch &sketch,
-                                  ITensorInfo       *src,
-                                  const Attributes &attributes);
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const Attributes &attributes);
     /** Check if the operator configuration is supported, irrespective of fusion
      *
      * @param[in] context    Workload context within which the operator is running
@@ -79,18 +77,15 @@ public:
      *
      * @return Status
      */
-    static Status is_supported_op(const GpuWorkloadContext &context,
-                                  const ITensorInfo        *src,
-                                  const Attributes         &attributes);
+    static Status
+    is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Attributes &attributes);
     /** Validate the operator and check if the its configuration is supported and if it can be fused into the workload sketch.
      *
      *  Parameters are similar to @ref GpuCast::create_op()
      *
      * @return Status
      */
-    static Status validate_op(const GpuWorkloadSketch &sketch,
-                              const ITensorInfo       *src,
-                              const Attributes        &attributes);
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const Attributes &attributes);
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuClamp.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuClamp.h
index e96251196a..5dd77bdc8e 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuClamp.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuClamp.h
@@ -62,9 +62,7 @@ public:
      *
      * @return Pointer for the destination tensor info
      */
-    static ITensorInfo *create_op(GpuWorkloadSketch &sketch,
-                                  ITensorInfo       *src,
-                                  const Attributes &attributes);
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const Attributes &attributes);
 
     /** Check if the operator configuration is supported, irrespective of fusion
      *
@@ -74,9 +72,8 @@ public:
      *
      * @return Status
      */
-    static Status is_supported_op(const GpuWorkloadContext &context,
-                                  const ITensorInfo        *src,
-                                  const Attributes         &attributes);
+    static Status
+    is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Attributes &attributes);
 
     /** Validate the operator and check if it can be fused into the workload sketch.
      *
@@ -84,9 +81,7 @@ public:
      *
      * @return Status
      */
-    static Status validate_op(const GpuWorkloadSketch &sketch,
-                              const ITensorInfo       *src,
-                              const Attributes        &attributes);
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const Attributes &attributes);
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h
index 612cc83a1f..da7e860757 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h
@@ -64,11 +64,8 @@ public:
      *
      * @return Pointer for the destination tensor info
      */
-    static ITensorInfo *create_op(GpuWorkloadSketch &sketch,
-                                  ITensorInfo       *src,
-                                  ITensorInfo       *wei,
-                                  ITensorInfo       *bia,
-                                  const Attributes &attributes);
+    static ITensorInfo *create_op(
+        GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *wei, ITensorInfo *bia, const Attributes &attributes);
     /** Check if the operator configuration is supported, irrespective of fusion
      *
      * @param[in] context    Workload context within which the operator is running
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h
index a0cb292730..958569efd7 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h
@@ -63,11 +63,8 @@ public:
      *
      * @return Pointer for the destination tensor info
      */
-    static ITensorInfo *create_op(GpuWorkloadSketch &sketch,
-                                  ITensorInfo       *src,
-                                  ITensorInfo       *wei,
-                                  ITensorInfo       *bia,
-                                  const Attributes &attributes);
+    static ITensorInfo *create_op(
+        GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *wei, ITensorInfo *bia, const Attributes &attributes);
 
     /** Check if the operator configuration is supported, irrespective of fusion
      *
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h
index 3e0ebdd96c..d13e4a3cad 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h
@@ -62,9 +62,7 @@ public:
      *
      * @return Pointer for the destination tensor info
      */
-    static ITensorInfo *create_op(GpuWorkloadSketch &sketch,
-                                  ITensorInfo       *lhs,
-                                  ITensorInfo       *rhs);
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs);
 
     /** Check if the operator configuration is supported, irrespective of fusion
      *
@@ -74,9 +72,7 @@ public:
      *
      * @return Status
      */
-    static Status is_supported_op(const GpuWorkloadContext &context,
-                                  const ITensorInfo        *lhs,
-                                  const ITensorInfo        *rhs);
+    static Status is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs);
 
     /** Validate the operator and check if the configuration is supported and if it can be fused into the workload sketch.
      *
@@ -84,9 +80,7 @@ public:
      *
      * @return Status
      */
-    static Status validate_op(const GpuWorkloadSketch &sketch,
-                              const ITensorInfo       *rhs,
-                              const ITensorInfo       *lhs);
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *rhs, const ITensorInfo *lhs);
 };
 
 } // namespace dynamic_fusion
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h
index 06317511cd..deb5559b9d 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h
@@ -56,9 +56,7 @@ public:
      * @param[in, out] dst    Destination tensor info.
      *                        If an uninitialized ITensorInfo is passed in, it will be auto-initialized.
      */
-    static void create_op(GpuWorkloadSketch &sketch,
-                          ITensorInfo       *src,
-                          ITensorInfo       *dst);
+    static void create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *dst);
 
     /** Check if the operator configuration is supported, irrespective of fusion.
      *
@@ -68,9 +66,7 @@ public:
      *
      * @return Status
      */
-    static Status is_supported_op(const GpuWorkloadContext &context,
-                                  const ITensorInfo        *src,
-                                  const ITensorInfo        *dst);
+    static Status is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const ITensorInfo *dst);
 
     /** Validate the operator and check if the its configuration is supported and if it can be fused into the workload sketch.
      *
@@ -78,9 +74,7 @@ public:
      *
      * @return Status
      */
-    static Status validate_op(const GpuWorkloadSketch &sketch,
-                              const ITensorInfo       *src,
-                              const ITensorInfo       *dst);
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const ITensorInfo *dst);
 };
 
 } // namespace dynamic_fusion
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h
index 65a092c0a2..4d2db0e89c 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h
@@ -55,8 +55,8 @@ public:
     GpuPool2dSettings use_inf_as_limit(bool use_inf_as_limit);
 
 private:
-    bool _mixed_precision{ false };
-    bool _use_inf_as_limit{ true };
+    bool _mixed_precision{false};
+    bool _use_inf_as_limit{true};
 };
 
 /** Operator interface. */
@@ -86,10 +86,8 @@ public:
      * @param[in]     attributes Operator attributes
      * @param[in]     settings   Operator settings
      */
-    static ITensorInfo *create_op(GpuWorkloadSketch &sketch,
-                                  ITensorInfo       *src,
-                                  const Attributes  &attributes,
-                                  const Settings    &settings);
+    static ITensorInfo *
+    create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const Attributes &attributes, const Settings &settings);
     /** Check if the operator configuration is supported, irrespective of fusion
      *
      * @param[in] context    Workload context within which the operator is running
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuReshape.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuReshape.h
index 0f50127199..dc194fcadb 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuReshape.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuReshape.h
@@ -62,9 +62,7 @@ public:
      *
      * @return Pointer for the destination tensor info
      */
-    static ITensorInfo *create_op(GpuWorkloadSketch &sketch,
-                                  ITensorInfo       *src,
-                                  const Attributes &attributes);
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const Attributes &attributes);
     /** Check if the operator configuration is supported, irrespective of fusion
      *
      * @param[in] context    Workload context within which the operator is running
@@ -73,18 +71,15 @@ public:
      *
      * @return Status
      */
-    static Status is_supported_op(const GpuWorkloadContext &context,
-                                  const ITensorInfo        *src,
-                                  const Attributes         &attributes);
+    static Status
+    is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Attributes &attributes);
     /** Validate the operator and check if the its configuration is supported and if it can be fused into the workload sketch.
      *
      *  Parameters are similar to @ref GpuReshape::create_op()
      *
      * @return Status
      */
-    static Status validate_op(const GpuWorkloadSketch &sketch,
-                              const ITensorInfo       *src,
-                              const Attributes        &attributes);
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const Attributes &attributes);
 };
 
 } // namespace dynamic_fusion
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuResize.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuResize.h
index 2579d10f5b..e2ece80a1d 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuResize.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuResize.h
@@ -67,9 +67,7 @@ public:
      *
      * @return Pointer for the destination tensor info
      */
-    static ITensorInfo *create_op(GpuWorkloadSketch &sketch,
-                                  ITensorInfo       *src,
-                                  const Attributes &attributes);
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const Attributes &attributes);
     /** Check if the operator configuration is supported, irrespective of fusion
      *
      * @param[in] context    Workload context within which the operator is running
@@ -78,18 +76,15 @@ public:
      *
      * @return Status
      */
-    static Status is_supported_op(const GpuWorkloadContext &context,
-                                  const ITensorInfo        *src,
-                                  const Attributes         &attributes);
+    static Status
+    is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Attributes &attributes);
     /** Validate the operator and check if the its configuration is supported and if it can be fused into the workload sketch.
      *
      * Parameters are similar to @ref GpuResize::create_op()
      *
      * @return Status
      */
-    static Status validate_op(const GpuWorkloadSketch &sketch,
-                              const ITensorInfo       *src,
-                              const Attributes        &attributes);
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const Attributes &attributes);
 };
 
 } // namespace dynamic_fusion
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.h
index 616a61e614..798b84b906 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.h
@@ -59,8 +59,7 @@ public:
      *
      * @return Pointer for the destination tensor info
      */
-    static ITensorInfo *create_op(GpuWorkloadSketch &sketch,
-                                  ITensorInfo       *src);
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch, ITensorInfo *src);
 
     /** Check if the operator configuration is supported, irrespective of fusion
      *
@@ -69,8 +68,7 @@ public:
      *
      * @return Status
      */
-    static Status is_supported_op(const GpuWorkloadContext &context,
-                                  const ITensorInfo        *src);
+    static Status is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src);
 
     /** Validate the operator and check if it can be fused into the workload sketch.
      *
@@ -78,8 +76,7 @@ public:
      *
      * @return Status
      */
-    static Status validate_op(const GpuWorkloadSketch &sketch,
-                              const ITensorInfo       *src);
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src);
 };
 
 } // namespace dynamic_fusion
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.h
index e86ef91e6a..66c2d77310 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.h
@@ -62,10 +62,7 @@ public:
      * @param[in]     dst        Destination tensor info.
      * @param[in]     attributes Operator attributes
      */
-    static void create_op(GpuWorkloadSketch &sketch,
-                          ITensorInfo       *src,
-                          ITensorInfo       *dst,
-                          const Attributes &attributes);
+    static void create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *dst, const Attributes &attributes);
     /** Check if the operator configuration is supported, irrespective of fusion
      *
      * @param[in] context    Workload context within which the operator is running
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h
index 6f8c2d0b76..2d9255fff2 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h
@@ -65,9 +65,7 @@ public:
      *
      * @return Pointer for the destination tensor info
      */
-    static ITensorInfo *create_op(GpuWorkloadSketch &sketch,
-                                  ITensorInfo       *lhs,
-                                  ITensorInfo       *rhs);
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs);
 
     /** Check if the operator configuration is supported, irrespective of fusion
      *
@@ -77,9 +75,7 @@ public:
      *
      * @return Status
      */
-    static Status is_supported_op(const GpuWorkloadContext &context,
-                                  const ITensorInfo        *lhs,
-                                  const ITensorInfo        *rhs);
+    static Status is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs);
 
     /** Validate the operator and check if its configuration is supported and if it can be fused into the workload sketch.
      *
@@ -87,9 +83,7 @@ public:
      *
      * @return Status
      */
-    static Status validate_op(const GpuWorkloadSketch &sketch,
-                              const ITensorInfo       *rhs,
-                              const ITensorInfo       *lhs);
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *rhs, const ITensorInfo *lhs);
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuTanh.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuTanh.h
index 08b5032e93..9c0ce6de02 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuTanh.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuTanh.h
@@ -59,8 +59,7 @@ public:
      *
      * @return Pointer for the destination tensor info
      */
-    static ITensorInfo *create_op(GpuWorkloadSketch &sketch,
-                                  ITensorInfo       *src);
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch, ITensorInfo *src);
 
     /** Check if the operator configuration is supported, irrespective of fusion
      *
@@ -69,8 +68,7 @@ public:
      *
      * @return Status
      */
-    static Status is_supported_op(const GpuWorkloadContext &context,
-                                  const ITensorInfo        *src);
+    static Status is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src);
 
     /** Validate the operator and check if it can be fused into the workload sketch.
      *
@@ -78,8 +76,7 @@ public:
      *
      * @return Status
      */
-    static Status validate_op(const GpuWorkloadSketch &sketch,
-                              const ITensorInfo       *src);
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src);
 };
 
 } // namespace dynamic_fusion
diff --git a/arm_compute/function_info/ActivationLayerInfo.h b/arm_compute/function_info/ActivationLayerInfo.h
index 84e962cb3a..195b67cf99 100644
--- a/arm_compute/function_info/ActivationLayerInfo.h
+++ b/arm_compute/function_info/ActivationLayerInfo.h
@@ -39,17 +39,17 @@ enum class ActivationFunction
     RELU,            /**< Rectifier ( \f$ f(x) = max(0,x) \f$ ) */
     BOUNDED_RELU,    /**< Upper Bounded Rectifier ( \f$ f(x) = min(a, max(0,x)) \f$ ) */
     LU_BOUNDED_RELU, /**< Lower and Upper Bounded Rectifier ( \f$ f(x) = min(a, max(b,x)) \f$ ) */
-    LEAKY_RELU,      /**< Leaky Rectifier ( \f$ f(x) = \begin{cases}  \alpha x & \quad \text{if } x \text{ < 0}\\  x & \quad \text{if } x \geq \text{ 0 } \end{cases} \f$ ) */
-    SOFT_RELU,       /**< Soft Rectifier ( \f$ f(x)= log(1+e^x) \f$ ) */
-    ELU,             /**< Exponential Linear Unit ( \f$ f(x) = \begin{cases}  \alpha (exp(x) - 1) & \quad \text{if } x \text{ < 0}\\  x & \quad \text{if } x \geq \text{ 0 } \end{cases} \f$ ) */
-    ABS,             /**< Absolute ( \f$ f(x)= |x| \f$ ) */
-    SQUARE,          /**< Square ( \f$ f(x)= x^2 \f$ )*/
-    SQRT,            /**< Square root ( \f$ f(x) = \sqrt{x} \f$ )*/
-    LINEAR,          /**< Linear ( \f$ f(x)= ax + b \f$ ) */
-    IDENTITY,        /**< Identity ( \f$ f(x)= x \f$ ) */
-    HARD_SWISH,      /**< Hard-swish ( \f$ f(x) = (x \text{ReLU6}(x+3))/6 = x \min(\max(0,x+3),6)/6 \f$ ) */
-    SWISH,           /**< Swish ( \f$ f(x) = \frac{x}{1 + e^{-ax}} = x \text{logistic}(ax) \f$ ) */
-    GELU             /**< GELU ( \f$ f(x) = x * 1/2 * 1 + erf(x / \sqrt{2}) \f$ ) */
+    LEAKY_RELU, /**< Leaky Rectifier ( \f$ f(x) = \begin{cases}  \alpha x & \quad \text{if } x \text{ < 0}\\  x & \quad \text{if } x \geq \text{ 0 } \end{cases} \f$ ) */
+    SOFT_RELU,  /**< Soft Rectifier ( \f$ f(x)= log(1+e^x) \f$ ) */
+    ELU, /**< Exponential Linear Unit ( \f$ f(x) = \begin{cases}  \alpha (exp(x) - 1) & \quad \text{if } x \text{ < 0}\\  x & \quad \text{if } x \geq \text{ 0 } \end{cases} \f$ ) */
+    ABS, /**< Absolute ( \f$ f(x)= |x| \f$ ) */
+    SQUARE,     /**< Square ( \f$ f(x)= x^2 \f$ )*/
+    SQRT,       /**< Square root ( \f$ f(x) = \sqrt{x} \f$ )*/
+    LINEAR,     /**< Linear ( \f$ f(x)= ax + b \f$ ) */
+    IDENTITY,   /**< Identity ( \f$ f(x)= x \f$ ) */
+    HARD_SWISH, /**< Hard-swish ( \f$ f(x) = (x \text{ReLU6}(x+3))/6 = x \min(\max(0,x+3),6)/6 \f$ ) */
+    SWISH,      /**< Swish ( \f$ f(x) = \frac{x}{1 + e^{-ax}} = x \text{logistic}(ax) \f$ ) */
+    GELU        /**< GELU ( \f$ f(x) = x * 1/2 * 1 + erf(x / \sqrt{2}) \f$ ) */
 };
 /** Activation Layer Information class */
 class ActivationLayerInfo
@@ -68,8 +68,7 @@ public:
      *              (@ref ActivationFunction::BOUNDED_RELU, @ref ActivationFunction::LU_BOUNDED_RELU, @ref ActivationFunction::LINEAR, @ref ActivationFunction::TANH).
      * @param[in] b (Optional) The beta parameter used by some activation functions (@ref ActivationFunction::LINEAR, @ref ActivationFunction::LU_BOUNDED_RELU, @ref ActivationFunction::TANH).
      */
-    ActivationLayerInfo(ActivationFunction f, float a = 0.0f, float b = 0.0f)
-        : _act(f), _a(a), _b(b), _enabled(true)
+    ActivationLayerInfo(ActivationFunction f, float a = 0.0f, float b = 0.0f) : _act(f), _a(a), _b(b), _enabled(true)
     {
     }
     /** Get the type of activation function */
@@ -104,10 +103,10 @@ public:
     }
 #endif // __aarch64__
 private:
-    ActivationFunction _act     = { ActivationLayerInfo::ActivationFunction::IDENTITY };
+    ActivationFunction _act     = {ActivationLayerInfo::ActivationFunction::IDENTITY};
     float              _a       = {};
     float              _b       = {};
-    bool               _enabled = { false };
+    bool               _enabled = {false};
 
 #ifdef __aarch64__
     LookupTable256 _lut = {};
diff --git a/arm_compute/function_info/ConvolutionInfo.h b/arm_compute/function_info/ConvolutionInfo.h
index c27dc523c8..4830cae137 100644
--- a/arm_compute/function_info/ConvolutionInfo.h
+++ b/arm_compute/function_info/ConvolutionInfo.h
@@ -33,14 +33,18 @@ namespace arm_compute
 struct ConvolutionInfo
 {
     ConvolutionInfo() = default;
-    ConvolutionInfo(const PadStrideInfo &pad_stride_info, unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+    ConvolutionInfo(const PadStrideInfo       &pad_stride_info,
+                    unsigned int               depth_multiplier,
+                    const ActivationLayerInfo &act_info,
+                    const Size2D              &dilation)
         : pad_stride_info(pad_stride_info), depth_multiplier(depth_multiplier), act_info(act_info), dilation(dilation)
     {
     }
-    PadStrideInfo       pad_stride_info{};        /**< Convolution info (Pads, strides,...) */
-    unsigned int        depth_multiplier{ 1 };    /**< Multiplier to apply to input's depth to retrieve the output depth. Defaults to 1 */
-    ActivationLayerInfo act_info{};               /**< Fused activation to apply after convolution. */
-    Size2D              dilation{ Size2D(1, 1) }; /**< Dilation, in elements, across x and y. Defaults to (1, 1). */
+    PadStrideInfo pad_stride_info{}; /**< Convolution info (Pads, strides,...) */
+    unsigned int  depth_multiplier{
+        1}; /**< Multiplier to apply to input's depth to retrieve the output depth. Defaults to 1 */
+    ActivationLayerInfo act_info{};             /**< Fused activation to apply after convolution. */
+    Size2D              dilation{Size2D(1, 1)}; /**< Dilation, in elements, across x and y. Defaults to (1, 1). */
 };
 } // namespace arm_compute
 #endif /* ACL_ARM_COMPUTE_FUNCTION_INFO_CONVOLUTIONINFO */
diff --git a/arm_compute/function_info/FullyConnectedLayerInfo.h b/arm_compute/function_info/FullyConnectedLayerInfo.h
index 5f5578eadd..e65daeb2d4 100644
--- a/arm_compute/function_info/FullyConnectedLayerInfo.h
+++ b/arm_compute/function_info/FullyConnectedLayerInfo.h
@@ -35,13 +35,13 @@ struct FullyConnectedLayerInfo
     /* Fused-activation parameters */
     ActivationLayerInfo activation_info{}; /**<  Fused activation to apply after the matrix multiplication. */
     /* Information about weights */
-    DataLayout weights_trained_layout{ DataLayout::NCHW }; /**<  Layout that the weights have been trained with. */
-    bool       transpose_weights{ true };                  /**<  Transpose weights if true. */
-    bool       are_weights_reshaped{ false };              /**<  @deprecated Reshape the weights tensor if false. */
-    bool       retain_internal_weights{ false };           /**<  Retain internal reshaped weights. */
-    bool       enable_fast_math{ false };                  /**<  Enable fast math computation. */
+    DataLayout weights_trained_layout{DataLayout::NCHW}; /**<  Layout that the weights have been trained with. */
+    bool       transpose_weights{true};                  /**<  Transpose weights if true. */
+    bool       are_weights_reshaped{false};              /**<  @deprecated Reshape the weights tensor if false. */
+    bool       retain_internal_weights{false};           /**<  Retain internal reshaped weights. */
+    bool       enable_fast_math{false};                  /**<  Enable fast math computation. */
     /* Other parameters */
-    bool fp_mixed_precision{ false }; /**<  Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy. */
+    bool fp_mixed_precision{false}; /**<  Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy. */
 
     /** Sets the weights trained data layout
      *
diff --git a/arm_compute/function_info/GEMMInfo.h b/arm_compute/function_info/GEMMInfo.h
index 29a57a00c2..c24762c0aa 100644
--- a/arm_compute/function_info/GEMMInfo.h
+++ b/arm_compute/function_info/GEMMInfo.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/CoreTypes.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include <vector>
 
 namespace arm_compute
@@ -43,17 +44,22 @@ enum class GEMMLowpOutputStageType
 /** GEMMLowp output stage info */
 struct GEMMLowpOutputStageInfo
 {
-    GEMMLowpOutputStageType type{ GEMMLowpOutputStageType::NONE };                        /**< GEMMLowp output stage type */
-    int32_t                 gemmlowp_offset{ 0 };                                         /**< GEMMLowp output stage offset used for quantizing to QASYMM8 */
-    int32_t                 gemmlowp_multiplier{ 0 };                                     /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
-    int32_t                 gemmlowp_shift{ 0 };                                          /**< GEMMLowp output stage shift used for quantizing to uint8 */
-    int32_t                 gemmlowp_min_bound{ std::numeric_limits<int32_t>::lowest() }; /**< GEMMLowp min value used to saturate down the output result before converting back to QASYMM8 */
-    int32_t                 gemmlowp_max_bound{ std::numeric_limits<int32_t>::max() };    /**< GEMMLowp max value used to saturate down the output result before converting back to QASYMM8 */
-    std::vector<int32_t>    gemmlowp_multipliers{};                                       /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
-    std::vector<int32_t>    gemmlowp_shifts{};                                            /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
-    float                   gemmlowp_real_multiplier{ 0 };                                /**< GEMMLowp output stage real multiplier used for quantizing to QASYMM8 */
-    bool                    is_quantized_per_channel{ false };                            /**< GEMMLowp quantized per-channel flag */
-    DataType                output_data_type{ DataType::UNKNOWN };                        /**< Output tensor data type to use if the output is not initialized */
+    GEMMLowpOutputStageType type{GEMMLowpOutputStageType::NONE}; /**< GEMMLowp output stage type */
+    int32_t                 gemmlowp_offset{0}; /**< GEMMLowp output stage offset used for quantizing to QASYMM8 */
+    int32_t gemmlowp_multiplier{0};             /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
+    int32_t gemmlowp_shift{0};                  /**< GEMMLowp output stage shift used for quantizing to uint8 */
+    int32_t gemmlowp_min_bound{
+        std::numeric_limits<int32_t>::
+            lowest()}; /**< GEMMLowp min value used to saturate down the output result before converting back to QASYMM8 */
+    int32_t gemmlowp_max_bound{
+        std::numeric_limits<int32_t>::
+            max()}; /**< GEMMLowp max value used to saturate down the output result before converting back to QASYMM8 */
+    std::vector<int32_t> gemmlowp_multipliers{}; /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
+    std::vector<int32_t> gemmlowp_shifts{};      /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
+    float    gemmlowp_real_multiplier{0}; /**< GEMMLowp output stage real multiplier used for quantizing to QASYMM8 */
+    bool     is_quantized_per_channel{false}; /**< GEMMLowp quantized per-channel flag */
+    DataType output_data_type{
+        DataType::UNKNOWN}; /**< Output tensor data type to use if the output is not initialized */
 };
 /** GEMM information class. This class stores the necessary information to compute GEMM functions
  *
@@ -100,9 +106,19 @@ public:
      * @param[in] fixed_format                (Optional) Specify the selection of fixed format kernels for variable weights support in GEMM. These kernels expect the weights tensor to be in amemory format that is fixed by the kernel itself. For more information, see arm_compute::WeightFormat.
      * @param[in] weight_format               (Optional) arm_gemm:WeightFormat enumeration requested by the user. Default is arm_compute::WeightFormat::UNSPECIFIED.
      */
-    GEMMInfo(bool is_a_reshaped, bool is_b_reshaped, bool reshape_b_only_on_first_run, int depth_output_gemm3d = 0, bool reinterpret_input_as_3d = false, bool retain_internal_weights = false,
-             GEMMLowpOutputStageInfo gemmlowp_output_stage = GEMMLowpOutputStageInfo(), bool fp_mixed_precision = false, bool fast_math = false, bool broadcast_bias = false,
-             const ActivationLayerInfo &activation_info = ActivationLayerInfo(), bool fixed_format = false, arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED) noexcept
+    GEMMInfo(bool                       is_a_reshaped,
+             bool                       is_b_reshaped,
+             bool                       reshape_b_only_on_first_run,
+             int                        depth_output_gemm3d     = 0,
+             bool                       reinterpret_input_as_3d = false,
+             bool                       retain_internal_weights = false,
+             GEMMLowpOutputStageInfo    gemmlowp_output_stage   = GEMMLowpOutputStageInfo(),
+             bool                       fp_mixed_precision      = false,
+             bool                       fast_math               = false,
+             bool                       broadcast_bias          = false,
+             const ActivationLayerInfo &activation_info         = ActivationLayerInfo(),
+             bool                       fixed_format            = false,
+             arm_compute::WeightFormat  weight_format           = arm_compute::WeightFormat::UNSPECIFIED) noexcept
         : _is_a_reshaped(is_a_reshaped),
           _is_b_reshaped(is_b_reshaped),
           _reshape_b_only_on_first_run(reshape_b_only_on_first_run),
diff --git a/arm_compute/function_info/MatMulInfo.h b/arm_compute/function_info/MatMulInfo.h
index cd9ef1f4d9..fc73efb44a 100644
--- a/arm_compute/function_info/MatMulInfo.h
+++ b/arm_compute/function_info/MatMulInfo.h
@@ -55,8 +55,8 @@ public:
     }
 
 private:
-    bool _adj_lhs{ false };
-    bool _adj_rhs{ false };
+    bool _adj_lhs{false};
+    bool _adj_rhs{false};
 };
 } // namespace arm_compute
 #endif /* ACL_ARM_COMPUTE_FUNCTION_INFO_MATMULINFO */
diff --git a/arm_compute/graph/Edge.h b/arm_compute/graph/Edge.h
index 5e81b9c52f..7f5075d885 100644
--- a/arm_compute/graph/Edge.h
+++ b/arm_compute/graph/Edge.h
@@ -48,8 +48,18 @@ public:
      * @param[in] consumer_idx Consumer node input index
      * @param[in] tensor       Tensor associated with the edge
      */
-    Edge(EdgeID id, INode *producer, unsigned int producer_idx, INode *consumer, unsigned int consumer_idx, Tensor *tensor)
-        : _id(id), _producer(producer), _consumer(consumer), _producer_idx(producer_idx), _consumer_idx(consumer_idx), _tensor(tensor)
+    Edge(EdgeID       id,
+         INode       *producer,
+         unsigned int producer_idx,
+         INode       *consumer,
+         unsigned int consumer_idx,
+         Tensor      *tensor)
+        : _id(id),
+          _producer(producer),
+          _consumer(consumer),
+          _producer_idx(producer_idx),
+          _consumer_idx(consumer_idx),
+          _tensor(tensor)
 
     {
     }
diff --git a/arm_compute/graph/Graph.h b/arm_compute/graph/Graph.h
index 806d84c3fd..e6e173f5fa 100644
--- a/arm_compute/graph/Graph.h
+++ b/arm_compute/graph/Graph.h
@@ -79,7 +79,7 @@ public:
      * @return ID of the node
      */
     template <typename NT, typename... Ts>
-    NodeID add_node(Ts &&... args);
+    NodeID add_node(Ts &&...args);
     /** Remove the node with the given ID
      *
      * @param[in] nid ID of the node to remove
@@ -221,17 +221,17 @@ private:
     TensorID create_tensor(const TensorDescriptor &desc = TensorDescriptor());
 
 private:
-    GraphID                              _id      = GraphID(0); /**< Graph id */
-    std::string                          _name    = {};         /**< Graph name */
-    std::vector<std::unique_ptr<INode>>  _nodes   = {};         /**< Graph nodes */
-    std::vector<std::unique_ptr<Edge>>   _edges   = {};         /**< Graph edges */
-    std::vector<std::unique_ptr<Tensor>> _tensors = {};         /**< Graph tensors */
+    GraphID                                 _id           = GraphID(0); /**< Graph id */
+    std::string                             _name         = {};         /**< Graph name */
+    std::vector<std::unique_ptr<INode>>     _nodes        = {};         /**< Graph nodes */
+    std::vector<std::unique_ptr<Edge>>      _edges        = {};         /**< Graph edges */
+    std::vector<std::unique_ptr<Tensor>>    _tensors      = {};         /**< Graph tensors */
     std::map<NodeType, std::vector<NodeID>> _tagged_nodes = {}; /**< Graph nodes map with the node type as key */
-    arm_compute::Mutex _mtx = {};                               /**< Mutex used for graph construction */
+    arm_compute::Mutex                      _mtx          = {}; /**< Mutex used for graph construction */
 };
 
 template <typename NT, typename... Ts>
-inline NodeID Graph::add_node(Ts &&... args)
+inline NodeID Graph::add_node(Ts &&...args)
 {
     arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
 
@@ -245,7 +245,7 @@ inline NodeID Graph::add_node(Ts &&... args)
     _tagged_nodes[node->type()].push_back(nid);
 
     // Associate a new tensor with each output
-    for(auto &output : node->_outputs)
+    for (auto &output : node->_outputs)
     {
         output = create_tensor();
     }
diff --git a/arm_compute/graph/GraphBuilder.h b/arm_compute/graph/GraphBuilder.h
index cb88c0e7aa..118d06bdda 100644
--- a/arm_compute/graph/GraphBuilder.h
+++ b/arm_compute/graph/GraphBuilder.h
@@ -51,7 +51,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_const_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor = nullptr);
+    static NodeID
+    add_const_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor = nullptr);
     /** Adds an input layer node to the graph
      *
      * @param[in] g        Graph to add the node to
@@ -61,7 +62,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_input_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor = nullptr);
+    static NodeID
+    add_input_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor = nullptr);
     /** Adds an output layer node to the graph
      *
      * @param[in] g        Graph to add the node to
@@ -71,7 +73,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_output_node(Graph &g, NodeParams params, NodeIdxPair input, ITensorAccessorUPtr accessor = nullptr);
+    static NodeID
+    add_output_node(Graph &g, NodeParams params, NodeIdxPair input, ITensorAccessorUPtr accessor = nullptr);
     /** Adds an activation layer node to the graph
      *
      * @param[in] g              Graph to add the node to
@@ -82,7 +85,10 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_activation_node(Graph &g, NodeParams params, NodeIdxPair input, ActivationLayerInfo act_info,
+    static NodeID add_activation_node(Graph                  &g,
+                                      NodeParams              params,
+                                      NodeIdxPair             input,
+                                      ActivationLayerInfo     act_info,
                                       const QuantizationInfo &out_quant_info = QuantizationInfo());
     /** Adds an activation layer node to the graph
      *
@@ -96,7 +102,11 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_arg_min_max_node(Graph &g, NodeParams params, NodeIdxPair input, ReductionOperation op, unsigned int axis,
+    static NodeID add_arg_min_max_node(Graph                  &g,
+                                       NodeParams              params,
+                                       NodeIdxPair             input,
+                                       ReductionOperation      op,
+                                       unsigned int            axis,
                                        DataType                out_data_type  = DataType::UNKNOWN,
                                        const QuantizationInfo &out_quant_info = QuantizationInfo());
     /** Adds a batch normalization layer node to the graph
@@ -112,9 +122,14 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_batch_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, float epsilon,
-                                               ITensorAccessorUPtr mean_accessor = nullptr, ITensorAccessorUPtr var_accessor = nullptr,
-                                               ITensorAccessorUPtr beta_accessor = nullptr, ITensorAccessorUPtr gamma_accessor = nullptr);
+    static NodeID add_batch_normalization_node(Graph              &g,
+                                               NodeParams          params,
+                                               NodeIdxPair         input,
+                                               float               epsilon,
+                                               ITensorAccessorUPtr mean_accessor  = nullptr,
+                                               ITensorAccessorUPtr var_accessor   = nullptr,
+                                               ITensorAccessorUPtr beta_accessor  = nullptr,
+                                               ITensorAccessorUPtr gamma_accessor = nullptr);
     /** Adds a bounding box transform layer node to the graph
      *
      * @param[in] g      Graph to add the node to
@@ -125,7 +140,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_bounding_box_transform_node(Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair deltas, BoundingBoxTransformInfo info);
+    static NodeID add_bounding_box_transform_node(
+        Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair deltas, BoundingBoxTransformInfo info);
     /** Adds an channel shuffle layer node to the graph
      *
      * @param[in] g          Graph to add the node to
@@ -154,10 +170,17 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_convolution_node(Graph &g, NodeParams params, NodeIdxPair input,
-                                       Size2D kernel_spatial_extend, unsigned int depth, PadStrideInfo conv_info, unsigned int num_groups = 1,
-                                       ConvolutionMethod method = ConvolutionMethod::Default, FastMathHint fast_math_hint = FastMathHint::Disabled,
-                                       ITensorAccessorUPtr weights_accessor = nullptr, ITensorAccessorUPtr bias_accessor = nullptr,
+    static NodeID add_convolution_node(Graph                  &g,
+                                       NodeParams              params,
+                                       NodeIdxPair             input,
+                                       Size2D                  kernel_spatial_extend,
+                                       unsigned int            depth,
+                                       PadStrideInfo           conv_info,
+                                       unsigned int            num_groups         = 1,
+                                       ConvolutionMethod       method             = ConvolutionMethod::Default,
+                                       FastMathHint            fast_math_hint     = FastMathHint::Disabled,
+                                       ITensorAccessorUPtr     weights_accessor   = nullptr,
+                                       ITensorAccessorUPtr     bias_accessor      = nullptr,
                                        const QuantizationInfo &weights_quant_info = QuantizationInfo(),
                                        const QuantizationInfo &out_quant_info     = QuantizationInfo());
     /** Adds a deconvolution layer node to the graph
@@ -173,9 +196,14 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_deconvolution_node(Graph &g, NodeParams params, NodeIdxPair input,
-                                         Size2D kernel_spatial_extend, unsigned int depth, PadStrideInfo deconv_info,
-                                         ITensorAccessorUPtr weights_accessor = nullptr, ITensorAccessorUPtr bias_accessor = nullptr);
+    static NodeID add_deconvolution_node(Graph              &g,
+                                         NodeParams          params,
+                                         NodeIdxPair         input,
+                                         Size2D              kernel_spatial_extend,
+                                         unsigned int        depth,
+                                         PadStrideInfo       deconv_info,
+                                         ITensorAccessorUPtr weights_accessor = nullptr,
+                                         ITensorAccessorUPtr bias_accessor    = nullptr);
     /** Adds a depth concatenate node to the graph
      *
      * @param[in] g                 Graph to add the node to
@@ -185,7 +213,10 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_concatenate_node(Graph &g, NodeParams params, const std::vector<NodeIdxPair> &inputs, const descriptors::ConcatLayerDescriptor &concat_descriptor);
+    static NodeID add_concatenate_node(Graph                                    &g,
+                                       NodeParams                                params,
+                                       const std::vector<NodeIdxPair>           &inputs,
+                                       const descriptors::ConcatLayerDescriptor &concat_descriptor);
     /** Adds an depth to space layer node to the graph
      *
      * @param[in] g           Graph to add the node to
@@ -212,11 +243,18 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_depthwise_convolution_node(Graph &g, NodeParams params, NodeIdxPair input,
-                                                 Size2D kernel_spatial_extend, PadStrideInfo conv_info, int depth_multiplier = 1,
-                                                 DepthwiseConvolutionMethod method    = DepthwiseConvolutionMethod::Default,
-                                                 ITensorAccessorUPtr weights_accessor = nullptr, ITensorAccessorUPtr bias_accessor = nullptr, const QuantizationInfo &quant_info = QuantizationInfo(),
-                                                 const QuantizationInfo &out_quant_info = QuantizationInfo());
+    static NodeID
+    add_depthwise_convolution_node(Graph                     &g,
+                                   NodeParams                 params,
+                                   NodeIdxPair                input,
+                                   Size2D                     kernel_spatial_extend,
+                                   PadStrideInfo              conv_info,
+                                   int                        depth_multiplier = 1,
+                                   DepthwiseConvolutionMethod method           = DepthwiseConvolutionMethod::Default,
+                                   ITensorAccessorUPtr        weights_accessor = nullptr,
+                                   ITensorAccessorUPtr        bias_accessor    = nullptr,
+                                   const QuantizationInfo    &quant_info       = QuantizationInfo(),
+                                   const QuantizationInfo    &out_quant_info   = QuantizationInfo());
     /** Adds an element-wise layer node to the graph
      *
      * @param[in] g         Graph to add the node to
@@ -227,7 +265,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_elementwise_node(Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, EltwiseOperation operation);
+    static NodeID add_elementwise_node(
+        Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, EltwiseOperation operation);
     /** Adds a dequantization node to the graph
      *
      * @param[in] g      Graph to add the node to
@@ -248,7 +287,12 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_detection_output_node(Graph &g, NodeParams params, NodeIdxPair input_loc, NodeIdxPair input_conf, NodeIdxPair input_priorbox, const DetectionOutputLayerInfo &detect_info);
+    static NodeID add_detection_output_node(Graph                          &g,
+                                            NodeParams                      params,
+                                            NodeIdxPair                     input_loc,
+                                            NodeIdxPair                     input_conf,
+                                            NodeIdxPair                     input_priorbox,
+                                            const DetectionOutputLayerInfo &detect_info);
     /** Adds a detection post process layer node to the graph
      *
      * @param[in] g                      Graph to add the node to
@@ -261,8 +305,12 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_detection_post_process_node(Graph &g, NodeParams params, NodeIdxPair input_box_encoding, NodeIdxPair input_class_prediction,
-                                                  const DetectionPostProcessLayerInfo &detect_info, ITensorAccessorUPtr anchors_accessor = nullptr,
+    static NodeID add_detection_post_process_node(Graph                               &g,
+                                                  NodeParams                           params,
+                                                  NodeIdxPair                          input_box_encoding,
+                                                  NodeIdxPair                          input_class_prediction,
+                                                  const DetectionPostProcessLayerInfo &detect_info,
+                                                  ITensorAccessorUPtr                  anchors_accessor = nullptr,
                                                   const QuantizationInfo &anchor_quant_info = QuantizationInfo());
     /** Adds a Dummy node to the graph
      *
@@ -299,8 +347,12 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_fully_connected_layer(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_outputs,
-                                            NodeID weights_nid, NodeID bias_nid = EmptyNodeID,
+    static NodeID add_fully_connected_layer(Graph                        &g,
+                                            NodeParams                    params,
+                                            NodeIdxPair                   input,
+                                            unsigned int                  num_outputs,
+                                            NodeID                        weights_nid,
+                                            NodeID                        bias_nid       = EmptyNodeID,
                                             const FullyConnectedLayerInfo fc_info        = FullyConnectedLayerInfo(),
                                             const QuantizationInfo       &out_quant_info = QuantizationInfo(),
                                             FastMathHint                  fast_math_hint = FastMathHint::Disabled);
@@ -319,9 +371,13 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_fully_connected_layer(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_outputs,
-                                            ITensorAccessorUPtr weights_accessor = nullptr, ITensorAccessorUPtr bias_accessor = nullptr,
-                                            const FullyConnectedLayerInfo fc_info            = FullyConnectedLayerInfo(),
+    static NodeID add_fully_connected_layer(Graph                        &g,
+                                            NodeParams                    params,
+                                            NodeIdxPair                   input,
+                                            unsigned int                  num_outputs,
+                                            ITensorAccessorUPtr           weights_accessor = nullptr,
+                                            ITensorAccessorUPtr           bias_accessor    = nullptr,
+                                            const FullyConnectedLayerInfo fc_info          = FullyConnectedLayerInfo(),
                                             const QuantizationInfo       &weights_quant_info = QuantizationInfo(),
                                             const QuantizationInfo       &out_quant_info     = QuantizationInfo(),
                                             FastMathHint                  fast_math_hint     = FastMathHint::Disabled);
@@ -336,8 +392,12 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_generate_proposals_node(Graph &g, NodeParams params, NodeIdxPair scores, NodeIdxPair deltas,
-                                              NodeIdxPair anchors, GenerateProposalsInfo info);
+    static NodeID add_generate_proposals_node(Graph                &g,
+                                              NodeParams            params,
+                                              NodeIdxPair           scores,
+                                              NodeIdxPair           deltas,
+                                              NodeIdxPair           anchors,
+                                              GenerateProposalsInfo info);
     /** Adds a L2 Normalize layer node to the graph
      *
      * @param[in] g       Graph to add the node to
@@ -358,7 +418,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, NormalizationLayerInfo norm_info);
+    static NodeID
+    add_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, NormalizationLayerInfo norm_info);
     /** Adds a normalize planar YUV layer node to the graph
      *
      * @param[in] g             Graph to add the node to
@@ -369,8 +430,11 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_normalize_planar_yuv_node(Graph &g, NodeParams params, NodeIdxPair input,
-                                                ITensorAccessorUPtr mean_accessor = nullptr, ITensorAccessorUPtr std_accessor = nullptr);
+    static NodeID add_normalize_planar_yuv_node(Graph              &g,
+                                                NodeParams          params,
+                                                NodeIdxPair         input,
+                                                ITensorAccessorUPtr mean_accessor = nullptr,
+                                                ITensorAccessorUPtr std_accessor  = nullptr);
     /** Adds a pad layer node to the graph
      *
      * @param[in] g         Graph to add the node to
@@ -382,7 +446,11 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_pad_node(Graph &g, NodeParams params, NodeIdxPair input, const PaddingList &paddings, PixelValue pad_value = PixelValue());
+    static NodeID add_pad_node(Graph             &g,
+                               NodeParams         params,
+                               NodeIdxPair        input,
+                               const PaddingList &paddings,
+                               PixelValue         pad_value = PixelValue());
     /** Adds a permute layer node to the graph
      *
      * @param[in] g      Graph to add the node to
@@ -394,7 +462,11 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_permute_node(Graph &g, NodeParams params, NodeIdxPair input, PermutationVector perm, DataLayout layout = DataLayout::UNKNOWN);
+    static NodeID add_permute_node(Graph            &g,
+                                   NodeParams        params,
+                                   NodeIdxPair       input,
+                                   PermutationVector perm,
+                                   DataLayout        layout = DataLayout::UNKNOWN);
     /** Adds a pooling layer node to the graph
      *
      * @param[in] g         Graph to add the node to
@@ -426,8 +498,12 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_print_node(Graph &g, NodeParams params, NodeIdxPair input, std::ostream &stream, const IOFormatInfo &format_info = IOFormatInfo(),
-                                 const std::function<ITensor *(ITensor *)> transform = nullptr);
+    static NodeID add_print_node(Graph                                    &g,
+                                 NodeParams                                params,
+                                 NodeIdxPair                               input,
+                                 std::ostream                             &stream,
+                                 const IOFormatInfo                       &format_info = IOFormatInfo(),
+                                 const std::function<ITensor *(ITensor *)> transform   = nullptr);
     /** Adds a priorbox layer node to the graph
      *
      * @param[in] g          Graph to add the node to
@@ -438,7 +514,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_priorbox_node(Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, const PriorBoxLayerInfo &prior_info);
+    static NodeID add_priorbox_node(
+        Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, const PriorBoxLayerInfo &prior_info);
     /** Adds a quantization layer node to the graph
      *
      * @param[in] g              Graph to add the node to
@@ -448,7 +525,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_quantization_node(Graph &g, NodeParams params, NodeIdxPair input, const QuantizationInfo &out_quant_info);
+    static NodeID
+    add_quantization_node(Graph &g, NodeParams params, NodeIdxPair input, const QuantizationInfo &out_quant_info);
     /** Adds a reduction sum layer node to the graph
      *
      * @param[in] g         Graph to add the node to
@@ -460,7 +538,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_reduction_operation_node(Graph &g, NodeParams params, NodeIdxPair input, ReductionOperation op, int axis, bool keep_dims = true);
+    static NodeID add_reduction_operation_node(
+        Graph &g, NodeParams params, NodeIdxPair input, ReductionOperation op, int axis, bool keep_dims = true);
     /** Adds a reorg layer node to the graph
      *
      * @param[in] g      Graph to add the node to
@@ -492,7 +571,12 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_resize_node(Graph &g, NodeParams params, NodeIdxPair input, InterpolationPolicy policy, float width_scale, float height_scale);
+    static NodeID add_resize_node(Graph              &g,
+                                  NodeParams          params,
+                                  NodeIdxPair         input,
+                                  InterpolationPolicy policy,
+                                  float               width_scale,
+                                  float               height_scale);
     /** Adds a ROI align layer node to the graph
      *
      * @param[in] g         Graph to add the node to
@@ -503,7 +587,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_roi_align_node(Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair rois, ROIPoolingLayerInfo pool_info);
+    static NodeID
+    add_roi_align_node(Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair rois, ROIPoolingLayerInfo pool_info);
     /** Adds a scale layer node to the graph
      * This layer computes a product of the input with a scale (read from mul_accessor) and it applies an offset (read from add_accessor).
      * output = input * mul_w + add_w
@@ -516,8 +601,11 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_scale_layer(Graph &g, const NodeParams &params, NodeIdxPair input,
-                                  ITensorAccessorUPtr mul_accessor = nullptr, ITensorAccessorUPtr add_accessor = nullptr);
+    static NodeID add_scale_layer(Graph              &g,
+                                  const NodeParams   &params,
+                                  NodeIdxPair         input,
+                                  ITensorAccessorUPtr mul_accessor = nullptr,
+                                  ITensorAccessorUPtr add_accessor = nullptr);
     /** Adds a softmax node to the graph
      *
      * @param[in] g      Graph to add the node to
@@ -538,7 +626,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_slice_node(Graph &g, NodeParams params, NodeIdxPair input, Coordinates &starts, Coordinates &ends);
+    static NodeID
+    add_slice_node(Graph &g, NodeParams params, NodeIdxPair input, Coordinates &starts, Coordinates &ends);
     /** Adds a split node to the graph
      *
      * @param[in] g          Graph to add the node to
@@ -549,7 +638,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_split_node(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_splits, unsigned int axis = 0);
+    static NodeID
+    add_split_node(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_splits, unsigned int axis = 0);
     /** Adds a stack layer node to the graph
      *
      * @param[in] g      Graph to add the node to
@@ -572,7 +662,13 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_strided_slice_node(Graph &g, NodeParams params, NodeIdxPair input, Coordinates &starts, Coordinates &ends, BiStrides &strides, StridedSliceLayerInfo info);
+    static NodeID add_strided_slice_node(Graph                &g,
+                                         NodeParams            params,
+                                         NodeIdxPair           input,
+                                         Coordinates          &starts,
+                                         Coordinates          &ends,
+                                         BiStrides            &strides,
+                                         StridedSliceLayerInfo info);
     /** Adds a yolo layer to the graph
      *
      * @param[in] g        Graph to add the node to
diff --git a/arm_compute/graph/GraphContext.h b/arm_compute/graph/GraphContext.h
index 7beb598646..68fbaf5478 100644
--- a/arm_compute/graph/GraphContext.h
+++ b/arm_compute/graph/GraphContext.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_GRAPH_GRAPH_CONTEXT_H
 
 #include "arm_compute/graph/Types.h"
-
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IWeightsManager.h"
 
@@ -39,18 +38,18 @@ namespace graph
 /** Contains structs required for memory management */
 struct MemoryManagerContext
 {
-    Target                                       target      = { Target::UNSPECIFIED }; /**< Target */
-    std::shared_ptr<arm_compute::IMemoryManager> intra_mm    = { nullptr };             /**< Intra-function memory manager */
-    std::shared_ptr<arm_compute::IMemoryManager> cross_mm    = { nullptr };             /**< Cross-function memory manager */
-    std::shared_ptr<arm_compute::IMemoryGroup>   cross_group = { nullptr };             /**< Cross-function memory group */
-    IAllocator                                  *allocator   = { nullptr };             /**< Backend allocator to use */
+    Target                                       target      = {Target::UNSPECIFIED}; /**< Target */
+    std::shared_ptr<arm_compute::IMemoryManager> intra_mm    = {nullptr}; /**< Intra-function memory manager */
+    std::shared_ptr<arm_compute::IMemoryManager> cross_mm    = {nullptr}; /**< Cross-function memory manager */
+    std::shared_ptr<arm_compute::IMemoryGroup>   cross_group = {nullptr}; /**< Cross-function memory group */
+    IAllocator                                  *allocator   = {nullptr}; /**< Backend allocator to use */
 };
 
 /** Contains structs required for weights management */
 struct WeightsManagerContext
 {
-    Target                                        target = { Target::UNSPECIFIED }; /**< Target */
-    std::shared_ptr<arm_compute::IWeightsManager> wm     = { nullptr };             /**< Weights manager */
+    Target                                        target = {Target::UNSPECIFIED}; /**< Target */
+    std::shared_ptr<arm_compute::IWeightsManager> wm     = {nullptr};             /**< Weights manager */
 };
 
 /** Graph context **/
@@ -125,7 +124,7 @@ public:
     void finalize();
 
 private:
-    GraphConfig _config;                                       /**< Graph configuration */
+    GraphConfig                             _config;           /**< Graph configuration */
     std::map<Target, MemoryManagerContext>  _memory_managers;  /**< Memory managers for each target */
     std::map<Target, WeightsManagerContext> _weights_managers; /**< Weights managers for each target */
 };
diff --git a/arm_compute/graph/IDeviceBackend.h b/arm_compute/graph/IDeviceBackend.h
index f84aac0ae0..8ae92e3177 100644
--- a/arm_compute/graph/IDeviceBackend.h
+++ b/arm_compute/graph/IDeviceBackend.h
@@ -88,7 +88,8 @@ public:
      *
      * @return Backend sub-tensor handle
      */
-    virtual std::unique_ptr<ITensorHandle> create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent) = 0;
+    virtual std::unique_ptr<ITensorHandle>
+    create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent) = 0;
     /** Configure a backend Node
      *
      * @note This creates an appropriate configured backend function for the given node
diff --git a/arm_compute/graph/LayerDescriptors.h b/arm_compute/graph/LayerDescriptors.h
index c11174f2ce..d632ed9e78 100644
--- a/arm_compute/graph/LayerDescriptors.h
+++ b/arm_compute/graph/LayerDescriptors.h
@@ -37,8 +37,7 @@ namespace descriptors
 struct ConcatLayerDescriptor
 {
     /** Default constructor */
-    ConcatLayerDescriptor()
-        : axis(DataLayoutDimension::CHANNEL), output_qinfo()
+    ConcatLayerDescriptor() : axis(DataLayoutDimension::CHANNEL), output_qinfo()
     {
     }
 
@@ -46,8 +45,7 @@ struct ConcatLayerDescriptor
      *
      * @param[in] axis Axis.
      */
-    ConcatLayerDescriptor(DataLayoutDimension axis)
-        : axis(axis), output_qinfo()
+    ConcatLayerDescriptor(DataLayoutDimension axis) : axis(axis), output_qinfo()
     {
     }
 
@@ -76,9 +74,16 @@ struct EltwiseLayerDescriptor
      * @param[in] r_policy         (Optional) Rounding policy used for the operation. Defaults to @ref RoundingPolicy::TO_ZERO
      * @param[in] fused_activation (Optional) Fused activation information. Defaults to empty (identity) @ref ActivationLayerInfo
      */
-    EltwiseLayerDescriptor(EltwiseOperation op, QuantizationInfo out_quant_info = QuantizationInfo(), ConvertPolicy c_policy = ConvertPolicy::SATURATE, RoundingPolicy r_policy = RoundingPolicy::TO_ZERO,
+    EltwiseLayerDescriptor(EltwiseOperation    op,
+                           QuantizationInfo    out_quant_info   = QuantizationInfo(),
+                           ConvertPolicy       c_policy         = ConvertPolicy::SATURATE,
+                           RoundingPolicy      r_policy         = RoundingPolicy::TO_ZERO,
                            ActivationLayerInfo fused_activation = ActivationLayerInfo())
-        : op(op), out_quant_info(out_quant_info), c_policy(c_policy), r_policy(r_policy), fused_activation(fused_activation)
+        : op(op),
+          out_quant_info(out_quant_info),
+          c_policy(c_policy),
+          r_policy(r_policy),
+          fused_activation(fused_activation)
     {
     }
 
@@ -100,10 +105,16 @@ struct UnaryEltwiseLayerDescriptor
      * @param[in] r_policy         (Optional) Rounding policy used for the operation. Defaults to @ref RoundingPolicy::TO_ZERO
      * @param[in] fused_activation (Optional) Fused activation information. Defaults to empty (identity) @ref ActivationLayerInfo
      */
-    UnaryEltwiseLayerDescriptor(UnaryEltwiseOperation op, QuantizationInfo out_quant_info = QuantizationInfo(), ConvertPolicy c_policy = ConvertPolicy::SATURATE,
-                                RoundingPolicy      r_policy         = RoundingPolicy::TO_ZERO,
-                                ActivationLayerInfo fused_activation = ActivationLayerInfo())
-        : op(op), out_quant_info(out_quant_info), c_policy(c_policy), r_policy(r_policy), fused_activation(fused_activation)
+    UnaryEltwiseLayerDescriptor(UnaryEltwiseOperation op,
+                                QuantizationInfo      out_quant_info   = QuantizationInfo(),
+                                ConvertPolicy         c_policy         = ConvertPolicy::SATURATE,
+                                RoundingPolicy        r_policy         = RoundingPolicy::TO_ZERO,
+                                ActivationLayerInfo   fused_activation = ActivationLayerInfo())
+        : op(op),
+          out_quant_info(out_quant_info),
+          c_policy(c_policy),
+          r_policy(r_policy),
+          fused_activation(fused_activation)
     {
     }
 
@@ -130,7 +141,7 @@ struct DeconvolutionLayerDescriptor
     PadStrideInfo    info;           /**< Padding and stride information */
     QuantizationInfo out_quant_info; /**< Output quantization information */
 };
-} // namespace descriptor
+} // namespace descriptors
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_LAYER_DESCRIPTORS_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_LAYER_DESCRIPTORS_H */
diff --git a/arm_compute/graph/Logger.h b/arm_compute/graph/Logger.h
index 872c650a1a..e83d5f4ddc 100644
--- a/arm_compute/graph/Logger.h
+++ b/arm_compute/graph/Logger.h
@@ -31,14 +31,14 @@
  *
  * @note It will eventually create all default loggers in don't exist
  */
-#define ARM_COMPUTE_CREATE_DEFAULT_GRAPH_LOGGER()                                  \
-    do                                                                             \
-    {                                                                              \
-        if(arm_compute::logging::LoggerRegistry::get().logger("GRAPH") == nullptr) \
-        {                                                                          \
-            arm_compute::logging::LoggerRegistry::get().create_reserved_loggers(); \
-        }                                                                          \
-    } while(false)
+#define ARM_COMPUTE_CREATE_DEFAULT_GRAPH_LOGGER()                                   \
+    do                                                                              \
+    {                                                                               \
+        if (arm_compute::logging::LoggerRegistry::get().logger("GRAPH") == nullptr) \
+        {                                                                           \
+            arm_compute::logging::LoggerRegistry::get().create_reserved_loggers();  \
+        }                                                                           \
+    } while (false)
 #else /* ARM_COMPUTE_LOGGING_ENABLED */
 #define ARM_COMPUTE_CREATE_DEFAULT_GRAPH_LOGGER()
 #endif /* ARM_COMPUTE_LOGGING_ENABLED */
diff --git a/arm_compute/graph/Tensor.h b/arm_compute/graph/Tensor.h
index de96c998bd..0ffae28ecc 100644
--- a/arm_compute/graph/Tensor.h
+++ b/arm_compute/graph/Tensor.h
@@ -24,11 +24,10 @@
 #ifndef ARM_COMPUTE_GRAPH_TENSOR_H
 #define ARM_COMPUTE_GRAPH_TENSOR_H
 
-#include "arm_compute/graph/Types.h"
-
 #include "arm_compute/graph/ITensorAccessor.h"
 #include "arm_compute/graph/ITensorHandle.h"
 #include "arm_compute/graph/TensorDescriptor.h"
+#include "arm_compute/graph/Types.h"
 
 #include <memory>
 #include <set>
diff --git a/arm_compute/graph/TensorDescriptor.h b/arm_compute/graph/TensorDescriptor.h
index 5fa155efc8..46a6ab2c27 100644
--- a/arm_compute/graph/TensorDescriptor.h
+++ b/arm_compute/graph/TensorDescriptor.h
@@ -52,7 +52,11 @@ struct TensorDescriptor final : public misc::ICloneable<TensorDescriptor>
                      QuantizationInfo tensor_quant_info  = QuantizationInfo(),
                      DataLayout       tensor_data_layout = DataLayout::NCHW,
                      Target           tensor_target      = Target::UNSPECIFIED)
-        : shape(tensor_shape), data_type(tensor_data_type), layout(tensor_data_layout), quant_info(tensor_quant_info), target(tensor_target)
+        : shape(tensor_shape),
+          data_type(tensor_data_type),
+          layout(tensor_data_layout),
+          quant_info(tensor_quant_info),
+          target(tensor_target)
     {
     }
     /** Sets tensor descriptor shape
@@ -106,11 +110,11 @@ struct TensorDescriptor final : public misc::ICloneable<TensorDescriptor>
         return std::make_unique<TensorDescriptor>(*this);
     }
 
-    TensorShape      shape{};                        /**< Tensor shape */
-    DataType         data_type{ DataType::UNKNOWN }; /**< Data type */
-    DataLayout       layout{ DataLayout::NCHW };     /**< Data layout */
-    QuantizationInfo quant_info{};                   /**< Quantization info */
-    Target           target{ Target::UNSPECIFIED };  /**< Target */
+    TensorShape      shape{};                      /**< Tensor shape */
+    DataType         data_type{DataType::UNKNOWN}; /**< Data type */
+    DataLayout       layout{DataLayout::NCHW};     /**< Data layout */
+    QuantizationInfo quant_info{};                 /**< Quantization info */
+    Target           target{Target::UNSPECIFIED};  /**< Target */
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/TypePrinter.h b/arm_compute/graph/TypePrinter.h
index 9df4eba5ec..5e83820ab3 100644
--- a/arm_compute/graph/TypePrinter.h
+++ b/arm_compute/graph/TypePrinter.h
@@ -37,7 +37,7 @@ namespace graph
 /** Formatted output of the Target. */
 inline ::std::ostream &operator<<(::std::ostream &os, const Target &target)
 {
-    switch(target)
+    switch (target)
     {
         case Target::UNSPECIFIED:
             os << "UNSPECIFIED";
@@ -60,7 +60,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const Target &target)
 
 inline ::std::ostream &operator<<(::std::ostream &os, const NodeType &node_type)
 {
-    switch(node_type)
+    switch (node_type)
     {
         case NodeType::ActivationLayer:
             os << "ActivationLayer";
@@ -207,7 +207,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const NodeType &node_type)
 /** Formatted output of the EltwiseOperation type. */
 inline ::std::ostream &operator<<(::std::ostream &os, const EltwiseOperation &eltwise_op)
 {
-    switch(eltwise_op)
+    switch (eltwise_op)
     {
         case EltwiseOperation::Add:
             os << "Add";
@@ -231,7 +231,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const EltwiseOperation &el
 /** Formatted output of the ConvolutionMethod type. */
 inline ::std::ostream &operator<<(::std::ostream &os, const ConvolutionMethod &method)
 {
-    switch(method)
+    switch (method)
     {
         case ConvolutionMethod::Default:
             os << "Default";
@@ -255,7 +255,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const ConvolutionMethod &m
 /** Formatted output of the FastMathHint type. */
 inline ::std::ostream &operator<<(::std::ostream &os, const FastMathHint &hint)
 {
-    switch(hint)
+    switch (hint)
     {
         case FastMathHint::Enabled:
             os << "Enabled";
@@ -273,7 +273,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const FastMathHint &hint)
 /** Formatted output of the DepthwiseConvolutionMethod type. */
 inline ::std::ostream &operator<<(::std::ostream &os, const DepthwiseConvolutionMethod &method)
 {
-    switch(method)
+    switch (method)
     {
         case DepthwiseConvolutionMethod::Default:
             os << "DEFAULT";
diff --git a/arm_compute/graph/Types.h b/arm_compute/graph/Types.h
index 8d493403b3..5541e3cbcc 100644
--- a/arm_compute/graph/Types.h
+++ b/arm_compute/graph/Types.h
@@ -86,17 +86,18 @@ struct TensorDescriptor;
 /** Graph configuration structure */
 struct GraphConfig
 {
-    bool          use_function_memory_manager{ true };   /**< Use a memory manager to manage per-function auxilary memory */
-    bool          use_function_weights_manager{ true };  /**< Use a weights manager to manage transformed weights */
-    bool          use_transition_memory_manager{ true }; /**< Use a memory manager to manager transition buffer memory */
-    bool          use_tuner{ false };                    /**< Use a tuner in tunable backends */
-    bool          use_synthetic_type{ false };           /**< Convert graph to a synthetic graph for a data type */
-    DataType      synthetic_type{ DataType::QASYMM8 };   /**< The data type of the synthetic graph  */
-    CLTunerMode   tuner_mode{ CLTunerMode::EXHAUSTIVE }; /**< Tuner mode to be used by the CL tuner */
-    int           num_threads{ -1 };                     /**< Number of threads to use (thread capable backends), if 0 the backend will auto-initialize, if -1 the backend will stay as it is. */
-    std::string   tuner_file{ "acl_tuner.csv" };         /**< File to load/store tuning values from */
-    std::string   mlgo_file{ "heuristics.mlgo" };        /**< Filename to load MLGO heuristics from */
-    CLBackendType backend_type{ CLBackendType::Native }; /**< CL backend type to use */
+    bool        use_function_memory_manager{true};   /**< Use a memory manager to manage per-function auxilary memory */
+    bool        use_function_weights_manager{true};  /**< Use a weights manager to manage transformed weights */
+    bool        use_transition_memory_manager{true}; /**< Use a memory manager to manager transition buffer memory */
+    bool        use_tuner{false};                    /**< Use a tuner in tunable backends */
+    bool        use_synthetic_type{false};           /**< Convert graph to a synthetic graph for a data type */
+    DataType    synthetic_type{DataType::QASYMM8};   /**< The data type of the synthetic graph  */
+    CLTunerMode tuner_mode{CLTunerMode::EXHAUSTIVE}; /**< Tuner mode to be used by the CL tuner */
+    int         num_threads{
+        -1}; /**< Number of threads to use (thread capable backends), if 0 the backend will auto-initialize, if -1 the backend will stay as it is. */
+    std::string   tuner_file{"acl_tuner.csv"};         /**< File to load/store tuning values from */
+    std::string   mlgo_file{"heuristics.mlgo"};        /**< Filename to load MLGO heuristics from */
+    CLBackendType backend_type{CLBackendType::Native}; /**< CL backend type to use */
 };
 
 /**< Device target types */
diff --git a/arm_compute/graph/Utils.h b/arm_compute/graph/Utils.h
index a3d9012ee9..9813ff05c7 100644
--- a/arm_compute/graph/Utils.h
+++ b/arm_compute/graph/Utils.h
@@ -36,7 +36,7 @@ class GraphContext;
 
 inline bool is_utility_node(INode *node)
 {
-    std::set<NodeType> utility_node_types = { NodeType::PrintLayer };
+    std::set<NodeType> utility_node_types = {NodeType::PrintLayer};
     return utility_node_types.find(node->type()) != utility_node_types.end();
 }
 
diff --git a/arm_compute/graph/Workload.h b/arm_compute/graph/Workload.h
index 5b4533cb6f..8ff0a548ae 100644
--- a/arm_compute/graph/Workload.h
+++ b/arm_compute/graph/Workload.h
@@ -69,8 +69,7 @@ public:
  */
 struct ExecutionTask
 {
-    ExecutionTask(std::unique_ptr<arm_compute::IFunction> &&f, INode *n)
-        : task(std::move(f)), node(n)
+    ExecutionTask(std::unique_ptr<arm_compute::IFunction> &&f, INode *n) : task(std::move(f)), node(n)
     {
     }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -97,11 +96,11 @@ struct ExecutionTask
 /** Execution workload */
 struct ExecutionWorkload
 {
-    std::vector<Tensor *>      inputs  = {};          /**< Input handles */
-    std::vector<Tensor *>      outputs = {};          /**< Output handles */
-    std::vector<ExecutionTask> tasks   = {};          /**< Execution workload */
-    Graph                     *graph   = { nullptr }; /**< Graph bound to the workload */
-    GraphContext              *ctx     = { nullptr }; /**< Graph execution context */
+    std::vector<Tensor *>      inputs  = {};        /**< Input handles */
+    std::vector<Tensor *>      outputs = {};        /**< Output handles */
+    std::vector<ExecutionTask> tasks   = {};        /**< Execution workload */
+    Graph                     *graph   = {nullptr}; /**< Graph bound to the workload */
+    GraphContext              *ctx     = {nullptr}; /**< Graph execution context */
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/backends/BackendRegistrar.h b/arm_compute/graph/backends/BackendRegistrar.h
index 902c12b0a6..2879361fef 100644
--- a/arm_compute/graph/backends/BackendRegistrar.h
+++ b/arm_compute/graph/backends/BackendRegistrar.h
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_GRAPH_BACKEND_REGISTRAR_H
 #define ARM_COMPUTE_GRAPH_BACKEND_REGISTRAR_H
 
-#include "arm_compute/graph/Types.h"
 #include "arm_compute/graph/backends/BackendRegistry.h"
+#include "arm_compute/graph/Types.h"
 
 #include <utility>
 
@@ -58,4 +58,4 @@ inline BackendRegistrar<T>::BackendRegistrar(Target target)
 } // namespace backends
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_BACKEND_REGISTRAR_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_GRAPH_BACKEND_REGISTRAR_H */
diff --git a/arm_compute/graph/backends/CL/CLDeviceBackend.h b/arm_compute/graph/backends/CL/CLDeviceBackend.h
index 63674ad794..09e19d7688 100644
--- a/arm_compute/graph/backends/CL/CLDeviceBackend.h
+++ b/arm_compute/graph/backends/CL/CLDeviceBackend.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_GRAPH_CLDEVICEBACKEND_H
 
 #include "arm_compute/graph/IDeviceBackend.h"
-
 #include "arm_compute/runtime/CL/CLBufferAllocator.h"
 #include "arm_compute/runtime/CL/CLGEMMHeuristicsHandle.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
@@ -59,22 +58,23 @@ public:
     void set_kernel_tuning_mode(CLTunerMode tuning_mode);
 
     // Inherited overridden methods
-    void initialize_backend() override;
-    void setup_backend_context(GraphContext &ctx) override;
-    void release_backend_context(GraphContext &ctx) override;
+    void                           initialize_backend() override;
+    void                           setup_backend_context(GraphContext &ctx) override;
+    void                           release_backend_context(GraphContext &ctx) override;
     bool                           is_backend_supported() override;
     IAllocator                    *backend_allocator() override;
     std::unique_ptr<ITensorHandle> create_tensor(const Tensor &tensor) override;
-    std::unique_ptr<ITensorHandle> create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent) override;
-    std::unique_ptr<arm_compute::IFunction> configure_node(INode &node, GraphContext &ctx) override;
-    Status validate_node(INode &node) override;
-    std::shared_ptr<arm_compute::IMemoryManager> create_memory_manager(MemoryManagerAffinity affinity) override;
+    std::unique_ptr<ITensorHandle>
+    create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent) override;
+    std::unique_ptr<arm_compute::IFunction>       configure_node(INode &node, GraphContext &ctx) override;
+    Status                                        validate_node(INode &node) override;
+    std::shared_ptr<arm_compute::IMemoryManager>  create_memory_manager(MemoryManagerAffinity affinity) override;
     std::shared_ptr<arm_compute::IWeightsManager> create_weights_manager() override;
     void                                          sync() override;
 
 private:
-    int                                _context_count;   /**< Counts how many contexts are currently using the backend */
-    CLTuner                            _tuner;           /**< CL kernel tuner */
+    int                                _context_count; /**< Counts how many contexts are currently using the backend */
+    CLTuner                            _tuner;         /**< CL kernel tuner */
     CLGEMMHeuristicsHandle             _gemm_heuristics; /**< GEMM heuristics */
     std::unique_ptr<CLBufferAllocator> _allocator;       /**< CL buffer affinity allocator */
     std::string                        _tuner_file;      /**< Filename to load/store the tuner's values from */
diff --git a/arm_compute/graph/backends/CL/CLSubTensorHandle.h b/arm_compute/graph/backends/CL/CLSubTensorHandle.h
index 3750fc85ee..85eebec639 100644
--- a/arm_compute/graph/backends/CL/CLSubTensorHandle.h
+++ b/arm_compute/graph/backends/CL/CLSubTensorHandle.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_GRAPH_CLSUBTENSORHANDLE_H
 
 #include "arm_compute/graph/ITensorHandle.h"
-
 #include "arm_compute/runtime/CL/CLSubTensor.h"
 
 namespace arm_compute
@@ -45,7 +44,10 @@ public:
      * @param[in] coords        Starting coordinates
      * @param[in] extend_parent Extends parent shape if true
      */
-    CLSubTensorHandle(ITensorHandle *parent_handle, const TensorShape &shape, const Coordinates &coords, bool extend_parent = false);
+    CLSubTensorHandle(ITensorHandle     *parent_handle,
+                      const TensorShape &shape,
+                      const Coordinates &coords,
+                      bool               extend_parent = false);
     /** Destructor: free the tensor's memory */
     ~CLSubTensorHandle() = default;
     /** Allow instances of this class to be move constructed */
@@ -58,10 +60,10 @@ public:
     CLSubTensorHandle &operator=(const CLSubTensorHandle &) = delete;
 
     // Inherited overridden methods
-    void allocate() override;
-    void free() override;
-    void manage(IMemoryGroup *mg) override;
-    void map(bool blocking) override;
+    void                        allocate() override;
+    void                        free() override;
+    void                        manage(IMemoryGroup *mg) override;
+    void                        map(bool blocking) override;
     void                        unmap() override;
     void                        release_if_unused() override;
     arm_compute::ITensor       &tensor() override;
diff --git a/arm_compute/graph/backends/CL/CLTensorHandle.h b/arm_compute/graph/backends/CL/CLTensorHandle.h
index 16e30efc43..57e9794ec3 100644
--- a/arm_compute/graph/backends/CL/CLTensorHandle.h
+++ b/arm_compute/graph/backends/CL/CLTensorHandle.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_GRAPH_CLTENSORHANDLE_H
 
 #include "arm_compute/graph/ITensorHandle.h"
-
 #include "arm_compute/runtime/CL/CLTensor.h"
 
 namespace arm_compute
@@ -51,10 +50,10 @@ public:
     CLTensorHandle &operator=(CLTensorHandle &&) = default;
 
     // Inherited overridden methods
-    void allocate() override;
-    void free() override;
-    void manage(IMemoryGroup *mg) override;
-    void map(bool blocking) override;
+    void                        allocate() override;
+    void                        free() override;
+    void                        manage(IMemoryGroup *mg) override;
+    void                        map(bool blocking) override;
     void                        unmap() override;
     void                        release_if_unused() override;
     arm_compute::ITensor       &tensor() override;
diff --git a/arm_compute/graph/backends/FunctionHelpers.h b/arm_compute/graph/backends/FunctionHelpers.h
index 877e1f92e4..fd8b6b5a69 100644
--- a/arm_compute/graph/backends/FunctionHelpers.h
+++ b/arm_compute/graph/backends/FunctionHelpers.h
@@ -24,19 +24,19 @@
 #ifndef ACL_ARM_COMPUTE_GRAPH_BACKENDS_FUNCTIONHELPERS_H
 #define ACL_ARM_COMPUTE_GRAPH_BACKENDS_FUNCTIONHELPERS_H
 
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/graph/backends/FusedConvolutionBatchNormalizationFunction.h"
+#include "arm_compute/graph/backends/FusedDepthwiseConvolutionBatchNormalizationFunction.h"
+#include "arm_compute/graph/backends/Utils.h"
 #include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/nodes/Nodes.h"
 #include "arm_compute/graph/Tensor.h"
 #include "arm_compute/graph/TypePrinter.h"
 #include "arm_compute/graph/Types.h"
 #include "arm_compute/graph/Utils.h"
-#include "arm_compute/graph/backends/FusedConvolutionBatchNormalizationFunction.h"
-#include "arm_compute/graph/backends/FusedDepthwiseConvolutionBatchNormalizationFunction.h"
-#include "arm_compute/graph/backends/Utils.h"
-#include "arm_compute/graph/nodes/Nodes.h"
 
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorInfo.h"
 #include "support/Cast.h"
 
 namespace arm_compute
@@ -59,13 +59,16 @@ template <typename TargetInfo>
 typename TargetInfo::TensorType *get_backing_tensor(arm_compute::graph::Tensor *tensor)
 {
     typename TargetInfo::TensorType *backing_tensor = nullptr;
-    if(tensor != nullptr)
+    if (tensor != nullptr)
     {
         ARM_COMPUTE_ERROR_ON(tensor->desc().target != TargetInfo::TargetType);
         // Get backing tensor handle
         ITensorHandle *tensor_handle = tensor->handle();
         // Get backing tensor
-        backing_tensor = (tensor_handle != nullptr) ? arm_compute::utils::cast::polymorphic_cast<typename TargetInfo::TensorType *>(&tensor_handle->tensor()) : nullptr;
+        backing_tensor = (tensor_handle != nullptr)
+                             ? arm_compute::utils::cast::polymorphic_cast<typename TargetInfo::TensorType *>(
+                                   &tensor_handle->tensor())
+                             : nullptr;
     }
 
     return backing_tensor;
@@ -74,11 +77,8 @@ typename TargetInfo::TensorType *get_backing_tensor(arm_compute::graph::Tensor *
 template <typename TargetInfo>
 void validate_node(const INode &node, size_t num_expected_inputs, size_t num_expected_outputs)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating " << node.type()
-                                  << " Target: " << TargetInfo::TargetType
-                                  << " ID: " << node.id()
-                                  << node.name()
-                                  << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating " << node.type() << " Target: " << TargetInfo::TargetType
+                                              << " ID: " << node.id() << node.name() << std::endl);
 
     ARM_COMPUTE_ERROR_ON(TargetInfo::TargetType != node.assigned_target());
     ARM_COMPUTE_ERROR_ON(node.num_inputs() != num_expected_inputs);
@@ -109,17 +109,11 @@ std::unique_ptr<IFunction> create_activation_layer(ActivationLayerNode &node)
     auto func = std::make_unique<ActivationLayerFunction>();
     func->configure(input, output, act_info);
 
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Shape: " << input->info()->tensor_shape()
-                               << " Activation function: " << act_info.activation()
-                               << " a: " << act_info.a()
-                               << " b: " << act_info.b()
-                               << " InPlace : " << is_in_place_operation(input, output)
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO(
+        "Instantiated " << node.name() << " Type: " << node.type() << " Target: " << TargetInfo::TargetType
+                        << " Data Type: " << input->info()->data_type() << " Shape: " << input->info()->tensor_shape()
+                        << " Activation function: " << act_info.activation() << " a: " << act_info.a() << " b: "
+                        << act_info.b() << " InPlace : " << is_in_place_operation(input, output) << std::endl);
 
     return func;
 }
@@ -148,15 +142,10 @@ std::unique_ptr<IFunction> create_arg_min_max_layer(ArgMinMaxLayerNode &node)
     auto func = std::make_unique<ArgMinMaxLayerFunction>();
     func->configure(input, axis, output, op);
 
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Shape: " << input->info()->tensor_shape()
-                               << " Reduction Operation: " << op
-                               << " axis: " << axis
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Shape: " << input->info()->tensor_shape()
+                                               << " Reduction Operation: " << op << " axis: " << axis << std::endl);
 
     return func;
 }
@@ -191,16 +180,11 @@ std::unique_ptr<IFunction> create_batch_normalization_layer(BatchNormalizationLa
     func->configure(input, output, mean, var, beta, gamma, epsilon, fused_act);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Shape: " << input->info()->tensor_shape()
-                               << " Epsilon: " << epsilon << " "
-                               << (fused_act.enabled() ? to_string(fused_act.activation()) : "")
-                               << " InPlace: " << is_in_place_operation(input, output)
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Shape: " << input->info()->tensor_shape() << " Epsilon: " << epsilon
+                                               << " " << (fused_act.enabled() ? to_string(fused_act.activation()) : "")
+                                               << " InPlace: " << is_in_place_operation(input, output) << std::endl);
 
     return func;
 }
@@ -216,7 +200,8 @@ std::unique_ptr<IFunction> create_batch_normalization_layer(BatchNormalizationLa
  * @return Backend batch normalization layer function
  */
 template <typename FusedLayerTypes, typename TargetInfo>
-std::unique_ptr<IFunction> create_fused_convolution_batch_normalization_layer(FusedConvolutionBatchNormalizationNode &node, GraphContext &ctx)
+std::unique_ptr<IFunction>
+create_fused_convolution_batch_normalization_layer(FusedConvolutionBatchNormalizationNode &node, GraphContext &ctx)
 {
     validate_node<TargetInfo>(node, 7 /* expected inputs */, 1 /* expected outputs */);
 
@@ -246,19 +231,16 @@ std::unique_ptr<IFunction> create_fused_convolution_batch_normalization_layer(Fu
 
     // Create and configure function
     std::tie(func, func_name) = create_named_memory_managed_function<FType>(
-                                    std::string("FusedConvolutionBatchNormalizationLayer"), mm, input, weights, biases, output, mean, var, beta, gamma, epsilon, conv_info, num_groups, fast_math, fused_act);
+        std::string("FusedConvolutionBatchNormalizationLayer"), mm, input, weights, biases, output, mean, var, beta,
+        gamma, epsilon, conv_info, num_groups, fast_math, fused_act);
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Weights shape: " << weights->info()->tensor_shape()
+                               << node.name() << " Type: " << node.type() << " Target: " << TargetInfo::TargetType
+                               << " Data Type: " << input->info()->data_type() << " Input shape: "
+                               << input->info()->tensor_shape() << " Weights shape: " << weights->info()->tensor_shape()
                                << " Output shape: " << output->info()->tensor_shape()
-                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
-                               << std::endl);
+                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "") << std::endl);
     return func;
 }
 
@@ -273,7 +255,9 @@ std::unique_ptr<IFunction> create_fused_convolution_batch_normalization_layer(Fu
  * @return Backend fused depthwise convolution batch normalization layer function
  */
 template <typename FusedLayerTypes, typename TargetInfo>
-std::unique_ptr<IFunction> create_fused_depthwise_convolution_batch_normalization_layer(FusedDepthwiseConvolutionBatchNormalizationNode &node, GraphContext &ctx)
+std::unique_ptr<IFunction>
+create_fused_depthwise_convolution_batch_normalization_layer(FusedDepthwiseConvolutionBatchNormalizationNode &node,
+                                                             GraphContext                                    &ctx)
 {
     validate_node<TargetInfo>(node, 7 /* expected inputs */, 1 /* expected outputs */);
 
@@ -302,19 +286,16 @@ std::unique_ptr<IFunction> create_fused_depthwise_convolution_batch_normalizatio
 
     // Create and configure function
     std::tie(func, func_name) = create_named_memory_managed_function<FType>(
-                                    std::string("FusedDepthwiseConvolutionBatchNormalizationLayer"), mm, input, weights, biases, output, mean, var, beta, gamma, epsilon, conv_info, depth_multiplier, fused_act);
+        std::string("FusedDepthwiseConvolutionBatchNormalizationLayer"), mm, input, weights, biases, output, mean, var,
+        beta, gamma, epsilon, conv_info, depth_multiplier, fused_act);
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Weights shape: " << weights->info()->tensor_shape()
+                               << node.name() << " Type: " << node.type() << " Target: " << TargetInfo::TargetType
+                               << " Data Type: " << input->info()->data_type() << " Input shape: "
+                               << input->info()->tensor_shape() << " Weights shape: " << weights->info()->tensor_shape()
                                << " Output shape: " << output->info()->tensor_shape()
-                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
-                               << std::endl);
+                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "") << std::endl);
     return func;
 }
 
@@ -343,15 +324,11 @@ std::unique_ptr<IFunction> create_bounding_box_transform_layer(BoundingBoxTransf
     func->configure(input, output, deltas, bbox_info);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Shape: " << input->info()->tensor_shape()
-                               << " BoundingBox Info img W: " << bbox_info.img_width() << " "
-                               << " BoundingBox Info img H: " << bbox_info.img_height() << " "
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO(
+        "Instantiated " << node.name() << " Type: " << node.type() << " Target: " << TargetInfo::TargetType
+                        << " Data Type: " << input->info()->data_type() << " Shape: " << input->info()->tensor_shape()
+                        << " BoundingBox Info img W: " << bbox_info.img_width() << " "
+                        << " BoundingBox Info img H: " << bbox_info.img_height() << " " << std::endl);
 
     return std::move(func);
 }
@@ -379,14 +356,10 @@ std::unique_ptr<IFunction> create_channel_shuffle_layer(ChannelShuffleLayerNode
     auto func = std::make_unique<ChannelShuffleLayerFunction>();
     func->configure(input, output, num_groups);
 
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Shape: " << input->info()->tensor_shape()
-                               << " Num groups: " << num_groups
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Shape: " << input->info()->tensor_shape()
+                                               << " Num groups: " << num_groups << std::endl);
 
     return func;
 }
@@ -403,24 +376,25 @@ std::unique_ptr<IFunction> create_channel_shuffle_layer(ChannelShuffleLayerNode
 template <typename ConcatenateLayerFunction, typename TargetInfo>
 std::unique_ptr<arm_compute::IFunction> create_concatenate_layer(ConcatenateLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating Concatenate node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating Concatenate node with ID : " << node.id() << " and Name: " << node.name()
+                                                                         << std::endl);
     ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
 
     // Return nullptr if depth concatenate is switched off
-    if(!node.is_enabled())
+    if (!node.is_enabled())
     {
         return nullptr;
     }
 
     // Extract IO and info
     std::vector<typename TargetInfo::SrcTensorType *> inputs;
-    for(unsigned int i = 0; i < node.num_inputs(); ++i)
+    for (unsigned int i = 0; i < node.num_inputs(); ++i)
     {
         inputs.push_back(get_backing_tensor<TargetInfo>(node.input(i)));
     }
-    typename TargetInfo::TensorType *output      = get_backing_tensor<TargetInfo>(node.output(0));
-    const DataLayout                 data_layout = node.output(0) != nullptr ? node.output(0)->desc().layout : DataLayout::UNKNOWN;
-    const size_t                     concat_axis = get_dimension_idx(data_layout, node.concatenation_axis());
+    typename TargetInfo::TensorType *output = get_backing_tensor<TargetInfo>(node.output(0));
+    const DataLayout data_layout = node.output(0) != nullptr ? node.output(0)->desc().layout : DataLayout::UNKNOWN;
+    const size_t     concat_axis = get_dimension_idx(data_layout, node.concatenation_axis());
 
     // Create and configure function
     auto func = std::make_unique<ConcatenateLayerFunction>();
@@ -429,20 +403,14 @@ std::unique_ptr<arm_compute::IFunction> create_concatenate_layer(ConcatenateLaye
     // Log info
     const bool         is_quantized = is_data_type_quantized_asymmetric(output->info()->data_type());
     std::ostringstream qss;
-    if(is_quantized)
+    if (is_quantized)
     {
         qss << " Output QuantInfo: " << output->info()->quantization_info();
     }
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << output->info()->data_type()
-                               << " Shape: " << output->info()->tensor_shape()
-                               << " Num Inputs: " << inputs.size()
-                               << " Axis: " << concat_axis
-                               << qss.str()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO(
+        "Instantiated " << node.name() << " Type: " << node.type() << " Target: " << TargetInfo::TargetType
+                        << " Data Type: " << output->info()->data_type() << " Shape: " << output->info()->tensor_shape()
+                        << " Num Inputs: " << inputs.size() << " Axis: " << concat_axis << qss.str() << std::endl);
 
     return func;
 }
@@ -470,7 +438,7 @@ std::unique_ptr<IFunction> create_convolution_layer(ConvolutionLayerNode &node,
 
     const bool is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
 
-    if(is_quantized)
+    if (is_quantized)
     {
         biases->info()->set_data_type(DataType::S32);
     }
@@ -486,55 +454,50 @@ std::unique_ptr<IFunction> create_convolution_layer(ConvolutionLayerNode &node,
     std::unique_ptr<IFunction>      func;
     std::string                     func_name;
 
-    if(conv_algorithm == ConvolutionMethod::Winograd)
+    if (conv_algorithm == ConvolutionMethod::Winograd)
     {
         ARM_COMPUTE_ERROR_ON_MSG(num_groups != 1, "WinogradConvolutionLayer does not support grouping!");
-        std::tie(func, func_name) = create_named_memory_managed_function<typename ConvolutionLayerFunctions::WinogradConvolutionLayer>(
-                                        std::string("WinogradConvolutionLayer"), mm,
-                                        input, weights, biases, output, conv_info, fused_act, fast_math);
+        std::tie(func, func_name) =
+            create_named_memory_managed_function<typename ConvolutionLayerFunctions::WinogradConvolutionLayer>(
+                std::string("WinogradConvolutionLayer"), mm, input, weights, biases, output, conv_info, fused_act,
+                fast_math);
     }
-    else if(conv_algorithm == ConvolutionMethod::Direct)
+    else if (conv_algorithm == ConvolutionMethod::Direct)
     {
         ARM_COMPUTE_ERROR_ON_MSG(num_groups != 1, "DirectConvolutionLayer does not support grouping!");
         std::tie(func, func_name) = create_named_function<typename ConvolutionLayerFunctions::DirectConvolutionLayer>(
-                                        std::string("DirectConvolutionLayer"),
-                                        input, weights, biases, output, conv_info, fused_act);
+            std::string("DirectConvolutionLayer"), input, weights, biases, output, conv_info, fused_act);
     }
-    else if(conv_algorithm == ConvolutionMethod::GEMM)
+    else if (conv_algorithm == ConvolutionMethod::GEMM)
     {
-        std::tie(func, func_name) = create_named_memory_managed_function<typename ConvolutionLayerFunctions::GEMMConvolutionLayer>(
-                                        std::string("GEMMConvolutionLayer"), mm,
-                                        input, weights, biases, output, conv_info,
-                                        WeightsInfo(), Size2D(1U, 1U), fused_act, num_groups);
+        std::tie(func, func_name) =
+            create_named_memory_managed_function<typename ConvolutionLayerFunctions::GEMMConvolutionLayer>(
+                std::string("GEMMConvolutionLayer"), mm, input, weights, biases, output, conv_info, WeightsInfo(),
+                Size2D(1U, 1U), fused_act, num_groups);
     }
     else
     {
-        std::tie(func, func_name) = create_named_memory_managed_function<typename ConvolutionLayerFunctions::GenericConvolutionLayer>(
-                                        std::string("GenericConvolutionLayer"), mm,
-                                        input, weights, biases, output, conv_info,
-                                        WeightsInfo(), Size2D(1U, 1U), fused_act, fast_math, num_groups);
+        std::tie(func, func_name) =
+            create_named_memory_managed_function<typename ConvolutionLayerFunctions::GenericConvolutionLayer>(
+                std::string("GenericConvolutionLayer"), mm, input, weights, biases, output, conv_info, WeightsInfo(),
+                Size2D(1U, 1U), fused_act, fast_math, num_groups);
     }
 
     // Log info
     std::ostringstream qss;
-    if(is_quantized)
+    if (is_quantized)
     {
         qss << " Input QuantInfo: " << input->info()->quantization_info()
             << " Weights QuantInfo: " << weights->info()->quantization_info()
             << " Output QuantInfo: " << output->info()->quantization_info();
     }
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << func_name
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Groups: " << num_groups
+                               << node.name() << " Type: " << func_name << " Target: " << TargetInfo::TargetType
+                               << " Data Type: " << input->info()->data_type() << " Groups: " << num_groups
                                << " Input shape: " << input->info()->tensor_shape()
                                << " Weights shape: " << weights->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << qss.str()
-                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
-                               << std::endl);
+                               << " Output shape: " << output->info()->tensor_shape() << qss.str()
+                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "") << std::endl);
     return func;
 }
 
@@ -566,19 +529,14 @@ std::unique_ptr<IFunction> create_deconvolution_layer(DeconvolutionLayerNode &no
     std::unique_ptr<IFunction>      func;
 
     std::tie(func, std::ignore) = create_named_memory_managed_function<DeconvolutionLayerFunction>(
-                                      std::string(), mm,
-                                      input, weights, biases, output, deconv_info);
+        std::string(), mm, input, weights, biases, output, deconv_info);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Weights shape: " << weights->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Weights shape: " << weights->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
     return func;
 }
 
@@ -604,7 +562,7 @@ std::unique_ptr<IFunction> create_depthwise_convolution_layer(DepthwiseConvoluti
 
     const bool is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
 
-    if(is_quantized)
+    if (is_quantized)
     {
         biases->info()->set_data_type(DataType::S32);
     }
@@ -617,30 +575,25 @@ std::unique_ptr<IFunction> create_depthwise_convolution_layer(DepthwiseConvoluti
     std::unique_ptr<IFunction> func;
     std::string                func_name;
 
-    std::tie(func, func_name) = create_named_function<DepthwiseConvolutionLayer>(
-                                    std::string("DepthwiseConvolutionLayer"),
-                                    input, weights, biases, output, conv_info, depth_multiplier, fused_act);
+    std::tie(func, func_name) =
+        create_named_function<DepthwiseConvolutionLayer>(std::string("DepthwiseConvolutionLayer"), input, weights,
+                                                         biases, output, conv_info, depth_multiplier, fused_act);
 
     // Log info
     std::ostringstream qss;
-    if(is_quantized)
+    if (is_quantized)
     {
         qss << " Input QuantInfo: " << input->info()->quantization_info()
             << " Weights QuantInfo: " << weights->info()->quantization_info()
             << " Output QuantInfo: " << output->info()->quantization_info();
     }
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << func_name
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Weights shape: " << weights->info()->tensor_shape()
+                               << node.name() << " Type: " << func_name << " Target: " << TargetInfo::TargetType
+                               << " Data Type: " << input->info()->data_type() << " Input shape: "
+                               << input->info()->tensor_shape() << " Weights shape: " << weights->info()->tensor_shape()
                                << " Output shape: " << output->info()->tensor_shape()
-                               << " Depth multiplier: " << depth_multiplier
-                               << qss.str()
-                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
-                               << std::endl);
+                               << " Depth multiplier: " << depth_multiplier << qss.str()
+                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "") << std::endl);
     return func;
 }
 
@@ -670,15 +623,11 @@ std::unique_ptr<IFunction> create_depth_to_space_layer(DepthToSpaceLayerNode &no
     func->configure(input, output, node.block_shape());
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Block Size: " << node.block_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Block Size: " << node.block_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
     return func;
 }
@@ -709,15 +658,11 @@ std::unique_ptr<IFunction> create_dequantization_layer(DequantizationLayerNode &
     func->configure(input, output);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Input quantization info: " << output->info()->quantization_info()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Input quantization info: " << output->info()->quantization_info()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
     return func;
 }
@@ -753,16 +698,12 @@ std::unique_ptr<IFunction> create_detection_output_layer(DetectionOutputLayerNod
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input0->info()->data_type()
-                               << " Input0 shape: " << input0->info()->tensor_shape()
-                               << " Input1 shape: " << input1->info()->tensor_shape()
+                               << node.name() << " Type: " << node.type() << " Target: " << TargetInfo::TargetType
+                               << " Data Type: " << input0->info()->data_type() << " Input0 shape: "
+                               << input0->info()->tensor_shape() << " Input1 shape: " << input1->info()->tensor_shape()
                                << " Input2 shape: " << input2->info()->tensor_shape()
                                << " Output shape: " << output->info()->tensor_shape()
-                               << " DetectionOutputLayer info: " << detect_info
-                               << std::endl);
+                               << " DetectionOutputLayer info: " << detect_info << std::endl);
 
     return func;
 }
@@ -805,19 +746,15 @@ std::unique_ptr<IFunction> create_detection_post_process_layer(DetectionPostProc
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input0->info()->data_type()
-                               << " Input0 shape: " << input0->info()->tensor_shape()
-                               << " Input1 shape: " << input1->info()->tensor_shape()
+                               << node.name() << " Type: " << node.type() << " Target: " << TargetInfo::TargetType
+                               << " Data Type: " << input0->info()->data_type() << " Input0 shape: "
+                               << input0->info()->tensor_shape() << " Input1 shape: " << input1->info()->tensor_shape()
                                << " Input2 shape: " << input2->info()->tensor_shape()
                                << " Output0 shape: " << output0->info()->tensor_shape()
                                << " Output1 shape: " << output1->info()->tensor_shape()
                                << " Output2 shape: " << output2->info()->tensor_shape()
                                << " Output3 shape: " << output3->info()->tensor_shape()
-                               << " DetectionPostProcessLayer info: " << detect_info
-                               << std::endl);
+                               << " DetectionPostProcessLayer info: " << detect_info << std::endl);
 
     return func;
 }
@@ -849,35 +786,31 @@ std::unique_ptr<IFunction> create_eltwise_layer(EltwiseLayerNode &node)
 
     std::unique_ptr<IFunction> func = nullptr;
     std::string                func_name;
-    if(eltwise_op == EltwiseOperation::Add)
+    if (eltwise_op == EltwiseOperation::Add)
     {
         std::tie(func, func_name) = create_named_function<typename EltwiseFunctions::Addition>(
-                                        std::string("ArithmeticAddition"),
-                                        input1, input2, output, convert_policy, act_info);
+            std::string("ArithmeticAddition"), input1, input2, output, convert_policy, act_info);
     }
-    else if(eltwise_op == EltwiseOperation::Sub)
+    else if (eltwise_op == EltwiseOperation::Sub)
     {
         std::tie(func, func_name) = create_named_function<typename EltwiseFunctions::Subtraction>(
-                                        std::string("ArithmeticSubtraction"),
-                                        input1, input2, output, convert_policy, act_info);
+            std::string("ArithmeticSubtraction"), input1, input2, output, convert_policy, act_info);
     }
-    else if(eltwise_op == EltwiseOperation::Mul)
+    else if (eltwise_op == EltwiseOperation::Mul)
     {
         std::tie(func, func_name) = create_named_function<typename EltwiseFunctions::Multiplication>(
-                                        std::string("PixelWiseMultiplication"),
-                                        input1, input2, output, 1.f, convert_policy, node.rounding_policy(), act_info);
+            std::string("PixelWiseMultiplication"), input1, input2, output, 1.f, convert_policy, node.rounding_policy(),
+            act_info);
     }
-    else if(eltwise_op == EltwiseOperation::Max)
+    else if (eltwise_op == EltwiseOperation::Max)
     {
         std::tie(func, func_name) = create_named_function<typename EltwiseFunctions::Maximum>(
-                                        std::string("ElementwiseMaximum"),
-                                        input1, input2, output, act_info);
+            std::string("ElementwiseMaximum"), input1, input2, output, act_info);
     }
-    else if(eltwise_op == EltwiseOperation::Div)
+    else if (eltwise_op == EltwiseOperation::Div)
     {
         std::tie(func, func_name) = create_named_function<typename EltwiseFunctions::Division>(
-                                        std::string("ArithmeticDivision"),
-                                        input1, input2, output, act_info);
+            std::string("ArithmeticDivision"), input1, input2, output, act_info);
     }
     else
     {
@@ -885,14 +818,10 @@ std::unique_ptr<IFunction> create_eltwise_layer(EltwiseLayerNode &node)
     }
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Operation: " << func_name
-                               << " Data Type: " << input1->info()->data_type()
-                               << " Shape: " << input1->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type()
+                                               << " Target: " << TargetInfo::TargetType << " Operation: " << func_name
+                                               << " Data Type: " << input1->info()->data_type()
+                                               << " Shape: " << input1->info()->tensor_shape() << std::endl);
 
     return func;
 }
@@ -921,11 +850,10 @@ std::unique_ptr<IFunction> create_unary_eltwise_layer(UnaryEltwiseLayerNode &nod
 
     std::unique_ptr<IFunction> func = nullptr;
     std::string                func_name;
-    if(eltwise_op == UnaryEltwiseOperation::Exp)
+    if (eltwise_op == UnaryEltwiseOperation::Exp)
     {
-        std::tie(func, func_name) = create_named_function<typename UnaryEltwiseFunctions::Exp>(
-                                        std::string("Exp"),
-                                        input, output);
+        std::tie(func, func_name) =
+            create_named_function<typename UnaryEltwiseFunctions::Exp>(std::string("Exp"), input, output);
     }
     else
     {
@@ -933,14 +861,10 @@ std::unique_ptr<IFunction> create_unary_eltwise_layer(UnaryEltwiseLayerNode &nod
     }
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Operation: " << func_name
-                               << " Data Type: " << input->info()->data_type()
-                               << " Shape: " << input->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type()
+                                               << " Target: " << TargetInfo::TargetType << " Operation: " << func_name
+                                               << " Data Type: " << input->info()->data_type()
+                                               << " Shape: " << input->info()->tensor_shape() << std::endl);
 
     return func;
 }
@@ -971,14 +895,10 @@ std::unique_ptr<IFunction> create_flatten_layer(FlattenLayerNode &node)
     func->configure(input, output);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
     return func;
 }
@@ -1020,22 +940,17 @@ std::unique_ptr<IFunction> create_fully_connected_layer(FullyConnectedLayerNode
 
     // Log info
     std::ostringstream qss;
-    if(is_quantized)
+    if (is_quantized)
     {
         qss << " Input QuantInfo: " << input->info()->quantization_info()
             << " Weights QuantInfo: " << weights->info()->quantization_info()
             << " Output QuantInfo: " << output->info()->quantization_info();
     }
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << qss.str()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Weights shape: " << weights->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << qss.str() << " Input shape: " << input->info()->tensor_shape()
+                                               << " Weights shape: " << weights->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
     return func;
 }
@@ -1075,16 +990,14 @@ std::unique_ptr<IFunction> create_generate_proposals_layer(GenerateProposalsLaye
     func->configure(scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.type()
-                               << " Target " << TargetInfo::TargetType
-                               << " Data Type: " << scores->info()->data_type()
-                               << " Scores shape: " << scores->info()->tensor_shape()
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
+                               << node.type() << " Target " << TargetInfo::TargetType << " Data Type: "
+                               << scores->info()->data_type() << " Scores shape: " << scores->info()->tensor_shape()
                                << " Deltas shape: " << deltas->info()->tensor_shape()
                                << " Anchors shape: " << anchors->info()->tensor_shape()
                                << " Proposals shape: " << proposals->info()->tensor_shape()
                                << " Num valid proposals shape: " << num_valid_proposals->info()->tensor_shape()
-                               << " Scores Out shape: " << scores_out->info()->tensor_shape()
-                               << std::endl);
+                               << " Scores Out shape: " << scores_out->info()->tensor_shape() << std::endl);
 
     return std::move(func);
 }
@@ -1119,16 +1032,11 @@ std::unique_ptr<IFunction> create_l2_normalize_layer(L2NormalizeLayerNode &node,
     func->configure(input, output, axis, epsilon);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " Axis: " << axis
-                               << " Epsilon: " << epsilon
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape()
+                                               << " Axis: " << axis << " Epsilon: " << epsilon << std::endl);
 
     return func;
 }
@@ -1162,15 +1070,11 @@ std::unique_ptr<IFunction> create_normalization_layer(NormalizationLayerNode &no
     func->configure(input, output, norm_info);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " Normalization info: " << norm_info.type()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape()
+                                               << " Normalization info: " << norm_info.type() << std::endl);
 
     return std::move(func);
 }
@@ -1204,13 +1108,9 @@ std::unique_ptr<IFunction> create_normalize_planar_yuv_layer(NormalizePlanarYUVL
     func->configure(input, output, mean, std);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Shape: " << input->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Shape: " << input->info()->tensor_shape() << std::endl);
 
     return std::move(func);
 }
@@ -1242,14 +1142,10 @@ std::unique_ptr<IFunction> create_pad_layer(PadLayerNode &node)
     func->configure(input, output, padding, pad_value);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
     return func;
 }
@@ -1280,15 +1176,11 @@ std::unique_ptr<IFunction> create_permute_layer(PermuteLayerNode &node)
     func->configure(input, output, perm);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " Permutation vector: " << perm
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape()
+                                               << " Permutation vector: " << perm << std::endl);
 
     return func;
 }
@@ -1319,15 +1211,11 @@ std::unique_ptr<IFunction> create_pooling_layer(PoolingLayerNode &node)
     func->configure(input, output, pool_info);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " Pooling info: " << pool_info.pool_type
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape()
+                                               << " Pooling info: " << pool_info.pool_type << std::endl);
 
     return func;
 }
@@ -1358,14 +1246,10 @@ std::unique_ptr<IFunction> create_prelu_layer(PReluLayerNode &node)
     func->configure(input, alpha, output);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
     return func;
 }
@@ -1388,13 +1272,9 @@ std::unique_ptr<IFunction> create_print_layer(PrintLayerNode &node)
     ARM_COMPUTE_UNUSED(input);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape() << std::endl);
 
     return nullptr;
 }
@@ -1428,15 +1308,11 @@ std::unique_ptr<IFunction> create_priorbox_layer(PriorBoxLayerNode &node)
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input0->info()->data_type()
-                               << " Input0 shape: " << input0->info()->tensor_shape()
-                               << " Input1 shape: " << input1->info()->tensor_shape()
+                               << node.name() << " Type: " << node.type() << " Target: " << TargetInfo::TargetType
+                               << " Data Type: " << input0->info()->data_type() << " Input0 shape: "
+                               << input0->info()->tensor_shape() << " Input1 shape: " << input1->info()->tensor_shape()
                                << " Output shape: " << output->info()->tensor_shape()
-                               << " PriorBoxLayer info: " << prior_info
-                               << std::endl);
+                               << " PriorBoxLayer info: " << prior_info << std::endl);
 
     return func;
 }
@@ -1466,14 +1342,10 @@ std::unique_ptr<IFunction> create_quantization_layer(QuantizationLayerNode &node
     func->configure(input, output);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
     return func;
 }
@@ -1508,16 +1380,11 @@ std::unique_ptr<IFunction> create_reduction_operation_layer(ReductionLayerNode &
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
+                               << node.name() << " Type: " << node.type() << " Target: " << TargetInfo::TargetType
                                << " Data Type: " << input->info()->data_type()
                                << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " Operation: " << op
-                               << " Axis: " << axis
-                               << " Keep dimensions:" << keep_dims
-                               << std::endl);
+                               << " Output shape: " << output->info()->tensor_shape() << " Operation: " << op
+                               << " Axis: " << axis << " Keep dimensions:" << keep_dims << std::endl);
 
     return func;
 }
@@ -1547,14 +1414,10 @@ std::unique_ptr<IFunction> create_reorg_layer(ReorgLayerNode &node)
     func->configure(input, output, node.stride());
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
     return func;
 }
@@ -1584,14 +1447,10 @@ std::unique_ptr<IFunction> create_reshape_layer(ReshapeLayerNode &node)
     func->configure(input, output);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
     return func;
 }
@@ -1619,18 +1478,15 @@ std::unique_ptr<IFunction> create_resize_layer(ResizeLayerNode &node)
 
     // Create and configure function
     auto func = std::make_unique<ResizeLayerFunction>();
-    func->configure(input, output, ScaleKernelInfo{ policy, BorderMode::CONSTANT, PixelValue(), SamplingPolicy::CENTER, false, false });
+    func->configure(input, output,
+                    ScaleKernelInfo{policy, BorderMode::CONSTANT, PixelValue(), SamplingPolicy::CENTER, false, false});
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " Interpolation: " << policy
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape()
+                                               << " Interpolation: " << policy << std::endl);
 
     return func;
 }
@@ -1665,17 +1521,13 @@ std::unique_ptr<IFunction> create_roi_align_layer(ROIAlignLayerNode &node)
     func->configure(input, rois, output, pool_info);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " ROIs shape: " << rois->info()->tensor_shape()
-                               << " ROIPooling width: " << pool_info.pooled_width()
-                               << " ROIPooling height: " << pool_info.pooled_height()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape()
+                                               << " ROIs shape: " << rois->info()->tensor_shape()
+                                               << " ROIPooling width: " << pool_info.pooled_width()
+                                               << " ROIPooling height: " << pool_info.pooled_height() << std::endl);
 
     return std::move(func);
 }
@@ -1705,14 +1557,10 @@ std::unique_ptr<IFunction> create_slice_layer(SliceLayerNode &node)
     func->configure(input, output, node.starts(), node.ends());
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
     return func;
 }
@@ -1744,14 +1592,10 @@ std::unique_ptr<IFunction> create_softmax_layer(SoftmaxLayerNode &node, GraphCon
     func->configure(input, output, beta);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
     return func;
 }
@@ -1768,12 +1612,13 @@ std::unique_ptr<IFunction> create_softmax_layer(SoftmaxLayerNode &node, GraphCon
 template <typename StackLayerFunction, typename TargetInfo>
 std::unique_ptr<arm_compute::IFunction> create_stack_layer(StackLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating Stack node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating Stack node with ID : " << node.id() << " and Name: " << node.name()
+                                                                   << std::endl);
     ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
 
     // Extract IO and info
     std::vector<typename TargetInfo::TensorType *> inputs;
-    for(unsigned int i = 0; i < node.num_inputs(); ++i)
+    for (unsigned int i = 0; i < node.num_inputs(); ++i)
     {
         inputs.push_back(get_backing_tensor<TargetInfo>(node.input(i)));
     }
@@ -1785,16 +1630,12 @@ std::unique_ptr<arm_compute::IFunction> create_stack_layer(StackLayerNode &node)
     func->configure(inputs, axis, output);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << output->info()->data_type()
-                               << " Inputs shape: " << inputs[0]->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " Num Inputs: " << inputs.size()
-                               << " Axis: " << axis
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type()
+                                               << " Target: " << TargetInfo::TargetType
+                                               << " Data Type: " << output->info()->data_type()
+                                               << " Inputs shape: " << inputs[0]->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape()
+                                               << " Num Inputs: " << inputs.size() << " Axis: " << axis << std::endl);
 
     return func;
 }
@@ -1829,14 +1670,10 @@ std::unique_ptr<IFunction> create_strided_slice_layer(StridedSliceLayerNode &nod
     func->configure(input, output, starts, ends, strides, info.begin_mask(), info.end_mask(), info.shrink_axis_mask());
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
     return func;
 }
diff --git a/arm_compute/graph/backends/FusedConvolutionBatchNormalizationFunction.h b/arm_compute/graph/backends/FusedConvolutionBatchNormalizationFunction.h
index 19c627d479..27e21cbc7e 100644
--- a/arm_compute/graph/backends/FusedConvolutionBatchNormalizationFunction.h
+++ b/arm_compute/graph/backends/FusedConvolutionBatchNormalizationFunction.h
@@ -70,15 +70,19 @@ public:
      * @param[in]  fused_act  Activation layer information in case of a fused activation.
      *
      */
-    void configure(TensorType       *input,
-                   TensorType       *weights,
-                   TensorType       *bias,
-                   TensorType       *output,
-                   const TensorType *mean,
-                   const TensorType *var,
-                   const TensorType *beta,
-                   const TensorType *gamma,
-                   float epsilon, const PadStrideInfo &conv_info, unsigned int num_groups, bool fast_math, ActivationLayerInfo const &fused_act)
+    void configure(TensorType                *input,
+                   TensorType                *weights,
+                   TensorType                *bias,
+                   TensorType                *output,
+                   const TensorType          *mean,
+                   const TensorType          *var,
+                   const TensorType          *beta,
+                   const TensorType          *gamma,
+                   float                      epsilon,
+                   const PadStrideInfo       &conv_info,
+                   unsigned int               num_groups,
+                   bool                       fast_math,
+                   ActivationLayerInfo const &fused_act)
     {
         // We don't run any validate, as we assume that the layers have been already validated
         const bool        has_bias = (bias != nullptr);
@@ -86,7 +90,7 @@ public:
 
         // We check if the layer has a bias. If yes, use it in-place. If not, we need to create one
         // as batch normalization might end up with a bias != 0
-        if(has_bias)
+        if (has_bias)
         {
             _fused_batch_norm_layer.configure(weights, mean, var, nullptr, nullptr, bias, beta, gamma, epsilon);
             bias_to_use = bias;
@@ -97,9 +101,10 @@ public:
             bias_to_use = &_fused_bias;
         }
 
-        _conv_layer.configure(input, weights, bias_to_use, output, conv_info, WeightsInfo(), Size2D(1U, 1U), fused_act, fast_math, num_groups);
+        _conv_layer.configure(input, weights, bias_to_use, output, conv_info, WeightsInfo(), Size2D(1U, 1U), fused_act,
+                              fast_math, num_groups);
 
-        if(!has_bias)
+        if (!has_bias)
         {
             _fused_bias.allocator()->allocate();
         }
@@ -114,7 +119,7 @@ public:
 
     void prepare()
     {
-        if(!_is_prepared)
+        if (!_is_prepared)
         {
             _fused_batch_norm_layer.run();
             _is_prepared = true;
diff --git a/arm_compute/graph/backends/FusedDepthwiseConvolutionBatchNormalizationFunction.h b/arm_compute/graph/backends/FusedDepthwiseConvolutionBatchNormalizationFunction.h
index 4f8a8da1fb..07a2cdd8b8 100644
--- a/arm_compute/graph/backends/FusedDepthwiseConvolutionBatchNormalizationFunction.h
+++ b/arm_compute/graph/backends/FusedDepthwiseConvolutionBatchNormalizationFunction.h
@@ -67,15 +67,18 @@ public:
      * @param[in]  fused_act        Activation layer information in case of a fused activation.
      *
      */
-    void configure(TensorType       *input,
-                   TensorType       *weights,
-                   TensorType       *bias,
-                   TensorType       *output,
-                   const TensorType *mean,
-                   const TensorType *var,
-                   const TensorType *beta,
-                   const TensorType *gamma,
-                   float epsilon, const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo const &fused_act)
+    void configure(TensorType                *input,
+                   TensorType                *weights,
+                   TensorType                *bias,
+                   TensorType                *output,
+                   const TensorType          *mean,
+                   const TensorType          *var,
+                   const TensorType          *beta,
+                   const TensorType          *gamma,
+                   float                      epsilon,
+                   const PadStrideInfo       &conv_info,
+                   unsigned int               depth_multiplier,
+                   ActivationLayerInfo const &fused_act)
     {
         // We don't run any validate, as we assume that the layers have been already validated
         const bool        has_bias = (bias != nullptr);
@@ -83,20 +86,23 @@ public:
 
         // We check if the layer has a bias. If yes, use it in-place. If not, we need to create one
         // as batch normalization might end up with a bias != 0
-        if(has_bias)
+        if (has_bias)
         {
-            _fused_batch_norm_layer.configure(weights, mean, var, nullptr, nullptr, bias, beta, gamma, epsilon, FuseBatchNormalizationType::DEPTHWISECONVOLUTION);
+            _fused_batch_norm_layer.configure(weights, mean, var, nullptr, nullptr, bias, beta, gamma, epsilon,
+                                              FuseBatchNormalizationType::DEPTHWISECONVOLUTION);
             bias_to_use = bias;
         }
         else
         {
-            _fused_batch_norm_layer.configure(weights, mean, var, nullptr, &_fused_bias, nullptr, beta, gamma, epsilon, FuseBatchNormalizationType::DEPTHWISECONVOLUTION);
+            _fused_batch_norm_layer.configure(weights, mean, var, nullptr, &_fused_bias, nullptr, beta, gamma, epsilon,
+                                              FuseBatchNormalizationType::DEPTHWISECONVOLUTION);
             bias_to_use = &_fused_bias;
         }
 
-        _depth_conv_layer.configure(input, weights, bias_to_use, output, conv_info, depth_multiplier, fused_act.enabled() ? fused_act : ActivationLayerInfo());
+        _depth_conv_layer.configure(input, weights, bias_to_use, output, conv_info, depth_multiplier,
+                                    fused_act.enabled() ? fused_act : ActivationLayerInfo());
 
-        if(!has_bias)
+        if (!has_bias)
         {
             _fused_bias.allocator()->allocate();
         }
@@ -111,7 +117,7 @@ public:
 
     void prepare()
     {
-        if(!_is_prepared)
+        if (!_is_prepared)
         {
             _fused_batch_norm_layer.run();
             _is_prepared = true;
diff --git a/arm_compute/graph/backends/NEON/NEDeviceBackend.h b/arm_compute/graph/backends/NEON/NEDeviceBackend.h
index 9cb37d4553..cd817a20d8 100644
--- a/arm_compute/graph/backends/NEON/NEDeviceBackend.h
+++ b/arm_compute/graph/backends/NEON/NEDeviceBackend.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_GRAPH_NEDEVICEBACKEND_H
 
 #include "arm_compute/graph/IDeviceBackend.h"
-
 #include "arm_compute/runtime/Allocator.h"
 
 namespace arm_compute
@@ -41,16 +40,17 @@ public:
     NEDeviceBackend();
 
     // Inherited overridden methods
-    void initialize_backend() override;
-    void setup_backend_context(GraphContext &ctx) override;
-    void release_backend_context(GraphContext &ctx) override;
+    void                           initialize_backend() override;
+    void                           setup_backend_context(GraphContext &ctx) override;
+    void                           release_backend_context(GraphContext &ctx) override;
     bool                           is_backend_supported() override;
     IAllocator                    *backend_allocator() override;
     std::unique_ptr<ITensorHandle> create_tensor(const Tensor &tensor) override;
-    std::unique_ptr<ITensorHandle> create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent) override;
-    std::unique_ptr<arm_compute::IFunction> configure_node(INode &node, GraphContext &ctx) override;
-    Status validate_node(INode &node) override;
-    std::shared_ptr<arm_compute::IMemoryManager> create_memory_manager(MemoryManagerAffinity affinity) override;
+    std::unique_ptr<ITensorHandle>
+    create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent) override;
+    std::unique_ptr<arm_compute::IFunction>       configure_node(INode &node, GraphContext &ctx) override;
+    Status                                        validate_node(INode &node) override;
+    std::shared_ptr<arm_compute::IMemoryManager>  create_memory_manager(MemoryManagerAffinity affinity) override;
     std::shared_ptr<arm_compute::IWeightsManager> create_weights_manager() override;
     void                                          sync() override;
 
diff --git a/arm_compute/graph/backends/NEON/NESubTensorHandle.h b/arm_compute/graph/backends/NEON/NESubTensorHandle.h
index a438b65735..3619f4ed1b 100644
--- a/arm_compute/graph/backends/NEON/NESubTensorHandle.h
+++ b/arm_compute/graph/backends/NEON/NESubTensorHandle.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_GRAPH_NESUBTENSORHANDLE_H
 
 #include "arm_compute/graph/ITensorHandle.h"
-
 #include "arm_compute/runtime/SubTensor.h"
 
 namespace arm_compute
@@ -45,7 +44,10 @@ public:
      * @param[in] coords        Starting coordinates
      * @param[in] extend_parent Extends parent shape if true
      */
-    NESubTensorHandle(ITensorHandle *parent_handle, const TensorShape &shape, const Coordinates &coords, bool extend_parent = false);
+    NESubTensorHandle(ITensorHandle     *parent_handle,
+                      const TensorShape &shape,
+                      const Coordinates &coords,
+                      bool               extend_parent = false);
     /** Destructor: free the tensor's memory */
     ~NESubTensorHandle() = default;
     /** Allow instances of this class to be move constructed */
@@ -58,10 +60,10 @@ public:
     NESubTensorHandle &operator=(const NESubTensorHandle &) = delete;
 
     // Inherited overridden methods
-    void allocate() override;
-    void free() override;
-    void manage(IMemoryGroup *mg) override;
-    void map(bool blocking) override;
+    void                        allocate() override;
+    void                        free() override;
+    void                        manage(IMemoryGroup *mg) override;
+    void                        map(bool blocking) override;
     void                        unmap() override;
     void                        release_if_unused() override;
     arm_compute::ITensor       &tensor() override;
diff --git a/arm_compute/graph/backends/NEON/NETensorHandle.h b/arm_compute/graph/backends/NEON/NETensorHandle.h
index 99101a8fe9..1df90822ba 100644
--- a/arm_compute/graph/backends/NEON/NETensorHandle.h
+++ b/arm_compute/graph/backends/NEON/NETensorHandle.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_GRAPH_NETENSORHANDLE_H
 
 #include "arm_compute/graph/ITensorHandle.h"
-
 #include "arm_compute/runtime/Tensor.h"
 
 namespace arm_compute
@@ -51,10 +50,10 @@ public:
     NETensorHandle &operator=(NETensorHandle &&) = default;
 
     // Inherited overridden methods
-    void allocate() override;
-    void free() override;
-    void manage(IMemoryGroup *mg) override;
-    void map(bool blocking) override;
+    void                        allocate() override;
+    void                        free() override;
+    void                        manage(IMemoryGroup *mg) override;
+    void                        map(bool blocking) override;
     void                        unmap() override;
     void                        release_if_unused() override;
     arm_compute::ITensor       &tensor() override;
diff --git a/arm_compute/graph/backends/Utils.h b/arm_compute/graph/backends/Utils.h
index 774ce515b5..5f4e66c207 100644
--- a/arm_compute/graph/backends/Utils.h
+++ b/arm_compute/graph/backends/Utils.h
@@ -42,7 +42,8 @@ namespace backends
  * @return  A configured backend function
  */
 template <typename FunctionType, typename FunctionNameType, typename... ParameterType>
-std::tuple<std::unique_ptr<arm_compute::IFunction>, FunctionNameType> create_named_function(FunctionNameType name, ParameterType... args)
+std::tuple<std::unique_ptr<arm_compute::IFunction>, FunctionNameType> create_named_function(FunctionNameType name,
+                                                                                            ParameterType... args)
 {
     auto f = std::make_unique<FunctionType>();
     f->configure(std::forward<ParameterType>(args)...);
@@ -58,9 +59,8 @@ std::tuple<std::unique_ptr<arm_compute::IFunction>, FunctionNameType> create_nam
  * @return  A configured backend function
  */
 template <typename FunctionType, typename FunctionNameType, typename MemoryManagerType, typename... ParameterType>
-std::tuple<std::unique_ptr<arm_compute::IFunction>, FunctionNameType> create_named_memory_managed_function(FunctionNameType name,
-                                                                                                           MemoryManagerType mm,
-                                                                                                           ParameterType... args)
+std::tuple<std::unique_ptr<arm_compute::IFunction>, FunctionNameType>
+create_named_memory_managed_function(FunctionNameType name, MemoryManagerType mm, ParameterType... args)
 {
     auto f = std::make_unique<FunctionType>(mm);
     f->configure(std::forward<ParameterType>(args)...);
diff --git a/arm_compute/graph/backends/ValidateHelpers.h b/arm_compute/graph/backends/ValidateHelpers.h
index 71a6201554..0e102942a7 100644
--- a/arm_compute/graph/backends/ValidateHelpers.h
+++ b/arm_compute/graph/backends/ValidateHelpers.h
@@ -24,14 +24,13 @@
 #ifndef ACL_ARM_COMPUTE_GRAPH_BACKENDS_VALIDATEHELPERS_H
 #define ACL_ARM_COMPUTE_GRAPH_BACKENDS_VALIDATEHELPERS_H
 
-#include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Tensor.h"
-#include "arm_compute/graph/Types.h"
-#include "arm_compute/graph/nodes/Nodes.h"
-
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/graph/Types.h"
 
 namespace arm_compute
 {
@@ -63,7 +62,8 @@ inline arm_compute::ITensorInfo *get_backing_tensor_info(arm_compute::graph::Ten
 template <typename ArgMinMaxLayer>
 Status validate_arg_min_max_layer(ArgMinMaxLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ArgMinMaxLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating ArgMinMaxLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -86,7 +86,8 @@ Status validate_arg_min_max_layer(ArgMinMaxLayerNode &node)
 template <typename BoundingBoxTransformLayer>
 Status validate_bounding_box_transform_layer(BoundingBoxTransformLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating BoundingBoxTransformLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating BoundingBoxTransformLayer node with ID : " << node.id() << " and Name: "
+                                                                                         << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -110,7 +111,8 @@ Status validate_bounding_box_transform_layer(BoundingBoxTransformLayerNode &node
 template <typename ChannelShuffleLayer>
 Status validate_channel_shuffle_layer(ChannelShuffleLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ChannelShuffle node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating ChannelShuffle node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -133,10 +135,14 @@ Status validate_channel_shuffle_layer(ChannelShuffleLayerNode &node)
  *
  * @return Status
  */
-template <typename ConvolutionLayer, typename DirectConvolutionLayer, typename GEMMConvolutionLayer, typename WinogradConvolutionLayer>
+template <typename ConvolutionLayer,
+          typename DirectConvolutionLayer,
+          typename GEMMConvolutionLayer,
+          typename WinogradConvolutionLayer>
 Status validate_convolution_layer(ConvolutionLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating ConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 3);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -146,7 +152,7 @@ Status validate_convolution_layer(ConvolutionLayerNode &node)
     arm_compute::ITensorInfo *biases  = get_backing_tensor_info(node.input(2));
     arm_compute::ITensorInfo *output  = get_backing_tensor_info(node.output(0));
 
-    if(is_data_type_quantized_asymmetric(input->data_type()))
+    if (is_data_type_quantized_asymmetric(input->data_type()))
     {
         biases->set_data_type(DataType::S32);
     }
@@ -158,23 +164,24 @@ Status validate_convolution_layer(ConvolutionLayerNode &node)
 
     // Validate function
     Status status{};
-    switch(conv_algorithm)
+    switch (conv_algorithm)
     {
         case ConvolutionMethod::Direct:
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups != 1, "DirectConvolutionLayer does not support grouping!");
             status = DirectConvolutionLayer::validate(input, weights, biases, output, conv_info);
             break;
         case ConvolutionMethod::GEMM:
-            status = GEMMConvolutionLayer::validate(input, weights, biases, output, conv_info,
-                                                    WeightsInfo(), Size2D(1, 1), ActivationLayerInfo(), num_groups);
+            status = GEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, WeightsInfo(),
+                                                    Size2D(1, 1), ActivationLayerInfo(), num_groups);
             break;
         case ConvolutionMethod::Winograd:
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups != 1, "WinogradConvolutionLayer does not support grouping!");
-            status = WinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, ActivationLayerInfo(), fast_math);
+            status = WinogradConvolutionLayer::validate(input, weights, biases, output, conv_info,
+                                                        ActivationLayerInfo(), fast_math);
             break;
         case ConvolutionMethod::Default:
-            status = ConvolutionLayer::validate(input, weights, biases, output, conv_info,
-                                                WeightsInfo(), Size2D(1, 1), ActivationLayerInfo(), fast_math, num_groups);
+            status = ConvolutionLayer::validate(input, weights, biases, output, conv_info, WeightsInfo(), Size2D(1, 1),
+                                                ActivationLayerInfo(), fast_math, num_groups);
             break;
         default:
             ARM_COMPUTE_RETURN_ERROR_MSG("Unsupported convolution method");
@@ -194,7 +201,8 @@ Status validate_convolution_layer(ConvolutionLayerNode &node)
 template <typename DepthwiseConvolutionLayer>
 Status validate_depthwise_convolution_layer(DepthwiseConvolutionLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating DepthwiseConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating DepthwiseConvolutionLayer node with ID : " << node.id() << " and Name: "
+                                                                                         << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 3);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -210,7 +218,7 @@ Status validate_depthwise_convolution_layer(DepthwiseConvolutionLayerNode &node)
 
     // Validate function
     Status status{};
-    switch(dwc_algorithm)
+    switch (dwc_algorithm)
     {
         case DepthwiseConvolutionMethod::Default:
         case DepthwiseConvolutionMethod::Optimized3x3:
@@ -233,7 +241,8 @@ Status validate_depthwise_convolution_layer(DepthwiseConvolutionLayerNode &node)
 template <typename DepthToSpaceLayer>
 Status validate_depth_to_space_layer(DepthToSpaceLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating DetectionOutputLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating DetectionOutputLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -254,7 +263,8 @@ Status validate_depth_to_space_layer(DepthToSpaceLayerNode &node)
 template <typename DequantizationLayer>
 Status validate_dequantization_layer(DequantizationLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating DetectionOutputLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating DetectionOutputLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -275,7 +285,8 @@ Status validate_dequantization_layer(DequantizationLayerNode &node)
 template <typename DetectionOutputLayer>
 Status validate_detection_output_layer(DetectionOutputLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating DetectionOutputLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating DetectionOutputLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 3);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -299,7 +310,8 @@ Status validate_detection_output_layer(DetectionOutputLayerNode &node)
 template <typename DetectionPostProcessLayer>
 Status validate_detection_post_process_layer(DetectionPostProcessLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating DetectionPostProcessLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating DetectionPostProcessLayer node with ID : " << node.id() << " and Name: "
+                                                                                         << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 3);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 4);
 
@@ -327,7 +339,8 @@ Status validate_detection_post_process_layer(DetectionPostProcessLayerNode &node
 template <typename GenerateProposalsLayer>
 Status validate_generate_proposals_layer(GenerateProposalsLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating GenerateProposalsLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating GenerateProposalsLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 3);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 3);
 
@@ -354,7 +367,8 @@ Status validate_generate_proposals_layer(GenerateProposalsLayerNode &node)
 template <typename L2NormalizeLayer>
 Status validate_l2_normalize_layer(L2NormalizeLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating L2NormalizeLayerNode node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating L2NormalizeLayerNode node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -379,7 +393,8 @@ Status validate_l2_normalize_layer(L2NormalizeLayerNode &node)
 template <typename NormalizePlanarYUVLayer>
 Status validate_normalize_planar_yuv_layer(NormalizePlanarYUVLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating NormalizePlanarYUVLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating NormalizePlanarYUVLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 3);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -404,7 +419,8 @@ Status validate_normalize_planar_yuv_layer(NormalizePlanarYUVLayerNode &node)
 template <typename PadLayer>
 Status validate_pad_layer(PadLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating PadLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating PadLayer node with ID : " << node.id() << " and Name: " << node.name()
+                                                                        << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -427,14 +443,15 @@ Status validate_pad_layer(PadLayerNode &node)
 template <typename PermuteLayer>
 Status validate_permute_layer(PermuteLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating PermuteLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating PermuteLayer node with ID : " << node.id() << " and Name: " << node.name()
+                                                                            << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
     // Extract IO and info
     arm_compute::ITensorInfo *input  = get_backing_tensor_info(node.input(0));
     arm_compute::ITensorInfo *output = get_backing_tensor_info(node.output(0));
-    const PermutationVector &perm   = node.permutation_vector();
+    const PermutationVector  &perm   = node.permutation_vector();
 
     return PermuteLayer::validate(input, output, perm);
 }
@@ -450,7 +467,8 @@ Status validate_permute_layer(PermuteLayerNode &node)
 template <typename PReluLayer>
 Status validate_prelu_layer(PReluLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating PRelu node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating PRelu node with ID : " << node.id() << " and Name: " << node.name()
+                                                                     << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -473,7 +491,8 @@ Status validate_prelu_layer(PReluLayerNode &node)
 template <typename PriorBoxLayer>
 Status validate_priorbox_layer(PriorBoxLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating PriorBoxLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating PriorBoxLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -497,7 +516,8 @@ Status validate_priorbox_layer(PriorBoxLayerNode &node)
 template <typename QuantizationLayer>
 Status validate_quantization_layer(QuantizationLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating QuantizationLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating QuantizationLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -520,7 +540,8 @@ Status validate_quantization_layer(QuantizationLayerNode &node)
 template <typename ReductionLayer>
 Status validate_reduction_operation_layer(ReductionLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ReductionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating ReductionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
 
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
@@ -544,7 +565,8 @@ Status validate_reduction_operation_layer(ReductionLayerNode &node)
 template <typename ReorgLayer>
 Status validate_reorg_layer(ReorgLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ReorgLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ReorgLayer node with ID : " << node.id() << " and Name: " << node.name()
+                                                                          << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -567,7 +589,8 @@ Status validate_reorg_layer(ReorgLayerNode &node)
 template <typename ReshapeLayer>
 Status validate_reshape_layer(ReshapeLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ReshapeLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ReshapeLayer node with ID : " << node.id() << " and Name: " << node.name()
+                                                                            << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -590,14 +613,15 @@ Status validate_reshape_layer(ReshapeLayerNode &node)
 template <typename ROIAlignLayer>
 Status validate_roi_align_layer(ROIAlignLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ROIAlignLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating ROIAlignLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
     // Extract input and output
-    arm_compute::ITensorInfo *input     = detail::get_backing_tensor_info(node.input(0));
-    arm_compute::ITensorInfo *rois      = detail::get_backing_tensor_info(node.input(1));
-    arm_compute::ITensorInfo *output    = detail::get_backing_tensor_info(node.output(0));
+    arm_compute::ITensorInfo  *input     = detail::get_backing_tensor_info(node.input(0));
+    arm_compute::ITensorInfo  *rois      = detail::get_backing_tensor_info(node.input(1));
+    arm_compute::ITensorInfo  *output    = detail::get_backing_tensor_info(node.output(0));
     const ROIPoolingLayerInfo &pool_info = node.pooling_info();
 
     // Validate function
@@ -615,7 +639,8 @@ Status validate_roi_align_layer(ROIAlignLayerNode &node)
 template <typename SliceLayer>
 Status validate_slice_layer(SliceLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating Slice node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating Slice node with ID : " << node.id() << " and Name: " << node.name()
+                                                                     << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -639,7 +664,8 @@ Status validate_slice_layer(SliceLayerNode &node)
 template <typename StridedSliceLayer>
 Status validate_strided_slice_layer(StridedSliceLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating StridedSlice node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating StridedSlice node with ID : " << node.id() << " and Name: " << node.name()
+                                                                            << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -651,7 +677,8 @@ Status validate_strided_slice_layer(StridedSliceLayerNode &node)
     const BiStrides             strides = node.strides();
     const StridedSliceLayerInfo info    = node.strided_slice_info();
 
-    return StridedSliceLayer::validate(input, output, starts, ends, strides, info.begin_mask(), info.end_mask(), info.shrink_axis_mask());
+    return StridedSliceLayer::validate(input, output, starts, ends, strides, info.begin_mask(), info.end_mask(),
+                                       info.shrink_axis_mask());
 }
 
 /** Validates a element-wise layer node
@@ -663,7 +690,8 @@ Status validate_strided_slice_layer(StridedSliceLayerNode &node)
 template <typename EltwiseLayerFunctions>
 Status validate_eltwise_Layer(EltwiseLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating EltwiseLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating EltwiseLayer node with ID : " << node.id() << " and Name: " << node.name()
+                                                                            << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -678,23 +706,24 @@ Status validate_eltwise_Layer(EltwiseLayerNode &node)
     const QuantizationInfo          quant_info     = node.output_quant_info();
 
     // Validate function
-    if(eltwise_op == EltwiseOperation::Add)
+    if (eltwise_op == EltwiseOperation::Add)
     {
         return EltwiseLayerFunctions::ArithmeticAddition::validate(input1, input2, output, convert_policy, act_info);
     }
-    else if(eltwise_op == EltwiseOperation::Sub)
+    else if (eltwise_op == EltwiseOperation::Sub)
     {
         return EltwiseLayerFunctions::ArithmeticSubtraction::validate(input1, input2, output, convert_policy, act_info);
     }
-    else if(eltwise_op == EltwiseOperation::Mul)
+    else if (eltwise_op == EltwiseOperation::Mul)
     {
-        return EltwiseLayerFunctions::PixelWiseMultiplication::validate(input1, input2, output, 1.0f, convert_policy, round_policy, act_info);
+        return EltwiseLayerFunctions::PixelWiseMultiplication::validate(input1, input2, output, 1.0f, convert_policy,
+                                                                        round_policy, act_info);
     }
-    else if(eltwise_op == EltwiseOperation::Max)
+    else if (eltwise_op == EltwiseOperation::Max)
     {
         return EltwiseLayerFunctions::ElementwiseMax::validate(input1, input2, output, act_info);
     }
-    else if(eltwise_op == EltwiseOperation::Div)
+    else if (eltwise_op == EltwiseOperation::Div)
     {
         return EltwiseLayerFunctions::ArithmeticDivision::validate(input1, input2, output, act_info);
     }
@@ -713,7 +742,8 @@ Status validate_eltwise_Layer(EltwiseLayerNode &node)
 template <typename UnaryEltwiseLayerFunctions>
 Status validate_unary_eltwise_layer(UnaryEltwiseLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating EltwiseLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating EltwiseLayer node with ID : " << node.id() << " and Name: " << node.name()
+                                                                            << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -723,7 +753,7 @@ Status validate_unary_eltwise_layer(UnaryEltwiseLayerNode &node)
     const UnaryEltwiseOperation eltwise_op = node.eltwise_descriptor().op;
 
     // Validate function
-    if(eltwise_op == UnaryEltwiseOperation::Exp)
+    if (eltwise_op == UnaryEltwiseOperation::Exp)
     {
         return UnaryEltwiseLayerFunctions::ExpLayer::validate(input, output);
     }
diff --git a/arm_compute/graph/frontend/IStream.h b/arm_compute/graph/frontend/IStream.h
index f69d5437c1..1831ac0be3 100644
--- a/arm_compute/graph/frontend/IStream.h
+++ b/arm_compute/graph/frontend/IStream.h
@@ -84,8 +84,8 @@ public:
     }
 
 protected:
-    StreamHints _hints     = {};              /**< Execution and algorithmic hints */
-    NodeID      _tail_node = { EmptyNodeID }; /**< NodeID pointing to the last(tail) node of the graph */
+    StreamHints _hints     = {};            /**< Execution and algorithmic hints */
+    NodeID      _tail_node = {EmptyNodeID}; /**< NodeID pointing to the last(tail) node of the graph */
 };
 } // namespace frontend
 } // namespace graph
diff --git a/arm_compute/graph/frontend/Layers.h b/arm_compute/graph/frontend/Layers.h
index fe0539bac5..bd321e6f1a 100644
--- a/arm_compute/graph/frontend/Layers.h
+++ b/arm_compute/graph/frontend/Layers.h
@@ -24,13 +24,12 @@
 #ifndef ARM_COMPUTE_GRAPH_LAYERS_H
 #define ARM_COMPUTE_GRAPH_LAYERS_H
 
-#include "arm_compute/graph/GraphBuilder.h"
-#include "arm_compute/graph/Types.h"
+#include "arm_compute/core/utils/misc/Utility.h"
 #include "arm_compute/graph/frontend/ILayer.h"
 #include "arm_compute/graph/frontend/IStream.h"
 #include "arm_compute/graph/frontend/SubStream.h"
-
-#include "arm_compute/core/utils/misc/Utility.h"
+#include "arm_compute/graph/GraphBuilder.h"
+#include "arm_compute/graph/Types.h"
 
 #include <memory>
 #include <string>
@@ -50,14 +49,13 @@ public:
      * @param[in] desc     Description of input tensor.
      * @param[in] accessor Accessor to get input tensor data from.
      */
-    InputLayer(TensorDescriptor desc, ITensorAccessorUPtr accessor)
-        : _desc(desc), _accessor(std::move(accessor))
+    InputLayer(TensorDescriptor desc, ITensorAccessorUPtr accessor) : _desc(desc), _accessor(std::move(accessor))
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams common_params = { name(), s.hints().target_hint };
+        NodeParams common_params = {name(), s.hints().target_hint};
         return GraphBuilder::add_input_node(s.graph(), common_params, _desc, std::move(_accessor));
     }
 
@@ -75,14 +73,13 @@ public:
      * @param[in] desc     Description of input tensor.
      * @param[in] accessor Accessor to get input tensor data from.
      */
-    ConstantLayer(TensorDescriptor desc, ITensorAccessorUPtr accessor)
-        : _desc(desc), _accessor(std::move(accessor))
+    ConstantLayer(TensorDescriptor desc, ITensorAccessorUPtr accessor) : _desc(desc), _accessor(std::move(accessor))
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams common_params = { name(), s.hints().target_hint };
+        NodeParams common_params = {name(), s.hints().target_hint};
         return GraphBuilder::add_const_node(s.graph(), common_params, _desc, std::move(_accessor));
     }
 
@@ -107,8 +104,8 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), _connection_idx };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), _connection_idx};
         return GraphBuilder::add_output_node(s.graph(), common_params, input, std::move(_accessor));
     }
 
@@ -126,18 +123,17 @@ public:
      * @param[in] act_info       Activation information
      * @param[in] out_quant_info (Optional) Output quantization info
      */
-    ActivationLayer(ActivationLayerInfo    act_info,
-                    const QuantizationInfo out_quant_info = QuantizationInfo())
-        : _act_info(act_info),
-          _out_quant_info(std::move(out_quant_info))
+    ActivationLayer(ActivationLayerInfo act_info, const QuantizationInfo out_quant_info = QuantizationInfo())
+        : _act_info(act_info), _out_quant_info(std::move(out_quant_info))
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
-        return GraphBuilder::add_activation_node(s.graph(), common_params, input, _act_info, std::move(_out_quant_info));
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
+        return GraphBuilder::add_activation_node(s.graph(), common_params, input, _act_info,
+                                                 std::move(_out_quant_info));
     }
 
 private:
@@ -160,10 +156,7 @@ public:
                    unsigned int           axis,
                    DataType               out_data_type  = DataType::UNKNOWN,
                    const QuantizationInfo out_quant_info = QuantizationInfo())
-        : _op(op),
-          _axis(axis),
-          _out_data_type(out_data_type),
-          _out_quant_info(std::move(out_quant_info))
+        : _op(op), _axis(axis), _out_data_type(out_data_type), _out_quant_info(std::move(out_quant_info))
     {
     }
 
@@ -175,9 +168,10 @@ public:
      */
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
-        return GraphBuilder::add_arg_min_max_node(s.graph(), common_params, input, _op, _axis, _out_data_type, std::move(_out_quant_info));
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
+        return GraphBuilder::add_arg_min_max_node(s.graph(), common_params, input, _op, _axis, _out_data_type,
+                                                  std::move(_out_quant_info));
     }
 
 private:
@@ -204,7 +198,11 @@ public:
                             ITensorAccessorUPtr gamma   = nullptr,
                             ITensorAccessorUPtr beta    = nullptr,
                             float               epsilon = 0.001f)
-        : _mean(std::move(mean)), _var(std::move(var)), _gamma(std::move(gamma)), _beta(std::move(beta)), _epsilon(epsilon)
+        : _mean(std::move(mean)),
+          _var(std::move(var)),
+          _gamma(std::move(gamma)),
+          _beta(std::move(beta)),
+          _epsilon(epsilon)
     {
     }
 
@@ -213,10 +211,10 @@ public:
         ARM_COMPUTE_ERROR_ON(_mean == nullptr);
         ARM_COMPUTE_ERROR_ON(_var == nullptr);
 
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
-        return GraphBuilder::add_batch_normalization_node(s.graph(), common_params, input, _epsilon,
-                                                          std::move(_mean), std::move(_var), std::move(_beta), std::move(_gamma));
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
+        return GraphBuilder::add_batch_normalization_node(s.graph(), common_params, input, _epsilon, std::move(_mean),
+                                                          std::move(_var), std::move(_beta), std::move(_gamma));
     }
 
 private:
@@ -237,7 +235,9 @@ public:
      * @param[in] sub_stream_deltas Graph sub-stream for the deltas
      * @param[in] info              Contains BoundingBox operation information described in @ref BoundingBoxTransformInfo.
      */
-    BoundingBoxTransformLayer(SubStream &&sub_stream_input, SubStream &&sub_stream_deltas, BoundingBoxTransformInfo info)
+    BoundingBoxTransformLayer(SubStream              &&sub_stream_input,
+                              SubStream              &&sub_stream_deltas,
+                              BoundingBoxTransformInfo info)
         : _ss_input(sub_stream_input), _ss_deltas(sub_stream_deltas), _bbox_info(info)
     {
     }
@@ -250,9 +250,9 @@ public:
      */
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { _ss_input.tail_node(), 0 };
-        NodeIdxPair deltas        = { _ss_deltas.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {_ss_input.tail_node(), 0};
+        NodeIdxPair deltas        = {_ss_deltas.tail_node(), 0};
         return GraphBuilder::add_bounding_box_transform_node(s.graph(), common_params, input, deltas, _bbox_info);
     }
 
@@ -270,15 +270,14 @@ public:
      *
      * @param[in] num_groups Number of groups
      */
-    ChannelShuffleLayer(unsigned int num_groups)
-        : _num_groups(num_groups)
+    ChannelShuffleLayer(unsigned int num_groups) : _num_groups(num_groups)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_channel_shuffle_node(s.graph(), common_params, input, _num_groups);
     }
 
@@ -297,17 +296,15 @@ public:
      * @param[in] rest_sub_streams Rest sub-graph branches
      */
     template <typename... Ts>
-    ConcatLayer(SubStream &&sub_stream1, SubStream &&sub_stream2, Ts &&... rest_sub_streams)
+    ConcatLayer(SubStream &&sub_stream1, SubStream &&sub_stream2, Ts &&...rest_sub_streams)
         : _sub_streams(), _concat_descriptor(DataLayoutDimension::CHANNEL)
     {
         _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream1)));
         _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream2)));
 
-        utility::for_each([&](SubStream && sub_stream)
-        {
-            _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream)));
-        },
-        std::move(rest_sub_streams)...);
+        utility::for_each([&](SubStream &&sub_stream)
+                          { _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream))); },
+                          std::move(rest_sub_streams)...);
     }
     /** Construct a concatenation layer
      *
@@ -317,33 +314,33 @@ public:
      * @param[in] rest_sub_streams  Rest sub-graph branches
      */
     template <typename... Ts>
-    ConcatLayer(descriptors::ConcatLayerDescriptor concat_descriptor, SubStream &&sub_stream1, SubStream &&sub_stream2, Ts &&... rest_sub_streams)
+    ConcatLayer(descriptors::ConcatLayerDescriptor concat_descriptor,
+                SubStream                        &&sub_stream1,
+                SubStream                        &&sub_stream2,
+                Ts &&...rest_sub_streams)
         : _sub_streams(), _concat_descriptor(concat_descriptor)
     {
         _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream1)));
         _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream2)));
 
-        utility::for_each([&](SubStream && sub_stream)
-        {
-            _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream)));
-        },
-        std::move(rest_sub_streams)...);
+        utility::for_each([&](SubStream &&sub_stream)
+                          { _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream))); },
+                          std::move(rest_sub_streams)...);
     }
     /** Construct a concat layer
      *
      * @param[in] sub_stream Sub-stream
      */
     template <typename... Ts>
-    ConcatLayer(SubStream &&sub_stream)
-        : _sub_streams(), _concat_descriptor(DataLayoutDimension::CHANNEL)
+    ConcatLayer(SubStream &&sub_stream) : _sub_streams(), _concat_descriptor(DataLayoutDimension::CHANNEL)
     {
         _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream)));
     }
     NodeID create_layer(IStream &s) override
     {
         NodeID     nid           = EmptyNodeID;
-        NodeParams common_params = { name(), s.hints().target_hint };
-        if(_sub_streams.size() == 1 && _sub_streams.at(0) != nullptr)
+        NodeParams common_params = {name(), s.hints().target_hint};
+        if (_sub_streams.size() == 1 && _sub_streams.at(0) != nullptr)
         {
             nid = _sub_streams[0]->tail_node();
         }
@@ -351,14 +348,14 @@ public:
         {
             // Collect tail nodes and concatenate
             std::vector<NodeIdxPair> nodes;
-            for(auto &ss : _sub_streams)
+            for (auto &ss : _sub_streams)
             {
-                if(ss && (ss->tail_node() != EmptyNodeID))
+                if (ss && (ss->tail_node() != EmptyNodeID))
                 {
                     const auto tail_node = s.graph().node(ss->tail_node());
-                    if(tail_node != nullptr && tail_node->type() != NodeType::Output)
+                    if (tail_node != nullptr && tail_node->type() != NodeType::Output)
                     {
-                        nodes.push_back({ ss->tail_node(), 0 });
+                        nodes.push_back({ss->tail_node(), 0});
                     }
                 }
             }
@@ -411,12 +408,12 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeIdxPair input         = { s.tail_node(), 0 };
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        return GraphBuilder::add_convolution_node(s.graph(), common_params, input,
-                                                  Size2D(_conv_width, _conv_height), _ofm, _conv_info, _num_groups,
-                                                  s.hints().convolution_method_hint, s.hints().fast_math_hint,
-                                                  std::move(_weights), std::move(_bias), std::move(_weights_quant_info), std::move(_out_quant_info));
+        NodeIdxPair input         = {s.tail_node(), 0};
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        return GraphBuilder::add_convolution_node(s.graph(), common_params, input, Size2D(_conv_width, _conv_height),
+                                                  _ofm, _conv_info, _num_groups, s.hints().convolution_method_hint,
+                                                  s.hints().fast_math_hint, std::move(_weights), std::move(_bias),
+                                                  std::move(_weights_quant_info), std::move(_out_quant_info));
     }
 
 private:
@@ -461,11 +458,10 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeIdxPair input         = { s.tail_node(), 0 };
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        return GraphBuilder::add_deconvolution_node(s.graph(), common_params, input,
-                                                    Size2D(_conv_width, _conv_height), _ofm, _deconv_info,
-                                                    std::move(_weights), std::move(_bias));
+        NodeIdxPair input         = {s.tail_node(), 0};
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        return GraphBuilder::add_deconvolution_node(s.graph(), common_params, input, Size2D(_conv_width, _conv_height),
+                                                    _ofm, _deconv_info, std::move(_weights), std::move(_bias));
     }
 
 private:
@@ -513,12 +509,12 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeIdxPair input         = { s.tail_node(), 0 };
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        return GraphBuilder::add_depthwise_convolution_node(s.graph(), common_params,
-                                                            input, Size2D(_conv_width, _conv_height), _conv_info, _depth_multiplier,
-                                                            s.hints().depthwise_convolution_method_hint,
-                                                            std::move(_weights), std::move(_bias), std::move(_weights_quant_info), std::move(_out_quant_info));
+        NodeIdxPair input         = {s.tail_node(), 0};
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        return GraphBuilder::add_depthwise_convolution_node(
+            s.graph(), common_params, input, Size2D(_conv_width, _conv_height), _conv_info, _depth_multiplier,
+            s.hints().depthwise_convolution_method_hint, std::move(_weights), std::move(_bias),
+            std::move(_weights_quant_info), std::move(_out_quant_info));
     }
 
 private:
@@ -540,15 +536,14 @@ public:
      *
      * @param[in] block_shape Block size to rearranged
      */
-    DepthToSpaceLayer(int32_t block_shape)
-        : _block_shape(block_shape)
+    DepthToSpaceLayer(int32_t block_shape) : _block_shape(block_shape)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_depth_to_space_node(s.graph(), common_params, input, _block_shape);
     }
 
@@ -569,8 +564,8 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_dequantization_node(s.graph(), common_params, input);
     }
 };
@@ -585,18 +580,21 @@ public:
      * @param[in] sub_stream_prior PriorBox graph sub-stream.
      * @param[in] detect_info      DetectionOutput parameters.
      */
-    DetectionOutputLayer(SubStream &&sub_stream_conf, SubStream &&sub_stream_prior, const DetectionOutputLayerInfo &detect_info)
+    DetectionOutputLayer(SubStream                     &&sub_stream_conf,
+                         SubStream                     &&sub_stream_prior,
+                         const DetectionOutputLayerInfo &detect_info)
         : _ss_conf(std::move(sub_stream_conf)), _ss_prior(std::move(sub_stream_prior)), _detect_info(detect_info)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params  = { name(), s.hints().target_hint };
-        NodeIdxPair input_loc      = { s.tail_node(), 0 };
-        NodeIdxPair input_conf     = { _ss_conf.tail_node(), 0 };
-        NodeIdxPair input_priorbox = { _ss_prior.tail_node(), 0 };
-        return GraphBuilder::add_detection_output_node(s.graph(), common_params, input_loc, input_conf, input_priorbox, _detect_info);
+        NodeParams  common_params  = {name(), s.hints().target_hint};
+        NodeIdxPair input_loc      = {s.tail_node(), 0};
+        NodeIdxPair input_conf     = {_ss_conf.tail_node(), 0};
+        NodeIdxPair input_priorbox = {_ss_prior.tail_node(), 0};
+        return GraphBuilder::add_detection_output_node(s.graph(), common_params, input_loc, input_conf, input_priorbox,
+                                                       _detect_info);
     }
 
 private:
@@ -615,9 +613,14 @@ public:
      * @param[in] anchors                     Accessor to get anchors tensor data from.
      * @param[in] out_quant_info              (Optional) Output quantization info
      */
-    DetectionPostProcessLayer(SubStream &&sub_stream_class_prediction, DetectionPostProcessLayerInfo detect_info, ITensorAccessorUPtr anchors,
-                              const QuantizationInfo out_quant_info = QuantizationInfo())
-        : _sub_stream_class_prediction(std::move(sub_stream_class_prediction)), _detect_info(detect_info), _anchors(std::move(anchors)), _out_quant_info(std::move(out_quant_info))
+    DetectionPostProcessLayer(SubStream                   &&sub_stream_class_prediction,
+                              DetectionPostProcessLayerInfo detect_info,
+                              ITensorAccessorUPtr           anchors,
+                              const QuantizationInfo        out_quant_info = QuantizationInfo())
+        : _sub_stream_class_prediction(std::move(sub_stream_class_prediction)),
+          _detect_info(detect_info),
+          _anchors(std::move(anchors)),
+          _out_quant_info(std::move(out_quant_info))
     {
     }
 
@@ -625,10 +628,12 @@ public:
     {
         ARM_COMPUTE_ERROR_ON(_anchors == nullptr);
 
-        NodeParams  common_params          = { name(), s.hints().target_hint };
-        NodeIdxPair input_box_encoding     = { s.tail_node(), 0 };
-        NodeIdxPair input_class_prediction = { _sub_stream_class_prediction.tail_node(), 0 };
-        return GraphBuilder::add_detection_post_process_node(s.graph(), common_params, input_box_encoding, input_class_prediction, _detect_info, std::move(_anchors), std::move(_out_quant_info));
+        NodeParams  common_params          = {name(), s.hints().target_hint};
+        NodeIdxPair input_box_encoding     = {s.tail_node(), 0};
+        NodeIdxPair input_class_prediction = {_sub_stream_class_prediction.tail_node(), 0};
+        return GraphBuilder::add_detection_post_process_node(s.graph(), common_params, input_box_encoding,
+                                                             input_class_prediction, _detect_info, std::move(_anchors),
+                                                             std::move(_out_quant_info));
     }
 
 private:
@@ -645,15 +650,14 @@ public:
      *
      * @param[in] shape Output shape
      */
-    DummyLayer(TensorShape shape)
-        : _shape(shape)
+    DummyLayer(TensorShape shape) : _shape(shape)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_dummy_node(s.graph(), common_params, input, _shape);
     }
 
@@ -677,9 +681,9 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input0        = { _ss0.tail_node(), 0 };
-        NodeIdxPair input1        = { _ss1.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input0        = {_ss0.tail_node(), 0};
+        NodeIdxPair input1        = {_ss1.tail_node(), 0};
 
         return GraphBuilder::add_elementwise_node(s.graph(), common_params, input0, input1, _op);
     }
@@ -700,8 +704,8 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_flatten_node(s.graph(), common_params, input);
     }
 };
@@ -770,13 +774,13 @@ public:
      */
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
-        if(_weights != nullptr)
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
+        if (_weights != nullptr)
         {
-            return GraphBuilder::add_fully_connected_layer(s.graph(), common_params, input, _num_outputs,
-                                                           std::move(_weights), std::move(_bias), _fc_info,
-                                                           std::move(_weights_quant_info), std::move(_out_quant_info), s.hints().fast_math_hint);
+            return GraphBuilder::add_fully_connected_layer(
+                s.graph(), common_params, input, _num_outputs, std::move(_weights), std::move(_bias), _fc_info,
+                std::move(_weights_quant_info), std::move(_out_quant_info), s.hints().fast_math_hint);
         }
         else
         {
@@ -811,8 +815,14 @@ public:
      * @param[in] ss_anchors Graph sub-stream for the anchors.
      * @param[in] info       Generate Proposals operation information.
      */
-    GenerateProposalsLayer(SubStream &&ss_scores, SubStream &&ss_deltas, SubStream &&ss_anchors, GenerateProposalsInfo info)
-        : _ss_scores(std::move(ss_scores)), _ss_deltas(std::move(ss_deltas)), _ss_anchors(std::move(ss_anchors)), _info(info)
+    GenerateProposalsLayer(SubStream           &&ss_scores,
+                           SubStream           &&ss_deltas,
+                           SubStream           &&ss_anchors,
+                           GenerateProposalsInfo info)
+        : _ss_scores(std::move(ss_scores)),
+          _ss_deltas(std::move(ss_deltas)),
+          _ss_anchors(std::move(ss_anchors)),
+          _info(info)
     {
     }
 
@@ -824,10 +834,10 @@ public:
      */
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair scores        = { _ss_scores.tail_node(), 0 };
-        NodeIdxPair deltas        = { _ss_deltas.tail_node(), 0 };
-        NodeIdxPair anchors       = { _ss_anchors.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair scores        = {_ss_scores.tail_node(), 0};
+        NodeIdxPair deltas        = {_ss_deltas.tail_node(), 0};
+        NodeIdxPair anchors       = {_ss_anchors.tail_node(), 0};
         return GraphBuilder::add_generate_proposals_node(s.graph(), common_params, scores, deltas, anchors, _info);
     }
 
@@ -847,15 +857,14 @@ public:
      * @param[in] axis    Axis to perform normalization on
      * @param[in] epsilon Lower bound value for the normalization
      */
-    L2NormalizeLayer(int axis, float epsilon)
-        : _axis(axis), _epsilon(epsilon)
+    L2NormalizeLayer(int axis, float epsilon) : _axis(axis), _epsilon(epsilon)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_l2_normalize_node(s.graph(), common_params, input, _axis, _epsilon);
     }
 
@@ -872,15 +881,14 @@ public:
      *
      * @param[in] norm_info Normalization information.
      */
-    NormalizationLayer(NormalizationLayerInfo norm_info)
-        : _norm_info(norm_info)
+    NormalizationLayer(NormalizationLayerInfo norm_info) : _norm_info(norm_info)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_normalization_node(s.graph(), common_params, input, _norm_info);
     }
 
@@ -897,8 +905,7 @@ public:
      * @param[in] mean Accessor to get mean tensor data from.
      * @param[in] std  Accessor to get std tensor data from.
      */
-    NormalizePlanarYUVLayer(ITensorAccessorUPtr mean,
-                            ITensorAccessorUPtr std)
+    NormalizePlanarYUVLayer(ITensorAccessorUPtr mean, ITensorAccessorUPtr std)
         : _mean(std::move(mean)), _std(std::move(std))
     {
     }
@@ -908,10 +915,10 @@ public:
         ARM_COMPUTE_ERROR_ON(_mean == nullptr);
         ARM_COMPUTE_ERROR_ON(_std == nullptr);
 
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
-        return GraphBuilder::add_normalize_planar_yuv_node(s.graph(), common_params, input,
-                                                           std::move(_mean), std::move(_std));
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
+        return GraphBuilder::add_normalize_planar_yuv_node(s.graph(), common_params, input, std::move(_mean),
+                                                           std::move(_std));
     }
 
 private:
@@ -929,15 +936,14 @@ public:
      *                      specifies the front and the end padding in the i-th dimension.
      * @param[in] pad_value Padding value to use. Defaults to 0.
      */
-    PadLayer(PaddingList padding, PixelValue pad_value = PixelValue())
-        : _padding(padding), _pad_value(pad_value)
+    PadLayer(PaddingList padding, PixelValue pad_value = PixelValue()) : _padding(padding), _pad_value(pad_value)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_pad_node(s.graph(), common_params, input, _padding, _pad_value);
     }
 
@@ -956,15 +962,14 @@ public:
      * @param[in] layout (Optional) Data layout to assign to permuted tensor.
      *                   If UNKNOWN then the input's layout will be used.
      */
-    PermuteLayer(PermutationVector perm, DataLayout layout = DataLayout::UNKNOWN)
-        : _perm(perm), _layout(layout)
+    PermuteLayer(PermutationVector perm, DataLayout layout = DataLayout::UNKNOWN) : _perm(perm), _layout(layout)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_permute_node(s.graph(), common_params, input, _perm, _layout);
     }
 
@@ -981,15 +986,14 @@ public:
      *
      * @param[in] pool_info Pooling information.
      */
-    PoolingLayer(PoolingLayerInfo pool_info)
-        : _pool_info(pool_info)
+    PoolingLayer(PoolingLayerInfo pool_info) : _pool_info(pool_info)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_pooling_node(s.graph(), common_params, input, _pool_info);
     }
 
@@ -1013,9 +1017,9 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { _ss0.tail_node(), 0 };
-        NodeIdxPair alpha         = { _ss1.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {_ss0.tail_node(), 0};
+        NodeIdxPair alpha         = {_ss1.tail_node(), 0};
 
         return GraphBuilder::add_prelu_node(s.graph(), common_params, input, alpha);
     }
@@ -1064,15 +1068,17 @@ public:
      * @param[in] format_info (Optional) Format info.
      * @param[in] transform   (Optional) Input transform function.
      */
-    PrintLayer(std::ostream &stream, const IOFormatInfo &format_info = IOFormatInfo(), const std::function<ITensor *(ITensor *)> transform = nullptr)
+    PrintLayer(std::ostream                             &stream,
+               const IOFormatInfo                       &format_info = IOFormatInfo(),
+               const std::function<ITensor *(ITensor *)> transform   = nullptr)
         : _stream(stream), _format_info(format_info), _transform(transform)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_print_node(s.graph(), common_params, input, _stream, _format_info, _transform);
     }
 
@@ -1098,9 +1104,9 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input0        = { s.tail_node(), 0 };
-        NodeIdxPair input1        = { _ss.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input0        = {s.tail_node(), 0};
+        NodeIdxPair input1        = {_ss.tail_node(), 0};
         return GraphBuilder::add_priorbox_node(s.graph(), common_params, input0, input1, _prior_info);
     }
 
@@ -1117,15 +1123,14 @@ public:
      *
      * @param[in] out_quant_info Output tensor quantization info
      */
-    QuantizationLayer(QuantizationInfo out_quant_info)
-        : _out_quant_info(out_quant_info)
+    QuantizationLayer(QuantizationInfo out_quant_info) : _out_quant_info(out_quant_info)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_quantization_node(s.graph(), common_params, input, _out_quant_info);
     }
 
@@ -1150,8 +1155,8 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_reduction_operation_node(s.graph(), common_params, input, _op, _axis, _keep_dims);
     }
 
@@ -1170,15 +1175,14 @@ public:
      * @param[in] stride Stride value to use for reorganizing the values in the output tensor.
      *                   It defines the spatial distance between 2 consecutive pixels in the x and y direction
      */
-    ReorgLayer(int stride)
-        : _stride(stride)
+    ReorgLayer(int stride) : _stride(stride)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_reorg_node(s.graph(), common_params, input, _stride);
     }
 
@@ -1194,15 +1198,14 @@ public:
      *
      * @param[in] shape Target shape.
      */
-    ReshapeLayer(TensorShape shape)
-        : _shape(shape)
+    ReshapeLayer(TensorShape shape) : _shape(shape)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_reshape_node(s.graph(), common_params, input, _shape);
     }
 
@@ -1221,8 +1224,8 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_resize_node(s.graph(), common_params, input, _policy, _width_scale, _height_scale);
     }
 
@@ -1254,9 +1257,9 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { _ss_input.tail_node(), 0 };
-        NodeIdxPair rois          = { _ss_rois.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {_ss_input.tail_node(), 0};
+        NodeIdxPair rois          = {_ss_rois.tail_node(), 0};
         return GraphBuilder::add_roi_align_node(s.graph(), common_params, input, rois, _pool_info);
     }
 
@@ -1275,16 +1278,15 @@ public:
      * @param[in] mul_w Accessor to get mul weight from.
      * @param[in] add_w Accessor to get add weight from.
      */
-    ScaleLayer(ITensorAccessorUPtr mul_w,
-               ITensorAccessorUPtr add_w)
+    ScaleLayer(ITensorAccessorUPtr mul_w, ITensorAccessorUPtr add_w)
         : _mul_w(std::move(mul_w)), _add_w(std::move(add_w))
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_scale_layer(s.graph(), common_params, input, std::move(_mul_w), std::move(_add_w));
     }
 
@@ -1302,15 +1304,14 @@ public:
      * @param[in] starts The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      * @param[in] ends   The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      */
-    SliceLayer(Coordinates &starts, Coordinates &ends)
-        : _starts(starts), _ends(ends)
+    SliceLayer(Coordinates &starts, Coordinates &ends) : _starts(starts), _ends(ends)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_slice_node(s.graph(), common_params, input, _starts, _ends);
     }
 
@@ -1327,15 +1328,14 @@ public:
      *
      * @param[in] beta (Optional) Beta value. Default 1.0.
      */
-    SoftmaxLayer(float beta = 1.0f)
-        : _beta(beta)
+    SoftmaxLayer(float beta = 1.0f) : _beta(beta)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_softmax_node(s.graph(), common_params, input, _beta);
     }
 
@@ -1354,17 +1354,14 @@ public:
      * @param[in] rest_sub_streams Rest sub-graph branches
      */
     template <typename... Ts>
-    StackLayer(SubStream &&sub_stream1, SubStream &&sub_stream2, Ts &&... rest_sub_streams)
-        : _sub_streams(), _axis(0)
+    StackLayer(SubStream &&sub_stream1, SubStream &&sub_stream2, Ts &&...rest_sub_streams) : _sub_streams(), _axis(0)
     {
         _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream1)));
         _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream2)));
 
-        utility::for_each([&](SubStream && sub_stream)
-        {
-            _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream)));
-        },
-        std::move(rest_sub_streams)...);
+        utility::for_each([&](SubStream &&sub_stream)
+                          { _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream))); },
+                          std::move(rest_sub_streams)...);
     }
     /** Construct a concatenation layer
      *
@@ -1374,33 +1371,30 @@ public:
      * @param[in] rest_sub_streams Rest sub-graph branches
      */
     template <typename... Ts>
-    StackLayer(int axis, SubStream &&sub_stream1, SubStream &&sub_stream2, Ts &&... rest_sub_streams)
+    StackLayer(int axis, SubStream &&sub_stream1, SubStream &&sub_stream2, Ts &&...rest_sub_streams)
         : _sub_streams(), _axis(axis)
     {
         _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream1)));
         _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream2)));
 
-        utility::for_each([&](SubStream && sub_stream)
-        {
-            _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream)));
-        },
-        std::move(rest_sub_streams)...);
+        utility::for_each([&](SubStream &&sub_stream)
+                          { _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream))); },
+                          std::move(rest_sub_streams)...);
     }
     /** Construct a concat layer
      *
      * @param[in] sub_stream Sub-stream
      */
     template <typename... Ts>
-    StackLayer(SubStream &&sub_stream)
-        : _sub_streams(), _axis(0)
+    StackLayer(SubStream &&sub_stream) : _sub_streams(), _axis(0)
     {
         _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream)));
     }
     NodeID create_layer(IStream &s) override
     {
         NodeID     nid           = EmptyNodeID;
-        NodeParams common_params = { name(), s.hints().target_hint };
-        if(_sub_streams.size() == 1 && _sub_streams.at(0) != nullptr)
+        NodeParams common_params = {name(), s.hints().target_hint};
+        if (_sub_streams.size() == 1 && _sub_streams.at(0) != nullptr)
         {
             nid = _sub_streams[0]->tail_node();
         }
@@ -1408,14 +1402,14 @@ public:
         {
             // Collect tail nodes and stack
             std::vector<NodeIdxPair> nodes;
-            for(auto &ss : _sub_streams)
+            for (auto &ss : _sub_streams)
             {
-                if(ss && (ss->tail_node() != EmptyNodeID))
+                if (ss && (ss->tail_node() != EmptyNodeID))
                 {
                     const auto tail_node = s.graph().node(ss->tail_node());
-                    if(tail_node != nullptr && tail_node->type() != NodeType::Output)
+                    if (tail_node != nullptr && tail_node->type() != NodeType::Output)
                     {
-                        nodes.push_back({ ss->tail_node(), 0 });
+                        nodes.push_back({ss->tail_node(), 0});
                     }
                 }
             }
@@ -1440,15 +1434,18 @@ public:
      * @param[in] strides            The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      * @param[in] strided_slice_info Contains masks for the starts, ends and strides
      */
-    StridedSliceLayer(Coordinates &starts, Coordinates &ends, BiStrides &strides, StridedSliceLayerInfo strided_slice_info)
+    StridedSliceLayer(Coordinates          &starts,
+                      Coordinates          &ends,
+                      BiStrides            &strides,
+                      StridedSliceLayerInfo strided_slice_info)
         : _starts(starts), _ends(ends), _strides(strides), _info(strided_slice_info)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_strided_slice_node(s.graph(), common_params, input, _starts, _ends, _strides, _info);
     }
 
@@ -1467,15 +1464,14 @@ public:
      *
      * @param[in] act_info Activation info
      */
-    YOLOLayer(ActivationLayerInfo act_info)
-        : _act_info(act_info)
+    YOLOLayer(ActivationLayerInfo act_info) : _act_info(act_info)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_yolo_node(s.graph(), common_params, input, _act_info);
     }
 
diff --git a/arm_compute/graph/frontend/Stream.h b/arm_compute/graph/frontend/Stream.h
index db22f6d91b..7e760b6373 100644
--- a/arm_compute/graph/frontend/Stream.h
+++ b/arm_compute/graph/frontend/Stream.h
@@ -27,7 +27,6 @@
 #include "arm_compute/graph/frontend/IStream.h"
 #include "arm_compute/graph/frontend/IStreamOperators.h"
 #include "arm_compute/graph/frontend/Types.h"
-
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/GraphManager.h"
@@ -65,7 +64,7 @@ public:
     void run();
 
     // Inherited overridden methods
-    void add_layer(ILayer &layer) override;
+    void         add_layer(ILayer &layer) override;
     Graph       &graph() override;
     const Graph &graph() const override;
 
diff --git a/arm_compute/graph/frontend/SubStream.h b/arm_compute/graph/frontend/SubStream.h
index 2283cfeebe..c54317c52b 100644
--- a/arm_compute/graph/frontend/SubStream.h
+++ b/arm_compute/graph/frontend/SubStream.h
@@ -54,7 +54,7 @@ public:
     SubStream(IStream &s);
 
     // Inherited overridden methods
-    void add_layer(ILayer &layer) override;
+    void         add_layer(ILayer &layer) override;
     Graph       &graph() override;
     const Graph &graph() const override;
 
diff --git a/arm_compute/graph/frontend/Types.h b/arm_compute/graph/frontend/Types.h
index bc4fe7ae0d..42b28b3cd2 100644
--- a/arm_compute/graph/frontend/Types.h
+++ b/arm_compute/graph/frontend/Types.h
@@ -33,39 +33,40 @@ namespace graph
 namespace frontend
 {
 // Import types for graph
-using graph::DataType;
 using graph::DataLayout;
 using graph::DataLayoutDimension;
-using graph::TensorShape;
+using graph::DataType;
 using graph::PermutationVector;
+using graph::TensorShape;
 
 using graph::ActivationLayerInfo;
+using graph::ConvolutionMethod;
+using graph::DepthwiseConvolutionMethod;
+using graph::DimensionRoundingType;
 using graph::EltwiseOperation;
+using graph::FastMathHint;
 using graph::FullyConnectedLayerInfo;
+using graph::GraphConfig;
+using graph::InterpolationPolicy;
 using graph::NormalizationLayerInfo;
 using graph::NormType;
 using graph::PadStrideInfo;
 using graph::PoolingLayerInfo;
 using graph::PoolingType;
+using graph::Size2D;
 using graph::Target;
-using graph::ConvolutionMethod;
-using graph::FastMathHint;
-using graph::DepthwiseConvolutionMethod;
 using graph::TensorDescriptor;
-using graph::DimensionRoundingType;
-using graph::GraphConfig;
-using graph::InterpolationPolicy;
-using graph::Size2D;
 
 /** Hints that can be passed to the stream to expose parameterization */
 struct StreamHints
 {
-    Target                     target_hint                       = { Target::UNSPECIFIED };                 /**< Target execution hint */
-    ConvolutionMethod          convolution_method_hint           = { ConvolutionMethod::Default };          /**< Convolution method hint */
-    DepthwiseConvolutionMethod depthwise_convolution_method_hint = { DepthwiseConvolutionMethod::Default }; /**< Depthwise Convolution method hint */
-    FastMathHint               fast_math_hint                    = { FastMathHint::Disabled };              /**< Fast math hint */
+    Target                     target_hint             = {Target::UNSPECIFIED};        /**< Target execution hint */
+    ConvolutionMethod          convolution_method_hint = {ConvolutionMethod::Default}; /**< Convolution method hint */
+    DepthwiseConvolutionMethod depthwise_convolution_method_hint = {
+        DepthwiseConvolutionMethod::Default};               /**< Depthwise Convolution method hint */
+    FastMathHint fast_math_hint = {FastMathHint::Disabled}; /**< Fast math hint */
 };
 } // namespace frontend
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_STREAM_TYPES_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_GRAPH_STREAM_TYPES_H */
diff --git a/arm_compute/graph/mutators/DepthConcatSubTensorMutator.h b/arm_compute/graph/mutators/DepthConcatSubTensorMutator.h
index cb1f079a2e..61d8854a61 100644
--- a/arm_compute/graph/mutators/DepthConcatSubTensorMutator.h
+++ b/arm_compute/graph/mutators/DepthConcatSubTensorMutator.h
@@ -40,7 +40,7 @@ public:
     // Inherited methods overridden
     virtual void mutate(Graph &g) override;
     MutationType type() const override;
-    const char *name() override;
+    const char  *name() override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/mutators/GroupedConvolutionMutator.h b/arm_compute/graph/mutators/GroupedConvolutionMutator.h
index e68c7030d0..3ed8d786fc 100644
--- a/arm_compute/graph/mutators/GroupedConvolutionMutator.h
+++ b/arm_compute/graph/mutators/GroupedConvolutionMutator.h
@@ -40,7 +40,7 @@ public:
     // Inherited methods overridden
     virtual void mutate(Graph &g) override;
     MutationType type() const override;
-    const char *name() override;
+    const char  *name() override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/mutators/InPlaceOperationMutator.h b/arm_compute/graph/mutators/InPlaceOperationMutator.h
index 6248d86a0a..86f62f1994 100644
--- a/arm_compute/graph/mutators/InPlaceOperationMutator.h
+++ b/arm_compute/graph/mutators/InPlaceOperationMutator.h
@@ -37,7 +37,7 @@ public:
     // Inherited methods overridden
     virtual void mutate(Graph &g) override;
     MutationType type() const override;
-    const char *name() override;
+    const char  *name() override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/mutators/NodeExecutionMethodMutator.h b/arm_compute/graph/mutators/NodeExecutionMethodMutator.h
index 07c8ffad97..505d4ab300 100644
--- a/arm_compute/graph/mutators/NodeExecutionMethodMutator.h
+++ b/arm_compute/graph/mutators/NodeExecutionMethodMutator.h
@@ -42,7 +42,7 @@ public:
     // Inherited methods overridden
     virtual void mutate(Graph &g) override;
     MutationType type() const override;
-    const char *name() override;
+    const char  *name() override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/mutators/NodeFusionMutator.h b/arm_compute/graph/mutators/NodeFusionMutator.h
index f3e3eaa190..9d2d44f436 100644
--- a/arm_compute/graph/mutators/NodeFusionMutator.h
+++ b/arm_compute/graph/mutators/NodeFusionMutator.h
@@ -38,7 +38,7 @@ public:
     // Inherited methods overridden
     virtual void mutate(Graph &g) override;
     MutationType type() const override;
-    const char *name() override;
+    const char  *name() override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/mutators/SplitLayerSubTensorMutator.h b/arm_compute/graph/mutators/SplitLayerSubTensorMutator.h
index b14ef59532..ab9746a29b 100644
--- a/arm_compute/graph/mutators/SplitLayerSubTensorMutator.h
+++ b/arm_compute/graph/mutators/SplitLayerSubTensorMutator.h
@@ -40,7 +40,7 @@ public:
     // Inherited methods overridden
     virtual void mutate(Graph &g) override;
     MutationType type() const override;
-    const char *name() override;
+    const char  *name() override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/mutators/SyntheticDataTypeMutator.h b/arm_compute/graph/mutators/SyntheticDataTypeMutator.h
index 2292e52086..ce8af0a1d7 100644
--- a/arm_compute/graph/mutators/SyntheticDataTypeMutator.h
+++ b/arm_compute/graph/mutators/SyntheticDataTypeMutator.h
@@ -40,7 +40,7 @@ public:
     // Inherited methods overridden
     virtual void mutate(Graph &g) override;
     MutationType type() const override;
-    const char *name() override;
+    const char  *name() override;
 
 private:
     DataType _mutate_type;
diff --git a/arm_compute/graph/nodes/ActivationLayerNode.h b/arm_compute/graph/nodes/ActivationLayerNode.h
index 4a98ee248f..fe5f273db5 100644
--- a/arm_compute/graph/nodes/ActivationLayerNode.h
+++ b/arm_compute/graph/nodes/ActivationLayerNode.h
@@ -39,8 +39,7 @@ public:
      * @param[in] info           Activation Layer information
      * @param[in] out_quant_info (Optional) Output quantization info
      */
-    ActivationLayerNode(ActivationLayerInfo info,
-                        QuantizationInfo    out_quant_info = QuantizationInfo());
+    ActivationLayerNode(ActivationLayerInfo info, QuantizationInfo out_quant_info = QuantizationInfo());
     /** Activation metadata accessor
      *
      * @return The activation info of the layer
@@ -51,7 +50,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 public:
     static constexpr NodeType node_type = NodeType::ActivationLayer;
diff --git a/arm_compute/graph/nodes/ArgMinMaxLayerNode.h b/arm_compute/graph/nodes/ArgMinMaxLayerNode.h
index 69191add99..65fbc36db6 100644
--- a/arm_compute/graph/nodes/ArgMinMaxLayerNode.h
+++ b/arm_compute/graph/nodes/ArgMinMaxLayerNode.h
@@ -65,7 +65,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 public:
     static constexpr NodeType node_type = NodeType::ArgMinMaxLayer;
diff --git a/arm_compute/graph/nodes/BatchNormalizationLayerNode.h b/arm_compute/graph/nodes/BatchNormalizationLayerNode.h
index e7f4049df8..8583ed87eb 100644
--- a/arm_compute/graph/nodes/BatchNormalizationLayerNode.h
+++ b/arm_compute/graph/nodes/BatchNormalizationLayerNode.h
@@ -60,7 +60,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 public:
     static constexpr NodeType node_type = NodeType::BatchNormalizationLayer;
diff --git a/arm_compute/graph/nodes/BoundingBoxTransformLayerNode.h b/arm_compute/graph/nodes/BoundingBoxTransformLayerNode.h
index 57175eba2e..96c2544065 100644
--- a/arm_compute/graph/nodes/BoundingBoxTransformLayerNode.h
+++ b/arm_compute/graph/nodes/BoundingBoxTransformLayerNode.h
@@ -50,7 +50,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     BoundingBoxTransformInfo _bbox_info;
diff --git a/arm_compute/graph/nodes/ChannelShuffleLayerNode.h b/arm_compute/graph/nodes/ChannelShuffleLayerNode.h
index 0696fe56fc..d296a2dcc3 100644
--- a/arm_compute/graph/nodes/ChannelShuffleLayerNode.h
+++ b/arm_compute/graph/nodes/ChannelShuffleLayerNode.h
@@ -49,7 +49,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     unsigned int _num_groups;
diff --git a/arm_compute/graph/nodes/ConcatenateLayerNode.h b/arm_compute/graph/nodes/ConcatenateLayerNode.h
index 8582403355..13398b1a61 100644
--- a/arm_compute/graph/nodes/ConcatenateLayerNode.h
+++ b/arm_compute/graph/nodes/ConcatenateLayerNode.h
@@ -47,7 +47,8 @@ public:
      *
      * @return Expected output descriptor
      */
-    static TensorDescriptor compute_output_descriptor(const std::vector<TensorDescriptor> &input_descriptors, DataLayoutDimension axis);
+    static TensorDescriptor compute_output_descriptor(const std::vector<TensorDescriptor> &input_descriptors,
+                                                      DataLayoutDimension                  axis);
     /** Disables or not the depth concatenate node
      *
      * @warning This is used when concatenate is performed using sub-tensors, where this node is used as a placeholder.
@@ -78,7 +79,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     unsigned int                       _total_nodes;
diff --git a/arm_compute/graph/nodes/ConstNode.h b/arm_compute/graph/nodes/ConstNode.h
index b377c60208..400b9b4d9f 100644
--- a/arm_compute/graph/nodes/ConstNode.h
+++ b/arm_compute/graph/nodes/ConstNode.h
@@ -44,7 +44,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     TensorDescriptor _desc;
diff --git a/arm_compute/graph/nodes/ConvolutionLayerNode.h b/arm_compute/graph/nodes/ConvolutionLayerNode.h
index 99effa07dc..8a77b89f27 100644
--- a/arm_compute/graph/nodes/ConvolutionLayerNode.h
+++ b/arm_compute/graph/nodes/ConvolutionLayerNode.h
@@ -111,7 +111,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 public:
     static constexpr NodeType node_type = NodeType::ConvolutionLayer;
diff --git a/arm_compute/graph/nodes/DeconvolutionLayerNode.h b/arm_compute/graph/nodes/DeconvolutionLayerNode.h
index e74adb17aa..553d05985c 100644
--- a/arm_compute/graph/nodes/DeconvolutionLayerNode.h
+++ b/arm_compute/graph/nodes/DeconvolutionLayerNode.h
@@ -61,7 +61,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     descriptors::DeconvolutionLayerDescriptor descriptor;
diff --git a/arm_compute/graph/nodes/DepthToSpaceLayerNode.h b/arm_compute/graph/nodes/DepthToSpaceLayerNode.h
index 25e30e2c67..5fbcc670ff 100644
--- a/arm_compute/graph/nodes/DepthToSpaceLayerNode.h
+++ b/arm_compute/graph/nodes/DepthToSpaceLayerNode.h
@@ -56,7 +56,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     int _block_shape;
diff --git a/arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h b/arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h
index 5df86983f0..441d68d2b8 100644
--- a/arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h
+++ b/arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h
@@ -101,7 +101,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 public:
     static constexpr NodeType node_type = NodeType::DepthwiseConvolutionLayer;
diff --git a/arm_compute/graph/nodes/DequantizationLayerNode.h b/arm_compute/graph/nodes/DequantizationLayerNode.h
index 4910938d47..1cce71373f 100644
--- a/arm_compute/graph/nodes/DequantizationLayerNode.h
+++ b/arm_compute/graph/nodes/DequantizationLayerNode.h
@@ -46,8 +46,8 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 };
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_DEQUANTIZATION_NODE_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_GRAPH_DEQUANTIZATION_NODE_H */
diff --git a/arm_compute/graph/nodes/DetectionOutputLayerNode.h b/arm_compute/graph/nodes/DetectionOutputLayerNode.h
index b4b910c40e..c3e067e430 100644
--- a/arm_compute/graph/nodes/DetectionOutputLayerNode.h
+++ b/arm_compute/graph/nodes/DetectionOutputLayerNode.h
@@ -51,13 +51,14 @@ public:
      *
      * @return Output descriptor
      */
-    static TensorDescriptor compute_output_descriptor(const TensorDescriptor &input_descriptor, const DetectionOutputLayerInfo &info);
+    static TensorDescriptor compute_output_descriptor(const TensorDescriptor         &input_descriptor,
+                                                      const DetectionOutputLayerInfo &info);
 
     // Inherited overridden methods:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     DetectionOutputLayerInfo _info;
diff --git a/arm_compute/graph/nodes/DetectionPostProcessLayerNode.h b/arm_compute/graph/nodes/DetectionPostProcessLayerNode.h
index 6ff78aee07..a53aaf2b9c 100644
--- a/arm_compute/graph/nodes/DetectionPostProcessLayerNode.h
+++ b/arm_compute/graph/nodes/DetectionPostProcessLayerNode.h
@@ -49,7 +49,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     DetectionPostProcessLayerInfo _info;
@@ -59,4 +59,4 @@ private:
 };
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_DETECTION_POST_PROCESS_LAYER_NODE_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_GRAPH_DETECTION_POST_PROCESS_LAYER_NODE_H */
diff --git a/arm_compute/graph/nodes/DummyNode.h b/arm_compute/graph/nodes/DummyNode.h
index 645f1b325d..2263525a72 100644
--- a/arm_compute/graph/nodes/DummyNode.h
+++ b/arm_compute/graph/nodes/DummyNode.h
@@ -51,11 +51,11 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     TensorShape _shape;
 };
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_DUMMY_NODE_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_GRAPH_DUMMY_NODE_H */
diff --git a/arm_compute/graph/nodes/EltwiseLayerNode.h b/arm_compute/graph/nodes/EltwiseLayerNode.h
index 7a6d8e8303..258298259f 100644
--- a/arm_compute/graph/nodes/EltwiseLayerNode.h
+++ b/arm_compute/graph/nodes/EltwiseLayerNode.h
@@ -79,7 +79,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
     static constexpr NodeType node_type = NodeType::EltwiseLayer;
 
@@ -112,7 +112,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
     static constexpr NodeType node_type = NodeType::UnaryEltwiseLayer;
 
diff --git a/arm_compute/graph/nodes/FlattenLayerNode.h b/arm_compute/graph/nodes/FlattenLayerNode.h
index 046114c291..af104707a1 100644
--- a/arm_compute/graph/nodes/FlattenLayerNode.h
+++ b/arm_compute/graph/nodes/FlattenLayerNode.h
@@ -41,7 +41,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/nodes/FullyConnectedLayerNode.h b/arm_compute/graph/nodes/FullyConnectedLayerNode.h
index 9ade62bf4a..3bcf386d64 100644
--- a/arm_compute/graph/nodes/FullyConnectedLayerNode.h
+++ b/arm_compute/graph/nodes/FullyConnectedLayerNode.h
@@ -73,7 +73,7 @@ public:
      */
     static TensorDescriptor compute_weights_descriptor(const TensorDescriptor &input_descriptor,
                                                        unsigned int            num_outputs,
-                                                       FullyConnectedLayerInfo fc_info            = FullyConnectedLayerInfo(),
+                                                       FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(),
                                                        const QuantizationInfo &weights_quant_info = QuantizationInfo());
     /** Computes fully connected layer output descriptor
      *
@@ -98,7 +98,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
     static constexpr NodeType node_type = NodeType::FullyConnectedLayer;
 
diff --git a/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h b/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h
index b0051b1385..d891ea49eb 100644
--- a/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h
+++ b/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h
@@ -43,7 +43,8 @@ public:
      * @param[in] fast_math_hint   (Optional) Fast math hint
      * @param[in] fused_activation (Optional) Fused activation layer. Disabled if not specified
      */
-    FusedConvolutionBatchNormalizationNode(float epsilon, PadStrideInfo info,
+    FusedConvolutionBatchNormalizationNode(float               epsilon,
+                                           PadStrideInfo       info,
                                            unsigned int        num_groups       = 1,
                                            ConvolutionMethod   method           = ConvolutionMethod::Default,
                                            FastMathHint        fast_math_hint   = FastMathHint::Disabled,
@@ -122,7 +123,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 public:
     static constexpr NodeType node_type = NodeType::FusedConvolutionBatchNormalizationLayer;
diff --git a/arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h b/arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h
index a01cb9dc42..a61b155151 100644
--- a/arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h
+++ b/arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h
@@ -46,7 +46,7 @@ public:
                                                     PadStrideInfo              info,
                                                     unsigned int               depth_multiplier,
                                                     DepthwiseConvolutionMethod method,
-                                                    ActivationLayerInfo        fused_activation = ActivationLayerInfo());
+                                                    ActivationLayerInfo fused_activation = ActivationLayerInfo());
 
     /** Sets the depthwise convolution layer method to use
      *
@@ -117,7 +117,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 public:
     static constexpr NodeType node_type = NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer;
diff --git a/arm_compute/graph/nodes/GenerateProposalsLayerNode.h b/arm_compute/graph/nodes/GenerateProposalsLayerNode.h
index 6f8edc8758..b5e4b9781c 100644
--- a/arm_compute/graph/nodes/GenerateProposalsLayerNode.h
+++ b/arm_compute/graph/nodes/GenerateProposalsLayerNode.h
@@ -50,7 +50,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     GenerateProposalsInfo _info;
diff --git a/arm_compute/graph/nodes/InputNode.h b/arm_compute/graph/nodes/InputNode.h
index 07091af64f..0983d25a59 100644
--- a/arm_compute/graph/nodes/InputNode.h
+++ b/arm_compute/graph/nodes/InputNode.h
@@ -44,7 +44,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     TensorDescriptor _desc;
diff --git a/arm_compute/graph/nodes/L2NormalizeLayerNode.h b/arm_compute/graph/nodes/L2NormalizeLayerNode.h
index 8edc5b0bf3..ed11412b70 100644
--- a/arm_compute/graph/nodes/L2NormalizeLayerNode.h
+++ b/arm_compute/graph/nodes/L2NormalizeLayerNode.h
@@ -68,7 +68,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     int   _axis;
diff --git a/arm_compute/graph/nodes/Nodes.h b/arm_compute/graph/nodes/Nodes.h
index ae9f177ec4..d4ad32b6f0 100644
--- a/arm_compute/graph/nodes/Nodes.h
+++ b/arm_compute/graph/nodes/Nodes.h
@@ -50,18 +50,18 @@
 #include "arm_compute/graph/nodes/NormalizationLayerNode.h"
 #include "arm_compute/graph/nodes/NormalizePlanarYUVLayerNode.h"
 #include "arm_compute/graph/nodes/OutputNode.h"
-#include "arm_compute/graph/nodes/PReluLayerNode.h"
 #include "arm_compute/graph/nodes/PadLayerNode.h"
 #include "arm_compute/graph/nodes/PermuteLayerNode.h"
 #include "arm_compute/graph/nodes/PoolingLayerNode.h"
+#include "arm_compute/graph/nodes/PReluLayerNode.h"
 #include "arm_compute/graph/nodes/PrintLayerNode.h"
 #include "arm_compute/graph/nodes/PriorBoxLayerNode.h"
 #include "arm_compute/graph/nodes/QuantizationLayerNode.h"
-#include "arm_compute/graph/nodes/ROIAlignLayerNode.h"
 #include "arm_compute/graph/nodes/ReductionLayerNode.h"
 #include "arm_compute/graph/nodes/ReorgLayerNode.h"
 #include "arm_compute/graph/nodes/ReshapeLayerNode.h"
 #include "arm_compute/graph/nodes/ResizeLayerNode.h"
+#include "arm_compute/graph/nodes/ROIAlignLayerNode.h"
 #include "arm_compute/graph/nodes/SliceLayerNode.h"
 #include "arm_compute/graph/nodes/SoftmaxLayerNode.h"
 #include "arm_compute/graph/nodes/SplitLayerNode.h"
diff --git a/arm_compute/graph/nodes/NormalizationLayerNode.h b/arm_compute/graph/nodes/NormalizationLayerNode.h
index 503b859e53..86f2fb9dba 100644
--- a/arm_compute/graph/nodes/NormalizationLayerNode.h
+++ b/arm_compute/graph/nodes/NormalizationLayerNode.h
@@ -49,7 +49,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     NormalizationLayerInfo _info;
diff --git a/arm_compute/graph/nodes/NormalizePlanarYUVLayerNode.h b/arm_compute/graph/nodes/NormalizePlanarYUVLayerNode.h
index 4d84c20de0..158acc4c23 100644
--- a/arm_compute/graph/nodes/NormalizePlanarYUVLayerNode.h
+++ b/arm_compute/graph/nodes/NormalizePlanarYUVLayerNode.h
@@ -41,7 +41,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/nodes/OutputNode.h b/arm_compute/graph/nodes/OutputNode.h
index c91bc6b699..75484ab328 100644
--- a/arm_compute/graph/nodes/OutputNode.h
+++ b/arm_compute/graph/nodes/OutputNode.h
@@ -41,7 +41,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/nodes/PReluLayerNode.h b/arm_compute/graph/nodes/PReluLayerNode.h
index b8e6c1ae7f..532fdccb3a 100644
--- a/arm_compute/graph/nodes/PReluLayerNode.h
+++ b/arm_compute/graph/nodes/PReluLayerNode.h
@@ -41,7 +41,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/nodes/PadLayerNode.h b/arm_compute/graph/nodes/PadLayerNode.h
index d6ff3553da..dcb5ea595b 100644
--- a/arm_compute/graph/nodes/PadLayerNode.h
+++ b/arm_compute/graph/nodes/PadLayerNode.h
@@ -56,7 +56,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 public:
     static constexpr NodeType node_type = NodeType::PadLayer;
diff --git a/arm_compute/graph/nodes/PermuteLayerNode.h b/arm_compute/graph/nodes/PermuteLayerNode.h
index 0b2380b51c..62654e777c 100644
--- a/arm_compute/graph/nodes/PermuteLayerNode.h
+++ b/arm_compute/graph/nodes/PermuteLayerNode.h
@@ -51,7 +51,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     PermutationVector _perm;
diff --git a/arm_compute/graph/nodes/PoolingLayerNode.h b/arm_compute/graph/nodes/PoolingLayerNode.h
index b336bb906f..c81f3f98dc 100644
--- a/arm_compute/graph/nodes/PoolingLayerNode.h
+++ b/arm_compute/graph/nodes/PoolingLayerNode.h
@@ -57,7 +57,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     PoolingLayerInfo _info;
diff --git a/arm_compute/graph/nodes/PrintLayerNode.h b/arm_compute/graph/nodes/PrintLayerNode.h
index b57ac1f6d4..e7accc8015 100644
--- a/arm_compute/graph/nodes/PrintLayerNode.h
+++ b/arm_compute/graph/nodes/PrintLayerNode.h
@@ -43,7 +43,9 @@ public:
      * @param[in] format_info (Optional) Format info.
      * @param[in] transform   (Optional) Input transform function.
      */
-    PrintLayerNode(std::ostream &stream, const IOFormatInfo &format_info = IOFormatInfo(), const std::function<ITensor *(ITensor *)> transform = nullptr);
+    PrintLayerNode(std::ostream                             &stream,
+                   const IOFormatInfo                       &format_info = IOFormatInfo(),
+                   const std::function<ITensor *(ITensor *)> transform   = nullptr);
 
     /** Stream metadata accessor
      *
@@ -67,7 +69,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     std::ostream                             &_stream;
diff --git a/arm_compute/graph/nodes/PriorBoxLayerNode.h b/arm_compute/graph/nodes/PriorBoxLayerNode.h
index c7eadd1fe5..db36bfb1e0 100644
--- a/arm_compute/graph/nodes/PriorBoxLayerNode.h
+++ b/arm_compute/graph/nodes/PriorBoxLayerNode.h
@@ -51,13 +51,14 @@ public:
      *
      * @return Output descriptor
      */
-    static TensorDescriptor compute_output_descriptor(const TensorDescriptor &input_descriptor, const PriorBoxLayerInfo &info);
+    static TensorDescriptor compute_output_descriptor(const TensorDescriptor  &input_descriptor,
+                                                      const PriorBoxLayerInfo &info);
 
     // Inherited overridden methods:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     PriorBoxLayerInfo _info;
diff --git a/arm_compute/graph/nodes/QuantizationLayerNode.h b/arm_compute/graph/nodes/QuantizationLayerNode.h
index e5d81afa0e..b8e4c7d27b 100644
--- a/arm_compute/graph/nodes/QuantizationLayerNode.h
+++ b/arm_compute/graph/nodes/QuantizationLayerNode.h
@@ -51,7 +51,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
     static constexpr NodeType node_type = NodeType::QuantizationLayer;
 
diff --git a/arm_compute/graph/nodes/ROIAlignLayerNode.h b/arm_compute/graph/nodes/ROIAlignLayerNode.h
index 5abd0659b5..70309a551c 100644
--- a/arm_compute/graph/nodes/ROIAlignLayerNode.h
+++ b/arm_compute/graph/nodes/ROIAlignLayerNode.h
@@ -56,7 +56,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     ROIPoolingLayerInfo _pool_info;
diff --git a/arm_compute/graph/nodes/ReductionLayerNode.h b/arm_compute/graph/nodes/ReductionLayerNode.h
index b8d295945c..ff99466c8f 100644
--- a/arm_compute/graph/nodes/ReductionLayerNode.h
+++ b/arm_compute/graph/nodes/ReductionLayerNode.h
@@ -56,7 +56,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     ReductionOperation _op;
diff --git a/arm_compute/graph/nodes/ReorgLayerNode.h b/arm_compute/graph/nodes/ReorgLayerNode.h
index 986692ed28..a3bbcdb00f 100644
--- a/arm_compute/graph/nodes/ReorgLayerNode.h
+++ b/arm_compute/graph/nodes/ReorgLayerNode.h
@@ -57,7 +57,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     int _stride;
diff --git a/arm_compute/graph/nodes/ReshapeLayerNode.h b/arm_compute/graph/nodes/ReshapeLayerNode.h
index 727d253ce5..992275c2b1 100644
--- a/arm_compute/graph/nodes/ReshapeLayerNode.h
+++ b/arm_compute/graph/nodes/ReshapeLayerNode.h
@@ -44,7 +44,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     TensorShape _shape;
diff --git a/arm_compute/graph/nodes/ResizeLayerNode.h b/arm_compute/graph/nodes/ResizeLayerNode.h
index 79f8889f9c..480d6e517f 100644
--- a/arm_compute/graph/nodes/ResizeLayerNode.h
+++ b/arm_compute/graph/nodes/ResizeLayerNode.h
@@ -51,7 +51,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     InterpolationPolicy _policy;
diff --git a/arm_compute/graph/nodes/SliceLayerNode.h b/arm_compute/graph/nodes/SliceLayerNode.h
index 08d3794e26..63f266b217 100644
--- a/arm_compute/graph/nodes/SliceLayerNode.h
+++ b/arm_compute/graph/nodes/SliceLayerNode.h
@@ -51,7 +51,8 @@ public:
      * @return  Output descriptor
      */
     static TensorDescriptor compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                      const Coordinates &starts, const Coordinates &ends);
+                                                      const Coordinates      &starts,
+                                                      const Coordinates      &ends);
     /** Start coordinates accessor
      *
      * @return Start coordinates of the dimensions
@@ -67,7 +68,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     Coordinates _starts;
diff --git a/arm_compute/graph/nodes/SoftmaxLayerNode.h b/arm_compute/graph/nodes/SoftmaxLayerNode.h
index 0868c6ff16..2cb1ac2cf4 100644
--- a/arm_compute/graph/nodes/SoftmaxLayerNode.h
+++ b/arm_compute/graph/nodes/SoftmaxLayerNode.h
@@ -49,7 +49,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 public:
     static constexpr NodeType node_type = NodeType::SoftmaxLayer;
diff --git a/arm_compute/graph/nodes/SplitLayerNode.h b/arm_compute/graph/nodes/SplitLayerNode.h
index 13cccdd447..5e6df53c0f 100644
--- a/arm_compute/graph/nodes/SplitLayerNode.h
+++ b/arm_compute/graph/nodes/SplitLayerNode.h
@@ -55,7 +55,9 @@ public:
      * @return  A pair with the descriptor of the split and the starting coordinates
      */
     std::pair<TensorDescriptor, Coordinates> compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                                       unsigned int num_splits, int axis, unsigned int idx);
+                                                                       unsigned int            num_splits,
+                                                                       int                     axis,
+                                                                       unsigned int            idx);
     /** Number of splits accessor
      *
      * @return Number of splits
@@ -72,7 +74,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     unsigned int     _num_splits;
diff --git a/arm_compute/graph/nodes/StackLayerNode.h b/arm_compute/graph/nodes/StackLayerNode.h
index 2990895c2b..9f0767c9f2 100644
--- a/arm_compute/graph/nodes/StackLayerNode.h
+++ b/arm_compute/graph/nodes/StackLayerNode.h
@@ -58,7 +58,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     unsigned int _total_nodes;
diff --git a/arm_compute/graph/nodes/StridedSliceLayerNode.h b/arm_compute/graph/nodes/StridedSliceLayerNode.h
index 6039f312b3..f521feb780 100644
--- a/arm_compute/graph/nodes/StridedSliceLayerNode.h
+++ b/arm_compute/graph/nodes/StridedSliceLayerNode.h
@@ -84,7 +84,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     Coordinates           _starts;
diff --git a/arm_compute/graph/printers/DotGraphPrinter.h b/arm_compute/graph/printers/DotGraphPrinter.h
index 564aecfb1e..6638033044 100644
--- a/arm_compute/graph/printers/DotGraphPrinter.h
+++ b/arm_compute/graph/printers/DotGraphPrinter.h
@@ -25,7 +25,6 @@
 #define ACL_ARM_COMPUTE_GRAPH_PRINTERS_DOTGRAPHPRINTER_H
 
 #include "arm_compute/graph/IGraphPrinter.h"
-
 #include "arm_compute/graph/INodeVisitor.h"
 
 #include <string>
diff --git a/arm_compute/runtime/Allocator.h b/arm_compute/runtime/Allocator.h
index 83f072ab6b..e99ddb3dac 100644
--- a/arm_compute/runtime/Allocator.h
+++ b/arm_compute/runtime/Allocator.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_ALLOCATOR_H
 
 #include "arm_compute/runtime/IAllocator.h"
-
 #include "arm_compute/runtime/IMemoryRegion.h"
 
 #include <cstddef>
@@ -40,9 +39,9 @@ public:
     Allocator() = default;
 
     // Inherited methods overridden:
-    void *allocate(size_t size, size_t alignment) override;
-    void free(void *ptr) override;
+    void                          *allocate(size_t size, size_t alignment) override;
+    void                           free(void *ptr) override;
     std::unique_ptr<IMemoryRegion> make_region(size_t size, size_t alignment) override;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ALLOCATOR_H */
diff --git a/arm_compute/runtime/Array.h b/arm_compute/runtime/Array.h
index 21d9c25c87..9283273317 100644
--- a/arm_compute/runtime/Array.h
+++ b/arm_compute/runtime/Array.h
@@ -37,16 +37,14 @@ class Array : public IArray<T>
 {
 public:
     /** Default constructor: empty array */
-    Array()
-        : IArray<T>(0), _values(nullptr)
+    Array() : IArray<T>(0), _values(nullptr)
     {
     }
     /** Constructor: initializes an array which can contain up to max_num_points values
      *
      * @param[in] max_num_values Maximum number of values the array will be able to stored
      */
-    Array(size_t max_num_values)
-        : IArray<T>(max_num_values), _values(std::make_unique<T[]>(max_num_values))
+    Array(size_t max_num_values) : IArray<T>(max_num_values), _values(std::make_unique<T[]>(max_num_values))
     {
     }
 
@@ -72,5 +70,5 @@ using Int16Array = Array<int16_t>;
 using Int32Array = Array<int32_t>;
 /** Array of floats. */
 using FloatArray = Array<float>;
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_ARRAY_H */
diff --git a/arm_compute/runtime/BlobLifetimeManager.h b/arm_compute/runtime/BlobLifetimeManager.h
index 0d69f2e7c5..18ffe96ee5 100644
--- a/arm_compute/runtime/BlobLifetimeManager.h
+++ b/arm_compute/runtime/BlobLifetimeManager.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_BLOBLIFETIMEMANAGER_H
 
 #include "arm_compute/runtime/ISimpleLifetimeManager.h"
-
 #include "arm_compute/runtime/Types.h"
 
 #include <memory>
@@ -62,7 +61,7 @@ public:
 
     // Inherited methods overridden:
     std::unique_ptr<IMemoryPool> create_pool(IAllocator *allocator) override;
-    MappingType mapping_type() const override;
+    MappingType                  mapping_type() const override;
 
 private:
     // Inherited methods overridden:
diff --git a/arm_compute/runtime/BlobMemoryPool.h b/arm_compute/runtime/BlobMemoryPool.h
index 8481fa20f9..b25efc3821 100644
--- a/arm_compute/runtime/BlobMemoryPool.h
+++ b/arm_compute/runtime/BlobMemoryPool.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_BLOBMEMORYPOOL_H
 
 #include "arm_compute/runtime/IMemoryPool.h"
-
 #include "arm_compute/runtime/IMemoryRegion.h"
 #include "arm_compute/runtime/Types.h"
 
@@ -62,8 +61,8 @@ public:
     BlobMemoryPool &operator=(BlobMemoryPool &&) = default;
 
     // Inherited methods overridden:
-    void acquire(MemoryMappings &handles) override;
-    void release(MemoryMappings &handles) override;
+    void                         acquire(MemoryMappings &handles) override;
+    void                         release(MemoryMappings &handles) override;
     MappingType                  mapping_type() const override;
     std::unique_ptr<IMemoryPool> duplicate() override;
 
diff --git a/arm_compute/runtime/CL/CLArray.h b/arm_compute/runtime/CL/CLArray.h
index 7efe208b9f..6e81a46a29 100644
--- a/arm_compute/runtime/CL/CLArray.h
+++ b/arm_compute/runtime/CL/CLArray.h
@@ -38,8 +38,7 @@ class CLArray : public ICLArray<T>
 {
 public:
     /** Default constructor: empty array */
-    CLArray()
-        : ICLArray<T>(0), _buffer()
+    CLArray() : ICLArray<T>(0), _buffer()
     {
     }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -55,7 +54,8 @@ public:
      * @param[in] max_num_values Maximum number of values the array will be able to stored
      */
     CLArray(size_t max_num_values)
-        : ICLArray<T>(max_num_values), _buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, max_num_values * sizeof(T))
+        : ICLArray<T>(max_num_values),
+          _buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, max_num_values * sizeof(T))
     {
     }
     /** Enqueue a map operation of the allocated buffer.
@@ -91,7 +91,8 @@ protected:
     uint8_t *do_map(cl::CommandQueue &q, bool blocking) override
     {
         ARM_COMPUTE_ERROR_ON(nullptr == _buffer.get());
-        return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, this->max_num_values() * sizeof(T)));
+        return static_cast<uint8_t *>(q.enqueueMapBuffer(
+            _buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, this->max_num_values() * sizeof(T)));
     }
     void do_unmap(cl::CommandQueue &q, uint8_t *mapping) override
     {
@@ -114,5 +115,5 @@ using CLInt16Array = CLArray<cl_short>;
 using CLInt32Array = CLArray<cl_int>;
 /** OpenCL Array of floats. */
 using CLFloatArray = CLArray<cl_float>;
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLARRAY_H */
diff --git a/arm_compute/runtime/CL/CLBufferAllocator.h b/arm_compute/runtime/CL/CLBufferAllocator.h
index 7467e9d1c6..00ff017012 100644
--- a/arm_compute/runtime/CL/CLBufferAllocator.h
+++ b/arm_compute/runtime/CL/CLBufferAllocator.h
@@ -35,9 +35,9 @@ class CLBufferAllocator final : public IAllocator
 {
 public:
     // Inherited methods overridden:
-    void *allocate(size_t size, size_t alignment) override;
-    void free(void *ptr) override;
+    void                          *allocate(size_t size, size_t alignment) override;
+    void                           free(void *ptr) override;
     std::unique_ptr<IMemoryRegion> make_region(size_t size, size_t alignment) override;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_CLBUFFERALLOCATOR_H */
diff --git a/arm_compute/runtime/CL/CLMemory.h b/arm_compute/runtime/CL/CLMemory.h
index 7adee66c73..5abe86bd53 100644
--- a/arm_compute/runtime/CL/CLMemory.h
+++ b/arm_compute/runtime/CL/CLMemory.h
@@ -24,10 +24,9 @@
 #ifndef ARM_COMPUTE_RUNTIME_CL_CLMEMORY_H
 #define ARM_COMPUTE_RUNTIME_CL_CLMEMORY_H
 
-#include "arm_compute/runtime/IMemory.h"
-
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/runtime/CL/CLMemoryRegion.h"
+#include "arm_compute/runtime/IMemory.h"
 
 #include <cstddef>
 #include <memory>
@@ -75,8 +74,8 @@ public:
     // Inherited methods overridden:
     IMemoryRegion *region() final;
     IMemoryRegion *region() const final;
-    void set_region(IMemoryRegion *region) final;
-    void set_owned_region(std::unique_ptr<IMemoryRegion> region) final;
+    void           set_region(IMemoryRegion *region) final;
+    void           set_owned_region(std::unique_ptr<IMemoryRegion> region) final;
 
 private:
     ICLMemoryRegion                 *_region;
diff --git a/arm_compute/runtime/CL/CLMemoryRegion.h b/arm_compute/runtime/CL/CLMemoryRegion.h
index 66a30fa56b..365973a9e6 100644
--- a/arm_compute/runtime/CL/CLMemoryRegion.h
+++ b/arm_compute/runtime/CL/CLMemoryRegion.h
@@ -110,7 +110,7 @@ public:
     // Inherited methods overridden :
     void *ptr() final;
     void *map(cl::CommandQueue &q, bool blocking) final;
-    void unmap(cl::CommandQueue &q) final;
+    void  unmap(cl::CommandQueue &q) final;
 };
 
 /** OpenCL SVM memory region interface */
@@ -156,7 +156,7 @@ public:
 
     // Inherited methods overridden :
     void *map(cl::CommandQueue &q, bool blocking) final;
-    void unmap(cl::CommandQueue &q) final;
+    void  unmap(cl::CommandQueue &q) final;
 };
 
 /** OpenCL fine-grain SVM memory region implementation */
@@ -173,7 +173,7 @@ public:
 
     // Inherited methods overridden :
     void *map(cl::CommandQueue &q, bool blocking) final;
-    void unmap(cl::CommandQueue &q) final;
+    void  unmap(cl::CommandQueue &q) final;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_RUNTIME_CL_CL_MEMORY_REGION_H */
diff --git a/arm_compute/runtime/CL/CLRuntimeContext.h b/arm_compute/runtime/CL/CLRuntimeContext.h
index dd17645fa7..2ed4b74796 100644
--- a/arm_compute/runtime/CL/CLRuntimeContext.h
+++ b/arm_compute/runtime/CL/CLRuntimeContext.h
@@ -54,11 +54,11 @@ public:
     CLKernelLibrary &kernel_library();
 
 private:
-    std::unique_ptr<CLScheduler> _gpu_owned_scheduler{ nullptr };
-    CLScheduler                 *_gpu_scheduler{ nullptr };
-    CLTuner                      _tuner{ false };
+    std::unique_ptr<CLScheduler> _gpu_owned_scheduler{nullptr};
+    CLScheduler                 *_gpu_scheduler{nullptr};
+    CLTuner                      _tuner{false};
     CLSymbols                    _symbols{};
-    CLBackendType                _backend_type{ CLBackendType::Native };
+    CLBackendType                _backend_type{CLBackendType::Native};
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLRUNTIME_CONTEXT_H */
diff --git a/arm_compute/runtime/CL/CLScheduler.h b/arm_compute/runtime/CL/CLScheduler.h
index 3030239270..b74fcb74ef 100644
--- a/arm_compute/runtime/CL/CLScheduler.h
+++ b/arm_compute/runtime/CL/CLScheduler.h
@@ -28,8 +28,8 @@
 #include "arm_compute/core/CL/CLTypes.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLGEMMHeuristicsHandle.h"
 #include "arm_compute/runtime/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLTypes.h"
@@ -63,7 +63,9 @@ public:
      * @param[in] gemm_h          (Optional) Pointer to CLGEMMHeuristicsHandle (default = nullptr)
      * @param[in] cl_backend_type (Optional) Type of backend to use (default = CLBackendType::Native)
      */
-    void default_init(ICLTuner *cl_tuner = nullptr, CLGEMMHeuristicsHandle *gemm_h = nullptr, CLBackendType cl_backend_type = CLBackendType::Native);
+    void default_init(ICLTuner               *cl_tuner        = nullptr,
+                      CLGEMMHeuristicsHandle *gemm_h          = nullptr,
+                      CLBackendType           cl_backend_type = CLBackendType::Native);
     /** Initialises the scheduler with context and device provided by the user
      *
      * @param[in] device   OpenCL device to be used
@@ -71,7 +73,10 @@ public:
      * @param[in] cl_tuner (Optional) Pointer to ICLTuner (default=nullptr)
      * @param[in] gemm_h   (Optional) Pointer to CLGEMMHeuristicsHandle (default = nullptr)
      */
-    void default_init_with_context(cl::Device &device, cl::Context &ctx, ICLTuner *cl_tuner = nullptr, CLGEMMHeuristicsHandle *gemm_h = nullptr);
+    void default_init_with_context(cl::Device             &device,
+                                   cl::Context            &ctx,
+                                   ICLTuner               *cl_tuner = nullptr,
+                                   CLGEMMHeuristicsHandle *gemm_h   = nullptr);
 
     /** Re-initializes the context and command queue used by the scheduler to default values
      *  and sets a default device and kernel path for the @ref CLKernelLibrary.
@@ -80,7 +85,9 @@ public:
      * @param[in] gemm_h          (Optional) Pointer to CLGEMMHeuristicsHandle (default = nullptr)
      * @param[in] cl_backend_type (Optional) Type of backend to use (default = CLBackendType::Native)
      */
-    void default_reinit(ICLTuner *cl_tuner = nullptr, CLGEMMHeuristicsHandle *gemm_h = nullptr, CLBackendType cl_backend_type = CLBackendType::Native);
+    void default_reinit(ICLTuner               *cl_tuner        = nullptr,
+                        CLGEMMHeuristicsHandle *gemm_h          = nullptr,
+                        CLBackendType           cl_backend_type = CLBackendType::Native);
 
     /** Schedule the execution of the passed kernel if possible.
      *
@@ -105,8 +112,12 @@ public:
      * @param[in] gemm_h          (Optional) Pointer to CLGEMMHeuristicsHandle (default = nullptr)
      * @param[in] cl_backend_type (Optional) Type of backend to use (default = CLBackendType::Native)
      */
-    void init(cl::Context context, cl::CommandQueue queue, const cl::Device &device, ICLTuner *cl_tuner = nullptr, CLGEMMHeuristicsHandle *gemm_h = nullptr,
-              CLBackendType cl_backend_type = CLBackendType::Native);
+    void init(cl::Context             context,
+              cl::CommandQueue        queue,
+              const cl::Device       &device,
+              ICLTuner               *cl_tuner        = nullptr,
+              CLGEMMHeuristicsHandle *gemm_h          = nullptr,
+              CLBackendType           cl_backend_type = CLBackendType::Native);
 
     /** Accessor for the associated CL context.
      *
diff --git a/arm_compute/runtime/CL/CLSubTensor.h b/arm_compute/runtime/CL/CLSubTensor.h
index 0a7f5f89b2..c18df8086a 100644
--- a/arm_compute/runtime/CL/CLSubTensor.h
+++ b/arm_compute/runtime/CL/CLSubTensor.h
@@ -46,7 +46,10 @@ public:
      * @param[in] coords        Coordinates of the first subtensor element inside the parent tensor.
      * @param[in] extend_parent (Optional) Extend parent with subtensor shape if subtensor indexes out of bounds
      */
-    CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords, bool extend_parent = false);
+    CLSubTensor(ICLTensor         *parent,
+                const TensorShape &tensor_shape,
+                const Coordinates &coords,
+                bool               extend_parent = false);
     /** Destructor: free the tensor's memory */
     ~CLSubTensor() = default;
     /** Restrict instances of this class to be copy constructed */
@@ -93,11 +96,11 @@ public:
 protected:
     // Inherited methods overridden:
     uint8_t *do_map(cl::CommandQueue &q, bool blocking) override;
-    void do_unmap(cl::CommandQueue &q) override;
+    void     do_unmap(cl::CommandQueue &q) override;
 
 private:
     ICLTensor            *_parent;
     mutable SubTensorInfo _info;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_CLSUBTENSOR_H */
diff --git a/arm_compute/runtime/CL/CLTensor.h b/arm_compute/runtime/CL/CLTensor.h
index ae73351f27..0729935e9e 100644
--- a/arm_compute/runtime/CL/CLTensor.h
+++ b/arm_compute/runtime/CL/CLTensor.h
@@ -87,17 +87,17 @@ public:
     TensorInfo       *info() override;
     const cl::Buffer &cl_buffer() const override;
     CLQuantization    quantization() const override;
-    void associate_memory_group(IMemoryGroup *memory_group) override;
+    void              associate_memory_group(IMemoryGroup *memory_group) override;
     CLRuntimeContext *context();
 
 protected:
     // Inherited methods overridden:
     uint8_t *do_map(cl::CommandQueue &q, bool blocking) override;
-    void do_unmap(cl::CommandQueue &q) override;
+    void     do_unmap(cl::CommandQueue &q) override;
 
 private:
     mutable CLTensorAllocator _allocator; /**< Instance of the OpenCL tensor allocator */
-    CLRuntimeContext         *_ctx{ nullptr };
+    CLRuntimeContext         *_ctx{nullptr};
 };
 
 /** OpenCL Image */
diff --git a/arm_compute/runtime/CL/CLTensorAllocator.h b/arm_compute/runtime/CL/CLTensorAllocator.h
index 1b061ee1d6..fde8e9c43a 100644
--- a/arm_compute/runtime/CL/CLTensorAllocator.h
+++ b/arm_compute/runtime/CL/CLTensorAllocator.h
@@ -24,15 +24,14 @@
 #ifndef ARM_COMPUTE_CLTENSORALLOCATOR_H
 #define ARM_COMPUTE_CLTENSORALLOCATOR_H
 
+#include "arm_compute/core/CL/CLTypes.h"
+#include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/runtime/CL/CLArray.h"
 #include "arm_compute/runtime/CL/CLMemory.h"
 #include "arm_compute/runtime/IAllocator.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
-#include "arm_compute/core/CL/CLTypes.h"
-#include "arm_compute/core/CL/OpenCL.h"
-
 #include <cstdint>
 
 namespace arm_compute
@@ -148,7 +147,7 @@ private:
     static const cl::Buffer _empty_buffer;
 
 private:
-    CLRuntimeContext *_ctx;
+    CLRuntimeContext  *_ctx;
     IMemoryManageable *_owner;                   /**< Memory manageable object that owns the allocator */
     IMemoryGroup      *_associated_memory_group; /**< Registered memory manager */
     CLMemory           _memory;                  /**< OpenCL memory */
diff --git a/arm_compute/runtime/CL/CLTuner.h b/arm_compute/runtime/CL/CLTuner.h
index 93aa45adc1..cf293d3d27 100644
--- a/arm_compute/runtime/CL/CLTuner.h
+++ b/arm_compute/runtime/CL/CLTuner.h
@@ -153,9 +153,9 @@ private:
 
     std::unordered_map<std::string, CLTuningParams> _tuning_params_table;
     std::unordered_map<std::string, cl::NDRange>    _lws_table;
-    cl::Event    _kernel_event;
-    bool         _tune_new_kernels;
-    CLTuningInfo _tuning_info;
+    cl::Event                                       _kernel_event;
+    bool                                            _tune_new_kernels;
+    CLTuningInfo                                    _tuning_info;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLTUNER_H */
diff --git a/arm_compute/runtime/CL/CLTunerTypes.h b/arm_compute/runtime/CL/CLTunerTypes.h
index 508cafac95..d9b914676a 100644
--- a/arm_compute/runtime/CL/CLTunerTypes.h
+++ b/arm_compute/runtime/CL/CLTunerTypes.h
@@ -43,7 +43,7 @@ enum class CLTunerMode
 struct CLTuningInfo
 {
     CLTunerMode tuner_mode = CLTunerMode::NORMAL; /**< Parameter to select the level (granularity) of the tuning */
-    bool        tune_wbsm  = false;               /**< Flag to tune the batches of work groups distributed to compute units.
+    bool        tune_wbsm  = false; /**< Flag to tune the batches of work groups distributed to compute units.
                                                        Internally, the library will check if this feature is available on
                                                        the target platform. This OpenCL tuner extension is still in experimental phase */
 };
@@ -56,11 +56,10 @@ struct CLTuningInfo
  */
 inline CLTunerMode tuner_mode_from_name(const std::string &name)
 {
-    static const std::map<std::string, CLTunerMode> tuner_modes =
-    {
-        { "exhaustive", CLTunerMode::EXHAUSTIVE },
-        { "normal", CLTunerMode::NORMAL },
-        { "rapid", CLTunerMode::RAPID },
+    static const std::map<std::string, CLTunerMode> tuner_modes = {
+        {"exhaustive", CLTunerMode::EXHAUSTIVE},
+        {"normal", CLTunerMode::NORMAL},
+        {"rapid", CLTunerMode::RAPID},
     };
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
@@ -71,7 +70,7 @@ inline CLTunerMode tuner_mode_from_name(const std::string &name)
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
-    catch(const std::out_of_range &)
+    catch (const std::out_of_range &)
     {
         throw std::invalid_argument(name);
     }
diff --git a/arm_compute/runtime/CL/CLTuningParams.h b/arm_compute/runtime/CL/CLTuningParams.h
index 1e5ab25c26..a876fad112 100644
--- a/arm_compute/runtime/CL/CLTuningParams.h
+++ b/arm_compute/runtime/CL/CLTuningParams.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/runtime/CL/CLTunerTypes.h"
+
 #include "support/StringSupport.h"
 
 #include <ostream>
@@ -36,8 +37,7 @@ namespace arm_compute
 class CLTuningParams
 {
 public:
-    CLTuningParams(const CLTuningParams &tuning_params)
-        : _lws(tuning_params._lws), _wbsm(tuning_params._wbsm)
+    CLTuningParams(const CLTuningParams &tuning_params) : _lws(tuning_params._lws), _wbsm(tuning_params._wbsm)
     {
     }
 
@@ -45,18 +45,16 @@ public:
         : _lws(lws_x, lws_y, lws_z), _wbsm(wbsm)
     {
     }
-    CLTuningParams(cl::NDRange lws, cl_int wbsm = 0)
-        : _lws(lws), _wbsm(wbsm)
+    CLTuningParams(cl::NDRange lws, cl_int wbsm = 0) : _lws(lws), _wbsm(wbsm)
     {
     }
 
-    CLTuningParams(cl_int wbsm)
-        : CLTuningParams(cl::NullRange, wbsm)
+    CLTuningParams(cl_int wbsm) : CLTuningParams(cl::NullRange, wbsm)
     {
     }
-    CLTuningParams& operator=(const CLTuningParams &other)
+    CLTuningParams &operator=(const CLTuningParams &other)
     {
-        _lws = other._lws;
+        _lws  = other._lws;
         _wbsm = other._wbsm;
         return *this;
     }
@@ -84,8 +82,9 @@ public:
     std::string to_string(CLTuningInfo tuning_info)
     {
         std::string tuning_params_string = "";
-        tuning_params_string += ";" + support::cpp11::to_string(_lws[0]) + ";" + support::cpp11::to_string(_lws[1]) + ";" + support::cpp11::to_string(_lws[2]);
-        if(tuning_info.tune_wbsm)
+        tuning_params_string += ";" + support::cpp11::to_string(_lws[0]) + ";" + support::cpp11::to_string(_lws[1]) +
+                                ";" + support::cpp11::to_string(_lws[2]);
+        if (tuning_info.tune_wbsm)
         {
             tuning_params_string += ";" + support::cpp11::to_string(_wbsm);
         }
@@ -98,19 +97,19 @@ public:
         std::vector<std::string> array;
         std::stringstream        ss(tuning_params_string);
         std::string              temp;
-        while(ss >> temp)
+        while (ss >> temp)
         {
             array.push_back(temp);
         }
         // Read 3 values for lws
-        if(array.size() < 3)
+        if (array.size() < 3)
         {
             return false;
         }
         const unsigned int lws_0 = support::cpp11::stoi(array[0]);
         const unsigned int lws_1 = support::cpp11::stoi(array[1]);
         const unsigned int lws_2 = support::cpp11::stoi(array[2]);
-        if(lws_0 == 0 && lws_1 == 0 && lws_2 == 0)
+        if (lws_0 == 0 && lws_1 == 0 && lws_2 == 0)
         {
             // If lws values are 0, cl::NullRange has to be used
             // otherwise the lws object will be badly created
@@ -121,9 +120,9 @@ public:
             _lws = cl::NDRange(lws_0, lws_1, lws_2);
         }
         array.erase(array.begin(), array.begin() + 3);
-        if(tuning_info.tune_wbsm)
+        if (tuning_info.tune_wbsm)
         {
-            if(array.size() < 1)
+            if (array.size() < 1)
             {
                 return false;
             }
diff --git a/arm_compute/runtime/CL/CLTypes.h b/arm_compute/runtime/CL/CLTypes.h
index d298ecd614..931740c47f 100644
--- a/arm_compute/runtime/CL/CLTypes.h
+++ b/arm_compute/runtime/CL/CLTypes.h
@@ -43,12 +43,12 @@ enum class CLGEMMKernelType
 /** OpenCL GEMM kernel selection parameters. These information are retrieved to select the GEMM kernel on OpenCL */
 struct CLGEMMKernelSelectionParams
 {
-    unsigned int m{ 0 };                         /**< Number of rows for the lhs matrix. Lhs matrix NOT transposed */
-    unsigned int n{ 0 };                         /**< Number of columns for the rhs matrix. Rhs matrix NOT transposed */
-    unsigned int k{ 0 };                         /**< Number of rows for the rhs matrix. Rhs matrix NOT transposed */
-    unsigned int b{ 0 };                         /**< Batch size */
-    bool         is_rhs_constant{ false };       /**< True if the content of the rhs matrix is constant */
-    DataType     data_type{ DataType::UNKNOWN }; /**< Data type */
+    unsigned int m{0};                         /**< Number of rows for the lhs matrix. Lhs matrix NOT transposed */
+    unsigned int n{0};                         /**< Number of columns for the rhs matrix. Rhs matrix NOT transposed */
+    unsigned int k{0};                         /**< Number of rows for the rhs matrix. Rhs matrix NOT transposed */
+    unsigned int b{0};                         /**< Batch size */
+    bool         is_rhs_constant{false};       /**< True if the content of the rhs matrix is constant */
+    DataType     data_type{DataType::UNKNOWN}; /**< Data type */
 };
 
 /** List the possible OpenCL backends */
diff --git a/arm_compute/runtime/CL/ICLGEMMKernelSelection.h b/arm_compute/runtime/CL/ICLGEMMKernelSelection.h
index 7be9393388..5a71a61203 100644
--- a/arm_compute/runtime/CL/ICLGEMMKernelSelection.h
+++ b/arm_compute/runtime/CL/ICLGEMMKernelSelection.h
@@ -40,8 +40,7 @@ public:
      *
      * @param[in] arch GPU target
      */
-    ICLGEMMKernelSelection(GPUTarget arch)
-        : _target(arch)
+    ICLGEMMKernelSelection(GPUTarget arch) : _target(arch)
     {
     }
     /** Default Move Constructor. */
@@ -59,7 +58,8 @@ public:
     virtual CLGEMMKernelType select_kernel(const CLGEMMKernelSelectionParams &params) = 0;
 
 protected:
-    GPUTarget _target; /**< GPU target could be used to call a dedicated heuristic for each GPU IP for a given GPU architecture */
+    GPUTarget
+        _target; /**< GPU target could be used to call a dedicated heuristic for each GPU IP for a given GPU architecture */
 };
 } // namespace cl_gemm
 } // namespace arm_compute
diff --git a/arm_compute/runtime/CL/ICLOperator.h b/arm_compute/runtime/CL/ICLOperator.h
index 38bcaf32f2..c0826e7733 100644
--- a/arm_compute/runtime/CL/ICLOperator.h
+++ b/arm_compute/runtime/CL/ICLOperator.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_ICLOPERATOR_H
 
 #include "arm_compute/core/Types.h"
-
 #include "arm_compute/runtime/IOperator.h"
 #include "arm_compute/runtime/IRuntimeContext.h"
 #include "arm_compute/runtime/Types.h"
@@ -56,8 +55,8 @@ public:
     ICLOperator &operator=(ICLOperator &&) = default;
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void               run(ITensorPack &tensors) override;
+    void               prepare(ITensorPack &constants) override;
     MemoryRequirements workspace() const override;
 
 protected:
diff --git a/arm_compute/runtime/CL/functions/CLActivationLayer.h b/arm_compute/runtime/CL/functions/CLActivationLayer.h
index 4a718ab4b6..e158efa093 100644
--- a/arm_compute/runtime/CL/functions/CLActivationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLActivationLayer.h
@@ -24,11 +24,10 @@
 #ifndef ARM_COMPUTE_CLACTIVATIONLAYER_H
 #define ARM_COMPUTE_CLACTIVATIONLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/CL/CLRuntimeContext.h"
+#include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
@@ -91,7 +90,10 @@ public:
      * @param[out]     output          Destination tensor. Data type supported: same as @p input
      * @param[in]      act_info        Activation layer parameters.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output,
+                   ActivationLayerInfo     act_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLActivationLayer
      *
      * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
diff --git a/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h b/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h
index ce5bee8d95..d340d20a1f 100644
--- a/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h
@@ -91,7 +91,11 @@ public:
      * @param[out] output          Output source tensor. Data types supported: U32/S32.
      * @param[in]  op              Reduction operation to perform. Operations supported: ARG_IDX_MAX, ARG_IDX_MIN
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op);
+    void configure(const CLCompileContext   &compile_context,
+                   const ICLTensor          *input,
+                   int                       axis,
+                   ICLTensor                *output,
+                   const ReductionOperation &op);
     /** Static function to check if given info will lead to a valid configuration of @ref CLArgMinMaxLayer
      *
      * @param[in] input  Input source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
diff --git a/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
index 37a0680709..f57bc8fe8b 100644
--- a/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
@@ -24,10 +24,9 @@
 #ifndef ARM_COMPUTE_CLBATCHNORMALIZATIONLAYER_H
 #define ARM_COMPUTE_CLBATCHNORMALIZATIONLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -84,7 +83,13 @@ public:
      * @param[in]      epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
      */
-    void configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr, const ICLTensor *gamma = nullptr, float epsilon = 0.001f,
+    void configure(ICLTensor          *input,
+                   ICLTensor          *output,
+                   const ICLTensor    *mean,
+                   const ICLTensor    *var,
+                   const ICLTensor    *beta     = nullptr,
+                   const ICLTensor    *gamma    = nullptr,
+                   float               epsilon  = 0.001f,
                    ActivationLayerInfo act_info = ActivationLayerInfo());
     /** Set the input and output tensors.
      *
@@ -102,9 +107,15 @@ public:
      * @param[in]      epsilon         (Optional) Small value to avoid division with zero. Default value is 0.001f.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr,
-                   const ICLTensor *gamma = nullptr,
-                   float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output,
+                   const ICLTensor        *mean,
+                   const ICLTensor        *var,
+                   const ICLTensor        *beta     = nullptr,
+                   const ICLTensor        *gamma    = nullptr,
+                   float                   epsilon  = 0.001f,
+                   ActivationLayerInfo     act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLBatchNormalizationLayer
      *
      * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result.
@@ -120,10 +131,14 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const ITensorInfo *mean, const ITensorInfo *var,
-                           const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
-                           float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo  *input,
+                           const ITensorInfo  *output,
+                           const ITensorInfo  *mean,
+                           const ITensorInfo  *var,
+                           const ITensorInfo  *beta     = nullptr,
+                           const ITensorInfo  *gamma    = nullptr,
+                           float               epsilon  = 0.001f,
+                           ActivationLayerInfo act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h b/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h
index 861330b9d4..20b9fdafed 100644
--- a/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h
+++ b/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CLBATCHTOSPACELAYER_H
 #define ARM_COMPUTE_CLBATCHTOSPACELAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -82,7 +81,10 @@ public:
      * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
      */
     ARM_COMPUTE_DEPRECATED_REL(23.05)
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *block_shape,
+                   ICLTensor              *output);
     /** Set the input and output tensors. (Static block shape).
      *
      * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -91,7 +93,11 @@ public:
      * @param[out] output        Tensor output. Data types supported: same as @p input
      * @param[in]  crop_info     Information about how the output shape is cropped after batch to space is performed
      */
-    void configure(const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info = CropInfo{});
+    void configure(const ICLTensor *input,
+                   int32_t          block_shape_x,
+                   int32_t          block_shape_y,
+                   ICLTensor       *output,
+                   const CropInfo  &crop_info = CropInfo{});
     /** Set the input and output tensors. (Static block shape).
      *
      * @param[in]  compile_context The compile context to be used.
@@ -101,7 +107,12 @@ public:
      * @param[out] output          Tensor output. Data types supported: same as @p input
      * @param[in]  crop_info       Information about how the output shape is cropped after batch to space is performed
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info = CropInfo{});
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   int32_t                 block_shape_x,
+                   int32_t                 block_shape_y,
+                   ICLTensor              *output,
+                   const CropInfo         &crop_info = CropInfo{});
     /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayer
      *
      * @param[in]  input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
@@ -124,7 +135,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info = CropInfo{});
+    static Status validate(const ITensorInfo *input,
+                           int32_t            block_shape_x,
+                           int32_t            block_shape_y,
+                           const ITensorInfo *output,
+                           const CropInfo    &crop_info = CropInfo{});
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseAnd.h b/arm_compute/runtime/CL/functions/CLBitwiseAnd.h
index b30be9b24f..f82af3af9b 100644
--- a/arm_compute/runtime/CL/functions/CLBitwiseAnd.h
+++ b/arm_compute/runtime/CL/functions/CLBitwiseAnd.h
@@ -61,7 +61,10 @@ public:
      * @param[in]  input2          Input tensor. Data types supported: U8.
      * @param[out] output          Output tensor. Data types supported: U8.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input1,
+                   const ICLTensor        *input2,
+                   ICLTensor              *output);
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLBITWISEAND_H */
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseNot.h b/arm_compute/runtime/CL/functions/CLBitwiseNot.h
index 1456ebe57e..31f8e86802 100644
--- a/arm_compute/runtime/CL/functions/CLBitwiseNot.h
+++ b/arm_compute/runtime/CL/functions/CLBitwiseNot.h
@@ -60,5 +60,5 @@ public:
      */
     void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLBITWISENOT_H */
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseOr.h b/arm_compute/runtime/CL/functions/CLBitwiseOr.h
index ff0a1f0d73..9a25a2099e 100644
--- a/arm_compute/runtime/CL/functions/CLBitwiseOr.h
+++ b/arm_compute/runtime/CL/functions/CLBitwiseOr.h
@@ -61,7 +61,10 @@ public:
      * @param[in]  input2          Input tensor. Data types supported: U8.
      * @param[out] output          Output tensor. Data types supported: U8.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input1,
+                   const ICLTensor        *input2,
+                   ICLTensor              *output);
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLBITWISEOR_H */
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseXor.h b/arm_compute/runtime/CL/functions/CLBitwiseXor.h
index 0cd9d073b4..9e288ef7b6 100644
--- a/arm_compute/runtime/CL/functions/CLBitwiseXor.h
+++ b/arm_compute/runtime/CL/functions/CLBitwiseXor.h
@@ -61,7 +61,10 @@ public:
      * @param[in]  input2          Input tensor. Data types supported: U8.
      * @param[out] output          Output tensor. Data types supported: U8.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input1,
+                   const ICLTensor        *input2,
+                   ICLTensor              *output);
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLBITWISEXOR_H */
diff --git a/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h b/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h
index d3499c3949..dba5497f5d 100644
--- a/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h
+++ b/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h
@@ -64,7 +64,10 @@ public:
      *
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
      */
-    void configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info);
+    void configure(const ICLTensor                *boxes,
+                   ICLTensor                      *pred_boxes,
+                   const ICLTensor                *deltas,
+                   const BoundingBoxTransformInfo &info);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -76,7 +79,11 @@ public:
      *
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info);
+    void configure(const CLCompileContext         &compile_context,
+                   const ICLTensor                *boxes,
+                   ICLTensor                      *pred_boxes,
+                   const ICLTensor                *deltas,
+                   const BoundingBoxTransformInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform
      *
@@ -90,7 +97,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info);
+    static Status validate(const ITensorInfo              *boxes,
+                           const ITensorInfo              *pred_boxes,
+                           const ITensorInfo              *deltas,
+                           const BoundingBoxTransformInfo &info);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLBOUNDINGBOXTRANSFORM_H */
diff --git a/arm_compute/runtime/CL/functions/CLCast.h b/arm_compute/runtime/CL/functions/CLCast.h
index 650cd11b9b..9433f08fac 100644
--- a/arm_compute/runtime/CL/functions/CLCast.h
+++ b/arm_compute/runtime/CL/functions/CLCast.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CLCAST_H
 #define ARM_COMPUTE_CLCAST_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -79,7 +78,8 @@ public:
      */
     void configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy);
     // Initialize the function's source, destination
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy);
+    void
+    configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy);
     /** Static function to check if given info will lead to a valid configuration of @ref CLCast
      *
      * @param[in] input  Source tensor info. Data types supported: U8/S8/U16/S16/U32/S32/U64/S64/F16/F32.
diff --git a/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h b/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h
index 3dc62595d2..8ca848a020 100644
--- a/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h
+++ b/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h
@@ -65,7 +65,10 @@ public:
      * @param[out] output          Output tensor. Data type supported: Same as @p input
      * @param[in]  num_groups      Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   unsigned int            num_groups);
     /** Static function to check if given info will lead to a valid configuration of @ref CLChannelShuffleLayerKernel
      *
      * @param[in] input      Input tensor info. Data types supported: All.
diff --git a/arm_compute/runtime/CL/functions/CLComparison.h b/arm_compute/runtime/CL/functions/CLComparison.h
index 3f984900ee..fca4b168b0 100644
--- a/arm_compute/runtime/CL/functions/CLComparison.h
+++ b/arm_compute/runtime/CL/functions/CLComparison.h
@@ -66,7 +66,11 @@ public:
      * @param[out] output          Destination tensor. Data types supported: U8.
      * @param[out] operation       Comparison operation to be used.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input1,
+                   ICLTensor              *input2,
+                   ICLTensor              *output,
+                   ComparisonOperation     operation);
     /** Static function to check if given info will lead to a valid configuration of @ref CLComparison
      *
      * @param[in]  input1    Source tensor. Data types supported: All.
@@ -76,7 +80,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation);
+    static Status validate(const ITensorInfo  *input1,
+                           const ITensorInfo  *input2,
+                           const ITensorInfo  *output,
+                           ComparisonOperation operation);
 };
 
 /** Basic function to run @ref CLComparisonKernel */
diff --git a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
index 71e84e21b5..88c4bed595 100644
--- a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CLCONCATENATELAYER_H
 #define ARM_COMPUTE_CLCONCATENATELAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 #include <vector>
@@ -95,7 +94,10 @@ public:
      * @param[out]    output          Output tensor. Data types supported: Same as @p input.
      * @param[in]     axis            Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
      */
-    void configure(const CLCompileContext &compile_context, std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis);
+    void configure(const CLCompileContext         &compile_context,
+                   std::vector<const ICLTensor *> &inputs_vector,
+                   ICLTensor                      *output,
+                   size_t                          axis);
     /** Static function to check if given info will lead to a valid configuration of @ref CLConcatenateLayer
      *
      * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
@@ -108,7 +110,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
+    static Status
+    validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLConv3D.h b/arm_compute/runtime/CL/functions/CLConv3D.h
index 5728fe79d8..aabaf01ab7 100644
--- a/arm_compute/runtime/CL/functions/CLConv3D.h
+++ b/arm_compute/runtime/CL/functions/CLConv3D.h
@@ -77,20 +77,33 @@ public:
      * @param[in]  conv3d_info     Contains strides, padding, rounding, activation, dilation and fast math information. Activation and fast math are currently unused.
      *
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *src, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *dst, const Conv3dInfo &conv3d_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *src,
+                   const ICLTensor        *weights,
+                   const ICLTensor        *biases,
+                   ICLTensor              *dst,
+                   const Conv3dInfo       &conv3d_info);
     /** Set the src and dst tensors.
      *
      * Similar to CLConv3D::configure() but using the default compile context
      *
      */
-    void configure(const ICLTensor *src, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *dst, const Conv3dInfo &conv3d_info);
+    void configure(const ICLTensor  *src,
+                   const ICLTensor  *weights,
+                   const ICLTensor  *biases,
+                   ICLTensor        *dst,
+                   const Conv3dInfo &conv3d_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLConv3D
      *
      * Similar to CLConv3D::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv3dInfo &conv3d_info);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *weights,
+                           const ITensorInfo *biases,
+                           const ITensorInfo *dst,
+                           const Conv3dInfo  &conv3d_info);
 
     // Inherited methods overridden:
     void run() override;
@@ -99,5 +112,5 @@ private:
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLCONVOLUTION3DLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h b/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h
index 0a634b5482..409430d595 100644
--- a/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h
+++ b/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h
@@ -69,7 +69,10 @@ public:
      * @param[in]  original_input_shape Shape of the original input tensor (the one entering fully connected layer).
      * @param[in]  data_layout          The data layout the weights have been trained in.
      */
-    void configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
+    void configure(const ICLTensor   *input,
+                   ICLTensor         *output,
+                   const TensorShape &original_input_shape,
+                   DataLayout         data_layout);
     /** Initialize the function.
      *
      * @param[in]  compile_context      The compile context to be used.
@@ -78,7 +81,11 @@ public:
      * @param[in]  original_input_shape Shape of the original input tensor (the one entering fully connected layer).
      * @param[in]  data_layout          The data layout the weights have been trained in.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const TensorShape      &original_input_shape,
+                   DataLayout              data_layout);
     /** Static function to check if given info will lead to a valid configuration of @ref CLConvertFullyConnectedWeights
      *
      * @param[in] input                Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
@@ -86,7 +93,10 @@ public:
      * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer).
      * @param[in] data_layout          The data layout the weights have been trained in.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, DataLayout data_layout);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const TensorShape &original_input_shape,
+                           DataLayout         data_layout);
 
     // Inherited methods overridden:
     void run() override;
@@ -144,7 +154,10 @@ public:
      * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer).
      * @param[in] data_layout          The data layout the weights have been trained in.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const TensorShape &original_input_shape, DataLayout data_layout)
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const TensorShape      &original_input_shape,
+                   DataLayout              data_layout)
     {
         _func.configure(compile_context, input, &_output, original_input_shape, data_layout);
     }
diff --git a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
index 77bf48d613..8487be71c3 100644
--- a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
@@ -120,8 +120,16 @@ public:
      *                              available which may introduce a drop of accuracy as well. Default is false
      * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      */
-    void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
+    void configure(ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const WeightsInfo         &weights_info     = WeightsInfo(),
+                   const Size2D              &dilation         = Size2D(1U, 1U),
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false,
+                   unsigned int               num_groups       = 1);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context  The compile context to be used.
@@ -142,9 +150,17 @@ public:
      *                              available which may introduce a drop of accuracy as well. Default is false
      * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false,
-                   unsigned int num_groups = 1);
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const WeightsInfo         &weights_info     = WeightsInfo(),
+                   const Size2D              &dilation         = Size2D(1U, 1U),
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false,
+                   unsigned int               num_groups       = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref CLConvolutionLayer
      *
      * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -166,9 +182,16 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false,
-                           unsigned int num_groups = 1);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const WeightsInfo         &weights_info     = WeightsInfo(),
+                           const Size2D              &dilation         = Size2D(1U, 1U),
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false,
+                           unsigned int               num_groups       = 1);
     /** Static function to check if given info will return the convolution called by @ref CLConvolutionLayer
      *
      * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -188,8 +211,15 @@ public:
      *
      * @return the Convolution Method Hint
      */
-    static ConvolutionMethod get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                    const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation = Size2D(1U, 1U), bool enable_fast_math = false);
+    static ConvolutionMethod get_convolution_method(const ITensorInfo         *input,
+                                                    const ITensorInfo         *weights,
+                                                    const ITensorInfo         *output,
+                                                    const PadStrideInfo       &conv_info,
+                                                    const WeightsInfo         &weights_info,
+                                                    const ActivationLayerInfo &act_info,
+                                                    const GPUTarget            gpu_target,
+                                                    const Size2D              &dilation         = Size2D(1U, 1U),
+                                                    bool                       enable_fast_math = false);
     // Inherited methods overridden:
     void run() override;
     void prepare() override;
diff --git a/arm_compute/runtime/CL/functions/CLCopy.h b/arm_compute/runtime/CL/functions/CLCopy.h
index 4fc4183d3e..fd40b7b9de 100644
--- a/arm_compute/runtime/CL/functions/CLCopy.h
+++ b/arm_compute/runtime/CL/functions/CLCopy.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/IFunction.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -74,7 +75,10 @@ public:
      * @param[in]  dst_window      (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
      *
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, Window *dst_window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output,
+                   Window                 *dst_window = nullptr);
     /** Static function to check if given info will lead to a valid configuration of @ref CLCopy
      *
      * @param[in] input      Source tensor. Data types supported: All.
diff --git a/arm_compute/runtime/CL/functions/CLCrop.h b/arm_compute/runtime/CL/functions/CLCrop.h
index a474215190..2942e9362a 100644
--- a/arm_compute/runtime/CL/functions/CLCrop.h
+++ b/arm_compute/runtime/CL/functions/CLCrop.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/IFunction.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -71,7 +72,13 @@ public:
      * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
      * @param[in]  output_window       Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
      */
-    void configure(const ICLTensor *input, ICLTensor *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0, Window *output_window = nullptr);
+    void configure(const ICLTensor *input,
+                   ICLTensor       *output,
+                   Coordinates2D    start,
+                   Coordinates2D    end,
+                   uint32_t         batch_index,
+                   float            extrapolation_value = 0,
+                   Window          *output_window       = nullptr);
     /** Configure function
      *
      * @note Supported tensor rank: up to 4
@@ -85,8 +92,14 @@ public:
      * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
      * @param[in]  output_window       Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                   Window *output_window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   Coordinates2D           start,
+                   Coordinates2D           end,
+                   uint32_t                batch_index,
+                   float                   extrapolation_value = 0,
+                   Window                 *output_window       = nullptr);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel
      *
@@ -100,8 +113,13 @@ public:
      * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0.
      * @param[in] output_window       Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                           Window *output_window = nullptr);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           Coordinates2D      start,
+                           Coordinates2D      end,
+                           uint32_t           batch_index,
+                           float              extrapolation_value = 0,
+                           Window            *output_window       = nullptr);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLCropResize.h b/arm_compute/runtime/CL/functions/CLCropResize.h
index 5c60c2879c..6fb055e893 100644
--- a/arm_compute/runtime/CL/functions/CLCropResize.h
+++ b/arm_compute/runtime/CL/functions/CLCropResize.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_CL_CROP_RESIZE_H
 
 #include "arm_compute/core/CL/ICLTensor.h"
-
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLCopy.h"
 #include "arm_compute/runtime/CL/functions/CLCrop.h"
@@ -82,8 +81,13 @@ public:
      * @param[in]  method              The policy to be used when resizing image. Default is bilinear.
      * @param[in]  extrapolation_value Value to be used for values outside of the image for cropping and resizing. Default is 0.
      */
-    void configure(const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
-                   InterpolationPolicy method = InterpolationPolicy::BILINEAR, float extrapolation_value = 0);
+    void configure(const ICLTensor    *input,
+                   ICLTensor          *boxes,
+                   ICLTensor          *box_ind,
+                   ICLTensor          *output,
+                   Coordinates2D       crop_size,
+                   InterpolationPolicy method              = InterpolationPolicy::BILINEAR,
+                   float               extrapolation_value = 0);
     /** Configure kernel
      *
      * @note Supported tensor rank: up to 4
@@ -100,8 +104,14 @@ public:
      * @param[in]  method              The policy to be used when resizing image. Default is bilinear.
      * @param[in]  extrapolation_value Value to be used for values outside of the image for cropping and resizing. Default is 0.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
-                   InterpolationPolicy method = InterpolationPolicy::BILINEAR, float extrapolation_value = 0);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *boxes,
+                   ICLTensor              *box_ind,
+                   ICLTensor              *output,
+                   Coordinates2D           crop_size,
+                   InterpolationPolicy     method              = InterpolationPolicy::BILINEAR,
+                   float                   extrapolation_value = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NESlice
      *
@@ -121,8 +131,13 @@ public:
      *
      * @return A status
      */
-    static Status validate(const ITensorInfo *input, ITensorInfo *boxes, ITensorInfo *box_ind, const ITensorInfo *output,
-                           Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value);
+    static Status validate(const ITensorInfo  *input,
+                           ITensorInfo        *boxes,
+                           ITensorInfo        *box_ind,
+                           const ITensorInfo  *output,
+                           Coordinates2D       crop_size,
+                           InterpolationPolicy method,
+                           float               extrapolation_value);
 
     void run() override;
 
diff --git a/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
index 0c59e2c86d..92f87ee461 100644
--- a/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
@@ -70,7 +70,12 @@ public:
      * @param[in]     weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref opencl::kernels::ClWeightsReshapeKernel.
      *
      */
-    void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info, const WeightsInfo &weights_info = WeightsInfo());
+    void configure(ICLTensor           *input,
+                   ICLTensor           *weights,
+                   const ICLTensor     *bias,
+                   ICLTensor           *output,
+                   const PadStrideInfo &deconv_info,
+                   const WeightsInfo   &weights_info = WeightsInfo());
     /** Set the input, weights, biases and output tensors.
      *
      * @param[in]     compile_context The compile context to be used.
@@ -82,8 +87,13 @@ public:
      * @param[in]     weights_info    (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref opencl::kernels::ClWeightsReshapeKernel.
      *
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
-                   const WeightsInfo &weights_info = WeightsInfo());
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *weights,
+                   const ICLTensor        *bias,
+                   ICLTensor              *output,
+                   const PadStrideInfo    &deconv_info,
+                   const WeightsInfo      &weights_info = WeightsInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayer
      *
      * @param[in] input        Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
@@ -95,11 +105,19 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
-                           const WeightsInfo &weights_info = WeightsInfo());
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *bias,
+                           ITensorInfo         *output,
+                           const PadStrideInfo &deconv_info,
+                           const WeightsInfo   &weights_info = WeightsInfo());
 
-    static DeconvolutionMethod get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
-                                                        const WeightsInfo &weights_info);
+    static DeconvolutionMethod get_deconvolution_method(const ITensorInfo   *input,
+                                                        const ITensorInfo   *weights,
+                                                        const ITensorInfo   *bias,
+                                                        ITensorInfo         *output,
+                                                        const PadStrideInfo &deconv_info,
+                                                        const WeightsInfo   &weights_info);
     // Inherited methods overridden:
     void run() override;
     void prepare() override;
diff --git a/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h b/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
index 344ebd0afb..5a2abafe79 100644
--- a/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
+++ b/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
@@ -82,7 +82,8 @@ public:
      * @param[out]     output          Destination tensor. Data type supported: same as @p input.
      * @param[in]      info            Contains padding and policies to be used in the deconvolution.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PadStrideInfo &info);
+    void
+    configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PadStrideInfo &info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayerUpsample
      *
      * @param[in] input  Source tensor info. Data type supported: All.
diff --git a/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h b/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h
index 58deb7ec40..3e7ca8830b 100644
--- a/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CLDEPTHCONVERT_H
 #define ARM_COMPUTE_CLDEPTHCONVERT_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -96,7 +95,11 @@ public:
      * @param[in]  policy          Conversion policy.
      * @param[in]  shift           Value for down/up conversions. Must be 0 <= shift < 8.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   ConvertPolicy           policy,
+                   uint32_t                shift);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthConvertLayer
      *
      * @param[in] input  Source tensor info. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
diff --git a/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h b/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h
index 0026cc2b67..14d0a7ec7c 100644
--- a/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h
@@ -60,7 +60,8 @@ public:
      * @param[out] output          Tensor output. Data types supported: same as @p input
      * @param[in]  block_shape     Block shape value.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+    void
+    configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthToSpaceLayer.
      *
      * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
@@ -71,5 +72,5 @@ public:
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLDEPTHTOSPACELAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
index 2798449100..2c0fa7aa22 100644
--- a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
@@ -88,15 +88,28 @@ public:
      *
      * @note: For in-place support, please check @ref CLDepthwiseConvolutionLayerNativeKernel
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   const ICLTensor        *weights,
+                   const ICLTensor        *biases,
+                   ICLTensor              *output,
+                   const PadStrideInfo    &conv_info,
+                   unsigned int            depth_multiplier = 1,
+                   ActivationLayerInfo     act_info         = ActivationLayerInfo(),
+                   const Size2D           &dilation         = Size2D(1U, 1U));
 
     /** Initialize the function's source, destination, weights and convolution information.
      *
      * Similar to @ref CLDepthwiseConvolutionLayer::configure()
      */
-    void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+    void configure(ICLTensor           *input,
+                   const ICLTensor     *weights,
+                   const ICLTensor     *biases,
+                   ICLTensor           *output,
+                   const PadStrideInfo &conv_info,
+                   unsigned int         depth_multiplier = 1,
+                   ActivationLayerInfo  act_info         = ActivationLayerInfo(),
+                   const Size2D        &dilation         = Size2D(1U, 1U));
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer
      *
@@ -104,8 +117,14 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *biases,
+                           const ITensorInfo   *output,
+                           const PadStrideInfo &conv_info,
+                           unsigned int         depth_multiplier = 1,
+                           ActivationLayerInfo  act_info         = ActivationLayerInfo(),
+                           const Size2D        &dilation         = Size2D(1U, 1U));
 
     // Inherited methods overriden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h
index 462a3ac07e..84900b03a3 100644
--- a/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h
@@ -79,7 +79,12 @@ public:
      * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]  act_info  (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -94,7 +99,12 @@ public:
      * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLDirectConvolutionLayer
      *
@@ -111,7 +121,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
@@ -121,5 +135,5 @@ private:
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h
index d0a61cdd36..14384a09b5 100644
--- a/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h
@@ -24,12 +24,11 @@
 #ifndef ARM_COMPUTE_CLDIRECTDECONVOLUTIONLAYER_H
 #define ARM_COMPUTE_CLDIRECTDECONVOLUTIONLAYER_H
 
+#include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h"
 #include "arm_compute/runtime/CL/functions/CLReverse.h"
 #include "arm_compute/runtime/CL/functions/CLTranspose.h"
-
-#include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -111,7 +110,12 @@ public:
      * @param[in]     weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref opencl::kernels::ClWeightsReshapeKernel.
      *
      */
-    void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info, const WeightsInfo &weights_info = WeightsInfo());
+    void configure(ICLTensor           *input,
+                   ICLTensor           *weights,
+                   const ICLTensor     *bias,
+                   ICLTensor           *output,
+                   const PadStrideInfo &info,
+                   const WeightsInfo   &weights_info = WeightsInfo());
     /** Set the input, weights, biases and output tensors.
      *
      * @param[in]     compile_context The compile context to be used.
@@ -125,8 +129,13 @@ public:
      * @param[in]     weights_info    (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref opencl::kernels::ClWeightsReshapeKernel.
      *
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
-                   const WeightsInfo &weights_info = WeightsInfo());
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *weights,
+                   const ICLTensor        *bias,
+                   ICLTensor              *output,
+                   const PadStrideInfo    &info,
+                   const WeightsInfo      &weights_info = WeightsInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLDirectDeconvolutionLayer
      *
      * @param[in] input        Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs.
@@ -140,8 +149,12 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
-                           const WeightsInfo &weights_info = WeightsInfo());
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *bias,
+                           ITensorInfo         *output,
+                           const PadStrideInfo &info,
+                           const WeightsInfo   &weights_info = WeightsInfo());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLElementwiseOperations.h b/arm_compute/runtime/CL/functions/CLElementwiseOperations.h
index 9de362d2b2..13844c98a1 100644
--- a/arm_compute/runtime/CL/functions/CLElementwiseOperations.h
+++ b/arm_compute/runtime/CL/functions/CLElementwiseOperations.h
@@ -82,7 +82,11 @@ public:
      * @param[in]      policy   Policy to use to handle overflow.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * Valid configurations (Input1,Input2) -> Output :
@@ -108,7 +112,11 @@ public:
      * @param[in]      policy          Policy to use to handle overflow.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy,
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input1,
+                   const ICLTensor           *input2,
+                   ICLTensor                 *output,
+                   ConvertPolicy              policy,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClSaturatedArithmeticKernel for addition
      *
@@ -134,7 +142,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -192,7 +204,11 @@ public:
      * @param[in]      policy   Policy to use to handle overflow.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ICLTensor           *input1,
+                   const ICLTensor           *input2,
+                   ICLTensor                 *output,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * Valid configurations (Input1,Input2) -> Output :
@@ -218,7 +234,11 @@ public:
      * @param[in]      policy          Policy to use to handle overflow.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy,
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input1,
+                   const ICLTensor           *input2,
+                   ICLTensor                 *output,
+                   ConvertPolicy              policy,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClSaturatedArithmeticKernel for subtraction
      *
@@ -244,7 +264,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -292,7 +316,10 @@ public:
      * @param[out]     output   Output tensor. Data types supported: Same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output.
      *
      * @param[in]      compile_context The compile context to be used.
@@ -303,7 +330,11 @@ public:
      * @param[out]     output          Output tensor. Data types supported: Same as @p input1.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input1,
+                   const ICLTensor           *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticDivision
      *
      * @param[in] input1   First tensor input info. Data types supported: F16/F32.
@@ -313,7 +344,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -368,7 +402,10 @@ public:
      * @param[out]     output   Output tensor. Data types supported: same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * @param[in]      compile_context The compile context to be used.
@@ -379,7 +416,11 @@ public:
      * @param[out]     output          Output tensor. Data types supported: same as @p input1.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClArithmeticKernel for max
      *
      * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
@@ -389,7 +430,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -444,7 +488,10 @@ public:
      * @param[out]     output   Output tensor. Data types supported: same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * @param[in]      compile_context The compile context to be used.
@@ -455,7 +502,11 @@ public:
      * @param[out]     output          Output tensor. Data types supported: same as @p input1.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClArithmeticKernel for min
      *
      * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
@@ -465,7 +516,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -518,7 +572,10 @@ public:
      * @param[out]     output   Output tensor. Data types supported: same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * @param[in]      compile_context The compile context to be used.
@@ -529,7 +586,11 @@ public:
      * @param[out]     output          Output tensor. Data types supported: same as @p input1.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClArithmeticKernel for squared difference
      *
      * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
@@ -539,7 +600,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -587,7 +651,10 @@ public:
      * @param[out]     output   Output tensor. Data types supported:F16/F32.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * @param[in]      compile_context The compile context to be used.
@@ -598,7 +665,11 @@ public:
      * @param[out]     output          Output tensor. Data types supported:F16/F32.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClArithmeticKernel for power
      *
      * @param[in] input1   First tensor input info. Data types supported: F16/F32.
@@ -608,7 +679,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h b/arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h
index 594ee4cfdc..d186b70d93 100644
--- a/arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h
+++ b/arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CLELEMENTWISEUNARYLAYER_H
 #define ARM_COMPUTE_CLELEMENTWISEUNARYLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
diff --git a/arm_compute/runtime/CL/functions/CLFFT1D.h b/arm_compute/runtime/CL/functions/CLFFT1D.h
index c7112dc737..49ecf3c260 100644
--- a/arm_compute/runtime/CL/functions/CLFFT1D.h
+++ b/arm_compute/runtime/CL/functions/CLFFT1D.h
@@ -24,10 +24,9 @@
 #ifndef ARM_COMPUTE_CLFFT1D_H
 #define ARM_COMPUTE_CLFFT1D_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
 namespace arm_compute
@@ -82,7 +81,10 @@ public:
      * @param[out] output          Destination tensor. Data types and data layouts supported: Same as @p input.
      * @param[in]  config          FFT related configuration
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const FFT1DInfo        &config);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFFT1D.
      *
      * @param[in] input  Source tensor info. Data types supported: F16/F32.
diff --git a/arm_compute/runtime/CL/functions/CLFFT2D.h b/arm_compute/runtime/CL/functions/CLFFT2D.h
index 3d20327bf1..b7d15f1602 100644
--- a/arm_compute/runtime/CL/functions/CLFFT2D.h
+++ b/arm_compute/runtime/CL/functions/CLFFT2D.h
@@ -24,11 +24,10 @@
 #ifndef ARM_COMPUTE_CLFFT2D_H
 #define ARM_COMPUTE_CLFFT2D_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLFFT1D.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
 namespace arm_compute
@@ -79,7 +78,10 @@ public:
      * @param[out] output          Destination tensor. Data types and data layouts supported: Same as @p input.
      * @param[in]  config          FFT related configuration
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const FFT2DInfo        &config);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFFT2D.
      *
      * @param[in] input  Source tensor info. Data types supported: F16/F32.
diff --git a/arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h
index f873cb0b86..ed78bbb7a7 100644
--- a/arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h
@@ -24,8 +24,6 @@
 #ifndef ARM_COMPUTE_CLFFTCONVOLUTIONLAYER_H
 #define ARM_COMPUTE_CLFFTCONVOLUTIONLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
@@ -37,6 +35,7 @@
 #include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
 #include "arm_compute/runtime/CL/functions/CLReverse.h"
 #include "arm_compute/runtime/CL/functions/CLSlice.h"
+#include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
@@ -94,8 +93,13 @@ public:
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                              available which may introduce a drop of accuracy as well. Default is false
      */
-    void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    void configure(ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false);
     /** Set the input and output tensors.
      *
      * @note: This function only works with any square kernel size and unit strides for both NCHW and NHWC data layout
@@ -113,8 +117,14 @@ public:
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                              available which may introduce a drop of accuracy as well. Default is false
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFFTConvolutionLayer
      *
      * @note: This function only works with any square kernel size and unit strides for both NCHW and NHWC data layout
@@ -133,8 +143,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLFill.h b/arm_compute/runtime/CL/functions/CLFill.h
index 341d93a9f6..be1059761a 100644
--- a/arm_compute/runtime/CL/functions/CLFill.h
+++ b/arm_compute/runtime/CL/functions/CLFill.h
@@ -28,6 +28,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/IFunction.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -73,7 +74,10 @@ public:
      * @param[in]     constant_value  The value used to fill the planes of the tensor
      * @param[in]     window          Window to be used in case setting only part of a tensor. Default is nullptr.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *tensor, const PixelValue &constant_value, Window *window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *tensor,
+                   const PixelValue       &constant_value,
+                   Window                 *window = nullptr);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFill
      *
      * @param[in] tensor         Source tensor info. Data types supported: All.
diff --git a/arm_compute/runtime/CL/functions/CLFloor.h b/arm_compute/runtime/CL/functions/CLFloor.h
index 87cd5b44c7..4d3d704857 100644
--- a/arm_compute/runtime/CL/functions/CLFloor.h
+++ b/arm_compute/runtime/CL/functions/CLFloor.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CLFLOOR_H
 #define ARM_COMPUTE_CLFLOOR_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
diff --git a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
index b784226a2f..9fd0b4aaef 100644
--- a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
+++ b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
@@ -25,9 +25,8 @@
 #define ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H
 
 #include "arm_compute/function_info/FullyConnectedLayerInfo.h"
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IWeightsManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
@@ -45,7 +44,8 @@ class CLFullyConnectedLayer : public IFunction
 {
 public:
     /** Constructor */
-    CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
+    CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager  = nullptr,
+                          IWeightsManager                *weights_manager = nullptr);
     /** Default destructor */
     ~CLFullyConnectedLayer();
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -83,13 +83,20 @@ public:
      *                             Data type supported: Same as @p input.
      * @param[in]  fc_info         (Optional) Fully connected layer additional info
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *weights,
+                   const ICLTensor        *biases,
+                   ICLTensor              *output,
                    FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
     /** Set the input and output tensors.
      *
      * Similar to @ref CLFullyConnectedLayer
      */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+    void configure(const ICLTensor        *input,
+                   const ICLTensor        *weights,
+                   const ICLTensor        *biases,
+                   ICLTensor              *output,
                    FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLFullyConnectedLayer
      *
@@ -97,7 +104,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+    static Status validate(const ITensorInfo      *input,
+                           const ITensorInfo      *weights,
+                           const ITensorInfo      *biases,
+                           const ITensorInfo      *output,
                            FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
 
     //Inherited methods override
diff --git a/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h b/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h
index cd75270392..2e777273cd 100644
--- a/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h
+++ b/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h
@@ -78,9 +78,16 @@ public:
      * @param[in]  epsilon       (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
      * @param[in]  fbn_type      (Optional) Fused batch normalization type. Defaults to Convolution.
      */
-    void configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias,
-                   const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    void configure(const ICLTensor           *input_weights,
+                   const ICLTensor           *bn_mean,
+                   const ICLTensor           *bn_var,
+                   ICLTensor                 *fused_weights,
+                   ICLTensor                 *fused_bias,
+                   const ICLTensor           *input_bias = nullptr,
+                   const ICLTensor           *bn_beta    = nullptr,
+                   const ICLTensor           *bn_gamma   = nullptr,
+                   float                      epsilon    = 0.001f,
+                   FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -97,9 +104,17 @@ public:
      * @param[in]  epsilon         (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
      * @param[in]  fbn_type        (Optional) Fused batch normalization type. Defaults to Convolution.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias,
-                   const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input_weights,
+                   const ICLTensor           *bn_mean,
+                   const ICLTensor           *bn_var,
+                   ICLTensor                 *fused_weights,
+                   ICLTensor                 *fused_bias,
+                   const ICLTensor           *input_bias = nullptr,
+                   const ICLTensor           *bn_beta    = nullptr,
+                   const ICLTensor           *bn_gamma   = nullptr,
+                   float                      epsilon    = 0.001f,
+                   FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFuseBatchNormalization
      *
      * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
@@ -117,10 +132,16 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                           const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                           const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr,
-                           float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    static Status validate(const ITensorInfo         *input_weights,
+                           const ITensorInfo         *bn_mean,
+                           const ITensorInfo         *bn_var,
+                           const ITensorInfo         *fused_weights,
+                           const ITensorInfo         *fused_bias,
+                           const ITensorInfo         *input_bias = nullptr,
+                           const ITensorInfo         *bn_beta    = nullptr,
+                           const ITensorInfo         *bn_gamma   = nullptr,
+                           float                      epsilon    = 0.001f,
+                           FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLGEMM.h b/arm_compute/runtime/CL/functions/CLGEMM.h
index 3a39aca692..f5e6aa1237 100644
--- a/arm_compute/runtime/CL/functions/CLGEMM.h
+++ b/arm_compute/runtime/CL/functions/CLGEMM.h
@@ -92,13 +92,26 @@ public:
      *                             if the reshape of matrix B should happen only for the first run. GEMMInfo also contains information about the reshaping
      *                             in case matrix A and matrix B have been already transformed.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *a,
+                   const ICLTensor        *b,
+                   const ICLTensor        *c,
+                   ICLTensor              *output,
+                   float                   alpha,
+                   float                   beta,
+                   const GEMMInfo         &gemm_info = GEMMInfo());
 
     /** Initialise the kernel's inputs and output
      *
      * Similar to @ref CLGEMM::configure()
      */
-    void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(const ICLTensor *a,
+                   const ICLTensor *b,
+                   const ICLTensor *c,
+                   ICLTensor       *output,
+                   float            alpha,
+                   float            beta,
+                   const GEMMInfo  &gemm_info = GEMMInfo());
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMM.
      *
@@ -106,7 +119,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *output,
+                           float              alpha,
+                           float              beta,
+                           const GEMMInfo    &gemm_info = GEMMInfo());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
index 4bafef27a9..70ceb1513b 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
@@ -52,7 +52,8 @@ public:
      * @param[in] memory_manager  (Optional) Memory manager.
      * @param[in] weights_manager (Optional) Weights manager.
      */
-    CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
+    CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager  = nullptr,
+                           IWeightsManager                *weights_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     CLGEMMConvolutionLayer(const CLGEMMConvolutionLayer &) = delete;
     /** Default move constructor */
@@ -95,8 +96,15 @@ public:
      * @param[in]  act_info     (Optional) Activation layer information in case of a fused activation.
      * @param[in]  num_groups   (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
+    void configure(const ICLTensor           *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const WeightsInfo         &weights_info = WeightsInfo(),
+                   const Size2D              &dilation     = Size2D(1U, 1U),
+                   const ActivationLayerInfo &act_info     = ActivationLayerInfo(),
+                   unsigned int               num_groups   = 1);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -116,9 +124,16 @@ public:
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const WeightsInfo         &weights_info = WeightsInfo(),
+                   const Size2D              &dilation     = Size2D(1U, 1U),
+                   const ActivationLayerInfo &act_info     = ActivationLayerInfo(),
+                   unsigned int               num_groups   = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer.
      *
      * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -139,8 +154,15 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const WeightsInfo         &weights_info = WeightsInfo(),
+                           const Size2D              &dilation     = Size2D(1U, 1U),
+                           const ActivationLayerInfo &act_info     = ActivationLayerInfo(),
+                           unsigned int               num_groups   = 1);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h
index c985738a9c..3e8929c5ad 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h
@@ -113,7 +113,11 @@ public:
      * @param[out]    output      Output tensor. The output has the same number of dimensions as the @p input. Data layout supported: same as @p input.
      * @param[in]     deconv_info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This function supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
      */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info);
+    void configure(const ICLTensor     *input,
+                   const ICLTensor     *weights,
+                   const ICLTensor     *bias,
+                   ICLTensor           *output,
+                   const PadStrideInfo &deconv_info);
     /** Set the input, weights, biases and output tensors.
      *
      * @param[in]     compile_context The compile context to be used.
@@ -124,7 +128,12 @@ public:
      * @param[out]    output          Output tensor. The output has the same number of dimensions as the @p input. Data layout supported: same as @p input.
      * @param[in]     deconv_info     Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This function supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *weights,
+                   const ICLTensor        *bias,
+                   ICLTensor              *output,
+                   const PadStrideInfo    &deconv_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayer
      *
      * @param[in] input       Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs.
@@ -136,7 +145,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &deconv_info);
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *bias,
+                           const ITensorInfo   *output,
+                           const PadStrideInfo &deconv_info);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
index 8b8d9f235f..1b8e5dcc1d 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
@@ -91,7 +91,11 @@ public:
      * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
      *                       if the reshape of matrix B should be executed only for the first run
      */
-    void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(const ICLTensor *a,
+                   const ICLTensor *b,
+                   const ICLTensor *c,
+                   ICLTensor       *output,
+                   const GEMMInfo  &gemm_info = GEMMInfo());
     /** Initialise the kernel's inputs, output
      *
      * @note GEMMLowp:  low precision GEMM kernel. [A * B + C]
@@ -110,7 +114,12 @@ public:
      * @param[in]  gemm_info       (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
      *                       if the reshape of matrix B should be executed only for the first run
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *a,
+                   const ICLTensor        *b,
+                   const ICLTensor        *c,
+                   ICLTensor              *output,
+                   const GEMMInfo         &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyCore
      *
      * @param[in] a         First input tensor info (Matrix A). Data type supported: QASYMM8.
@@ -122,7 +131,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *output,
+                           const GEMMInfo    &gemm_info = GEMMInfo());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
index 6ec7b71f7d..ff9c872896 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
@@ -85,7 +85,8 @@ public:
      * @param[out] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16
      * @param[in]  info   GEMMLowp output stage metadata.
      */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info);
+    void
+    configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info);
     /** Initialise the kernel's inputs, output
      *
      * @param[in]  compile_context The compile context to be used.
@@ -95,7 +96,11 @@ public:
      * @param[out] output          Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
      * @param[in]  info            GEMMLowp output stage metadata.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info);
+    void configure(const CLCompileContext        &compile_context,
+                   const ICLTensor               *input,
+                   const ICLTensor               *bias,
+                   ICLTensor                     *output,
+                   const GEMMLowpOutputStageInfo &info);
     /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel
      *
      * @param[in] input  Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
@@ -106,7 +111,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info);
+    static Status validate(const ITensorInfo             *input,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *output,
+                           const GEMMLowpOutputStageInfo &info);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLGather.h b/arm_compute/runtime/CL/functions/CLGather.h
index 0f1ccbad08..360c8757b6 100644
--- a/arm_compute/runtime/CL/functions/CLGather.h
+++ b/arm_compute/runtime/CL/functions/CLGather.h
@@ -62,7 +62,11 @@ public:
      * @param[out] output          Destination tensor. Data type supported: Same as @p input
      * @param[in]  axis            (Optional) The axis in @p input to gather @p indices from. Defaults to 0
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *indices,
+                   ICLTensor              *output,
+                   int                     axis = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLGatherKernel
      *
@@ -73,7 +77,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLGATHER_H */
diff --git a/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h b/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h
index aec5cdf1a8..3a201e79b0 100644
--- a/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h
@@ -100,7 +100,12 @@ public:
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the @ref GenerateProposalsInfo struct.
      * @note Proposals contains all the proposals. Of those, only the first num_valid_proposals are valid.
      */
-    void configure(const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals, ICLTensor *scores_out, ICLTensor *num_valid_proposals,
+    void configure(const ICLTensor             *scores,
+                   const ICLTensor             *deltas,
+                   const ICLTensor             *anchors,
+                   ICLTensor                   *proposals,
+                   ICLTensor                   *scores_out,
+                   ICLTensor                   *num_valid_proposals,
                    const GenerateProposalsInfo &info);
     /** Set the input and output tensors.
      *
@@ -118,8 +123,14 @@ public:
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the @ref GenerateProposalsInfo struct.
      * @note Proposals contains all the proposals. Of those, only the first num_valid_proposals are valid.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals, ICLTensor *scores_out,
-                   ICLTensor *num_valid_proposals, const GenerateProposalsInfo &info);
+    void configure(const CLCompileContext      &compile_context,
+                   const ICLTensor             *scores,
+                   const ICLTensor             *deltas,
+                   const ICLTensor             *anchors,
+                   ICLTensor                   *proposals,
+                   ICLTensor                   *scores_out,
+                   ICLTensor                   *num_valid_proposals,
+                   const GenerateProposalsInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLGenerateProposalsLayer
      *
@@ -135,7 +146,11 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out,
+    static Status validate(const ITensorInfo           *scores,
+                           const ITensorInfo           *deltas,
+                           const ITensorInfo           *anchors,
+                           const ITensorInfo           *proposals,
+                           const ITensorInfo           *scores_out,
                            const ITensorInfo           *num_valid_proposals,
                            const GenerateProposalsInfo &info);
 
diff --git a/arm_compute/runtime/CL/functions/CLIndirectConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLIndirectConvolutionLayer.h
index 12b83ea25b..91952af5dc 100644
--- a/arm_compute/runtime/CL/functions/CLIndirectConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLIndirectConvolutionLayer.h
@@ -75,7 +75,12 @@ public:
      * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]  act_info  (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -90,7 +95,12 @@ public:
      * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLIndirectConvolutionLayer
      *
@@ -107,7 +117,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
@@ -117,5 +131,5 @@ private:
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLINDIRECTCONVOLUTIONLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h
index 985a6a75f7..98d215dd4b 100644
--- a/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h
@@ -83,7 +83,12 @@ public:
      * @param[in]      epsilon             (Optional) Lower bound value for the normalization. Defaults to 1e-12
      * @param[in]      use_mixed_precision (Optional) Use mixed precision in case of FP16 execution
      */
-    void configure(ICLTensor *input, ICLTensor *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f, bool use_mixed_precision = true);
+    void configure(ICLTensor *input,
+                   ICLTensor *output,
+                   float      gamma               = 1.0f,
+                   float      beta                = 0.0f,
+                   float      epsilon             = 1e-12f,
+                   bool       use_mixed_precision = true);
     /** Set the input and output tensors.
      *
      * @param[in]      compile_context     The compile context to be used.
@@ -95,7 +100,13 @@ public:
      * @param[in]      epsilon             (Optional) Lower bound value for the normalization. Defaults to 1e-12
      * @param[in]      use_mixed_precision (Optional) Use mixed precision in case of FP16 execution
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f, bool use_mixed_precision = true);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output,
+                   float                   gamma               = 1.0f,
+                   float                   beta                = 0.0f,
+                   float                   epsilon             = 1e-12f,
+                   bool                    use_mixed_precision = true);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer.
      *
@@ -108,8 +119,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f, bool use_mixed_precision = true);
-    void run() override;
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           float              gamma               = 1.0f,
+                           float              beta                = 0.0f,
+                           float              epsilon             = 1e-12f,
+                           bool               use_mixed_precision = true);
+    void          run() override;
 
 private:
     std::unique_ptr<ICLKernel> _inst_norm_kernel; /**< Kernel to run */
diff --git a/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h b/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h
index 4dc5c778d2..a8b356a708 100644
--- a/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h
+++ b/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h
@@ -26,8 +26,8 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 #include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
@@ -89,7 +89,8 @@ public:
      * @param[in]  axis            Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
      * @param[in]  epsilon         (Optional) Lower bound value for the normalization.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon = 1e-12f);
+    void configure(
+        const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon = 1e-12f);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLL2NormalizeLayer.
      *
@@ -111,5 +112,5 @@ private:
     std::unique_ptr<CLL2NormalizeLayerKernel> _normalize_kernel;
     CLTensor                                  _sumsq;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_CLL2NORMALIZELAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayer.h b/arm_compute/runtime/CL/functions/CLLSTMLayer.h
index d26b4c5595..fe494991af 100644
--- a/arm_compute/runtime/CL/functions/CLLSTMLayer.h
+++ b/arm_compute/runtime/CL/functions/CLLSTMLayer.h
@@ -24,8 +24,6 @@
 #ifndef ARM_COMPUTE_CLLSTMLAYER_H
 #define ARM_COMPUTE_CLLSTMLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
@@ -37,9 +35,10 @@
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 #include "arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
+#include "arm_compute/runtime/common/LSTMParams.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/common/LSTMParams.h"
 
 #include <memory>
 
@@ -53,7 +52,7 @@ namespace kernels
 {
 class ClTransposeKernel;
 }
-}
+} // namespace opencl
 
 /** This function performs a single time step in a Long Short-Term Memory (LSTM) layer.
  *
@@ -120,13 +119,26 @@ public:
      * @param[in]  projection_threshold        (Optional) The clipping threshold for the output from the projection layer, such that values are bound within [-proj_clip, proj_clip].
      *                                         If set to 0.0f then clipping is disabled.
      */
-    void configure(const ICLTensor *input,
-                   const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                   const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                   const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                   const ICLTensor *output_state_in, ICLTensor *cell_state_in,
-                   ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
-                   const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f);
+    void configure(const ICLTensor             *input,
+                   const ICLTensor             *input_to_forget_weights,
+                   const ICLTensor             *input_to_cell_weights,
+                   const ICLTensor             *input_to_output_weights,
+                   const ICLTensor             *recurrent_to_forget_weights,
+                   const ICLTensor             *recurrent_to_cell_weights,
+                   const ICLTensor             *recurrent_to_output_weights,
+                   const ICLTensor             *forget_gate_bias,
+                   const ICLTensor             *cell_bias,
+                   const ICLTensor             *output_gate_bias,
+                   const ICLTensor             *output_state_in,
+                   ICLTensor                   *cell_state_in,
+                   ICLTensor                   *scratch_buffer,
+                   ICLTensor                   *output_state_out,
+                   ICLTensor                   *cell_state_out,
+                   ICLTensor                   *output,
+                   const LSTMParams<ICLTensor> &lstm_params,
+                   const ActivationLayerInfo   &activation_info,
+                   float                        cell_threshold       = 0.f,
+                   float                        projection_threshold = 0.f);
     /** Initialize function's tensors.
      *
      * @param[in]  compile_context             The compile context to be used.
@@ -166,13 +178,27 @@ public:
      * @param[in]  projection_threshold        (Optional) The clipping threshold for the output from the projection layer, such that values are bound within [-proj_clip, proj_clip].
      *                                         If set to 0.0f then clipping is disabled.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input,
-                   const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                   const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                   const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                   const ICLTensor *output_state_in, ICLTensor *cell_state_in,
-                   ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
-                   const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f);
+    void configure(const CLCompileContext      &compile_context,
+                   const ICLTensor             *input,
+                   const ICLTensor             *input_to_forget_weights,
+                   const ICLTensor             *input_to_cell_weights,
+                   const ICLTensor             *input_to_output_weights,
+                   const ICLTensor             *recurrent_to_forget_weights,
+                   const ICLTensor             *recurrent_to_cell_weights,
+                   const ICLTensor             *recurrent_to_output_weights,
+                   const ICLTensor             *forget_gate_bias,
+                   const ICLTensor             *cell_bias,
+                   const ICLTensor             *output_gate_bias,
+                   const ICLTensor             *output_state_in,
+                   ICLTensor                   *cell_state_in,
+                   ICLTensor                   *scratch_buffer,
+                   ICLTensor                   *output_state_out,
+                   ICLTensor                   *cell_state_out,
+                   ICLTensor                   *output,
+                   const LSTMParams<ICLTensor> &lstm_params,
+                   const ActivationLayerInfo   &activation_info,
+                   float                        cell_threshold       = 0.f,
+                   float                        projection_threshold = 0.f);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLLSTMLayer
      *
@@ -214,13 +240,26 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input,
-                           const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                           const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                           const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                           const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in,
-                           const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output,
-                           const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f);
+    static Status validate(const ITensorInfo             *input,
+                           const ITensorInfo             *input_to_forget_weights,
+                           const ITensorInfo             *input_to_cell_weights,
+                           const ITensorInfo             *input_to_output_weights,
+                           const ITensorInfo             *recurrent_to_forget_weights,
+                           const ITensorInfo             *recurrent_to_cell_weights,
+                           const ITensorInfo             *recurrent_to_output_weights,
+                           const ITensorInfo             *forget_gate_bias,
+                           const ITensorInfo             *cell_bias,
+                           const ITensorInfo             *output_gate_bias,
+                           const ITensorInfo             *output_state_in,
+                           const ITensorInfo             *cell_state_in,
+                           const ITensorInfo             *scratch_buffer,
+                           const ITensorInfo             *output_state_out,
+                           const ITensorInfo             *cell_state_out,
+                           const ITensorInfo             *output,
+                           const LSTMParams<ITensorInfo> &lstm_params,
+                           const ActivationLayerInfo     &activation_info,
+                           float                          cell_threshold       = 0.f,
+                           float                          projection_threshold = 0.f);
 
     // Inherited methods overridden:
     void run() override;
@@ -311,7 +350,7 @@ private:
     bool                                                _perform_projection_clipping;
     bool                                                _is_prepared;
     bool                                                _is_layer_norm_lstm;
-    const ICLTensor                                    *_recurrent_to_cell_weights{ nullptr };
+    const ICLTensor                                    *_recurrent_to_cell_weights{nullptr};
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLLSTMLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h b/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h
index 9c004b85d0..8c116b1482 100644
--- a/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h
+++ b/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h
@@ -35,7 +35,6 @@
 #include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLSlice.h"
 #include "arm_compute/runtime/CL/functions/CLTranspose.h"
-
 #include "arm_compute/runtime/common/LSTMParams.h"
 
 namespace arm_compute
@@ -100,11 +99,22 @@ public:
      * @param[out] output_state_out            Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size].Data types supported: Same as @p input.
      */
     void configure(const ICLTensor *input,
-                   const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                   const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                   const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                   ICLTensor *cell_state_in, const ICLTensor *output_state_in,
-                   ICLTensor *cell_state_out, ICLTensor *output_state_out);
+                   const ICLTensor *input_to_input_weights,
+                   const ICLTensor *input_to_forget_weights,
+                   const ICLTensor *input_to_cell_weights,
+                   const ICLTensor *input_to_output_weights,
+                   const ICLTensor *recurrent_to_input_weights,
+                   const ICLTensor *recurrent_to_forget_weights,
+                   const ICLTensor *recurrent_to_cell_weights,
+                   const ICLTensor *recurrent_to_output_weights,
+                   const ICLTensor *input_gate_bias,
+                   const ICLTensor *forget_gate_bias,
+                   const ICLTensor *cell_bias,
+                   const ICLTensor *output_gate_bias,
+                   ICLTensor       *cell_state_in,
+                   const ICLTensor *output_state_in,
+                   ICLTensor       *cell_state_out,
+                   ICLTensor       *output_state_out);
     /** Initialize function's tensors.
      *
      * @param[in]  compile_context             The compile context to be used.
@@ -126,12 +136,24 @@ public:
      * @param[out] cell_state_out              Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size]. Data type supported:  QSYMM16.
      * @param[out] output_state_out            Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size].Data types supported: Same as @p input.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input,
-                   const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                   const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                   const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                   ICLTensor *cell_state_in, const ICLTensor *output_state_in,
-                   ICLTensor *cell_state_out, ICLTensor *output_state_out);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *input_to_input_weights,
+                   const ICLTensor        *input_to_forget_weights,
+                   const ICLTensor        *input_to_cell_weights,
+                   const ICLTensor        *input_to_output_weights,
+                   const ICLTensor        *recurrent_to_input_weights,
+                   const ICLTensor        *recurrent_to_forget_weights,
+                   const ICLTensor        *recurrent_to_cell_weights,
+                   const ICLTensor        *recurrent_to_output_weights,
+                   const ICLTensor        *input_gate_bias,
+                   const ICLTensor        *forget_gate_bias,
+                   const ICLTensor        *cell_bias,
+                   const ICLTensor        *output_gate_bias,
+                   ICLTensor              *cell_state_in,
+                   const ICLTensor        *output_state_in,
+                   ICLTensor              *cell_state_out,
+                   ICLTensor              *output_state_out);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLLSTMLayerQuantized
      *
@@ -156,11 +178,22 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input,
-                           const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                           const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                           const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                           const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                           const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out);
+                           const ITensorInfo *input_to_input_weights,
+                           const ITensorInfo *input_to_forget_weights,
+                           const ITensorInfo *input_to_cell_weights,
+                           const ITensorInfo *input_to_output_weights,
+                           const ITensorInfo *recurrent_to_input_weights,
+                           const ITensorInfo *recurrent_to_forget_weights,
+                           const ITensorInfo *recurrent_to_cell_weights,
+                           const ITensorInfo *recurrent_to_output_weights,
+                           const ITensorInfo *input_gate_bias,
+                           const ITensorInfo *forget_gate_bias,
+                           const ITensorInfo *cell_bias,
+                           const ITensorInfo *output_gate_bias,
+                           const ITensorInfo *cell_state_in,
+                           const ITensorInfo *output_state_in,
+                           const ITensorInfo *cell_state_out,
+                           const ITensorInfo *output_state_out);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLLogicalAnd.h b/arm_compute/runtime/CL/functions/CLLogicalAnd.h
index e3061e1dc3..4ff488782a 100644
--- a/arm_compute/runtime/CL/functions/CLLogicalAnd.h
+++ b/arm_compute/runtime/CL/functions/CLLogicalAnd.h
@@ -111,7 +111,8 @@ public:
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
      * @param[out]     output          Output tensor. Data types supported: same as @p input1.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref arm_compute::opencl::kernels::ClLogicalBinaryKernel
      *
      * @param[in] input1 First tensor input info. Data types supported: U8.
diff --git a/arm_compute/runtime/CL/functions/CLLogicalNot.h b/arm_compute/runtime/CL/functions/CLLogicalNot.h
index 27fd0f9c9f..c7d9db93d7 100644
--- a/arm_compute/runtime/CL/functions/CLLogicalNot.h
+++ b/arm_compute/runtime/CL/functions/CLLogicalNot.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CLLOGICALNOT_H
 #define ARM_COMPUTE_CLLOGICALNOT_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -85,7 +84,7 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-    void run() override;
+    void          run() override;
 
 private:
     struct Impl;
diff --git a/arm_compute/runtime/CL/functions/CLLogicalOr.h b/arm_compute/runtime/CL/functions/CLLogicalOr.h
index 893c22f721..64b6d83177 100644
--- a/arm_compute/runtime/CL/functions/CLLogicalOr.h
+++ b/arm_compute/runtime/CL/functions/CLLogicalOr.h
@@ -111,7 +111,8 @@ public:
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
      * @param[out]     output          Output tensor. Data types supported: same as @p input1.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref arm_compute::opencl::kernels::ClLogicalBinaryKernel
      *
      * @param[in] input1 First tensor input info. Data types supported: U8.
diff --git a/arm_compute/runtime/CL/functions/CLMatMul.h b/arm_compute/runtime/CL/functions/CLMatMul.h
index 9d54bab868..9c9939b9d0 100644
--- a/arm_compute/runtime/CL/functions/CLMatMul.h
+++ b/arm_compute/runtime/CL/functions/CLMatMul.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/IFunction.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -88,14 +89,23 @@ public:
      * @param[in]  settings        Contains flags for function level settings
      * @param[in]  act_info        (Optional) Contains activation function and lower and upper bound values for bounded activation functions.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *rhs, ICLTensor *lhs, ICLTensor *dst, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings = GpuMatMulSettings{}, const
-                   ActivationLayerInfo &act_info = ActivationLayerInfo{});
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *rhs,
+                   ICLTensor                 *lhs,
+                   ICLTensor                 *dst,
+                   const MatMulInfo          &matmul_info,
+                   const GpuMatMulSettings   &settings = GpuMatMulSettings{},
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo{});
     /** Initialise the kernel's inputs and output
      *
      * Similar to @ref CLMatMul::configure()
      */
-    void configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *dst, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings = GpuMatMulSettings{}, const ActivationLayerInfo &act_info =
-                       ActivationLayerInfo{});
+    void configure(ICLTensor                 *lhs,
+                   ICLTensor                 *rhs,
+                   ICLTensor                 *dst,
+                   const MatMulInfo          &matmul_info,
+                   const GpuMatMulSettings   &settings = GpuMatMulSettings{},
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo{});
     /** Static function to check if given info will lead to a valid configuration of @ref CLMatMul.
      *
      *
@@ -107,7 +117,11 @@ public:
      * @param[in]  matmul_info Contains MatMul operation information described in @ref MatMulInfo.
      * @param[in]  act_info    (Optional) Contains activation function and lower and upper bound values for bounded activation functions.
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info = ActivationLayerInfo{});
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *output,
+                           const MatMulInfo          &matmul_info,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo{});
     // Inherited methods overridden:
     void run() override;
 
diff --git a/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h b/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h
index f7ff1234f6..2d2f064b4c 100644
--- a/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h
+++ b/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h
@@ -92,7 +92,11 @@ public:
      * @param[out] output          Destination tensor. Data types supported: Same as @p input.
      * @param[in]  pool_info       Contains pooling operation information described in @ref PoolingLayerInfo.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *indices,
+                   ICLTensor              *output,
+                   const PoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLMaxUnpoolingLayer
      *
      * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -105,7 +109,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo      *input,
+                           const ITensorInfo      *indices,
+                           const ITensorInfo      *output,
+                           const PoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run() override;
@@ -114,5 +121,5 @@ private:
     CLFill                                     _fill;
     std::unique_ptr<CLMaxUnpoolingLayerKernel> _unpooling_layer_kernel;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLMAXUNPOOLINGLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h
index 68a7df24e6..951db3e419 100644
--- a/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h
@@ -65,7 +65,10 @@ public:
      * @param[out]     output          (Optional) Destination tensor. It can be nullptr in case of in-place computation. Data type supported: same as @p input
      * @param[in]      epsilon         (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output = nullptr, float epsilon = 1e-8f);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output  = nullptr,
+                   float                   epsilon = 1e-8f);
     /** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDevNormalizationKernel
      *
      * @param[in] input   Source tensor info with 2 dimensions. In case of @p output tensor info = nullptr,
diff --git a/arm_compute/runtime/CL/functions/CLNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
index 15406f7728..10fd8ed4c6 100644
--- a/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
@@ -89,7 +89,10 @@ public:
      *                                 Data types supported: same as @p input. Data layouts supported: same as @p input.
      * @param[in]      norm_info       Normalization layer information like the normalization type, normalization size and other parameters.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info);
+    void configure(const CLCompileContext       &compile_context,
+                   ICLTensor                    *input,
+                   ICLTensor                    *output,
+                   const NormalizationLayerInfo &norm_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizationLayer
      *
      * @param[in] input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
@@ -100,7 +103,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h b/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h
index de5155c65a..3473af1004 100644
--- a/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h
+++ b/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h
@@ -62,7 +62,11 @@ public:
      * @param[in]  std             Standard deviation values tensor. 1 dimension with size equal to the number of input channels.
      *                    Data types supported: Same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const ICLTensor        *mean,
+                   const ICLTensor        *std);
     /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizePlanarYUVLayer
      *
      * @param[in]  input  Source tensor info. 3 lower dimensions represent a single input with dimensions [width, height, channels].
@@ -74,7 +78,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLNORMALIZEPLANARYUVLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLPadLayer.h b/arm_compute/runtime/CL/functions/CLPadLayer.h
index 7f950bcfb3..89e693bd92 100644
--- a/arm_compute/runtime/CL/functions/CLPadLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPadLayer.h
@@ -76,7 +76,11 @@ public:
      * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
-    void configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
+    void configure(ICLTensor         *input,
+                   ICLTensor         *output,
+                   const PaddingList &padding,
+                   PixelValue         constant_value = PixelValue(),
+                   PaddingMode        mode           = PaddingMode::CONSTANT);
     /** Initialize the function
      *
      * @param[in]  compile_context The compile context to be used.
@@ -88,8 +92,12 @@ public:
      * @param[in]  mode            (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(),
-                   PaddingMode mode = PaddingMode::CONSTANT);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output,
+                   const PaddingList      &padding,
+                   PixelValue              constant_value = PixelValue(),
+                   PaddingMode             mode           = PaddingMode::CONSTANT);
 
     /**  Static function to check if given info will lead to a valid configuration of @ref CLPadLayer.
      *
@@ -101,7 +109,11 @@ public:
      * @param[in] mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const PaddingList &padding,
+                           PixelValue         constant_value = PixelValue(),
+                           PaddingMode        mode           = PaddingMode::CONSTANT);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLPermute.h b/arm_compute/runtime/CL/functions/CLPermute.h
index 8e15da2287..7ac0bf6b9c 100644
--- a/arm_compute/runtime/CL/functions/CLPermute.h
+++ b/arm_compute/runtime/CL/functions/CLPermute.h
@@ -78,7 +78,10 @@ public:
      * @param[in] output          The output tensor. Data types supported: Same as @p input
      * @param[in] perm            Permutation vector
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PermutationVector &perm);
+    void configure(const CLCompileContext  &compile_context,
+                   const ICLTensor         *input,
+                   ICLTensor               *output,
+                   const PermutationVector &perm);
     /**  Static function to check if given info will lead to a valid configuration of @ref CLPermute.
      *
      * @note Arbitrary permutation vectors are supported with rank not greater than 4
diff --git a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
index 62b6d96ad5..f3e5cf9bd3 100644
--- a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
+++ b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
@@ -84,8 +84,13 @@ public:
      * @param[in]      rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
-                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   float                      scale,
+                   ConvertPolicy              overflow_policy,
+                   RoundingPolicy             rounding_policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output and convertion policy.
      *
      * @param[in]      compile_context The compile context to be used.
@@ -100,8 +105,14 @@ public:
      * @param[in]      rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
-                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   float                      scale,
+                   ConvertPolicy              overflow_policy,
+                   RoundingPolicy             rounding_policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseMultiplication
      *
      * @param[in] input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
@@ -115,8 +126,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
-                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           float                      scale,
+                           ConvertPolicy              overflow_policy,
+                           RoundingPolicy             rounding_policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -151,7 +167,10 @@ public:
      * @param[out]     output   The output tensor, Data types supported: same as @p input1. Number of channels supported: same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output.
      *
      * @param[in]      compile_context The compile context to be used.
@@ -162,7 +181,11 @@ public:
      * @param[out]     output          The output tensor, Data types supported: same as @p input1. Number of channels supported: same as @p input1.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLComplexPixelWiseMultiplication
      *
      * @param[in] input1   An input tensor info. Data types supported: F16/F32. Number of channels supported: 2.
@@ -170,7 +193,10 @@ public:
      * @param[in] output   The output tensor info, Data types supported: same as @p input1. Number of channels supported: same as @p input1.
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLPooling3dLayer.h b/arm_compute/runtime/CL/functions/CLPooling3dLayer.h
index 91c46770da..1c69148771 100644
--- a/arm_compute/runtime/CL/functions/CLPooling3dLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPooling3dLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CLPOOLING3DLAYER_H
 #define ARM_COMPUTE_CLPOOLING3DLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -82,7 +81,10 @@ public:
      * @param[out]    output          Destination tensor. Data types supported: Same as @p input.
      * @param[in]     pool_info       Contains 3d pooling operation information described in @ref Pooling3dLayerInfo.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Pooling3dLayerInfo &pool_info);
+    void configure(const CLCompileContext   &compile_context,
+                   const ICLTensor          *input,
+                   ICLTensor                *output,
+                   const Pooling3dLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPooling3dLayer
      *
      * @param[in] input     Source tensor info. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED.
diff --git a/arm_compute/runtime/CL/functions/CLPoolingLayer.h b/arm_compute/runtime/CL/functions/CLPoolingLayer.h
index 2163c16801..3dbdf8aeea 100644
--- a/arm_compute/runtime/CL/functions/CLPoolingLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPoolingLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CLPOOLINGLAYER_H
 #define ARM_COMPUTE_CLPOOLINGLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -74,7 +73,8 @@ public:
      * @param[in]     pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      * @param[out]    indices   (optional) The indices of the maximal values. Data type supported: U32.
      */
-    void configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices = nullptr);
+    void
+    configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices = nullptr);
     /** Set the input and output tensors.
      *
      * @param[in]     compile_context The compile context to be used.
@@ -83,7 +83,11 @@ public:
      * @param[in]     pool_info       Contains pooling operation information described in @ref PoolingLayerInfo.
      * @param[out]    indices         (optional) The indices of the maximal values. Data type supported: U32.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output,
+                   const PoolingLayerInfo &pool_info,
+                   ICLTensor              *indices = nullptr);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPoolingLayer
      *
      * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -93,7 +97,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+    static Status validate(const ITensorInfo      *input,
+                           const ITensorInfo      *output,
+                           const PoolingLayerInfo &pool_info,
+                           const ITensorInfo      *indices = nullptr);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h b/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h
index 9b36c9e433..4ede906baa 100644
--- a/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h
@@ -66,7 +66,11 @@ public:
      * @param[out] output          Destination tensor. Output dimensions are [W * H * num_priors * 4, 2]. Data types and layouts supported: same as @p input1
      * @param[in]  info            Prior box layer info.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info);
+    void configure(const CLCompileContext  &compile_context,
+                   const ICLTensor         *input1,
+                   const ICLTensor         *input2,
+                   ICLTensor               *output,
+                   const PriorBoxLayerInfo &info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPriorBoxLayer
      *
      * @param[in] input1 First source tensor info. Data types supported: F32. Data layouts supported: NCHW/NHWC.
@@ -76,12 +80,15 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info);
+    static Status validate(const ITensorInfo       *input1,
+                           const ITensorInfo       *input2,
+                           const ITensorInfo       *output,
+                           const PriorBoxLayerInfo &info);
 
 private:
     cl::Buffer _min;
     cl::Buffer _max;
     cl::Buffer _aspect_ratios;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLPRIORBOXLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
index 1b0b759d74..3e76da086f 100644
--- a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
+++ b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
@@ -32,7 +32,6 @@
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
 #include "arm_compute/runtime/CL/functions/CLTranspose.h"
-
 #include "arm_compute/runtime/common/LSTMParams.h"
 
 namespace arm_compute
@@ -127,12 +126,21 @@ public:
      *                                         projection_threshold       (Optional) The clipping threshold for the output from the projection layer, such that values are bound within
      *                                                                               [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.
      */
-    void configure(const ICLTensor *input,
-                   const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                   const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                   const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                   ICLTensor *cell_state_in, ICLTensor *output_state_in,
-                   ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
+    void configure(const ICLTensor             *input,
+                   const ICLTensor             *input_to_forget_weights,
+                   const ICLTensor             *input_to_cell_weights,
+                   const ICLTensor             *input_to_output_weights,
+                   const ICLTensor             *recurrent_to_forget_weights,
+                   const ICLTensor             *recurrent_to_cell_weights,
+                   const ICLTensor             *recurrent_to_output_weights,
+                   const ICLTensor             *forget_gate_bias,
+                   const ICLTensor             *cell_bias,
+                   const ICLTensor             *output_gate_bias,
+                   ICLTensor                   *cell_state_in,
+                   ICLTensor                   *output_state_in,
+                   ICLTensor                   *cell_state_out,
+                   ICLTensor                   *output_state_out,
+                   ICLTensor                   *output,
                    const LSTMParams<ICLTensor> &lstm_params);
 
     /** Initialize function's tensors.
@@ -177,12 +185,22 @@ public:
      *                                         projection_threshold       (Optional) The clipping threshold for the output from the projection layer, such that values are bound within
      *                                                                               [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input,
-                   const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                   const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                   const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                   ICLTensor *cell_state_in, ICLTensor *output_state_in,
-                   ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
+    void configure(const CLCompileContext      &compile_context,
+                   const ICLTensor             *input,
+                   const ICLTensor             *input_to_forget_weights,
+                   const ICLTensor             *input_to_cell_weights,
+                   const ICLTensor             *input_to_output_weights,
+                   const ICLTensor             *recurrent_to_forget_weights,
+                   const ICLTensor             *recurrent_to_cell_weights,
+                   const ICLTensor             *recurrent_to_output_weights,
+                   const ICLTensor             *forget_gate_bias,
+                   const ICLTensor             *cell_bias,
+                   const ICLTensor             *output_gate_bias,
+                   ICLTensor                   *cell_state_in,
+                   ICLTensor                   *output_state_in,
+                   ICLTensor                   *cell_state_out,
+                   ICLTensor                   *output_state_out,
+                   ICLTensor                   *output,
                    const LSTMParams<ICLTensor> &lstm_params);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLQLSTMLayer
@@ -227,12 +245,21 @@ public:
      *                                                                              [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.
      * @return a status
      */
-    static Status validate(const ITensorInfo *input,
-                           const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                           const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                           const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                           const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                           const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output,
+    static Status validate(const ITensorInfo             *input,
+                           const ITensorInfo             *input_to_forget_weights,
+                           const ITensorInfo             *input_to_cell_weights,
+                           const ITensorInfo             *input_to_output_weights,
+                           const ITensorInfo             *recurrent_to_forget_weights,
+                           const ITensorInfo             *recurrent_to_cell_weights,
+                           const ITensorInfo             *recurrent_to_output_weights,
+                           const ITensorInfo             *forget_gate_bias,
+                           const ITensorInfo             *cell_bias,
+                           const ITensorInfo             *output_gate_bias,
+                           const ITensorInfo             *cell_state_in,
+                           const ITensorInfo             *output_state_in,
+                           const ITensorInfo             *cell_state_out,
+                           const ITensorInfo             *output_state_out,
+                           const ITensorInfo             *output,
                            const LSTMParams<ITensorInfo> &lstm_params);
 
     // Inherited methods overridden:
@@ -266,10 +293,18 @@ private:
      * @param[in] mm_res_info     Tensor info to be used to initialize output stage result tensor.
      *
      */
-    void configure_mm(const CLCompileContext &compile_context, CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
-                      const ICLTensor *mm_input, const ICLTensor *mm_weights, const ICLTensor *bias, CLTensor *mm_res,
-                      CLTensor *outstage_res, float gemmlowp_scale,
-                      const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info);
+    void configure_mm(const CLCompileContext       &compile_context,
+                      CLGEMMLowpMatrixMultiplyCore &mm,
+                      CLGEMMLowpOutputStage        &outstage,
+                      GEMMLowpOutputStageInfo      &gemmlowp_info,
+                      const ICLTensor              *mm_input,
+                      const ICLTensor              *mm_weights,
+                      const ICLTensor              *bias,
+                      CLTensor                     *mm_res,
+                      CLTensor                     *outstage_res,
+                      float                         gemmlowp_scale,
+                      const TensorInfo             &mm_res_info,
+                      const TensorInfo             &outstage_tensor_info);
 
     MemoryGroup _memory_group{};
 
@@ -278,8 +313,8 @@ private:
     {
         static constexpr uint32_t max_dimension_supported = 2;
 
-        ICLTensor *_src{ nullptr };
-        ICLTensor *_dst{ nullptr };
+        ICLTensor *_src{nullptr};
+        ICLTensor *_dst{nullptr};
         size_t     _row_size{};
         Window     _window{};
 
@@ -368,7 +403,7 @@ private:
     CLArithmeticAddition                                               _accumulate_projection{};
     CLActivationLayer                                                  _projection_clip{};
     std::array<std::unique_ptr<CLQLSTMLayerNormalizationKernel>, _layer_norm_count> _layer_norms;
-    CLCopy _copy_output;
+    CLCopy                                                                          _copy_output;
 
     TensorCopyKernel _projection_bias_copy{};
     TensorCopyKernel _projection_output_to_accumulate_copy{};
@@ -376,21 +411,18 @@ private:
     TensorCopyKernel _hidden_to_output_copy{};
 
     // Tensor pointers
-    const ICLTensor *_input_to_input_weights
-    {
-        nullptr
-    };
-    const ICLTensor *_recurrent_to_input_weights{ nullptr };
-    const ICLTensor *_projection_bias{ nullptr };
-    const ICLTensor *_input_to_forget_weights{ nullptr };
-    const ICLTensor *_input_to_cell_weights{ nullptr };
-    const ICLTensor *_input_to_output_weights{ nullptr };
-    const ICLTensor *_recurrent_to_forget_weights{ nullptr };
-    const ICLTensor *_recurrent_to_cell_weights{ nullptr };
-    const ICLTensor *_recurrent_to_output_weights{ nullptr };
-    const ICLTensor *_projection_weights{ nullptr };
-    std::array<const ICLTensor *, _layer_norm_count> _layer_norm_weights{ {} };
-    std::array<const ICLTensor *, _layer_norm_count> _layer_norm_bias{ {} };
+    const ICLTensor                                 *_input_to_input_weights{nullptr};
+    const ICLTensor                                 *_recurrent_to_input_weights{nullptr};
+    const ICLTensor                                 *_projection_bias{nullptr};
+    const ICLTensor                                 *_input_to_forget_weights{nullptr};
+    const ICLTensor                                 *_input_to_cell_weights{nullptr};
+    const ICLTensor                                 *_input_to_output_weights{nullptr};
+    const ICLTensor                                 *_recurrent_to_forget_weights{nullptr};
+    const ICLTensor                                 *_recurrent_to_cell_weights{nullptr};
+    const ICLTensor                                 *_recurrent_to_output_weights{nullptr};
+    const ICLTensor                                 *_projection_weights{nullptr};
+    std::array<const ICLTensor *, _layer_norm_count> _layer_norm_weights{{}};
+    std::array<const ICLTensor *, _layer_norm_count> _layer_norm_bias{{}};
 
     using LayerNormIndexType = typename std::underlying_type<LayerNormGate>::type;
     inline LayerNormIndexType getGateIndex(LayerNormGate g)
@@ -423,78 +455,78 @@ private:
         return *_layer_norms[getGateIndex(g)];
     }
 
-    inline void configure_layer_norm(LayerNormGate g, const ICLTensor *in);
+    inline void          configure_layer_norm(LayerNormGate g, const ICLTensor *in);
     inline static Status validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias);
 
     // Temporary tensors
-    CLTensor _input_to_forget_weights_transposed{ nullptr };
-    CLTensor _input_to_cell_weights_transposed{ nullptr };
-    CLTensor _input_to_output_weights_transposed{ nullptr };
-    CLTensor _input_to_input_weights_transposed{ nullptr };
-    CLTensor _recurrent_to_forget_weights_transposed{ nullptr };
-    CLTensor _recurrent_to_cell_weights_transposed{ nullptr };
-    CLTensor _recurrent_to_output_weights_transposed{ nullptr };
-    CLTensor _recurrent_to_input_weights_transposed{ nullptr };
-    CLTensor _projection_weights_transposed{ nullptr };
-    CLTensor _input_to_input_eff_bias{ nullptr };
-    CLTensor _recurrent_to_input_eff_bias{ nullptr };
-    CLTensor _input_to_forget_eff_bias{ nullptr };
-    CLTensor _recurrent_to_forget_eff_bias{ nullptr };
-    CLTensor _input_to_cell_eff_bias{ nullptr };
-    CLTensor _recurrent_to_cell_eff_bias{ nullptr };
-    CLTensor _input_to_output_eff_bias{ nullptr };
-    CLTensor _recurrent_to_output_eff_bias{ nullptr };
-    CLTensor _projection_reduction_res{ nullptr };
-    CLTensor _projection_eff_bias{ nullptr };
-    CLTensor _mm_input_to_forget_res{ nullptr };
-    CLTensor _mm_recurrent_to_forget_res{ nullptr };
-    CLTensor _mul_cell_to_forget_res{ nullptr };
-    CLTensor _input_to_forget_outstage_res{ nullptr };
-    CLTensor _cell_to_forget_outstage_res{ nullptr };
-    CLTensor _recurrent_to_forget_outstage_res{ nullptr };
-    CLTensor _forget_gate{ nullptr };
-    CLTensor _mm_input_to_cell_res{ nullptr };
-    CLTensor _input_to_cell_outstage_res{ nullptr };
-    CLTensor _mm_recurrent_to_cell_res{ nullptr };
-    CLTensor _recurrent_to_cell_outstage_res{ nullptr };
-    CLTensor _cell_gate{ nullptr };
-    CLTensor _mul_input_cell_res{ nullptr };
-    CLTensor _mm_input_to_input_res{ nullptr };
-    CLTensor _input_to_input_outstage_res{ nullptr };
-    CLTensor _mm_recurrent_to_input_res{ nullptr };
-    CLTensor _mul_cell_to_input_res{ nullptr };
-    CLTensor _cell_to_input_outstage_res{ nullptr };
-    CLTensor _recurrent_to_input_outstage_res{ nullptr };
-    CLTensor _input_gate{ nullptr };
-    CLTensor _mm_input_to_output_res{ nullptr };
-    CLTensor _input_to_output_outstage_res{ nullptr };
-    CLTensor _mm_recurrent_to_output_res{ nullptr };
-    CLTensor _mul_cell_to_output_res{ nullptr };
-    CLTensor _cell_to_output_outstage_res{ nullptr };
-    CLTensor _recurrent_to_output_outstage_res{ nullptr };
-    CLTensor _output_gate{ nullptr };
-    CLTensor _hidden_mul_res{ nullptr };
-    CLTensor _hidden_gate{ nullptr };
-    CLTensor _mm_projection_res{ nullptr };
-    CLTensor _projection_outstage_res{ nullptr };
-    CLTensor _projection_out_res{ nullptr };
-    CLTensor _projection_accumulate_res{ nullptr };
-    CLTensor _ones{ nullptr };
-    std::array<CLTensor, _layer_norm_count> _layer_norm_output{ {} };
+    CLTensor                                _input_to_forget_weights_transposed{nullptr};
+    CLTensor                                _input_to_cell_weights_transposed{nullptr};
+    CLTensor                                _input_to_output_weights_transposed{nullptr};
+    CLTensor                                _input_to_input_weights_transposed{nullptr};
+    CLTensor                                _recurrent_to_forget_weights_transposed{nullptr};
+    CLTensor                                _recurrent_to_cell_weights_transposed{nullptr};
+    CLTensor                                _recurrent_to_output_weights_transposed{nullptr};
+    CLTensor                                _recurrent_to_input_weights_transposed{nullptr};
+    CLTensor                                _projection_weights_transposed{nullptr};
+    CLTensor                                _input_to_input_eff_bias{nullptr};
+    CLTensor                                _recurrent_to_input_eff_bias{nullptr};
+    CLTensor                                _input_to_forget_eff_bias{nullptr};
+    CLTensor                                _recurrent_to_forget_eff_bias{nullptr};
+    CLTensor                                _input_to_cell_eff_bias{nullptr};
+    CLTensor                                _recurrent_to_cell_eff_bias{nullptr};
+    CLTensor                                _input_to_output_eff_bias{nullptr};
+    CLTensor                                _recurrent_to_output_eff_bias{nullptr};
+    CLTensor                                _projection_reduction_res{nullptr};
+    CLTensor                                _projection_eff_bias{nullptr};
+    CLTensor                                _mm_input_to_forget_res{nullptr};
+    CLTensor                                _mm_recurrent_to_forget_res{nullptr};
+    CLTensor                                _mul_cell_to_forget_res{nullptr};
+    CLTensor                                _input_to_forget_outstage_res{nullptr};
+    CLTensor                                _cell_to_forget_outstage_res{nullptr};
+    CLTensor                                _recurrent_to_forget_outstage_res{nullptr};
+    CLTensor                                _forget_gate{nullptr};
+    CLTensor                                _mm_input_to_cell_res{nullptr};
+    CLTensor                                _input_to_cell_outstage_res{nullptr};
+    CLTensor                                _mm_recurrent_to_cell_res{nullptr};
+    CLTensor                                _recurrent_to_cell_outstage_res{nullptr};
+    CLTensor                                _cell_gate{nullptr};
+    CLTensor                                _mul_input_cell_res{nullptr};
+    CLTensor                                _mm_input_to_input_res{nullptr};
+    CLTensor                                _input_to_input_outstage_res{nullptr};
+    CLTensor                                _mm_recurrent_to_input_res{nullptr};
+    CLTensor                                _mul_cell_to_input_res{nullptr};
+    CLTensor                                _cell_to_input_outstage_res{nullptr};
+    CLTensor                                _recurrent_to_input_outstage_res{nullptr};
+    CLTensor                                _input_gate{nullptr};
+    CLTensor                                _mm_input_to_output_res{nullptr};
+    CLTensor                                _input_to_output_outstage_res{nullptr};
+    CLTensor                                _mm_recurrent_to_output_res{nullptr};
+    CLTensor                                _mul_cell_to_output_res{nullptr};
+    CLTensor                                _cell_to_output_outstage_res{nullptr};
+    CLTensor                                _recurrent_to_output_outstage_res{nullptr};
+    CLTensor                                _output_gate{nullptr};
+    CLTensor                                _hidden_mul_res{nullptr};
+    CLTensor                                _hidden_gate{nullptr};
+    CLTensor                                _mm_projection_res{nullptr};
+    CLTensor                                _projection_outstage_res{nullptr};
+    CLTensor                                _projection_out_res{nullptr};
+    CLTensor                                _projection_accumulate_res{nullptr};
+    CLTensor                                _ones{nullptr};
+    std::array<CLTensor, _layer_norm_count> _layer_norm_output{{}};
 
     inline CLTensor &get_layer_norm_output(LayerNormGate g)
     {
         return _layer_norm_output[getGateIndex(g)];
     }
 
-    bool _is_prepared{ false };
-    bool _has_cifg{ false };
-    bool _has_cell_clipping{ false };
-    bool _has_projection{ false };
-    bool _has_projection_clipping{ false };
-    bool _has_peephole{ false };
-    bool _has_layer_norm{ false };
-    bool _projection_tensor_copy_required{ false };
+    bool _is_prepared{false};
+    bool _has_cifg{false};
+    bool _has_cell_clipping{false};
+    bool _has_projection{false};
+    bool _has_projection_clipping{false};
+    bool _has_peephole{false};
+    bool _has_layer_norm{false};
+    bool _projection_tensor_copy_required{false};
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLQLSTMLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLRNNLayer.h b/arm_compute/runtime/CL/functions/CLRNNLayer.h
index 2b3b35e37d..a8d835d04d 100644
--- a/arm_compute/runtime/CL/functions/CLRNNLayer.h
+++ b/arm_compute/runtime/CL/functions/CLRNNLayer.h
@@ -24,12 +24,12 @@
 #ifndef ARM_COMPUTE_CLRNN_LAYER_H
 #define ARM_COMPUTE_CLRNN_LAYER_H
 
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLCopy.h"
 #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
 #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 #include <memory>
 
@@ -69,7 +69,13 @@ public:
      * @param[in,out] hidden_state      Output tensor of shape [num_units, batch_size]. Data types supported: Same as @p input
      * @param[in]     info              Activation layer parameter.
      */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state, ICLTensor *output, ActivationLayerInfo &info);
+    void configure(const ICLTensor     *input,
+                   const ICLTensor     *weights,
+                   const ICLTensor     *recurrent_weights,
+                   const ICLTensor     *bias,
+                   ICLTensor           *hidden_state,
+                   ICLTensor           *output,
+                   ActivationLayerInfo &info);
     /** Initialize the function
      *
      * @param[in]     compile_context   The compile context to be used.
@@ -81,8 +87,14 @@ public:
      * @param[in,out] hidden_state      Output tensor of shape [num_units, batch_size]. Data types supported: Same as @p input
      * @param[in]     info              Activation layer parameter.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state,
-                   ICLTensor *output, ActivationLayerInfo &info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *weights,
+                   const ICLTensor        *recurrent_weights,
+                   const ICLTensor        *bias,
+                   ICLTensor              *hidden_state,
+                   ICLTensor              *output,
+                   ActivationLayerInfo    &info);
     /** Initialize the function
      *
      * @param[in] input             Input is a 2-D tensor of shape [input_size, batch_size]. Data types supported: F16/F32
@@ -95,7 +107,12 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state, const ITensorInfo *output,
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *recurrent_weights,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *hidden_state,
+                           const ITensorInfo         *output,
                            const ActivationLayerInfo &info);
 
     // Inherited methods overridden:
@@ -114,5 +131,5 @@ private:
     CLTensor              _add_output;
     bool                  _is_prepared;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLRNN_LAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLROIAlignLayer.h b/arm_compute/runtime/CL/functions/CLROIAlignLayer.h
index 1eaea1b297..14d3476711 100644
--- a/arm_compute/runtime/CL/functions/CLROIAlignLayer.h
+++ b/arm_compute/runtime/CL/functions/CLROIAlignLayer.h
@@ -68,7 +68,8 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void
+    configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -84,7 +85,11 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input,
+                   const ICLTensor           *rois,
+                   ICLTensor                 *output,
+                   const ROIPoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLROIAlignLayer
      *
      * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -100,7 +105,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           ITensorInfo               *output,
+                           const ROIPoolingLayerInfo &pool_info);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLROIALIGNLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h b/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h
index 151586a1f6..86294596d2 100644
--- a/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h
+++ b/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h
@@ -66,7 +66,8 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void
+    configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -81,7 +82,11 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input,
+                   const ICLTensor           *rois,
+                   const ICLTensor           *output,
+                   const ROIPoolingLayerInfo &pool_info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLROIPoolingLayer
      *
@@ -97,7 +102,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           ITensorInfo               *output,
+                           const ROIPoolingLayerInfo &pool_info);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLROIPOOLINGLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLRange.h b/arm_compute/runtime/CL/functions/CLRange.h
index fbce05162c..ed665bc398 100644
--- a/arm_compute/runtime/CL/functions/CLRange.h
+++ b/arm_compute/runtime/CL/functions/CLRange.h
@@ -73,7 +73,8 @@ public:
      * @param[in]  end             The ending (not including) value of the sequence.
      * @param[in]  step            The gap between each pair of values in the sequence. Default is 1.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *output, float start, float end, float step = 1.f);
+    void
+    configure(const CLCompileContext &compile_context, ICLTensor *output, float start, float end, float step = 1.f);
     /** Static function to check if given info will lead to a valid configuration of @ref CLRange
      *
      * @param[in] output Output tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
diff --git a/arm_compute/runtime/CL/functions/CLReduceMean.h b/arm_compute/runtime/CL/functions/CLReduceMean.h
index 1ce088b2ce..640fe7cf1b 100644
--- a/arm_compute/runtime/CL/functions/CLReduceMean.h
+++ b/arm_compute/runtime/CL/functions/CLReduceMean.h
@@ -24,12 +24,12 @@
 #ifndef ARM_COMPUTE_CL_REDUCE_MEAN_H
 #define ARM_COMPUTE_CL_REDUCE_MEAN_H
 
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 #include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
 #include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
 #include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 
 namespace arm_compute
@@ -74,7 +74,11 @@ public:
      * @param[in]  keep_dims       If positive, retains reduced dimensions with length 1.
      * @param[out] output          Destination tensor. Data type supported: Same as @p input
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   const Coordinates      &reduction_axis,
+                   bool                    keep_dims,
+                   ICLTensor              *output);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLReduceMean
      *
@@ -85,7 +89,8 @@ public:
      *
      * @return A status
      */
-    static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output);
+    static Status
+    validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLReductionOperation.h b/arm_compute/runtime/CL/functions/CLReductionOperation.h
index 2245735b62..80068ac35c 100644
--- a/arm_compute/runtime/CL/functions/CLReductionOperation.h
+++ b/arm_compute/runtime/CL/functions/CLReductionOperation.h
@@ -80,7 +80,8 @@ public:
      * @param[in]  op        Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
      * @param[in]  keep_dims (Optional) Whether to keep the reduced dimension after the operation. Defaults to true.
      */
-    void configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
+    void
+    configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -90,7 +91,12 @@ public:
      * @param[in]  op              Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
      * @param[in]  keep_dims       (Optional) Whether to keep the reduced dimension after the operation. Defaults to true.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output,
+                   unsigned int            axis,
+                   ReductionOperation      op,
+                   bool                    keep_dims = true);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperation.
      *
@@ -102,7 +108,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           unsigned int       axis,
+                           ReductionOperation op,
+                           bool               keep_dims = true);
 
     // Inherited methods overridden:
     void run() override;
@@ -118,4 +128,4 @@ private:
     bool                                        _is_reshape_required;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CLREDUCTIONOPERATION_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_CLREDUCTIONOPERATION_H */
diff --git a/arm_compute/runtime/CL/functions/CLReshapeLayer.h b/arm_compute/runtime/CL/functions/CLReshapeLayer.h
index 7346b65e9b..dad90e6ba9 100644
--- a/arm_compute/runtime/CL/functions/CLReshapeLayer.h
+++ b/arm_compute/runtime/CL/functions/CLReshapeLayer.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/runtime/CL/ICLOperator.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
 #include <memory>
 
 namespace arm_compute
diff --git a/arm_compute/runtime/CL/functions/CLReverse.h b/arm_compute/runtime/CL/functions/CLReverse.h
index 94c63ca92d..0defc3f28b 100644
--- a/arm_compute/runtime/CL/functions/CLReverse.h
+++ b/arm_compute/runtime/CL/functions/CLReverse.h
@@ -59,7 +59,10 @@ public:
      * @param[out] output          Output tensor. Data type supported: Same as @p input
      * @param[in]  axis            Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const ICLTensor        *axis);
     /** Static function to check if given info will lead to a valid configuration of @ref CLReverseKernel
      *
      * @param[in] input  Input tensor info. Data types supported: All.
diff --git a/arm_compute/runtime/CL/functions/CLScale.h b/arm_compute/runtime/CL/functions/CLScale.h
index ddb4a23531..5c3824eb58 100644
--- a/arm_compute/runtime/CL/functions/CLScale.h
+++ b/arm_compute/runtime/CL/functions/CLScale.h
@@ -83,7 +83,10 @@ public:
      *                                All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]     info            @ref ScaleKernelInfo descriptor to be used to configure
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output,
+                   const ScaleKernelInfo  &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLScale
      *
diff --git a/arm_compute/runtime/CL/functions/CLSelect.h b/arm_compute/runtime/CL/functions/CLSelect.h
index 8b1e6b2019..effcb58313 100644
--- a/arm_compute/runtime/CL/functions/CLSelect.h
+++ b/arm_compute/runtime/CL/functions/CLSelect.h
@@ -62,7 +62,11 @@ public:
      * @param[in]  y               Second input tensor. Data types supported: Same as @p x
      * @param[out] output          Output tensor. Data types supported: Same as @p x.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *c,
+                   const ICLTensor        *x,
+                   const ICLTensor        *y,
+                   ICLTensor              *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSelect
      *
      * @param[in] c      Condition input tensor. Data types supported: U8.
diff --git a/arm_compute/runtime/CL/functions/CLSlice.h b/arm_compute/runtime/CL/functions/CLSlice.h
index 297bcd86fe..7a274ded72 100644
--- a/arm_compute/runtime/CL/functions/CLSlice.h
+++ b/arm_compute/runtime/CL/functions/CLSlice.h
@@ -84,7 +84,11 @@ public:
      * @param[in]  starts          The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      * @param[in]  ends            The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const Coordinates      &starts,
+                   const Coordinates      &ends);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLSlice
      *
@@ -100,7 +104,8 @@ public:
      *
      * @return A status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
 
     // Inherited methods overridden:
     void run() override;
@@ -129,7 +134,11 @@ public:
      * @param[in]  starts          The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      * @param[in]  ends            The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *input,
+                   ITensorInfo            *output,
+                   const Coordinates      &starts,
+                   const Coordinates      &ends);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLSlice
      *
@@ -145,7 +154,8 @@ public:
      *
      * @return A status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
 };
 } // namespace experimental
 } // namespace arm_compute
diff --git a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
index 687f8ff6d8..70ef1f4402 100644
--- a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
@@ -87,7 +87,11 @@ public:
      * @param[in]  axis            (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
      *                       axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta = 1.0f, int32_t axis = 0);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   float                   beta = 1.0f,
+                   int32_t                 axis = 0);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSoftmaxLayer
      *
      * @param[in] input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax
diff --git a/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h b/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h
index 304a74137e..191f4863d5 100644
--- a/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h
@@ -83,7 +83,11 @@ public:
      * @param[in]  paddings        2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32
      * @param[out] output          Tensor output. Data types supported: same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *block_shape,
+                   const ICLTensor        *paddings,
+                   ICLTensor              *output);
     /** Set the input and output tensors. (Static block shape and paddings)
      *
      * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -93,7 +97,12 @@ public:
      * @param[in]  padding_right The padding at the end of every dimension of the output tensor.
      * @param[out] output        Tensor output. Data types supported: same as @p input
      */
-    void configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output);
+    void configure(const ICLTensor *input,
+                   const int        block_shape_x,
+                   const int        block_shape_y,
+                   const Size2D    &padding_left,
+                   const Size2D    &padding_right,
+                   ICLTensor       *output);
     /** Set the input and output tensors. (Static block shape and paddings)
      *
      * @param[in]  compile_context The compile context to be used.
@@ -104,8 +113,13 @@ public:
      * @param[in]  padding_right   The padding at the end of every dimension of the output tensor.
      * @param[out] output          Tensor output. Data types supported: same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
-                   ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const int               block_shape_x,
+                   const int               block_shape_y,
+                   const Size2D           &padding_left,
+                   const Size2D           &padding_right,
+                   ICLTensor              *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayer
      *
      * @param[in]  input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
@@ -115,7 +129,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *block_shape,
+                           const ITensorInfo *paddings,
+                           const ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayer (Static block shape and paddings)
      *
      * @param[in]  input         Tensor input info. Supported tensor rank: 4. Data types supported: All.
@@ -127,7 +144,12 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const int          block_shape_x,
+                           const int          block_shape_y,
+                           const Size2D      &padding_left,
+                           const Size2D      &padding_right,
+                           const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h b/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h
index 8a47e95f9d..1b0dfc2b74 100644
--- a/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h
@@ -75,7 +75,8 @@ public:
      * @param[out] output          Tensor output. Data types supported: same as @p input
      * @param[in]  block_shape     Block shape value.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+    void
+    configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToDepthLayer.
      *
      * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
diff --git a/arm_compute/runtime/CL/functions/CLSplit.h b/arm_compute/runtime/CL/functions/CLSplit.h
index 86c7bdde7d..8d13755212 100644
--- a/arm_compute/runtime/CL/functions/CLSplit.h
+++ b/arm_compute/runtime/CL/functions/CLSplit.h
@@ -26,7 +26,6 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
-
 #include "arm_compute/runtime/CL/functions/CLSlice.h"
 #include "arm_compute/runtime/CPP/functions/CPPSplit.h"
 #include "arm_compute/runtime/IFunction.h"
diff --git a/arm_compute/runtime/CL/functions/CLStackLayer.h b/arm_compute/runtime/CL/functions/CLStackLayer.h
index 54c903a706..18745c8a4f 100644
--- a/arm_compute/runtime/CL/functions/CLStackLayer.h
+++ b/arm_compute/runtime/CL/functions/CLStackLayer.h
@@ -85,7 +85,10 @@ public:
      *                             Negative values wrap around
      * @param[out] output          Output tensor. Data types supported: Same as @p input.
      */
-    void configure(const CLCompileContext &compile_context, const std::vector<ICLTensor *> &input, int axis, ICLTensor *output);
+    void configure(const CLCompileContext         &compile_context,
+                   const std::vector<ICLTensor *> &input,
+                   int                             axis,
+                   ICLTensor                      *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLStackLayerKernel
      *
      * @note Supported input tensor rank: up to 4
diff --git a/arm_compute/runtime/CL/functions/CLStridedSlice.h b/arm_compute/runtime/CL/functions/CLStridedSlice.h
index 6fab0c0186..b1edc2481c 100644
--- a/arm_compute/runtime/CL/functions/CLStridedSlice.h
+++ b/arm_compute/runtime/CL/functions/CLStridedSlice.h
@@ -74,9 +74,14 @@ public:
      * @param[in]  shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    void configure(const ICLTensor *input, ICLTensor *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+    void configure(const ICLTensor   *input,
+                   ICLTensor         *output,
+                   const Coordinates &starts,
+                   const Coordinates &ends,
+                   const BiStrides   &strides,
+                   int32_t            begin_mask       = 0,
+                   int32_t            end_mask         = 0,
+                   int32_t            shrink_axis_mask = 0);
     /** Configure kernel
      *
      * @note Supported tensor rank: up to 4
@@ -92,9 +97,15 @@ public:
      * @param[in]  shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const Coordinates      &starts,
+                   const Coordinates      &ends,
+                   const BiStrides        &strides,
+                   int32_t                 begin_mask       = 0,
+                   int32_t                 end_mask         = 0,
+                   int32_t                 shrink_axis_mask = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSlice
      *
@@ -110,9 +121,14 @@ public:
      * @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                           int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const Coordinates &starts,
+                           const Coordinates &ends,
+                           const BiStrides   &strides,
+                           int32_t            begin_mask       = 0,
+                           int32_t            end_mask         = 0,
+                           int32_t            shrink_axis_mask = 0);
 
     // Inherited methods overridden:
     void run() override;
@@ -143,9 +159,15 @@ public:
      * @param[in]  shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *input,
+                   ITensorInfo            *output,
+                   const Coordinates      &starts,
+                   const Coordinates      &ends,
+                   const BiStrides        &strides,
+                   int32_t                 begin_mask       = 0,
+                   int32_t                 end_mask         = 0,
+                   int32_t                 shrink_axis_mask = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSlice
      *
@@ -161,9 +183,14 @@ public:
      * @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                           int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const Coordinates &starts,
+                           const Coordinates &ends,
+                           const BiStrides   &strides,
+                           int32_t            begin_mask       = 0,
+                           int32_t            end_mask         = 0,
+                           int32_t            shrink_axis_mask = 0);
 };
 } // namespace experimental
 } // namespace arm_compute
diff --git a/arm_compute/runtime/CL/functions/CLTile.h b/arm_compute/runtime/CL/functions/CLTile.h
index c266adbbd4..4c414670a5 100644
--- a/arm_compute/runtime/CL/functions/CLTile.h
+++ b/arm_compute/runtime/CL/functions/CLTile.h
@@ -59,7 +59,10 @@ public:
      * @param[in]  multiples       Contains the number of times the input tensor should be replicated on the given dimension.
      * @param[out] output          Destination tensor. Same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const Multiples        &multiples);
     /** Static function to check if given info will lead to a valid configuration of @ref CLTile
      *
      * @param[in] input     Source tensor info. Data type supported: All.
diff --git a/arm_compute/runtime/CL/functions/CLTranspose.h b/arm_compute/runtime/CL/functions/CLTranspose.h
index a866aeabaa..9dc977fbeb 100644
--- a/arm_compute/runtime/CL/functions/CLTranspose.h
+++ b/arm_compute/runtime/CL/functions/CLTranspose.h
@@ -88,6 +88,6 @@ private:
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
-}
+} // namespace arm_compute
 
 #endif /* ARM_COMPUTE_CLTRANSPOSE_H */
diff --git a/arm_compute/runtime/CL/functions/CLUnstack.h b/arm_compute/runtime/CL/functions/CLUnstack.h
index 32ad439b70..a6eee43177 100644
--- a/arm_compute/runtime/CL/functions/CLUnstack.h
+++ b/arm_compute/runtime/CL/functions/CLUnstack.h
@@ -26,9 +26,8 @@
 
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/runtime/CL/functions/CLStridedSlice.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -72,7 +71,10 @@ public:
      * @param[in]     axis            The axis to unstack along. Valid values are [-R,R) where R is the input's rank. Negative values wrap around.
      *
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const std::vector<ICLTensor *> &output_vector, int axis);
+    void configure(const CLCompileContext         &compile_context,
+                   const ICLTensor                *input,
+                   const std::vector<ICLTensor *> &output_vector,
+                   int                             axis);
     /** Static function to check if given info will lead to a valid configuration of @ref CLUnstack
      *
      * @param[in] input         Input tensor info. Data type supported: All.
diff --git a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
index adf5f18626..efea9a1550 100644
--- a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
@@ -84,8 +84,13 @@ public:
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                              available which may introduce a drop of accuracy as well. Default is false
      */
-    void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    void configure(ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false);
     /** Set the input and output tensors.
      *
      * @note: This function only works with 3x3,3x1,1x3,5x5,5x1,1x5,7x1 and 1x7 kernels along with unit strides for both NCHW and NHWC data layout
@@ -104,8 +109,14 @@ public:
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                              available which may introduce a drop of accuracy as well. Default is false
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false);
     /** Static function to check if given info will lead to a valid configuration of @ref CLWinogradConvolutionLayer
      *
      * @note: This function only works with 3x3,3x1,1x3,5x5,5x1 and 1x5 kernels along with unit strides for both NCHW and NHWC data layout
@@ -125,8 +136,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/tuners/CLTuningParametersList.h b/arm_compute/runtime/CL/tuners/CLTuningParametersList.h
index 69572c98d2..5f6d12b4a7 100644
--- a/arm_compute/runtime/CL/tuners/CLTuningParametersList.h
+++ b/arm_compute/runtime/CL/tuners/CLTuningParametersList.h
@@ -29,6 +29,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/runtime/CL/CLTunerTypes.h"
 #include "arm_compute/runtime/CL/CLTuningParams.h"
+
 #include "support/ToolchainSupport.h"
 
 #include <memory>
diff --git a/arm_compute/runtime/CPP/CPPScheduler.h b/arm_compute/runtime/CPP/CPPScheduler.h
index a5932d6301..7f70b5fa1f 100644
--- a/arm_compute/runtime/CPP/CPPScheduler.h
+++ b/arm_compute/runtime/CPP/CPPScheduler.h
@@ -55,10 +55,10 @@ public:
     static CPPScheduler &get();
 
     // Inherited functions overridden
-    void set_num_threads(unsigned int num_threads) override;
-    void set_num_threads_with_affinity(unsigned int num_threads, BindFunc func) override;
+    void         set_num_threads(unsigned int num_threads) override;
+    void         set_num_threads_with_affinity(unsigned int num_threads, BindFunc func) override;
     unsigned int num_threads() const override;
-    void schedule(ICPPKernel *kernel, const Hints &hints) override;
+    void         schedule(ICPPKernel *kernel, const Hints &hints) override;
     void schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) override;
 
 protected:
diff --git a/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h b/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h
index 58b4bf25cc..9af4ed6208 100644
--- a/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h
+++ b/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h
@@ -61,8 +61,16 @@ public:
      * @param[in]  keeps_size       (Optional) Number of filtered indices per class tensor of size [num_classes]. Data types supported: U32.
      * @param[in]  info             (Optional) BoxNMSLimitInfo information.
      */
-    void configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,
-                   ITensor *batch_splits_out = nullptr, ITensor *keeps = nullptr, ITensor *keeps_size = nullptr, const BoxNMSLimitInfo info = BoxNMSLimitInfo());
+    void configure(const ITensor        *scores_in,
+                   const ITensor        *boxes_in,
+                   const ITensor        *batch_splits_in,
+                   ITensor              *scores_out,
+                   ITensor              *boxes_out,
+                   ITensor              *classes,
+                   ITensor              *batch_splits_out = nullptr,
+                   ITensor              *keeps            = nullptr,
+                   ITensor              *keeps_size       = nullptr,
+                   const BoxNMSLimitInfo info             = BoxNMSLimitInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CPPDetectionOutputLayer
      *
      * @param[in] scores_in        The scores input tensor of size [count, num_classes]. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
@@ -81,9 +89,16 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *scores_in, const ITensorInfo *boxes_in, const ITensorInfo *batch_splits_in, const ITensorInfo *scores_out, const ITensorInfo *boxes_out,
-                           const ITensorInfo *classes,
-                           const ITensorInfo *batch_splits_out = nullptr, const ITensorInfo *keeps = nullptr, const ITensorInfo *keeps_size = nullptr, const BoxNMSLimitInfo info = BoxNMSLimitInfo());
+    static Status validate(const ITensorInfo    *scores_in,
+                           const ITensorInfo    *boxes_in,
+                           const ITensorInfo    *batch_splits_in,
+                           const ITensorInfo    *scores_out,
+                           const ITensorInfo    *boxes_out,
+                           const ITensorInfo    *classes,
+                           const ITensorInfo    *batch_splits_out = nullptr,
+                           const ITensorInfo    *keeps            = nullptr,
+                           const ITensorInfo    *keeps_size       = nullptr,
+                           const BoxNMSLimitInfo info             = BoxNMSLimitInfo());
     // Inherited methods overridden:
     void run() override;
 
diff --git a/arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h b/arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h
index f2c7ccccc5..dc8c8e76ba 100644
--- a/arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h
+++ b/arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CPP_DETECTION_OUTPUT_LAYER_H
 #define ARM_COMPUTE_CPP_DETECTION_OUTPUT_LAYER_H
 
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
 
 namespace arm_compute
 {
@@ -52,7 +51,11 @@ public:
      *
      * @note Output contains all the detections. Of those, only the ones selected by the valid region are valid.
      */
-    void configure(const ITensor *input_loc, const ITensor *input_conf, const ITensor *input_priorbox, ITensor *output, DetectionOutputLayerInfo info = DetectionOutputLayerInfo());
+    void configure(const ITensor           *input_loc,
+                   const ITensor           *input_conf,
+                   const ITensor           *input_priorbox,
+                   ITensor                 *output,
+                   DetectionOutputLayerInfo info = DetectionOutputLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CPPDetectionOutputLayer
      *
      * @param[in] input_loc      The mbox location input tensor info. Data types supported: F32.
@@ -63,7 +66,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output,
+    static Status validate(const ITensorInfo       *input_loc,
+                           const ITensorInfo       *input_conf,
+                           const ITensorInfo       *input_priorbox,
+                           const ITensorInfo       *output,
                            DetectionOutputLayerInfo info = DetectionOutputLayerInfo());
     // Inherited methods overridden:
     void run() override;
@@ -82,12 +88,12 @@ private:
     int _num_priors;
     int _num;
 
-    std::vector<LabelBBox> _all_location_predictions;
+    std::vector<LabelBBox>                         _all_location_predictions;
     std::vector<std::map<int, std::vector<float>>> _all_confidence_scores;
-    std::vector<BBox> _all_prior_bboxes;
-    std::vector<std::array<float, 4>> _all_prior_variances;
-    std::vector<LabelBBox> _all_decode_bboxes;
-    std::vector<std::map<int, std::vector<int>>> _all_indices;
+    std::vector<BBox>                              _all_prior_bboxes;
+    std::vector<std::array<float, 4>>              _all_prior_variances;
+    std::vector<LabelBBox>                         _all_decode_bboxes;
+    std::vector<std::map<int, std::vector<int>>>   _all_indices;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CPP_DETECTION_OUTPUT_LAYER_H */
diff --git a/arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h b/arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h
index 94248ff314..a40e4f9ecb 100644
--- a/arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h
+++ b/arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h
@@ -24,10 +24,9 @@
 #ifndef ARM_COMPUTE_CPP_DETECTION_POSTPROCESS_H
 #define ARM_COMPUTE_CPP_DETECTION_POSTPROCESS_H
 
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h"
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
@@ -65,8 +64,14 @@ public:
      *
      * @note Output contains all the detections. Of those, only the ones selected by the valid region are valid.
      */
-    void configure(const ITensor *input_box_encoding, const ITensor *input_score, const ITensor *input_anchors,
-                   ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo());
+    void configure(const ITensor                *input_box_encoding,
+                   const ITensor                *input_score,
+                   const ITensor                *input_anchors,
+                   ITensor                      *output_boxes,
+                   ITensor                      *output_classes,
+                   ITensor                      *output_scores,
+                   ITensor                      *num_detection,
+                   DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CPPDetectionPostProcessLayer
      *
      * @param[in]  input_box_encoding The bounding box input tensor info. Data types supported: F32/QASYMM8/QASYMM8_SIGNED.
@@ -80,8 +85,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors,
-                           ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection,
+    static Status validate(const ITensorInfo            *input_box_encoding,
+                           const ITensorInfo            *input_class_score,
+                           const ITensorInfo            *input_anchors,
+                           ITensorInfo                  *output_boxes,
+                           ITensorInfo                  *output_classes,
+                           ITensorInfo                  *output_scores,
+                           ITensorInfo                  *num_detection,
                            DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo());
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h b/arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h
index 71c44a8bd1..af6afc6029 100644
--- a/arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h
+++ b/arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CPP_NONMAXIMUMSUPPRESSION_LAYER_H
 #define ARM_COMPUTE_CPP_NONMAXIMUMSUPPRESSION_LAYER_H
 
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
 
 namespace arm_compute
 {
@@ -48,7 +47,12 @@ public:
      * @param[in]  nms_threshold   The threshold used in non maximum suppression.
      *
      */
-    void configure(const ITensor *bboxes, const ITensor *scores, ITensor *indices, unsigned int max_output_size, const float score_threshold, const float nms_threshold);
+    void configure(const ITensor *bboxes,
+                   const ITensor *scores,
+                   ITensor       *indices,
+                   unsigned int   max_output_size,
+                   const float    score_threshold,
+                   const float    nms_threshold);
 
     /** Static function to check if given arguments will lead to a valid configuration of @ref CPPNonMaximumSuppression
      *
@@ -60,8 +64,12 @@ public:
      * @param[in]  nms_threshold   The threshold used in non maximum suppression.
      *
      */
-    static Status validate(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *indices, unsigned int max_output_size,
-                           const float score_threshold, const float nms_threshold);
+    static Status validate(const ITensorInfo *bboxes,
+                           const ITensorInfo *scores,
+                           const ITensorInfo *indices,
+                           unsigned int       max_output_size,
+                           const float        score_threshold,
+                           const float        nms_threshold);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CPP_NONMAXIMUMSUPPRESSION_LAYER_H */
diff --git a/arm_compute/runtime/CPP/functions/CPPPermute.h b/arm_compute/runtime/CPP/functions/CPPPermute.h
index 85c1502324..232da41b8e 100644
--- a/arm_compute/runtime/CPP/functions/CPPPermute.h
+++ b/arm_compute/runtime/CPP/functions/CPPPermute.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CPPPERMUTE_H
 #define ARM_COMPUTE_CPPPERMUTE_H
 
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
 
 namespace arm_compute
 {
@@ -53,5 +52,5 @@ public:
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm);
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CPPPERMUTE_H */
diff --git a/arm_compute/runtime/CPP/functions/CPPSplit.h b/arm_compute/runtime/CPP/functions/CPPSplit.h
index 56aad2db4b..9be081f5bb 100644
--- a/arm_compute/runtime/CPP/functions/CPPSplit.h
+++ b/arm_compute/runtime/CPP/functions/CPPSplit.h
@@ -29,7 +29,6 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
 #include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
@@ -39,8 +38,7 @@ template <typename SliceType, typename TensorInterfaceType = ITensor>
 class CPPSplit : public IFunction
 {
 public:
-    CPPSplit()
-        : _outputs_vector(), _slice_functions(), _num_outputs(0)
+    CPPSplit() : _outputs_vector(), _slice_functions(), _num_outputs(0)
     {
     }
     /** Static function to check if given info will lead to a valid configuration of @ref CPPSplit
@@ -64,14 +62,16 @@ public:
         unsigned int total_output_shape_size = 0;
 
         // Sum the output sizes and fall back to evenly-sized splits if any are zero
-        const bool using_split_shapes = std::none_of(outputs.begin(), outputs.end(), [&total_output_shape_size](ITensorInfo * info)
-        {
-            unsigned int output_shape_size = info->tensor_shape().total_size();
-            total_output_shape_size += output_shape_size;
-            return output_shape_size == 0;
-        });
-
-        if(using_split_shapes)
+        const bool using_split_shapes = std::none_of(outputs.begin(), outputs.end(),
+                                                     [&total_output_shape_size](ITensorInfo *info)
+                                                     {
+                                                         unsigned int output_shape_size =
+                                                             info->tensor_shape().total_size();
+                                                         total_output_shape_size += output_shape_size;
+                                                         return output_shape_size == 0;
+                                                     });
+
+        if (using_split_shapes)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().total_size() != total_output_shape_size);
         }
@@ -83,10 +83,10 @@ public:
 
         // Validate output tensors
         unsigned int axis_offset = 0;
-        for(const auto &output : outputs)
+        for (const auto &output : outputs)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-            if(using_split_shapes)
+            if (using_split_shapes)
             {
                 output_shape = output->tensor_shape();
                 ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() == 0);
@@ -97,14 +97,14 @@ public:
             // Start/End coordinates
             Coordinates start_coords;
             Coordinates end_coords;
-            for(unsigned int d = 0; d < output_shape.num_dimensions(); ++d)
+            for (unsigned int d = 0; d < output_shape.num_dimensions(); ++d)
             {
                 end_coords.set(d, -1);
             }
 
             // Output auto inizialitation if not yet initialized
             TensorInfo tmp_output_info = *output->clone();
-            if(tmp_output_info.tensor_shape().total_size() == 0)
+            if (tmp_output_info.tensor_shape().total_size() == 0)
             {
                 tmp_output_info = input->clone()->set_is_resizable(true).set_tensor_shape(output_shape);
             }
@@ -128,7 +128,8 @@ public:
      *                     from the split dimension.
      * @param[in]  axis    Axis on which to split the input.
      */
-    void configure(const TensorInterfaceType *input, const std::vector<TensorInterfaceType *> &outputs, unsigned int axis)
+    void
+    configure(const TensorInterfaceType *input, const std::vector<TensorInterfaceType *> &outputs, unsigned int axis)
     {
         // Create Slice functions
         _num_outputs = outputs.size();
@@ -136,17 +137,16 @@ public:
 
         // Extract output tensor info
         std::vector<ITensorInfo *> outputs_info;
-        for(auto &output : outputs)
+        for (auto &output : outputs)
         {
             ARM_COMPUTE_ERROR_ON_NULLPTR(output);
             outputs_info.emplace_back(output->info());
         }
 
         // If any of the outputs have a zero size, fall-back to using evenly-sized output splits
-        const bool outputs_have_sizes = std::none_of(outputs_info.begin(), outputs_info.end(), [](ITensorInfo * info)
-        {
-            return info->tensor_shape().total_size() == 0;
-        });
+        const bool outputs_have_sizes =
+            std::none_of(outputs_info.begin(), outputs_info.end(),
+                         [](ITensorInfo *info) { return info->tensor_shape().total_size() == 0; });
 
         // Validate
         ARM_COMPUTE_ERROR_THROW_ON(CPPSplit::validate(input->info(), outputs_info, axis));
@@ -154,12 +154,13 @@ public:
         unsigned int axis_offset = 0;
         unsigned int i           = 0;
 
-        for(const auto &output_info : outputs_info)
+        for (const auto &output_info : outputs_info)
         {
             // Get output shape
-            TensorShape output_shape = (outputs_have_sizes ?
-                                        output_info->tensor_shape() :
-                                        arm_compute::misc::shape_calculator::compute_split_shape(input->info(), axis, _num_outputs));
+            TensorShape output_shape =
+                (outputs_have_sizes
+                     ? output_info->tensor_shape()
+                     : arm_compute::misc::shape_calculator::compute_split_shape(input->info(), axis, _num_outputs));
 
             const size_t axis_split_step = output_shape[axis];
 
@@ -167,7 +168,7 @@ public:
             Coordinates start_coords;
             Coordinates end_coords;
 
-            for(unsigned int d = 0; d < output_shape.num_dimensions(); ++d)
+            for (unsigned int d = 0; d < output_shape.num_dimensions(); ++d)
             {
                 end_coords.set(d, -1);
             }
diff --git a/arm_compute/runtime/CPP/functions/CPPTopKV.h b/arm_compute/runtime/CPP/functions/CPPTopKV.h
index 2f63084056..232cbb3067 100644
--- a/arm_compute/runtime/CPP/functions/CPPTopKV.h
+++ b/arm_compute/runtime/CPP/functions/CPPTopKV.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CPPTOPKV_H
 #define ARM_COMPUTE_CPPTOPKV_H
 
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
 
 namespace arm_compute
 {
@@ -54,7 +53,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k);
+    static Status
+    validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CPPTOPKV_H */
diff --git a/arm_compute/runtime/CPP/functions/CPPUpsample.h b/arm_compute/runtime/CPP/functions/CPPUpsample.h
index b97d4d1cc1..3b0f997b17 100644
--- a/arm_compute/runtime/CPP/functions/CPPUpsample.h
+++ b/arm_compute/runtime/CPP/functions/CPPUpsample.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CPPUPSAMPLE_H
 #define ARM_COMPUTE_CPPUPSAMPLE_H
 
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
 
 namespace arm_compute
 {
@@ -44,5 +43,5 @@ public:
      */
     void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info);
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CPPUPSAMPLE_H */
diff --git a/arm_compute/runtime/FunctionDescriptors.h b/arm_compute/runtime/FunctionDescriptors.h
index 05f172b9f1..c3af17d6f2 100644
--- a/arm_compute/runtime/FunctionDescriptors.h
+++ b/arm_compute/runtime/FunctionDescriptors.h
@@ -41,16 +41,16 @@ enum class FFTDirection
 /** Descriptor used by the FFT1D function */
 struct FFT1DInfo
 {
-    unsigned int axis{ 0 };                          /**< Axis to run the FFT on. */
-    FFTDirection direction{ FFTDirection::Forward }; /**< Direction of the FFT. */
+    unsigned int axis{0};                          /**< Axis to run the FFT on. */
+    FFTDirection direction{FFTDirection::Forward}; /**< Direction of the FFT. */
 };
 
 /** Descriptor used by the FFT2D function */
 struct FFT2DInfo
 {
-    unsigned int axis0{ 0 };                         /**< Axis to run first pass on. If same, multiple transforms are performed on single axis*/
-    unsigned int axis1{ 1 };                         /**< Axis to run second pass on. If same, multiple transforms are performed on single axis*/
-    FFTDirection direction{ FFTDirection::Forward }; /**< Direction of the FFT. */
+    unsigned int axis0{0}; /**< Axis to run first pass on. If same, multiple transforms are performed on single axis*/
+    unsigned int axis1{1}; /**< Axis to run second pass on. If same, multiple transforms are performed on single axis*/
+    FFTDirection direction{FFTDirection::Forward}; /**< Direction of the FFT. */
 };
 
 /** Descriptor used by the 2d Convolution function */
@@ -64,15 +64,20 @@ struct Conv2dInfo
                bool                       enable_fast_math,
                unsigned int               num_groups,
                const WeightsInfo         &weights_info = WeightsInfo())
-        : conv_info(conv_info), dilation(dilation), act_info(act_info), enable_fast_math(enable_fast_math), num_groups(num_groups), weights_info(weights_info)
+        : conv_info(conv_info),
+          dilation(dilation),
+          act_info(act_info),
+          enable_fast_math(enable_fast_math),
+          num_groups(num_groups),
+          weights_info(weights_info)
     {
     }
 
     PadStrideInfo       conv_info{};
-    Size2D              dilation{ 1U, 1U };
+    Size2D              dilation{1U, 1U};
     ActivationLayerInfo act_info{};
-    bool                enable_fast_math{ false };
-    unsigned int        num_groups{ 1 };
+    bool                enable_fast_math{false};
+    unsigned int        num_groups{1};
     WeightsInfo         weights_info{};
 };
 
@@ -87,16 +92,21 @@ struct Conv3dInfo
                const Size3D                &dilation,
                const DimensionRoundingType &round_type,
                bool                         enable_fast_math)
-        : stride(stride), padding(padding), act_info(act_info), dilation(dilation), round_type(round_type), enable_fast_math(enable_fast_math)
+        : stride(stride),
+          padding(padding),
+          act_info(act_info),
+          dilation(dilation),
+          round_type(round_type),
+          enable_fast_math(enable_fast_math)
     {
     }
 
-    Size3D                stride{ 1U, 1U, 1U };
+    Size3D                stride{1U, 1U, 1U};
     Padding3D             padding{};
     ActivationLayerInfo   act_info{};
-    Size3D                dilation{ 1U, 1U, 1U };
+    Size3D                dilation{1U, 1U, 1U};
     DimensionRoundingType round_type{};
-    bool                  enable_fast_math{ false };
+    bool                  enable_fast_math{false};
 };
 
 } // namespace arm_compute
diff --git a/arm_compute/runtime/IAllocator.h b/arm_compute/runtime/IAllocator.h
index 5c28b24fea..f8446db811 100644
--- a/arm_compute/runtime/IAllocator.h
+++ b/arm_compute/runtime/IAllocator.h
@@ -56,5 +56,5 @@ public:
      */
     virtual std::unique_ptr<IMemoryRegion> make_region(size_t size, size_t alignment) = 0;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_IALLOCATOR_H */
diff --git a/arm_compute/runtime/IFunction.h b/arm_compute/runtime/IFunction.h
index b7b28f999d..fb68dbbecf 100644
--- a/arm_compute/runtime/IFunction.h
+++ b/arm_compute/runtime/IFunction.h
@@ -58,5 +58,5 @@ public:
     {
     }
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_IFUNCTION_H */
diff --git a/arm_compute/runtime/IMemoryGroup.h b/arm_compute/runtime/IMemoryGroup.h
index a977a4a3c3..77198dd29d 100644
--- a/arm_compute/runtime/IMemoryGroup.h
+++ b/arm_compute/runtime/IMemoryGroup.h
@@ -86,8 +86,7 @@ public:
      *
      * @param[in] memory_group Memory group to handle
      */
-    explicit MemoryGroupResourceScope(IMemoryGroup &memory_group)
-        : _memory_group(memory_group)
+    explicit MemoryGroupResourceScope(IMemoryGroup &memory_group) : _memory_group(memory_group)
     {
         _memory_group.acquire();
     }
@@ -100,5 +99,5 @@ public:
 private:
     IMemoryGroup &_memory_group;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_IMEMORYGROUP_H */
diff --git a/arm_compute/runtime/IMemoryManager.h b/arm_compute/runtime/IMemoryManager.h
index 4d7d8cd9c9..42910edfda 100644
--- a/arm_compute/runtime/IMemoryManager.h
+++ b/arm_compute/runtime/IMemoryManager.h
@@ -65,5 +65,5 @@ public:
      */
     virtual void clear() = 0;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_IMEMORYMANAGER_H */
diff --git a/arm_compute/runtime/IMemoryPool.h b/arm_compute/runtime/IMemoryPool.h
index b8d36c362d..0c112c8f35 100644
--- a/arm_compute/runtime/IMemoryPool.h
+++ b/arm_compute/runtime/IMemoryPool.h
@@ -60,5 +60,5 @@ public:
      */
     virtual std::unique_ptr<IMemoryPool> duplicate() = 0;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_IMEMORYPOOL_H */
diff --git a/arm_compute/runtime/IMemoryRegion.h b/arm_compute/runtime/IMemoryRegion.h
index 914aa57fbe..9431663e4e 100644
--- a/arm_compute/runtime/IMemoryRegion.h
+++ b/arm_compute/runtime/IMemoryRegion.h
@@ -37,8 +37,7 @@ public:
      *
      * @param[in] size Region size
      */
-    explicit IMemoryRegion(size_t size)
-        : _size(size)
+    explicit IMemoryRegion(size_t size) : _size(size)
     {
     }
     /** Virtual Destructor */
diff --git a/arm_compute/runtime/IPoolManager.h b/arm_compute/runtime/IPoolManager.h
index 481bde5fb6..5f6d4ffbe5 100644
--- a/arm_compute/runtime/IPoolManager.h
+++ b/arm_compute/runtime/IPoolManager.h
@@ -69,5 +69,5 @@ public:
      */
     virtual size_t num_pools() const = 0;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_IPOOLMANAGER_H */
diff --git a/arm_compute/runtime/IScheduler.h b/arm_compute/runtime/IScheduler.h
index df5a44001f..ae204c8560 100644
--- a/arm_compute/runtime/IScheduler.h
+++ b/arm_compute/runtime/IScheduler.h
@@ -25,8 +25,8 @@
 #define ARM_COMPUTE_ISCHEDULER_H
 
 #include "arm_compute/core/CPP/CPPTypes.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/Types.h"
 
 #include <functional>
 #include <limits>
@@ -226,7 +226,11 @@ protected:
      *
      * @return Adjusted number of windows
      */
-    std::size_t adjust_num_of_windows(const Window &window, std::size_t split_dimension, std::size_t init_num_windows, const ICPPKernel &kernel, const CPUInfo &cpu_info);
+    std::size_t adjust_num_of_windows(const Window     &window,
+                                      std::size_t       split_dimension,
+                                      std::size_t       init_num_windows,
+                                      const ICPPKernel &kernel,
+                                      const CPUInfo    &cpu_info);
 
 private:
     unsigned int _num_threads_hint = {};
diff --git a/arm_compute/runtime/ISimpleLifetimeManager.h b/arm_compute/runtime/ISimpleLifetimeManager.h
index b2d17c6fea..9e481bb563 100644
--- a/arm_compute/runtime/ISimpleLifetimeManager.h
+++ b/arm_compute/runtime/ISimpleLifetimeManager.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_ISIMPLELIFETIMEMANAGER_H
 
 #include "arm_compute/runtime/ILifetimeManager.h"
-
 #include "arm_compute/runtime/IMemoryPool.h"
 #include "arm_compute/runtime/Types.h"
 
@@ -70,7 +69,11 @@ protected:
     /** Element struct */
     struct Element
     {
-        Element(void *id_ = nullptr, IMemory *handle_ = nullptr, size_t size_ = 0, size_t alignment_ = 0, bool status_ = false)
+        Element(void    *id_        = nullptr,
+                IMemory *handle_    = nullptr,
+                size_t   size_      = 0,
+                size_t   alignment_ = 0,
+                bool     status_    = false)
             : id(id_), handle(handle_), size(size_), alignment(alignment_), status(status_)
         {
         }
@@ -90,11 +93,12 @@ protected:
         std::set<void *> bound_elements;
     };
 
-    IMemoryGroup *_active_group;                                           /**< Active group */
-    std::map<void *, Element> _active_elements;                            /**< A map that contains the active elements */
-    std::list<Blob> _free_blobs;                                           /**< Free blobs */
-    std::list<Blob> _occupied_blobs;                                       /**< Occupied blobs */
-    std::map<IMemoryGroup *, std::map<void *, Element>> _finalized_groups; /**< A map that contains the finalized groups */
+    IMemoryGroup             *_active_group;    /**< Active group */
+    std::map<void *, Element> _active_elements; /**< A map that contains the active elements */
+    std::list<Blob>           _free_blobs;      /**< Free blobs */
+    std::list<Blob>           _occupied_blobs;  /**< Occupied blobs */
+    std::map<IMemoryGroup *, std::map<void *, Element>>
+        _finalized_groups; /**< A map that contains the finalized groups */
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_ISIMPLELIFETIMEMANAGER_H */
diff --git a/arm_compute/runtime/ITensorAllocator.h b/arm_compute/runtime/ITensorAllocator.h
index 17e581b40e..e2d3536169 100644
--- a/arm_compute/runtime/ITensorAllocator.h
+++ b/arm_compute/runtime/ITensorAllocator.h
@@ -101,9 +101,9 @@ protected:
     virtual void unlock() = 0;
 
 private:
-    TensorInfo  _info_owned{};             /**< Tensor's metadata. */
-    TensorInfo *_info_external{ nullptr }; /**< External Tensor's metadata */
-    size_t      _alignment{};              /**< Tensor's alignment in bytes */
+    TensorInfo  _info_owned{};           /**< Tensor's metadata. */
+    TensorInfo *_info_external{nullptr}; /**< External Tensor's metadata */
+    size_t      _alignment{};            /**< Tensor's alignment in bytes */
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_ITENSORALLOCATOR_H */
diff --git a/arm_compute/runtime/ITransformWeights.h b/arm_compute/runtime/ITransformWeights.h
index f85b7966c5..08671bbe3c 100644
--- a/arm_compute/runtime/ITransformWeights.h
+++ b/arm_compute/runtime/ITransformWeights.h
@@ -72,7 +72,7 @@ public:
     /** Allow instances of this class to be moved */
     ITransformWeights &operator=(ITransformWeights &&other)
     {
-        if(this != &other)
+        if (this != &other)
         {
             _num_refcount = other._num_refcount.load();
             _reshape_run  = other._reshape_run;
@@ -119,9 +119,9 @@ public:
     }
 
 protected:
-    std::atomic<int32_t> _num_refcount{ 0 };
-    bool                 _reshape_run{ false };
+    std::atomic<int32_t> _num_refcount{0};
+    bool                 _reshape_run{false};
 };
-} // arm_compute
+} // namespace arm_compute
 
 #endif /*ARM_COMPUTE_ITRANSFORMWEIGHTS_H */
diff --git a/arm_compute/runtime/IWeightsManager.h b/arm_compute/runtime/IWeightsManager.h
index 3b97d696bb..de8a92faa3 100644
--- a/arm_compute/runtime/IWeightsManager.h
+++ b/arm_compute/runtime/IWeightsManager.h
@@ -90,8 +90,8 @@ public:
 private:
     struct CounterElement
     {
-        bool             is_unused{ false };
-        std::atomic<int> counter{ 1 };
+        bool             is_unused{false};
+        std::atomic<int> counter{1};
     };
 
 private:
@@ -99,5 +99,5 @@ private:
     std::map<const ITensor *, CounterElement>                   _managed_counter;
     std::map<const ITensor *, ITransformWeights *>              _managed_weights_parents;
 };
-} // arm_compute
-#endif /*ARM_COMPUTE_IWEIGHTSMANAGER_H */
-\ No newline at end of file
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_IWEIGHTSMANAGER_H */
diff --git a/arm_compute/runtime/Memory.h b/arm_compute/runtime/Memory.h
index 1eab605d50..63514c409b 100644
--- a/arm_compute/runtime/Memory.h
+++ b/arm_compute/runtime/Memory.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_MEMORY_H
 
 #include "arm_compute/runtime/IMemory.h"
-
 #include "arm_compute/runtime/IMemoryRegion.h"
 
 #include <cstddef>
@@ -64,8 +63,8 @@ public:
     // Inherited methods overridden:
     IMemoryRegion *region() final;
     IMemoryRegion *region() const final;
-    void set_region(IMemoryRegion *region) final;
-    void set_owned_region(std::unique_ptr<IMemoryRegion> region) final;
+    void           set_region(IMemoryRegion *region) final;
+    void           set_owned_region(std::unique_ptr<IMemoryRegion> region) final;
 
 private:
     IMemoryRegion                 *_region;
diff --git a/arm_compute/runtime/MemoryGroup.h b/arm_compute/runtime/MemoryGroup.h
index 9fd2b9fa72..93ea3d2c72 100644
--- a/arm_compute/runtime/MemoryGroup.h
+++ b/arm_compute/runtime/MemoryGroup.h
@@ -24,10 +24,9 @@
 #ifndef ARM_COMPUTE_MEMORYGROUP_H
 #define ARM_COMPUTE_MEMORYGROUP_H
 
-#include "arm_compute/runtime/IMemoryGroup.h"
-
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/Macros.h"
+#include "arm_compute/runtime/IMemoryGroup.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IMemoryPool.h"
 
@@ -59,8 +58,8 @@ public:
     // Inherited methods overridden:
     void manage(IMemoryManageable *obj) override;
     void finalize_memory(IMemoryManageable *obj, IMemory &obj_memory, size_t size, size_t alignment) override;
-    void            acquire() override;
-    void            release() override;
+    void acquire() override;
+    void release() override;
     MemoryMappings &mappings() override;
 
 private:
@@ -70,15 +69,13 @@ private:
 };
 
 inline MemoryGroup::MemoryGroup(std::shared_ptr<IMemoryManager> memory_manager) noexcept
-    : _memory_manager(memory_manager),
-      _pool(nullptr),
-      _mappings()
+    : _memory_manager(memory_manager), _pool(nullptr), _mappings()
 {
 }
 
 inline void MemoryGroup::manage(IMemoryManageable *obj)
 {
-    if(_memory_manager && (obj != nullptr))
+    if (_memory_manager && (obj != nullptr))
     {
         ARM_COMPUTE_ERROR_ON(!_memory_manager->lifetime_manager());
 
@@ -95,7 +92,7 @@ inline void MemoryGroup::manage(IMemoryManageable *obj)
 
 inline void MemoryGroup::finalize_memory(IMemoryManageable *obj, IMemory &obj_memory, size_t size, size_t alignment)
 {
-    if(_memory_manager)
+    if (_memory_manager)
     {
         ARM_COMPUTE_ERROR_ON(!_memory_manager->lifetime_manager());
         _memory_manager->lifetime_manager()->end_lifetime(obj, obj_memory, size, alignment);
@@ -104,7 +101,7 @@ inline void MemoryGroup::finalize_memory(IMemoryManageable *obj, IMemory &obj_me
 
 inline void MemoryGroup::acquire()
 {
-    if(!_mappings.empty())
+    if (!_mappings.empty())
     {
         ARM_COMPUTE_ERROR_ON(!_memory_manager->pool_manager());
         _pool = _memory_manager->pool_manager()->lock_pool();
@@ -114,7 +111,7 @@ inline void MemoryGroup::acquire()
 
 inline void MemoryGroup::release()
 {
-    if(_pool != nullptr)
+    if (_pool != nullptr)
     {
         ARM_COMPUTE_ERROR_ON(!_memory_manager->pool_manager());
         ARM_COMPUTE_ERROR_ON(_mappings.empty());
@@ -128,5 +125,5 @@ inline MemoryMappings &MemoryGroup::mappings()
 {
     return _mappings;
 }
-} // arm_compute
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_MEMORYGROUP_H */
diff --git a/arm_compute/runtime/MemoryManagerOnDemand.h b/arm_compute/runtime/MemoryManagerOnDemand.h
index 50547ac38e..7c31fe7f5a 100644
--- a/arm_compute/runtime/MemoryManagerOnDemand.h
+++ b/arm_compute/runtime/MemoryManagerOnDemand.h
@@ -24,10 +24,9 @@
 #ifndef ARM_COMPUTE_MEMORY_MANAGER_ON_DEMAND_H
 #define ARM_COMPUTE_MEMORY_MANAGER_ON_DEMAND_H
 
-#include "arm_compute/runtime/IMemoryManager.h"
-
 #include "arm_compute/runtime/ILifetimeManager.h"
 #include "arm_compute/runtime/IMemoryGroup.h"
+#include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IPoolManager.h"
 
 #include <memory>
@@ -39,7 +38,8 @@ class MemoryManagerOnDemand : public IMemoryManager
 {
 public:
     /** Default Constructor */
-    MemoryManagerOnDemand(std::shared_ptr<ILifetimeManager> lifetime_manager, std::shared_ptr<IPoolManager> pool_manager);
+    MemoryManagerOnDemand(std::shared_ptr<ILifetimeManager> lifetime_manager,
+                          std::shared_ptr<IPoolManager>     pool_manager);
     /** Prevent instances of this class to be copy constructed */
     MemoryManagerOnDemand(const MemoryManagerOnDemand &) = delete;
     /** Prevent instances of this class to be copied */
@@ -52,12 +52,12 @@ public:
     // Inherited methods overridden:
     ILifetimeManager *lifetime_manager() override;
     IPoolManager     *pool_manager() override;
-    void populate(IAllocator &allocator, size_t num_pools) override;
-    void clear() override;
+    void              populate(IAllocator &allocator, size_t num_pools) override;
+    void              clear() override;
 
 private:
     std::shared_ptr<ILifetimeManager> _lifetime_mgr; /**< Lifetime manager */
     std::shared_ptr<IPoolManager>     _pool_mgr;     /**< Memory pool manager */
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_MEMORY_MANAGER_ON_DEMAND_H */
diff --git a/arm_compute/runtime/MemoryRegion.h b/arm_compute/runtime/MemoryRegion.h
index 6408deceaa..f8a4898281 100644
--- a/arm_compute/runtime/MemoryRegion.h
+++ b/arm_compute/runtime/MemoryRegion.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_RUNTIME_MEMORY_REGION_H
 #define ARM_COMPUTE_RUNTIME_MEMORY_REGION_H
 
-#include "arm_compute/runtime/IMemoryRegion.h"
-
 #include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/IMemoryRegion.h"
 
 #include <cstddef>
 
@@ -41,21 +40,17 @@ public:
      * @param[in] size      Region size
      * @param[in] alignment Alignment in bytes of the base pointer. Defaults to 0
      */
-    MemoryRegion(size_t size, size_t alignment = 0)
-        : IMemoryRegion(size), _mem(nullptr), _ptr(nullptr)
+    MemoryRegion(size_t size, size_t alignment = 0) : IMemoryRegion(size), _mem(nullptr), _ptr(nullptr)
     {
-        if(size != 0)
+        if (size != 0)
         {
             // Allocate backing memory
             size_t space = size + alignment;
-            _mem         = std::shared_ptr<uint8_t>(new uint8_t[space](), [](uint8_t *ptr)
-            {
-                delete[] ptr;
-            });
-            _ptr = _mem.get();
+            _mem         = std::shared_ptr<uint8_t>(new uint8_t[space](), [](uint8_t *ptr) { delete[] ptr; });
+            _ptr         = _mem.get();
 
             // Calculate alignment offset
-            if(alignment != 0)
+            if (alignment != 0)
             {
                 void *aligned_ptr = _mem.get();
                 std::align(alignment, size, aligned_ptr, space);
@@ -63,10 +58,9 @@ public:
             }
         }
     }
-    MemoryRegion(void *ptr, size_t size)
-        : IMemoryRegion(size), _mem(nullptr), _ptr(nullptr)
+    MemoryRegion(void *ptr, size_t size) : IMemoryRegion(size), _mem(nullptr), _ptr(nullptr)
     {
-        if(size != 0)
+        if (size != 0)
         {
             _ptr = ptr;
         }
@@ -91,7 +85,7 @@ public:
     }
     std::unique_ptr<IMemoryRegion> extract_subregion(size_t offset, size_t size) final
     {
-        if(_ptr != nullptr && (offset < _size) && (_size - offset >= size))
+        if (_ptr != nullptr && (offset < _size) && (_size - offset >= size))
         {
             return std::make_unique<MemoryRegion>(static_cast<uint8_t *>(_ptr) + offset, size);
         }
diff --git a/arm_compute/runtime/NEON/INEOperator.h b/arm_compute/runtime/NEON/INEOperator.h
index 5637d831a3..7971168d24 100644
--- a/arm_compute/runtime/NEON/INEOperator.h
+++ b/arm_compute/runtime/NEON/INEOperator.h
@@ -24,11 +24,11 @@
 #ifndef ARM_COMPUTE_INEOPERATOR_H
 #define ARM_COMPUTE_INEOPERATOR_H
 
-#include "../../core/ITensor.h"
 #include "arm_compute/runtime/IOperator.h"
 #include "arm_compute/runtime/IRuntimeContext.h"
 #include "arm_compute/runtime/Types.h"
 
+#include "../../core/ITensor.h"
 #include <memory>
 
 namespace arm_compute
@@ -60,8 +60,8 @@ public:
     ~INEOperator();
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void               run(ITensorPack &tensors) override;
+    void               prepare(ITensorPack &constants) override;
     MemoryRequirements workspace() const override;
 
 protected:
diff --git a/arm_compute/runtime/NEON/INESimpleFunction.h b/arm_compute/runtime/NEON/INESimpleFunction.h
index 7512759bd0..f783a836ee 100644
--- a/arm_compute/runtime/NEON/INESimpleFunction.h
+++ b/arm_compute/runtime/NEON/INESimpleFunction.h
@@ -57,5 +57,5 @@ protected:
     std::unique_ptr<INEKernel>          _kernel;         /**< Kernel to run */
     std::unique_ptr<NEFillBorderKernel> _border_handler; /**< Kernel to handle image borders */
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_INESIMPLEFUNCTION_H */
diff --git a/arm_compute/runtime/NEON/NEScheduler.h b/arm_compute/runtime/NEON/NEScheduler.h
index a3082d00f6..613f44cc52 100644
--- a/arm_compute/runtime/NEON/NEScheduler.h
+++ b/arm_compute/runtime/NEON/NEScheduler.h
@@ -30,5 +30,5 @@ namespace arm_compute
 {
 /** CPU Scheduler */
 using NEScheduler = Scheduler;
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_NESCHEDULER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEActivationLayer.h b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
index 9992de2af8..5584fdc783 100644
--- a/arm_compute/runtime/NEON/functions/NEActivationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
@@ -24,10 +24,9 @@
 #ifndef ARM_COMPUTE_NEACTIVATIONLAYER_H
 #define ARM_COMPUTE_NEACTIVATIONLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IRuntimeContext.h"
 
 #include <memory>
@@ -102,5 +101,5 @@ private:
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
-} // namespace arm_computes
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEACTIVATIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEAddMulAdd.h b/arm_compute/runtime/NEON/functions/NEAddMulAdd.h
index e5e85542f8..6c65c055dd 100644
--- a/arm_compute/runtime/NEON/functions/NEAddMulAdd.h
+++ b/arm_compute/runtime/NEON/functions/NEAddMulAdd.h
@@ -81,19 +81,28 @@ public:
      * @param[in]  act_info     (Optional) Activation layer information in case of a fused activation.
      *
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *bn_mul, ITensor *bn_add,
-                   ITensor *add_output, ITensor *final_output,
-                   ConvertPolicy policy, const ActivationLayerInfo &act_info);
+    void configure(ITensor                   *input1,
+                   ITensor                   *input2,
+                   ITensor                   *bn_mul,
+                   ITensor                   *bn_add,
+                   ITensor                   *add_output,
+                   ITensor                   *final_output,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info);
     /** Static function to check if given info will lead to a valid configuration of @ref NEAddMulAdd
      *
      * Similar to @ref NEAddMulAdd::configure() except the arguments are @ref ITensorInfo * instead of @ref ITensor *
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
-                           const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
-                           const ITensorInfo *add_output, const ITensorInfo *final_output,
-                           ConvertPolicy policy, const ActivationLayerInfo &act_info);
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *bn_mul,
+                           const ITensorInfo         *bn_add,
+                           const ITensorInfo         *add_output,
+                           const ITensorInfo         *final_output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
index b0d710d517..73a43dbc44 100644
--- a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/IFunction.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -74,7 +75,11 @@ public:
      * @param[in]  policy   Policy to use to handle overflow.
      * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ITensor             *input1,
+                   const ITensor             *input2,
+                   ITensor                   *output,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticAddition
      *
      * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
@@ -85,7 +90,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
index 6fbe9ad450..3e4f6356c5 100644
--- a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
@@ -80,7 +80,11 @@ public:
      * @param[in]  policy   Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
      * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ITensor             *input1,
+                   const ITensor             *input2,
+                   ITensor                   *output,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtraction
      *
      * @param[in] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32
@@ -91,7 +95,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
index ec00fbdbf2..99e2dcadbb 100644
--- a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
@@ -81,7 +81,13 @@ public:
      * @param[in]      epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
      */
-    void configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta = nullptr, const ITensor *gamma = nullptr, float epsilon = 0.001f,
+    void configure(ITensor            *input,
+                   ITensor            *output,
+                   const ITensor      *mean,
+                   const ITensor      *var,
+                   const ITensor      *beta     = nullptr,
+                   const ITensor      *gamma    = nullptr,
+                   float               epsilon  = 0.001f,
                    ActivationLayerInfo act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEBatchNormalizationLayer
      *
@@ -98,10 +104,14 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const ITensorInfo *mean, const ITensorInfo *var,
-                           const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
-                           float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo  *input,
+                           const ITensorInfo  *output,
+                           const ITensorInfo  *mean,
+                           const ITensorInfo  *var,
+                           const ITensorInfo  *beta     = nullptr,
+                           const ITensorInfo  *gamma    = nullptr,
+                           float               epsilon  = 0.001f,
+                           ActivationLayerInfo act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -109,5 +119,5 @@ public:
 private:
     std::unique_ptr<NEBatchNormalizationLayerKernel> _norm_kernel; /**< Batch normalization layer kernel */
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEBATCHNORMALIZATIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h b/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h
index b33ba435a8..ebed0bea29 100644
--- a/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NEBATCHTOSPACELAYER_H
 #define ARM_COMPUTE_NEBATCHTOSPACELAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
@@ -77,7 +76,11 @@ public:
      * @param[out] output        Tensor output. Data types supported: same as @p input
      * @param[in]  crop_info     Specifies how the output shape is cropped after batch to space is performed
      */
-    void configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info = CropInfo{});
+    void configure(const ITensor  *input,
+                   int32_t         block_shape_x,
+                   int32_t         block_shape_y,
+                   ITensor        *output,
+                   const CropInfo &crop_info = CropInfo{});
     /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayer
      *
      * @param[in]  input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
@@ -99,7 +102,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info = CropInfo{});
+    static Status validate(const ITensorInfo *input,
+                           int32_t            block_shape_x,
+                           int32_t            block_shape_y,
+                           const ITensorInfo *output,
+                           const CropInfo    &crop_info = CropInfo{});
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEBATCHTOSPACELAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h b/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h
index 2a196a2de5..aa41fc0df2 100644
--- a/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h
+++ b/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h
@@ -57,7 +57,8 @@ public:
      *
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
      */
-    void configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info);
+    void
+    configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEBoundingBoxTransform
      *
@@ -71,7 +72,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info);
+    static Status validate(const ITensorInfo              *boxes,
+                           const ITensorInfo              *pred_boxes,
+                           const ITensorInfo              *deltas,
+                           const BoundingBoxTransformInfo &info);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEBOUNDINGBOXTRANSFORM_H */
diff --git a/arm_compute/runtime/NEON/functions/NECast.h b/arm_compute/runtime/NEON/functions/NECast.h
index 821249c142..43cae777f6 100644
--- a/arm_compute/runtime/NEON/functions/NECast.h
+++ b/arm_compute/runtime/NEON/functions/NECast.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NECAST_H
 #define ARM_COMPUTE_NECAST_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
diff --git a/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h b/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h
index dd1c709d76..1600f85488 100644
--- a/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NECONCATENATELAYER_H
 #define ARM_COMPUTE_NECONCATENATELAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -87,7 +86,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
+    static Status
+    validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEConv3D.h b/arm_compute/runtime/NEON/functions/NEConv3D.h
index 2a3c5351b0..525f37f3e7 100644
--- a/arm_compute/runtime/NEON/functions/NEConv3D.h
+++ b/arm_compute/runtime/NEON/functions/NEConv3D.h
@@ -24,11 +24,10 @@
 #ifndef ARM_COMPUTE_NECONV3D_H
 #define ARM_COMPUTE_NECONV3D_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -76,14 +75,19 @@ public:
      * @param[out] output    Destination tensor. 4 lower dimensions represent a single output [OFM, width, height, depth], while the rest represent batch of outputs.
      * @param[in]  conv_info Contains padding, stride, acitvation information described in @ref Conv3dInfo.
      */
-    void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv3dInfo &conv_info);
+    void configure(
+        ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv3dInfo &conv_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to NEConv3D::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv3dInfo &conv_info);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *weights,
+                           const ITensorInfo *biases,
+                           const ITensorInfo *output,
+                           const Conv3dInfo  &conv_info);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h b/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h
index a892d3036b..dc6b22d717 100644
--- a/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h
+++ b/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTS_H
 #define ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTS_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
@@ -66,7 +65,8 @@ public:
      * @param[in]  original_input_shape Shape of the original input tensor (the one entering fully connected layer).
      * @param[in]  data_layout          The data layout the weights have been trained in.
      */
-    void configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
+    void
+    configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
     /** Static function to check if given info will lead to a valid configuration of @ref NEConvertFullyConnectedWeights
      *
      * @param[in] input                Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
@@ -76,7 +76,10 @@ public:
      *
      * @return A Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, DataLayout data_layout);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const TensorShape &original_input_shape,
+                           DataLayout         data_layout);
 
     // Inherited methods overriden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
index 4dd76d082b..cdf0f652e1 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
@@ -24,11 +24,10 @@
 #ifndef ARM_COMPUTE_NECONVOLUTIONLAYER_H
 #define ARM_COMPUTE_NECONVOLUTIONLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
 #include <memory>
@@ -119,8 +118,16 @@ public:
      *                              available which may introduce a drop of accuracy as well. Default is false
      * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      */
-    void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
+    void configure(ITensor                   *input,
+                   const ITensor             *weights,
+                   const ITensor             *biases,
+                   ITensor                   *output,
+                   const PadStrideInfo       &conv_info,
+                   const WeightsInfo         &weights_info     = WeightsInfo(),
+                   const Size2D              &dilation         = Size2D(1U, 1U),
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false,
+                   unsigned int               num_groups       = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref NEConvolutionLayer
      *
      * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -143,9 +150,16 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false,
-                           unsigned int num_groups = 1);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const WeightsInfo         &weights_info     = WeightsInfo(),
+                           const Size2D              &dilation         = Size2D(1U, 1U),
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false,
+                           unsigned int               num_groups       = 1);
     /** Static function to check if given info will return the convolution called by @ref NEConvolutionLayer
      *
      * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -165,8 +179,14 @@ public:
      *
      * @return the Convolution Method Hint
      */
-    static ConvolutionMethod get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                    const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    static ConvolutionMethod get_convolution_method(const ITensorInfo         *input,
+                                                    const ITensorInfo         *weights,
+                                                    const ITensorInfo         *output,
+                                                    const PadStrideInfo       &conv_info,
+                                                    const WeightsInfo         &weights_info     = WeightsInfo(),
+                                                    const Size2D              &dilation         = Size2D(1U, 1U),
+                                                    const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                                                    bool                       enable_fast_math = false);
     // Inherited methods overridden:
     void run() override;
     void prepare() override;
diff --git a/arm_compute/runtime/NEON/functions/NECopy.h b/arm_compute/runtime/NEON/functions/NECopy.h
index ee02c259f4..840c03e968 100644
--- a/arm_compute/runtime/NEON/functions/NECopy.h
+++ b/arm_compute/runtime/NEON/functions/NECopy.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NECOPY_H
 #define ARM_COMPUTE_NECOPY_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
diff --git a/arm_compute/runtime/NEON/functions/NECropResize.h b/arm_compute/runtime/NEON/functions/NECropResize.h
index 143bbbc6f1..f806762158 100644
--- a/arm_compute/runtime/NEON/functions/NECropResize.h
+++ b/arm_compute/runtime/NEON/functions/NECropResize.h
@@ -75,8 +75,13 @@ public:
      * @param[in]  method              The policy to be used when resizing image. Default is bilinear.
      * @param[in]  extrapolation_value Value to be used for values outside of the image for cropping and resizing. Default is 0.
      */
-    void configure(const ITensor *input, const ITensor *boxes, const ITensor *box_ind, ITensor *output, Coordinates2D crop_size,
-                   InterpolationPolicy method = InterpolationPolicy::BILINEAR, float extrapolation_value = 0);
+    void configure(const ITensor      *input,
+                   const ITensor      *boxes,
+                   const ITensor      *box_ind,
+                   ITensor            *output,
+                   Coordinates2D       crop_size,
+                   InterpolationPolicy method              = InterpolationPolicy::BILINEAR,
+                   float               extrapolation_value = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NESlice
      *
@@ -96,8 +101,13 @@ public:
      *
      * @return A status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *boxes, const ITensorInfo *box_ind, const ITensorInfo *output,
-                           Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value);
+    static Status validate(const ITensorInfo  *input,
+                           const ITensorInfo  *boxes,
+                           const ITensorInfo  *box_ind,
+                           const ITensorInfo  *output,
+                           Coordinates2D       crop_size,
+                           InterpolationPolicy method,
+                           float               extrapolation_value);
 
     void run() override;
 
diff --git a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
index cdc3a636b0..aabe42f928 100644
--- a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
@@ -24,15 +24,14 @@
 #ifndef ARM_COMPUTE_NEDECONVOLUTIONLAYER_H
 #define ARM_COMPUTE_NEDECONVOLUTIONLAYER_H
 
-#include "arm_compute/runtime/CPP/functions/CPPUpsample.h"
-#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEReverse.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/functions/CPPUpsample.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEReverse.h"
 #include "arm_compute/runtime/Tensor.h"
 
 #include <memory>
@@ -117,7 +116,13 @@ public:
      *                                 the GEMM convolution.
      *
      */
-    void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info, bool enable_fast_math = false, const WeightsInfo &weights_info = WeightsInfo());
+    void configure(ITensor             *input,
+                   const ITensor       *weights,
+                   const ITensor       *bias,
+                   ITensor             *output,
+                   const PadStrideInfo &info,
+                   bool                 enable_fast_math = false,
+                   const WeightsInfo   &weights_info     = WeightsInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEDeconvolutionLayer
      *
      * @param[in] input            Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs.
@@ -134,8 +139,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info,
-                           bool enable_fast_math = false, const WeightsInfo &weights_info = WeightsInfo());
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *bias,
+                           const ITensorInfo   *output,
+                           const PadStrideInfo &info,
+                           bool                 enable_fast_math = false,
+                           const WeightsInfo   &weights_info     = WeightsInfo());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h b/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h
index eb0724ae12..7bfdfbd13d 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NEDEPTHCONVERT_H
 #define ARM_COMPUTE_NEDEPTHCONVERT_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -84,7 +83,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift = 0);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift = 0);
 
     // Inherited methods overridden
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h b/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
index b9bdcd1f11..c7df29a704 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NEDEPTHTOSPACELAYER_H
 #define ARM_COMPUTE_NEDEPTHTOSPACELAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
index 6f2ec8cddb..6ad5aa7bfa 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
@@ -28,6 +28,7 @@
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPermute.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -80,8 +81,14 @@ public:
      * @param[in]      act_info         (Optional) Activation layer information in case of a fused activation.
      * @param[in]      dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      */
-    void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                   unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+    void configure(ITensor                   *input,
+                   const ITensor             *weights,
+                   const ITensor             *biases,
+                   ITensor                   *output,
+                   const PadStrideInfo       &conv_info,
+                   unsigned int               depth_multiplier = 1,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   const Size2D              &dilation         = Size2D(1U, 1U));
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer
      *
@@ -98,8 +105,14 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           unsigned int               depth_multiplier = 1,
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           const Size2D              &dilation         = Size2D(1U, 1U));
 
     // Inherited methods overriden:
     void run() override;
@@ -127,9 +140,11 @@ private:
         /** Default move constructor */
         NEDepthwiseConvolutionLayerOptimizedInternal(NEDepthwiseConvolutionLayerOptimizedInternal &&) = default;
         /** Prevent instances of this class from being copied (As this class contains pointers) */
-        NEDepthwiseConvolutionLayerOptimizedInternal &operator=(const NEDepthwiseConvolutionLayerOptimizedInternal &) = delete;
+        NEDepthwiseConvolutionLayerOptimizedInternal &
+        operator=(const NEDepthwiseConvolutionLayerOptimizedInternal &) = delete;
         /** Default move assignment operator */
-        NEDepthwiseConvolutionLayerOptimizedInternal &operator=(NEDepthwiseConvolutionLayerOptimizedInternal &&) = default;
+        NEDepthwiseConvolutionLayerOptimizedInternal &
+        operator=(NEDepthwiseConvolutionLayerOptimizedInternal &&) = default;
         /** Default destructor */
         ~NEDepthwiseConvolutionLayerOptimizedInternal() = default;
         /** Initialize the function's source, destination, kernels and border_size.
@@ -144,8 +159,14 @@ private:
          * @param[in]      act_info         (Optional) Activation layer information in case of a fused activation.
          * @param[in]      dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
          */
-        void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                       unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+        void configure(ITensor                   *input,
+                       const ITensor             *weights,
+                       const ITensor             *biases,
+                       ITensor                   *output,
+                       const PadStrideInfo       &conv_info,
+                       unsigned int               depth_multiplier = 1,
+                       const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                       const Size2D              &dilation         = Size2D(1U, 1U));
 
         /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer3x3
          *
@@ -161,8 +182,14 @@ private:
          *
          * @return a status
          */
-        static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                               unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+        static Status validate(const ITensorInfo         *input,
+                               const ITensorInfo         *weights,
+                               const ITensorInfo         *biases,
+                               const ITensorInfo         *output,
+                               const PadStrideInfo       &conv_info,
+                               unsigned int               depth_multiplier = 1,
+                               const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                               const Size2D              &dilation         = Size2D(1U, 1U));
 
         // Inherited methods overriden:
         void run() override;
@@ -207,8 +234,14 @@ private:
          * @param[in]      act_info         (Optional) Activation layer information in case of a fused activation.
          * @param[in]      dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
          */
-        void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                       unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+        void configure(ITensor                   *input,
+                       const ITensor             *weights,
+                       const ITensor             *biases,
+                       ITensor                   *output,
+                       const PadStrideInfo       &conv_info,
+                       unsigned int               depth_multiplier = 1,
+                       const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                       const Size2D              &dilation         = Size2D(1U, 1U));
 
         /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayerGeneric
          *
@@ -225,8 +258,14 @@ private:
          *
          * @return a status
          */
-        static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                               unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+        static Status validate(const ITensorInfo         *input,
+                               const ITensorInfo         *weights,
+                               const ITensorInfo         *biases,
+                               const ITensorInfo         *output,
+                               const PadStrideInfo       &conv_info,
+                               unsigned int               depth_multiplier = 1,
+                               const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                               const Size2D              &dilation         = Size2D(1U, 1U));
 
         // Inherited methods overriden:
         void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h b/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h
index 2affa8d49e..7a94833d10 100644
--- a/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h
@@ -24,13 +24,12 @@
 #ifndef ARM_COMPUTE_NE_DETECTION_POSTPROCESS_H
 #define ARM_COMPUTE_NE_DETECTION_POSTPROCESS_H
 
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
 #include "arm_compute/runtime/Tensor.h"
 
 #include <map>
@@ -78,8 +77,14 @@ public:
      *
      * @note Output contains all the detections. Of those, only the ones selected by the valid region are valid.
      */
-    void configure(const ITensor *input_box_encoding, const ITensor *input_score, const ITensor *input_anchors,
-                   ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo());
+    void configure(const ITensor                *input_box_encoding,
+                   const ITensor                *input_score,
+                   const ITensor                *input_anchors,
+                   ITensor                      *output_boxes,
+                   ITensor                      *output_classes,
+                   ITensor                      *output_scores,
+                   ITensor                      *num_detection,
+                   DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEDetectionPostProcessLayer
      *
      * @param[in] input_box_encoding The bounding box input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F32.
@@ -93,8 +98,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors,
-                           ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection,
+    static Status validate(const ITensorInfo            *input_box_encoding,
+                           const ITensorInfo            *input_class_score,
+                           const ITensorInfo            *input_anchors,
+                           ITensorInfo                  *output_boxes,
+                           ITensorInfo                  *output_classes,
+                           ITensorInfo                  *output_scores,
+                           ITensorInfo                  *num_detection,
                            DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo());
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
index 8db7e6596b..3ae3b2a15c 100644
--- a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
@@ -85,7 +85,12 @@ public:
      * @param[in]      conv_info Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]      act_info  (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensor                   *input,
+                   const ITensor             *weights,
+                   const ITensor             *bias,
+                   ITensor                   *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayer
      *
      * @note: DirectConvolution only works in the following configurations:
@@ -106,7 +111,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info,
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
diff --git a/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h b/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h
index bfcd221e17..ebf2277d1f 100644
--- a/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h
+++ b/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h
@@ -73,7 +73,10 @@ public:
      * @param[out]     output   Output tensor. Data types supported: Same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensor                   *input1,
+                   ITensor                   *input2,
+                   ITensor                   *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for max
      *
      * @param[in] input1   First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
@@ -83,7 +86,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -133,7 +139,10 @@ public:
      * @param[out]     output   Output tensor. Data types supported: Same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensor                   *input1,
+                   ITensor                   *input2,
+                   ITensor                   *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for min
      *
      * @param[in] input1   First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
@@ -143,7 +152,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -193,7 +205,10 @@ public:
      * @param[out]     output   Output tensor. Data types supported: Same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensor                   *input1,
+                   ITensor                   *input2,
+                   ITensor                   *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for squared difference
      *
      * @param[in] input1   First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
@@ -203,7 +218,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -249,7 +267,10 @@ public:
      * @param[out]     output   Output tensor. Data types supported: Same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensor                   *input1,
+                   ITensor                   *input2,
+                   ITensor                   *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for division
      *
      * @param[in] input1   First tensor input info. Data types supported: F16/F32.
@@ -259,7 +280,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -306,7 +330,10 @@ public:
      * @param[out]     output   Output tensor. Data types supported: Same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensor                   *input1,
+                   ITensor                   *input2,
+                   ITensor                   *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for power
      *
      * @param[in] input1   First tensor input info. Data types supported: F16/F32.
@@ -316,7 +343,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -377,7 +407,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op);
+    static Status
+    validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEFFT1D.h b/arm_compute/runtime/NEON/functions/NEFFT1D.h
index 9654b1e604..99c6fd4eb4 100644
--- a/arm_compute/runtime/NEON/functions/NEFFT1D.h
+++ b/arm_compute/runtime/NEON/functions/NEFFT1D.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NEFFT1D_H
 #define ARM_COMPUTE_NEFFT1D_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
 
diff --git a/arm_compute/runtime/NEON/functions/NEFFT2D.h b/arm_compute/runtime/NEON/functions/NEFFT2D.h
index 57f38d1942..cefd3df17a 100644
--- a/arm_compute/runtime/NEON/functions/NEFFT2D.h
+++ b/arm_compute/runtime/NEON/functions/NEFFT2D.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NEFFT2D_H
 #define ARM_COMPUTE_NEFFT2D_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEFFT1D.h"
 #include "arm_compute/runtime/Tensor.h"
diff --git a/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h
index c5f4d45b6b..84bfe6b02f 100644
--- a/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NEFFTCONVOLUTIONLAYER_H
 #define ARM_COMPUTE_NEFFTCONVOLUTIONLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEFFT2D.h"
@@ -94,8 +93,13 @@ public:
      * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation.
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. Unused for CPU backend.
      */
-    void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    void configure(ITensor                   *input,
+                   const ITensor             *weights,
+                   const ITensor             *biases,
+                   ITensor                   *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false);
     /** Static function to check if given info will lead to a valid configuration of @ref NEFFTConvolutionLayer
      *
      * @note: This function only works with any square kernel size and unit strides for both NCHW and NHWC data layout
@@ -113,8 +117,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEFill.h b/arm_compute/runtime/NEON/functions/NEFill.h
index e923ce33e1..1829c71fef 100644
--- a/arm_compute/runtime/NEON/functions/NEFill.h
+++ b/arm_compute/runtime/NEON/functions/NEFill.h
@@ -24,10 +24,9 @@
 #ifndef ARM_COMPUTE_NEFILL_H
 #define ARM_COMPUTE_NEFILL_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
diff --git a/arm_compute/runtime/NEON/functions/NEFillBorder.h b/arm_compute/runtime/NEON/functions/NEFillBorder.h
index ab77c28839..44b1d4a62b 100644
--- a/arm_compute/runtime/NEON/functions/NEFillBorder.h
+++ b/arm_compute/runtime/NEON/functions/NEFillBorder.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -57,7 +58,10 @@ public:
      * @param[in]      border_mode           Strategy to use for borders.
      * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(ITensor *input, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    void configure(ITensor          *input,
+                   unsigned int      border_width,
+                   BorderMode        border_mode,
+                   const PixelValue &constant_border_value = PixelValue());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEFloor.h b/arm_compute/runtime/NEON/functions/NEFloor.h
index 4d47b068db..77ac484bab 100644
--- a/arm_compute/runtime/NEON/functions/NEFloor.h
+++ b/arm_compute/runtime/NEON/functions/NEFloor.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NEFLOOR_H
 #define ARM_COMPUTE_NEFLOOR_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
index 05b7ce3735..885f8430cf 100644
--- a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
@@ -28,7 +28,6 @@
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IWeightsManager.h"
-
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
 #include "arm_compute/runtime/Tensor.h"
 
@@ -88,7 +87,8 @@ class NEFullyConnectedLayer : public IFunction
 {
 public:
     /** Constructor */
-    NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
+    NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager  = nullptr,
+                          IWeightsManager                *weights_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEFullyConnectedLayer(const NEFullyConnectedLayer &) = delete;
     /** Prevent instances of this class from being moved (As this class contains pointers) */
@@ -126,16 +126,24 @@ public:
      * @param[in]  fc_info      (Optional) Fully connected layer additional info
      * @param[in]  weights_info (Optional) Stores neccessary compute information when weights are already reshaped
      */
-    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output,
-                   FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), const WeightsInfo &weights_info = WeightsInfo());
+    void configure(const ITensor          *input,
+                   const ITensor          *weights,
+                   const ITensor          *biases,
+                   ITensor                *output,
+                   FullyConnectedLayerInfo fc_info      = FullyConnectedLayerInfo(),
+                   const WeightsInfo      &weights_info = WeightsInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEFullyConnectedLayer
      *
      * Similar to @ref NEFullyConnectedLayer::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                           FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), const WeightsInfo &weights_info = WeightsInfo());
+    static Status validate(const ITensorInfo      *input,
+                           const ITensorInfo      *weights,
+                           const ITensorInfo      *biases,
+                           const ITensorInfo      *output,
+                           FullyConnectedLayerInfo fc_info      = FullyConnectedLayerInfo(),
+                           const WeightsInfo      &weights_info = WeightsInfo());
 
     /** Static function that queries whether fixed-format kernel exists for a given problem description
      *
@@ -149,8 +157,13 @@ public:
      *
      * @return a status
      */
-    static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *input, const ITensorInfo *weights,
-                               const ITensorInfo *biases, const ITensorInfo *output, const FullyConnectedLayerInfo &fc_info, const WeightsInfo &weights_info);
+    static Status has_opt_impl(arm_compute::WeightFormat     &expected_weight_format,
+                               const ITensorInfo             *input,
+                               const ITensorInfo             *weights,
+                               const ITensorInfo             *biases,
+                               const ITensorInfo             *output,
+                               const FullyConnectedLayerInfo &fc_info,
+                               const WeightsInfo             &weights_info);
 
     //Inherited methods override
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h b/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h
index 3dd7f49044..f53b3de7f6 100644
--- a/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h
+++ b/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h
@@ -75,9 +75,16 @@ public:
      * @param[in]  epsilon       (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
      * @param[in]  fbn_type      (Optional) Fused batch normalization type. Defaults to Convolution.
      */
-    void configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var, ITensor *fused_weights, ITensor *fused_bias,
-                   const ITensor *input_bias = nullptr, const ITensor *bn_beta = nullptr, const ITensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    void configure(const ITensor             *input_weights,
+                   const ITensor             *bn_mean,
+                   const ITensor             *bn_var,
+                   ITensor                   *fused_weights,
+                   ITensor                   *fused_bias,
+                   const ITensor             *input_bias = nullptr,
+                   const ITensor             *bn_beta    = nullptr,
+                   const ITensor             *bn_gamma   = nullptr,
+                   float                      epsilon    = 0.001f,
+                   FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
     /** Static function to check if given info will lead to a valid configuration of @ref NEFuseBatchNormalization
      *
      * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
@@ -95,10 +102,16 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                           const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                           const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr,
-                           float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    static Status validate(const ITensorInfo         *input_weights,
+                           const ITensorInfo         *bn_mean,
+                           const ITensorInfo         *bn_var,
+                           const ITensorInfo         *fused_weights,
+                           const ITensorInfo         *fused_bias,
+                           const ITensorInfo         *input_bias = nullptr,
+                           const ITensorInfo         *bn_beta    = nullptr,
+                           const ITensorInfo         *bn_gamma   = nullptr,
+                           float                      epsilon    = 0.001f,
+                           FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
index c6ff2dfb92..29650a5eca 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMM.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h
@@ -78,14 +78,26 @@ public:
      * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
      *                       if the reshape of matrix B should happen only for the first run
      */
-    void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(const ITensor  *a,
+                   const ITensor  *b,
+                   const ITensor  *c,
+                   ITensor        *d,
+                   float           alpha,
+                   float           beta,
+                   const GEMMInfo &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMM.
      *
      * Similar to @ref NEGEMM::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *output,
+                           float              alpha,
+                           float              beta,
+                           const GEMMInfo    &gemm_info = GEMMInfo());
 
     /** Static function that queries whether there exists fixed-format kernel and if it exists it will return in the first argument in what format
      * weights are expected to be reshaped as defined by WeightFormat class. Apart from the first argument the rest of the arguments are the same
@@ -93,8 +105,14 @@ public:
      *
      * @return a status
      */
-    static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output,
-                               float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+    static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                               const ITensorInfo         *a,
+                               const ITensorInfo         *b,
+                               const ITensorInfo         *c,
+                               const ITensorInfo         *output,
+                               float                      alpha,
+                               float                      beta,
+                               const GEMMInfo            &gemm_info = GEMMInfo());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
index 53ceb6d978..d1c5a1c9b3 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
@@ -86,7 +86,8 @@ public:
      *                     Data types supported: Same as @p input.
      * @param[in]  info    Convolution layer descriptor
      */
-    void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info);
+    void
+    configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info);
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConv2d
      *
      * @param[in] input   Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
@@ -102,7 +103,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &info);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *weights,
+                           const ITensorInfo *biases,
+                           const ITensorInfo *output,
+                           const Conv2dInfo  &info);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
index 72309e464e..3e84c3e2cf 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
@@ -24,8 +24,6 @@
 #ifndef ARM_COMPUTE_NEGEMMCONVOLUTIONLAYER_H
 #define ARM_COMPUTE_NEGEMMCONVOLUTIONLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/IFunction.h"
@@ -49,7 +47,8 @@ class NEGEMMConvolutionLayer : public IFunction
 {
 public:
     /** Constructor */
-    NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
+    NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager  = nullptr,
+                           IWeightsManager                       *weights_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEGEMMConvolutionLayer(const NEGEMMConvolutionLayer &) = delete;
     /** Prevent instances of this class from being moved (As this class contains non movable objects) */
@@ -95,8 +94,16 @@ public:
      *                              available which may introduce a drop of accuracy as well. Default is false
      * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      */
-    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
+    void configure(const ITensor             *input,
+                   const ITensor             *weights,
+                   const ITensor             *biases,
+                   ITensor                   *output,
+                   const PadStrideInfo       &conv_info,
+                   const WeightsInfo         &weights_info     = WeightsInfo(),
+                   const Size2D              &dilation         = Size2D(1U, 1U),
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false,
+                   unsigned int               num_groups       = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer
      *
      * @param[in] input            Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
@@ -119,9 +126,16 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(),
-                           bool enable_fast_math = false, unsigned int num_groups = 1);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const WeightsInfo         &weights_info     = WeightsInfo(),
+                           const Size2D              &dilation         = Size2D(1U, 1U),
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false,
+                           unsigned int               num_groups       = 1);
 
     /** Static function to check if there is an optimized version of
      * GEMM available for the input parameters.
@@ -178,10 +192,16 @@ public:
      *
      * @return a Status
      */
-    static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                               const PadStrideInfo &conv_info,
-                               const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(),
-                               bool enable_fast_math = false);
+    static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                               const ITensorInfo         *src,
+                               const ITensorInfo         *weights,
+                               const ITensorInfo         *biases,
+                               const ITensorInfo         *dst,
+                               const PadStrideInfo       &conv_info,
+                               const WeightsInfo         &weights_info     = WeightsInfo(),
+                               const Size2D              &dilation         = Size2D(1U, 1U),
+                               const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                               bool                       enable_fast_math = false);
     // Inherited methods overridden:
     void run() override;
     void prepare() override;
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
index addb13cdfa..824c4443ad 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
@@ -47,7 +47,8 @@ class NEGEMMLowpMatrixMultiplyCore : public IFunction
 {
 public:
     /** Constructor */
-    NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
+    NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager  = nullptr,
+                                 IWeightsManager                *weights_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEGEMMLowpMatrixMultiplyCore(const NEGEMMLowpMatrixMultiplyCore &) = delete;
     /** Default move constructor */
@@ -96,14 +97,19 @@ public:
      * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
      *                       if the reshape of matrix B should be executed only for the first run
      */
-    void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(
+        const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyCore
      *
      * Similar to @ref NEGEMMLowpMatrixMultiplyCore::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *output,
+                           const GEMMInfo    &gemm_info = GEMMInfo());
 
     // Inherited methods overridden
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
index 232344e5c2..0d932bb4af 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
@@ -89,7 +89,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info);
+    static Status validate(const ITensorInfo             *input,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *output,
+                           const GEMMLowpOutputStageInfo &info);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h b/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
index 3b683382ec..0f294fde22 100644
--- a/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
@@ -95,7 +95,12 @@ public:
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the @ref GenerateProposalsInfo struct.
      * @note Proposals contains all the proposals. Of those, only the first num_valid_proposals are valid.
      */
-    void configure(const ITensor *scores, const ITensor *deltas, const ITensor *anchors, ITensor *proposals, ITensor *scores_out, ITensor *num_valid_proposals,
+    void configure(const ITensor               *scores,
+                   const ITensor               *deltas,
+                   const ITensor               *anchors,
+                   ITensor                     *proposals,
+                   ITensor                     *scores_out,
+                   ITensor                     *num_valid_proposals,
                    const GenerateProposalsInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEGenerateProposalsLayer
@@ -112,7 +117,11 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out,
+    static Status validate(const ITensorInfo           *scores,
+                           const ITensorInfo           *deltas,
+                           const ITensorInfo           *anchors,
+                           const ITensorInfo           *proposals,
+                           const ITensorInfo           *scores_out,
                            const ITensorInfo           *num_valid_proposals,
                            const GenerateProposalsInfo &info);
 
diff --git a/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h
index bb0697072b..0bc57be09e 100644
--- a/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h
@@ -89,7 +89,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           float              gamma   = 1.0f,
+                           float              beta    = 0.0f,
+                           float              epsilon = 1e-12f);
 
     // Inherited methods overridden:
     void run() override;
@@ -103,5 +107,5 @@ private:
     Tensor                                              _permuted_input;
     Tensor                                              _permuted_output;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
index 7f1a5e785e..8502cee5d2 100644
--- a/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
@@ -97,5 +97,5 @@ private:
     std::unique_ptr<NEL2NormalizeLayerKernel> _normalize_kernel;
     Tensor                                    _sumsq;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEL2NORMALIZELAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayer.h b/arm_compute/runtime/NEON/functions/NELSTMLayer.h
index 4272215486..629c5d10a0 100644
--- a/arm_compute/runtime/NEON/functions/NELSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NELSTMLayer.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NELSTMLAYER_H
 
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/common/LSTMParams.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
@@ -35,7 +36,6 @@
 #include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
-#include "arm_compute/runtime/common/LSTMParams.h"
 
 namespace arm_compute
 {
@@ -104,13 +104,26 @@ public:
      * @param[in]  projection_threshold        The clipping threshold for the output from the projection layer, such that values are bound within [-proj_clip, proj_clip].
      *                                         If set to 0.0 then clipping is disabled.
      */
-    void configure(const ITensor *input,
-                   const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
-                   const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
-                   const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
-                   const ITensor *output_state_in, const ITensor *cell_state_in,
-                   ITensor *scratch_buffer, ITensor *output_state_out, ITensor *cell_state_out, ITensor *output,
-                   const LSTMParams<ITensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f);
+    void configure(const ITensor             *input,
+                   const ITensor             *input_to_forget_weights,
+                   const ITensor             *input_to_cell_weights,
+                   const ITensor             *input_to_output_weights,
+                   const ITensor             *recurrent_to_forget_weights,
+                   const ITensor             *recurrent_to_cell_weights,
+                   const ITensor             *recurrent_to_output_weights,
+                   const ITensor             *forget_gate_bias,
+                   const ITensor             *cell_bias,
+                   const ITensor             *output_gate_bias,
+                   const ITensor             *output_state_in,
+                   const ITensor             *cell_state_in,
+                   ITensor                   *scratch_buffer,
+                   ITensor                   *output_state_out,
+                   ITensor                   *cell_state_out,
+                   ITensor                   *output,
+                   const LSTMParams<ITensor> &lstm_params,
+                   const ActivationLayerInfo &activation_info,
+                   float                      cell_threshold       = 0.f,
+                   float                      projection_threshold = 0.f);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NELSTMLayer
      *
@@ -151,13 +164,26 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input,
-                           const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                           const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                           const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                           const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in,
-                           const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output,
-                           const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f);
+    static Status validate(const ITensorInfo             *input,
+                           const ITensorInfo             *input_to_forget_weights,
+                           const ITensorInfo             *input_to_cell_weights,
+                           const ITensorInfo             *input_to_output_weights,
+                           const ITensorInfo             *recurrent_to_forget_weights,
+                           const ITensorInfo             *recurrent_to_cell_weights,
+                           const ITensorInfo             *recurrent_to_output_weights,
+                           const ITensorInfo             *forget_gate_bias,
+                           const ITensorInfo             *cell_bias,
+                           const ITensorInfo             *output_gate_bias,
+                           const ITensorInfo             *output_state_in,
+                           const ITensorInfo             *cell_state_in,
+                           const ITensorInfo             *scratch_buffer,
+                           const ITensorInfo             *output_state_out,
+                           const ITensorInfo             *cell_state_out,
+                           const ITensorInfo             *output,
+                           const LSTMParams<ITensorInfo> &lstm_params,
+                           const ActivationLayerInfo     &activation_info,
+                           float                          cell_threshold       = 0.f,
+                           float                          projection_threshold = 0.f);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
index bcb89d997d..ae951669b3 100644
--- a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
+++ b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NELSTMLAYERQUANTIZED_H
 
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/common/LSTMParams.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
@@ -38,8 +39,6 @@
 #include "arm_compute/runtime/NEON/functions/NESlice.h"
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
 
-#include "arm_compute/runtime/common/LSTMParams.h"
-
 namespace arm_compute
 {
 // Forward declarations
@@ -104,11 +103,22 @@ public:
      * @param[out] output_state_out            Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size].Data types supported: Same as @p input.
      */
     void configure(const ITensor *input,
-                   const ITensor *input_to_input_weights, const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
-                   const ITensor *recurrent_to_input_weights, const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
-                   const ITensor *input_gate_bias, const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
-                   ITensor *cell_state_in, const ITensor *output_state_in,
-                   ITensor *cell_state_out, ITensor *output_state_out);
+                   const ITensor *input_to_input_weights,
+                   const ITensor *input_to_forget_weights,
+                   const ITensor *input_to_cell_weights,
+                   const ITensor *input_to_output_weights,
+                   const ITensor *recurrent_to_input_weights,
+                   const ITensor *recurrent_to_forget_weights,
+                   const ITensor *recurrent_to_cell_weights,
+                   const ITensor *recurrent_to_output_weights,
+                   const ITensor *input_gate_bias,
+                   const ITensor *forget_gate_bias,
+                   const ITensor *cell_bias,
+                   const ITensor *output_gate_bias,
+                   ITensor       *cell_state_in,
+                   const ITensor *output_state_in,
+                   ITensor       *cell_state_out,
+                   ITensor       *output_state_out);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NELSTMLayer
      *
@@ -133,11 +143,22 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input,
-                           const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                           const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                           const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                           const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                           const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out);
+                           const ITensorInfo *input_to_input_weights,
+                           const ITensorInfo *input_to_forget_weights,
+                           const ITensorInfo *input_to_cell_weights,
+                           const ITensorInfo *input_to_output_weights,
+                           const ITensorInfo *recurrent_to_input_weights,
+                           const ITensorInfo *recurrent_to_forget_weights,
+                           const ITensorInfo *recurrent_to_cell_weights,
+                           const ITensorInfo *recurrent_to_output_weights,
+                           const ITensorInfo *input_gate_bias,
+                           const ITensorInfo *forget_gate_bias,
+                           const ITensorInfo *cell_bias,
+                           const ITensorInfo *output_gate_bias,
+                           const ITensorInfo *cell_state_in,
+                           const ITensorInfo *output_state_in,
+                           const ITensorInfo *cell_state_out,
+                           const ITensorInfo *output_state_out);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEMatMul.h b/arm_compute/runtime/NEON/functions/NEMatMul.h
index e961f860c1..414fc2f3fd 100644
--- a/arm_compute/runtime/NEON/functions/NEMatMul.h
+++ b/arm_compute/runtime/NEON/functions/NEMatMul.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/IFunction.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -48,7 +49,7 @@ public:
     };
 
 private:
-    bool _fast_math{ false };
+    bool _fast_math{false};
 };
 
 // Forward declarations
@@ -96,7 +97,12 @@ public:
      * @param[in]  settings Contains flags for function level settings i.e fast math
      * @param[in]  act_info (Optional) Contains activation function and lower and upper bound values for bounded activation functions.
      */
-    void configure(ITensor *lhs, ITensor *rhs, ITensor *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensor                   *lhs,
+                   ITensor                   *rhs,
+                   ITensor                   *dst,
+                   const MatMulInfo          &info,
+                   const CpuMatMulSettings   &settings,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEMatMul
      *
      * @param[in]  lhs      Left-hand side tensor info. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8.
@@ -108,7 +114,11 @@ public:
      *
      * @return Status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings,
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *dst,
+                           const MatMulInfo          &info,
+                           const CpuMatMulSettings   &settings,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden
@@ -118,5 +128,5 @@ private:
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
-}
+} // namespace arm_compute
 #endif /* ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL */
diff --git a/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h b/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h
index 2f77540e1e..e00fc4544f 100644
--- a/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -86,7 +87,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo      *input,
+                           const ITensorInfo      *indices,
+                           const ITensorInfo      *output,
+                           const PoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run() override;
@@ -96,5 +100,5 @@ private:
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEMAXUNPOOLINGLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
index fbe000445c..27e3fa674e 100644
--- a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NENORMALIZATIONLAYER_H
 #define ARM_COMPUTE_NENORMALIZATIONLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
@@ -88,16 +87,17 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    MemoryGroup                                 _memory_group;  /**< Function memory group */
-    std::unique_ptr<NENormalizationLayerKernel> _norm_kernel;   /**< Normalization layer kernel */
-    NEPixelWiseMultiplication                   _multiply_f;    /**< Pixel multiplication function */
-    Tensor                                      _input_squared; /**< The intermediate buffer which stores results of squaring input */
+    MemoryGroup                                 _memory_group; /**< Function memory group */
+    std::unique_ptr<NENormalizationLayerKernel> _norm_kernel;  /**< Normalization layer kernel */
+    NEPixelWiseMultiplication                   _multiply_f;   /**< Pixel multiplication function */
+    Tensor _input_squared; /**< The intermediate buffer which stores results of squaring input */
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NENORMALIZATIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEPadLayer.h b/arm_compute/runtime/NEON/functions/NEPadLayer.h
index 4aa6725496..494b1c0641 100644
--- a/arm_compute/runtime/NEON/functions/NEPadLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPadLayer.h
@@ -24,14 +24,14 @@
 #ifndef ARM_COMPUTE_NEPADLAYER_H
 #define ARM_COMPUTE_NEPADLAYER_H
 
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
 #include "arm_compute/runtime/NEON/functions/NECopy.h"
 #include "arm_compute/runtime/NEON/functions/NEStridedSlice.h"
 #include "arm_compute/runtime/SubTensor.h"
-
-#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -82,7 +82,11 @@ public:
      * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
-    void configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT);
+    void configure(ITensor           *input,
+                   ITensor           *output,
+                   const PaddingList &padding,
+                   const PixelValue   constant_value = PixelValue(),
+                   const PaddingMode  mode           = PaddingMode::CONSTANT);
     /**  Static function to check if given info will lead to a valid configuration of @ref NEPadLayer.
      *
      * @param[in] input          Source tensor info. Data types supported: All.
@@ -95,7 +99,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const PaddingList &padding,
+                           const PixelValue   constant_value = PixelValue(),
+                           const PaddingMode  mode           = PaddingMode::CONSTANT);
 
     // Inherited methods overridden:
     void run() override;
@@ -109,7 +117,10 @@ private:
      *                            specifies the front and the end padding in the i-th dimension.
      * @param[in]  constant_value Constant value to be used for the padding
      */
-    void configure_constant_mode(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value);
+    void configure_constant_mode(ITensor           *input,
+                                 ITensor           *output,
+                                 const PaddingList &padding,
+                                 const PixelValue   constant_value);
     /** Configure functions for when reflect or symmetric padding is used.
      *
      * @param[in]  input  Source tensor. Data types supported: All.
diff --git a/arm_compute/runtime/NEON/functions/NEPermute.h b/arm_compute/runtime/NEON/functions/NEPermute.h
index c863fde0ac..2cef64764d 100644
--- a/arm_compute/runtime/NEON/functions/NEPermute.h
+++ b/arm_compute/runtime/NEON/functions/NEPermute.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NEPERMUTE_H
 #define ARM_COMPUTE_NEPERMUTE_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
index 634e8e0c39..3d81bf6087 100644
--- a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
+++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
@@ -95,7 +95,12 @@ public:
      * @param[in]      rounding_policy Rounding policy.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+    void configure(const ITensor             *input1,
+                   const ITensor             *input2,
+                   ITensor                   *output,
+                   float                      scale,
+                   ConvertPolicy              overflow_policy,
+                   RoundingPolicy             rounding_policy,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplication
      *
@@ -122,7 +127,12 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           float                      scale,
+                           ConvertPolicy              overflow_policy,
+                           RoundingPolicy             rounding_policy,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
@@ -158,7 +168,10 @@ public:
      * @param[out]     output   The output tensor. Data types supported: same as @p input1. Number of channels: same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensor                   *input1,
+                   ITensor                   *input2,
+                   ITensor                   *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEComplexPixelWiseMultiplication
      *
      * @param[in] input1   An input tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor).
@@ -166,7 +179,10 @@ public:
      * @param[in] output   The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h b/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h
index 4c5eb58e05..09251f2a5f 100644
--- a/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h
@@ -92,5 +92,5 @@ private:
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEPOOLING3DLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
index 9147ad9687..768ad0d818 100644
--- a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
@@ -91,7 +91,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+    static Status validate(const ITensorInfo      *input,
+                           const ITensorInfo      *output,
+                           const PoolingLayerInfo &pool_info,
+                           const ITensorInfo      *indices = nullptr);
 
     // Inherited methods overridden:
     void run() override;
@@ -100,5 +103,5 @@ private:
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEPOOLINGLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h b/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h
index 38e0c9f3ad..858e3299af 100644
--- a/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h
@@ -62,7 +62,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info);
+    static Status validate(const ITensorInfo       *input1,
+                           const ITensorInfo       *input2,
+                           const ITensorInfo       *output,
+                           const PriorBoxLayerInfo &info);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEPRIORBOXLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
index 185d821ec0..009a4e0911 100644
--- a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NEQLSTMLAYER_H
 
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/common/LSTMParams.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
@@ -35,7 +36,6 @@
 #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
 #include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
-#include "arm_compute/runtime/common/LSTMParams.h"
 
 #include <memory>
 
@@ -130,12 +130,21 @@ public:
      *                                         projection_threshold       (Optional) The clipping threshold for the output from the projection layer, such that values are bound within
      *                                                                               [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.
      */
-    void configure(const ITensor *input,
-                   const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
-                   const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
-                   const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
-                   const ITensor *cell_state_in, ITensor *output_state_in,
-                   ITensor *cell_state_out, ITensor *output_state_out, ITensor *output,
+    void configure(const ITensor             *input,
+                   const ITensor             *input_to_forget_weights,
+                   const ITensor             *input_to_cell_weights,
+                   const ITensor             *input_to_output_weights,
+                   const ITensor             *recurrent_to_forget_weights,
+                   const ITensor             *recurrent_to_cell_weights,
+                   const ITensor             *recurrent_to_output_weights,
+                   const ITensor             *forget_gate_bias,
+                   const ITensor             *cell_bias,
+                   const ITensor             *output_gate_bias,
+                   const ITensor             *cell_state_in,
+                   ITensor                   *output_state_in,
+                   ITensor                   *cell_state_out,
+                   ITensor                   *output_state_out,
+                   ITensor                   *output,
                    const LSTMParams<ITensor> &lstm_params);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEQLSTMLayer
@@ -180,12 +189,21 @@ public:
      *                                                                              [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.
      * @return a status
      */
-    static Status validate(const ITensorInfo *input,
-                           const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                           const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                           const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                           const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                           const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output,
+    static Status validate(const ITensorInfo             *input,
+                           const ITensorInfo             *input_to_forget_weights,
+                           const ITensorInfo             *input_to_cell_weights,
+                           const ITensorInfo             *input_to_output_weights,
+                           const ITensorInfo             *recurrent_to_forget_weights,
+                           const ITensorInfo             *recurrent_to_cell_weights,
+                           const ITensorInfo             *recurrent_to_output_weights,
+                           const ITensorInfo             *forget_gate_bias,
+                           const ITensorInfo             *cell_bias,
+                           const ITensorInfo             *output_gate_bias,
+                           const ITensorInfo             *cell_state_in,
+                           const ITensorInfo             *output_state_in,
+                           const ITensorInfo             *cell_state_out,
+                           const ITensorInfo             *output_state_out,
+                           const ITensorInfo             *output,
                            const LSTMParams<ITensorInfo> &lstm_params);
 
     // Inherited methods overridden:
@@ -218,10 +236,17 @@ private:
      * @param[in] mm_res_info    Tensor info to be used to initialize output stage result tensor.
      *
      */
-    void configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
-                      const ITensor *mm_input, const ITensor *mm_weights, const ITensor *bias, Tensor *mm_res,
-                      Tensor *outstage_res, float gemmlowp_scale,
-                      const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info);
+    void configure_mm(NEGEMMLowpMatrixMultiplyCore &mm,
+                      NEGEMMLowpOutputStage        &outstage,
+                      GEMMLowpOutputStageInfo      &gemmlowp_info,
+                      const ITensor                *mm_input,
+                      const ITensor                *mm_weights,
+                      const ITensor                *bias,
+                      Tensor                       *mm_res,
+                      Tensor                       *outstage_res,
+                      float                         gemmlowp_scale,
+                      const TensorInfo             &mm_res_info,
+                      const TensorInfo             &outstage_tensor_info);
 
     MemoryGroup _memory_group;
 
@@ -230,8 +255,8 @@ private:
     {
         static constexpr uint32_t max_dimension_supported = 2;
 
-        ITensor *_src{ nullptr };
-        ITensor *_dst{ nullptr };
+        ITensor *_src{nullptr};
+        ITensor *_dst{nullptr};
         size_t   _row_size{};
         Window   _window{};
 
@@ -335,19 +360,16 @@ private:
     NECopy _copy_output;
 
     // Tensor pointers
-    const ITensor *_input_to_input_weights
-    {
-        nullptr
-    };
-    const ITensor *_recurrent_to_input_weights{ nullptr };
-    const ITensor *_projection_bias{ nullptr };
-    const ITensor *_input_to_forget_weights{ nullptr };
-    const ITensor *_input_to_cell_weights{ nullptr };
-    const ITensor *_input_to_output_weights{ nullptr };
-    const ITensor *_recurrent_to_forget_weights{ nullptr };
-    const ITensor *_recurrent_to_cell_weights{ nullptr };
-    const ITensor *_recurrent_to_output_weights{ nullptr };
-    const ITensor *_projection_weights{ nullptr };
+    const ITensor                                 *_input_to_input_weights{nullptr};
+    const ITensor                                 *_recurrent_to_input_weights{nullptr};
+    const ITensor                                 *_projection_bias{nullptr};
+    const ITensor                                 *_input_to_forget_weights{nullptr};
+    const ITensor                                 *_input_to_cell_weights{nullptr};
+    const ITensor                                 *_input_to_output_weights{nullptr};
+    const ITensor                                 *_recurrent_to_forget_weights{nullptr};
+    const ITensor                                 *_recurrent_to_cell_weights{nullptr};
+    const ITensor                                 *_recurrent_to_output_weights{nullptr};
+    const ITensor                                 *_projection_weights{nullptr};
     std::array<const ITensor *, _layer_norm_count> _layer_norm_weights{};
     std::array<const ITensor *, _layer_norm_count> _layer_norm_bias{};
 
@@ -382,66 +404,66 @@ private:
         return _layer_norms[getGateIndex(g)];
     }
 
-    void configure_layer_norm(LayerNormGate g, const ITensor *in);
+    void          configure_layer_norm(LayerNormGate g, const ITensor *in);
     static Status validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias);
 
     // Temporary tensors
-    Tensor _input_to_forget_weights_f32{ nullptr };
-    Tensor _input_to_forget_weights_symm8{ nullptr };
+    Tensor _input_to_forget_weights_f32{nullptr};
+    Tensor _input_to_forget_weights_symm8{nullptr};
 
-    Tensor _input_to_forget_weights_transposed{ nullptr };
-    Tensor _input_to_cell_weights_transposed{ nullptr };
-    Tensor _input_to_output_weights_transposed{ nullptr };
-    Tensor _input_to_input_weights_transposed{ nullptr };
-    Tensor _recurrent_to_forget_weights_transposed{ nullptr };
-    Tensor _recurrent_to_cell_weights_transposed{ nullptr };
-    Tensor _recurrent_to_output_weights_transposed{ nullptr };
-    Tensor _recurrent_to_input_weights_transposed{ nullptr };
-    Tensor _projection_weights_transposed{ nullptr };
-    Tensor _input_to_input_eff_bias{ nullptr };
-    Tensor _recurrent_to_input_eff_bias{ nullptr };
-    Tensor _input_to_forget_eff_bias{ nullptr };
-    Tensor _recurrent_to_forget_eff_bias{ nullptr };
-    Tensor _input_to_cell_eff_bias{ nullptr };
-    Tensor _recurrent_to_cell_eff_bias{ nullptr };
-    Tensor _input_to_output_eff_bias{ nullptr };
-    Tensor _recurrent_to_output_eff_bias{ nullptr };
-    Tensor _projection_reduction_res{ nullptr };
-    Tensor _projection_eff_bias{ nullptr };
-    Tensor _mm_input_to_forget_res{ nullptr };
-    Tensor _mm_recurrent_to_forget_res{ nullptr };
-    Tensor _mul_cell_to_forget_res{ nullptr };
-    Tensor _input_to_forget_outstage_res{ nullptr };
-    Tensor _cell_to_forget_outstage_res{ nullptr };
-    Tensor _recurrent_to_forget_outstage_res{ nullptr };
-    Tensor _forget_gate{ nullptr };
-    Tensor _mm_input_to_cell_res{ nullptr };
-    Tensor _input_to_cell_outstage_res{ nullptr };
-    Tensor _mm_recurrent_to_cell_res{ nullptr };
-    Tensor _recurrent_to_cell_outstage_res{ nullptr };
-    Tensor _cell_gate{ nullptr };
-    Tensor _mul_input_cell_res{ nullptr };
-    Tensor _mm_input_to_input_res{ nullptr };
-    Tensor _input_to_input_outstage_res{ nullptr };
-    Tensor _mm_recurrent_to_input_res{ nullptr };
-    Tensor _mul_cell_to_input_res{ nullptr };
-    Tensor _cell_to_input_outstage_res{ nullptr };
-    Tensor _recurrent_to_input_outstage_res{ nullptr };
-    Tensor _input_gate{ nullptr };
-    Tensor _mm_input_to_output_res{ nullptr };
-    Tensor _input_to_output_outstage_res{ nullptr };
-    Tensor _mm_recurrent_to_output_res{ nullptr };
-    Tensor _mul_cell_to_output_res{ nullptr };
-    Tensor _cell_to_output_outstage_res{ nullptr };
-    Tensor _recurrent_to_output_outstage_res{ nullptr };
-    Tensor _output_gate{ nullptr };
-    Tensor _hidden_mul_res{ nullptr };
-    Tensor _hidden_gate{ nullptr };
-    Tensor _mm_projection_res{ nullptr };
-    Tensor _projection_outstage_res{ nullptr };
-    Tensor _projection_out_res{ nullptr };
-    Tensor _projection_accumulate_res{ nullptr };
-    Tensor _ones{ nullptr };
+    Tensor                                _input_to_forget_weights_transposed{nullptr};
+    Tensor                                _input_to_cell_weights_transposed{nullptr};
+    Tensor                                _input_to_output_weights_transposed{nullptr};
+    Tensor                                _input_to_input_weights_transposed{nullptr};
+    Tensor                                _recurrent_to_forget_weights_transposed{nullptr};
+    Tensor                                _recurrent_to_cell_weights_transposed{nullptr};
+    Tensor                                _recurrent_to_output_weights_transposed{nullptr};
+    Tensor                                _recurrent_to_input_weights_transposed{nullptr};
+    Tensor                                _projection_weights_transposed{nullptr};
+    Tensor                                _input_to_input_eff_bias{nullptr};
+    Tensor                                _recurrent_to_input_eff_bias{nullptr};
+    Tensor                                _input_to_forget_eff_bias{nullptr};
+    Tensor                                _recurrent_to_forget_eff_bias{nullptr};
+    Tensor                                _input_to_cell_eff_bias{nullptr};
+    Tensor                                _recurrent_to_cell_eff_bias{nullptr};
+    Tensor                                _input_to_output_eff_bias{nullptr};
+    Tensor                                _recurrent_to_output_eff_bias{nullptr};
+    Tensor                                _projection_reduction_res{nullptr};
+    Tensor                                _projection_eff_bias{nullptr};
+    Tensor                                _mm_input_to_forget_res{nullptr};
+    Tensor                                _mm_recurrent_to_forget_res{nullptr};
+    Tensor                                _mul_cell_to_forget_res{nullptr};
+    Tensor                                _input_to_forget_outstage_res{nullptr};
+    Tensor                                _cell_to_forget_outstage_res{nullptr};
+    Tensor                                _recurrent_to_forget_outstage_res{nullptr};
+    Tensor                                _forget_gate{nullptr};
+    Tensor                                _mm_input_to_cell_res{nullptr};
+    Tensor                                _input_to_cell_outstage_res{nullptr};
+    Tensor                                _mm_recurrent_to_cell_res{nullptr};
+    Tensor                                _recurrent_to_cell_outstage_res{nullptr};
+    Tensor                                _cell_gate{nullptr};
+    Tensor                                _mul_input_cell_res{nullptr};
+    Tensor                                _mm_input_to_input_res{nullptr};
+    Tensor                                _input_to_input_outstage_res{nullptr};
+    Tensor                                _mm_recurrent_to_input_res{nullptr};
+    Tensor                                _mul_cell_to_input_res{nullptr};
+    Tensor                                _cell_to_input_outstage_res{nullptr};
+    Tensor                                _recurrent_to_input_outstage_res{nullptr};
+    Tensor                                _input_gate{nullptr};
+    Tensor                                _mm_input_to_output_res{nullptr};
+    Tensor                                _input_to_output_outstage_res{nullptr};
+    Tensor                                _mm_recurrent_to_output_res{nullptr};
+    Tensor                                _mul_cell_to_output_res{nullptr};
+    Tensor                                _cell_to_output_outstage_res{nullptr};
+    Tensor                                _recurrent_to_output_outstage_res{nullptr};
+    Tensor                                _output_gate{nullptr};
+    Tensor                                _hidden_mul_res{nullptr};
+    Tensor                                _hidden_gate{nullptr};
+    Tensor                                _mm_projection_res{nullptr};
+    Tensor                                _projection_outstage_res{nullptr};
+    Tensor                                _projection_out_res{nullptr};
+    Tensor                                _projection_accumulate_res{nullptr};
+    Tensor                                _ones{nullptr};
     std::array<Tensor, _layer_norm_count> _layer_norm_output{};
 
     inline Tensor &get_layer_norm_output(LayerNormGate g)
@@ -449,15 +471,15 @@ private:
         return _layer_norm_output[getGateIndex(g)];
     }
 
-    bool _is_prepared{ false };
-    bool _has_cifg{ false };
-    bool _has_cell_clipping{ false };
-    bool _has_projection{ false };
-    bool _has_projection_clipping{ false };
-    bool _has_peephole{ false };
-    bool _has_layer_norm{ false };
-    bool _projection_tensor_copy_required{ false };
-    bool _convert_input_to_forget_weights_to_qsymm8{ false };
+    bool _is_prepared{false};
+    bool _has_cifg{false};
+    bool _has_cell_clipping{false};
+    bool _has_projection{false};
+    bool _has_projection_clipping{false};
+    bool _has_peephole{false};
+    bool _has_layer_norm{false};
+    bool _projection_tensor_copy_required{false};
+    bool _convert_input_to_forget_weights_to_qsymm8{false};
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEQLSTMLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NERNNLayer.h b/arm_compute/runtime/NEON/functions/NERNNLayer.h
index 667d3144ac..af7f464ac9 100644
--- a/arm_compute/runtime/NEON/functions/NERNNLayer.h
+++ b/arm_compute/runtime/NEON/functions/NERNNLayer.h
@@ -72,7 +72,13 @@ public:
      * @param[in,out] hidden_state      Output tensor of shape [num_units, batch_size]. Data types supported: Same as @p input
      * @param[in]     info              Activation layer parameter.
      */
-    void configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights, const ITensor *bias, ITensor *hidden_state, ITensor *output, ActivationLayerInfo &info);
+    void configure(const ITensor       *input,
+                   const ITensor       *weights,
+                   const ITensor       *recurrent_weights,
+                   const ITensor       *bias,
+                   ITensor             *hidden_state,
+                   ITensor             *output,
+                   ActivationLayerInfo &info);
     /** Initialize the function
      *
      * @param[in] input             Input is a 2-D tensor of shape [input_size, batch_size]. Data types supported: F16/F32
@@ -85,7 +91,12 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state, const ITensorInfo *output,
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *recurrent_weights,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *hidden_state,
+                           const ITensorInfo         *output,
                            const ActivationLayerInfo &info);
 
     // Inherited methods overridden:
diff --git a/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h b/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h
index ea1af4daea..b06ebe899d 100644
--- a/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h
@@ -77,7 +77,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           ITensorInfo               *output,
+                           const ROIPoolingLayerInfo &pool_info);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEROIALIGNLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
index 2992b3eb95..929111ad4b 100644
--- a/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/IArray.h"
 #include "arm_compute/runtime/IFunction.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -73,7 +74,8 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info);
+    void
+    configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run() override;
@@ -91,7 +93,10 @@ public:
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           const ITensorInfo         *output,
+                           const ROIPoolingLayerInfo &pool_info);
 
 private:
     std::unique_ptr<NEROIPoolingLayerKernel> _roi_kernel;
diff --git a/arm_compute/runtime/NEON/functions/NERange.h b/arm_compute/runtime/NEON/functions/NERange.h
index cb14c8fdde..609456a4ef 100644
--- a/arm_compute/runtime/NEON/functions/NERange.h
+++ b/arm_compute/runtime/NEON/functions/NERange.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
+
 #include <memory>
 
 namespace arm_compute
diff --git a/arm_compute/runtime/NEON/functions/NEReduceMean.h b/arm_compute/runtime/NEON/functions/NEReduceMean.h
index caaee8284a..5b8d8cdf2b 100644
--- a/arm_compute/runtime/NEON/functions/NEReduceMean.h
+++ b/arm_compute/runtime/NEON/functions/NEReduceMean.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NEON_REDUCE_MEAN_H
 #define ARM_COMPUTE_NEON_REDUCE_MEAN_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
 #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
@@ -81,7 +80,8 @@ public:
      *
      * @return A status
      */
-    static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output);
+    static Status
+    validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEReductionOperation.h b/arm_compute/runtime/NEON/functions/NEReductionOperation.h
index 533c10adcf..f5391a6d0e 100644
--- a/arm_compute/runtime/NEON/functions/NEReductionOperation.h
+++ b/arm_compute/runtime/NEON/functions/NEReductionOperation.h
@@ -25,9 +25,9 @@
 #define ARM_COMPUTE_NEREDUCTIONOPERATION_H
 
 #include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -88,7 +88,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           unsigned int       axis,
+                           ReductionOperation op,
+                           bool               keep_dims = true);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEReorderLayer.h b/arm_compute/runtime/NEON/functions/NEReorderLayer.h
index eb777f1925..e3fa7b9c16 100644
--- a/arm_compute/runtime/NEON/functions/NEReorderLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEReorderLayer.h
@@ -66,7 +66,10 @@ public:
      * @param[in]  input_wf  WeightFormat of input.
      * @param[in]  output_wf WeightFormat of output.
      */
-    void configure(const ITensor *input, ITensor *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf);
+    void configure(const ITensor            *input,
+                   ITensor                  *output,
+                   arm_compute::WeightFormat input_wf,
+                   arm_compute::WeightFormat output_wf);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEReorderLayer
      *
@@ -74,7 +77,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf);
+    static Status validate(const ITensorInfo        *input,
+                           const ITensorInfo        *output,
+                           arm_compute::WeightFormat input_wf,
+                           arm_compute::WeightFormat output_wf);
 
     // Inherited methods overridden:
     void run() override;
@@ -85,4 +91,4 @@ private:
 } // namespace arm_compute
 #endif /* ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREORDERLAYER */
 
-#endif  // defined(__aarch64__)
-\ No newline at end of file
+#endif // defined(__aarch64__)
diff --git a/arm_compute/runtime/NEON/functions/NEReverse.h b/arm_compute/runtime/NEON/functions/NEReverse.h
index f58eb2373f..e03e415068 100644
--- a/arm_compute/runtime/NEON/functions/NEReverse.h
+++ b/arm_compute/runtime/NEON/functions/NEReverse.h
@@ -24,9 +24,8 @@
 #ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREVERSE_H
 #define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREVERSE_H
 
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
@@ -68,7 +67,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, const bool use_inverted_axis = false);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const ITensorInfo *axis,
+                           const bool         use_inverted_axis = false);
 };
 } // namespace arm_compute
 #endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREVERSE_H
diff --git a/arm_compute/runtime/NEON/functions/NEScale.h b/arm_compute/runtime/NEON/functions/NEScale.h
index 0920ff3802..72dfa3bda4 100644
--- a/arm_compute/runtime/NEON/functions/NEScale.h
+++ b/arm_compute/runtime/NEON/functions/NEScale.h
@@ -24,10 +24,9 @@
 #ifndef ARM_COMPUTE_NESCALEIMAGE_H
 #define ARM_COMPUTE_NESCALEIMAGE_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
diff --git a/arm_compute/runtime/NEON/functions/NESlice.h b/arm_compute/runtime/NEON/functions/NESlice.h
index ac79a5c633..70a688d3b0 100644
--- a/arm_compute/runtime/NEON/functions/NESlice.h
+++ b/arm_compute/runtime/NEON/functions/NESlice.h
@@ -85,7 +85,8 @@ public:
      *
      * @return A status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
 
     // Inherited methods overridden:
     void run() override;
@@ -129,7 +130,8 @@ public:
      *
      * @return A status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
 };
 } // namespace experimental
 } // namespace arm_compute
diff --git a/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h b/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h
index ad8c1467d0..5dee61a4a8 100644
--- a/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h
+++ b/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h
@@ -24,9 +24,9 @@
 #ifndef ARM_COMPUTE_NESPACETOBATCHLAYER_H
 #define ARM_COMPUTE_NESPACETOBATCHLAYER_H
 
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/Types.h"
 #include <memory>
 
 namespace arm_compute
@@ -82,7 +82,12 @@ public:
      * @param[in]  padding_right The padding at the end of every dimension of the output tensor.
      * @param[out] output        Tensor output. Data types supported: same as @p input
      */
-    void configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output);
+    void configure(const ITensor *input,
+                   const int      block_shape_x,
+                   const int      block_shape_y,
+                   const Size2D  &padding_left,
+                   const Size2D  &padding_right,
+                   ITensor       *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayer
      *
      * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
@@ -92,7 +97,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *block_shape,
+                           const ITensorInfo *paddings,
+                           const ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayer (Static block shape and paddings)
      *
      * @param[in] input         Tensor input info. Supported tensor rank: 4. Data types supported: All.
@@ -104,7 +112,12 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const int          block_shape_x,
+                           const int          block_shape_y,
+                           const Size2D      &padding_left,
+                           const Size2D      &padding_right,
+                           const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NESplit.h b/arm_compute/runtime/NEON/functions/NESplit.h
index 206f299c06..36358a7094 100644
--- a/arm_compute/runtime/NEON/functions/NESplit.h
+++ b/arm_compute/runtime/NEON/functions/NESplit.h
@@ -26,7 +26,6 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-
 #include "arm_compute/runtime/CPP/functions/CPPSplit.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/functions/NESlice.h"
diff --git a/arm_compute/runtime/NEON/functions/NEStridedSlice.h b/arm_compute/runtime/NEON/functions/NEStridedSlice.h
index 4b14d946f6..fa1113ffec 100644
--- a/arm_compute/runtime/NEON/functions/NEStridedSlice.h
+++ b/arm_compute/runtime/NEON/functions/NEStridedSlice.h
@@ -71,9 +71,14 @@ public:
      * @param[in]  shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    void configure(const ITensor *input, ITensor *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+    void configure(const ITensor     *input,
+                   ITensor           *output,
+                   const Coordinates &starts,
+                   const Coordinates &ends,
+                   const BiStrides   &strides,
+                   int32_t            begin_mask       = 0,
+                   int32_t            end_mask         = 0,
+                   int32_t            shrink_axis_mask = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEStridedSlice
      *
@@ -89,9 +94,14 @@ public:
      * @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                           int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const Coordinates &starts,
+                           const Coordinates &ends,
+                           const BiStrides   &strides,
+                           int32_t            begin_mask       = 0,
+                           int32_t            end_mask         = 0,
+                           int32_t            shrink_axis_mask = 0);
 
     // Inherited methods overridden:
     void run() override;
@@ -121,9 +131,14 @@ public:
      * @param[in]  shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    void configure(const ITensorInfo *input, ITensorInfo *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+    void configure(const ITensorInfo *input,
+                   ITensorInfo       *output,
+                   const Coordinates &starts,
+                   const Coordinates &ends,
+                   const BiStrides   &strides,
+                   int32_t            begin_mask       = 0,
+                   int32_t            end_mask         = 0,
+                   int32_t            shrink_axis_mask = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEStridedSlice
      *
@@ -139,9 +154,14 @@ public:
      * @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                           int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const Coordinates &starts,
+                           const Coordinates &ends,
+                           const BiStrides   &strides,
+                           int32_t            begin_mask       = 0,
+                           int32_t            end_mask         = 0,
+                           int32_t            shrink_axis_mask = 0);
 };
 } // namespace experimental
 } // namespace arm_compute
diff --git a/arm_compute/runtime/NEON/functions/NETile.h b/arm_compute/runtime/NEON/functions/NETile.h
index 915e5aa1da..001a0a4128 100644
--- a/arm_compute/runtime/NEON/functions/NETile.h
+++ b/arm_compute/runtime/NEON/functions/NETile.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NETILE_H
 #define ARM_COMPUTE_NETILE_H
 
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/runtime/NEON/functions/NETranspose.h b/arm_compute/runtime/NEON/functions/NETranspose.h
index 581fe74309..5d2d1f1b01 100644
--- a/arm_compute/runtime/NEON/functions/NETranspose.h
+++ b/arm_compute/runtime/NEON/functions/NETranspose.h
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NETRANSPOSE_H
 #define ARM_COMPUTE_NETRANSPOSE_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -83,4 +82,4 @@ private:
     std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NETRANSPOSE_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_NETRANSPOSE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEUnstack.h b/arm_compute/runtime/NEON/functions/NEUnstack.h
index 079fee5b9e..e1af96d08d 100644
--- a/arm_compute/runtime/NEON/functions/NEUnstack.h
+++ b/arm_compute/runtime/NEON/functions/NEUnstack.h
@@ -26,7 +26,6 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/runtime/NEON/functions/NEStridedSlice.h"
 
 #include <memory>
diff --git a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
index f6f0185e7d..7f4e354362 100644
--- a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
@@ -24,10 +24,9 @@
 #ifndef ARM_COMPUTE_NEWINOGRADCONVOLUTIONLAYER_H
 #define ARM_COMPUTE_NEWINOGRADCONVOLUTIONLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/Tensor.h"
 
 #include <memory>
@@ -87,8 +86,13 @@ public:
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                              available which may introduce a drop of accuracy as well. Default is false
      */
-    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
-                   bool enable_fast_math = false);
+    void configure(const ITensor             *input,
+                   const ITensor             *weights,
+                   const ITensor             *biases,
+                   ITensor                   *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false);
 
     // Inherited methods overridden:
     void run() override;
@@ -100,8 +104,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false);
 
 private:
     struct Impl;
diff --git a/arm_compute/runtime/OffsetLifetimeManager.h b/arm_compute/runtime/OffsetLifetimeManager.h
index 2eef61a236..13ebb9fbe3 100644
--- a/arm_compute/runtime/OffsetLifetimeManager.h
+++ b/arm_compute/runtime/OffsetLifetimeManager.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_OFFSETLIFETIMEMANAGER_H
 
 #include "arm_compute/runtime/ISimpleLifetimeManager.h"
-
 #include "arm_compute/runtime/Types.h"
 
 #include <map>
@@ -62,7 +61,7 @@ public:
 
     // Inherited methods overridden:
     std::unique_ptr<IMemoryPool> create_pool(IAllocator *allocator) override;
-    MappingType mapping_type() const override;
+    MappingType                  mapping_type() const override;
 
 private:
     // Inherited methods overridden:
diff --git a/arm_compute/runtime/OffsetMemoryPool.h b/arm_compute/runtime/OffsetMemoryPool.h
index a5c363d866..7250194f85 100644
--- a/arm_compute/runtime/OffsetMemoryPool.h
+++ b/arm_compute/runtime/OffsetMemoryPool.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_OFFSETMEMORYPOOL_H
 
 #include "arm_compute/runtime/IMemoryPool.h"
-
 #include "arm_compute/runtime/IMemoryRegion.h"
 #include "arm_compute/runtime/Types.h"
 
@@ -65,8 +64,8 @@ public:
     const BlobInfo &info() const;
 
     // Inherited methods overridden:
-    void acquire(MemoryMappings &handles) override;
-    void release(MemoryMappings &handles) override;
+    void                         acquire(MemoryMappings &handles) override;
+    void                         release(MemoryMappings &handles) override;
     MappingType                  mapping_type() const override;
     std::unique_ptr<IMemoryPool> duplicate() override;
 
diff --git a/arm_compute/runtime/OperatorTensor.h b/arm_compute/runtime/OperatorTensor.h
index 92ae01934b..237585bec2 100644
--- a/arm_compute/runtime/OperatorTensor.h
+++ b/arm_compute/runtime/OperatorTensor.h
@@ -26,8 +26,8 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/runtime/Types.h"
 #include "arm_compute/runtime/experimental/Types.h"
+#include "arm_compute/runtime/Types.h"
 
 #include <cstdint>
 
diff --git a/arm_compute/runtime/PoolManager.h b/arm_compute/runtime/PoolManager.h
index cc50fc04a4..6aa6aef6e2 100644
--- a/arm_compute/runtime/PoolManager.h
+++ b/arm_compute/runtime/PoolManager.h
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_POOLMANAGER_H
 #define ARM_COMPUTE_POOLMANAGER_H
 
-#include "arm_compute/runtime/IPoolManager.h"
-
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/IMemoryPool.h"
+#include "arm_compute/runtime/IPoolManager.h"
+
 #include "support/Mutex.h"
 #include "support/Semaphore.h"
 
@@ -53,9 +53,9 @@ public:
     PoolManager &operator=(PoolManager &&) = delete;
 
     // Inherited methods overridden:
-    IMemoryPool *lock_pool() override;
-    void unlock_pool(IMemoryPool *pool) override;
-    void register_pool(std::unique_ptr<IMemoryPool> pool) override;
+    IMemoryPool                 *lock_pool() override;
+    void                         unlock_pool(IMemoryPool *pool) override;
+    void                         register_pool(std::unique_ptr<IMemoryPool> pool) override;
     std::unique_ptr<IMemoryPool> release_pool() override;
     void                         clear_pools() override;
     size_t                       num_pools() const override;
@@ -66,5 +66,5 @@ private:
     std::unique_ptr<arm_compute::Semaphore> _sem;            /**< Semaphore to control the queues */
     mutable arm_compute::Mutex              _mtx;            /**< Mutex to control access to the queues */
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_POOLMANAGER_H */
diff --git a/arm_compute/runtime/RuntimeContext.h b/arm_compute/runtime/RuntimeContext.h
index 23bd267375..d64e609196 100644
--- a/arm_compute/runtime/RuntimeContext.h
+++ b/arm_compute/runtime/RuntimeContext.h
@@ -54,8 +54,8 @@ public:
     IAssetManager *asset_manager() override;
 
 private:
-    std::unique_ptr<IScheduler> _owned_scheduler{ nullptr };
-    IScheduler                 *_scheduler{ nullptr };
+    std::unique_ptr<IScheduler> _owned_scheduler{nullptr};
+    IScheduler                 *_scheduler{nullptr};
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_RUNTIME_CONTEXT_H */
diff --git a/arm_compute/runtime/Scheduler.h b/arm_compute/runtime/Scheduler.h
index 9e8add1f95..bd29cbb31f 100644
--- a/arm_compute/runtime/Scheduler.h
+++ b/arm_compute/runtime/Scheduler.h
@@ -74,8 +74,8 @@ public:
     static bool is_available(Type t);
 
 private:
-    static Type                        _scheduler_type;
-    static std::shared_ptr<IScheduler> _custom_scheduler;
+    static Type                                        _scheduler_type;
+    static std::shared_ptr<IScheduler>                 _custom_scheduler;
     static std::map<Type, std::unique_ptr<IScheduler>> _schedulers;
 
     Scheduler();
diff --git a/arm_compute/runtime/SubTensor.h b/arm_compute/runtime/SubTensor.h
index 3ca066e1c8..2badb31b26 100644
--- a/arm_compute/runtime/SubTensor.h
+++ b/arm_compute/runtime/SubTensor.h
@@ -72,5 +72,5 @@ private:
     ITensor              *_parent;
     mutable SubTensorInfo _info;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_SUBTENSOR_H */
diff --git a/arm_compute/runtime/Tensor.h b/arm_compute/runtime/Tensor.h
index 172c8963f0..e71fbd4a96 100644
--- a/arm_compute/runtime/Tensor.h
+++ b/arm_compute/runtime/Tensor.h
@@ -59,7 +59,7 @@ public:
     ITensorInfo *info() const override;
     ITensorInfo *info() override;
     uint8_t     *buffer() const override;
-    void associate_memory_group(IMemoryGroup *memory_group) override;
+    void         associate_memory_group(IMemoryGroup *memory_group) override;
 
 private:
     mutable TensorAllocator _allocator; /**< Instance of the basic CPU allocator.*/
diff --git a/arm_compute/runtime/TensorAllocator.h b/arm_compute/runtime/TensorAllocator.h
index a5e16c4d90..d819931415 100644
--- a/arm_compute/runtime/TensorAllocator.h
+++ b/arm_compute/runtime/TensorAllocator.h
@@ -24,7 +24,6 @@
 #ifndef ARM_COMPUTE_TENSORALLOCATOR_H
 #define ARM_COMPUTE_TENSORALLOCATOR_H
 #include "arm_compute/runtime/ITensorAllocator.h"
-
 #include "arm_compute/runtime/Memory.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
diff --git a/arm_compute/runtime/common/LSTMParams.h b/arm_compute/runtime/common/LSTMParams.h
index aedb9c0d46..6800faf87f 100644
--- a/arm_compute/runtime/common/LSTMParams.h
+++ b/arm_compute/runtime/common/LSTMParams.h
@@ -79,7 +79,10 @@ public:
      *
      * @return Reference to this LSTMParams object
      */
-    LSTMParams &set_cifg_params(const T *input_to_input_weights, const T *recurrent_to_input_weights, T *cell_to_input_weights, const T *input_gate_bias)
+    LSTMParams &set_cifg_params(const T *input_to_input_weights,
+                                const T *recurrent_to_input_weights,
+                                T       *cell_to_input_weights,
+                                const T *input_gate_bias)
     {
         _input_to_input_weights     = input_to_input_weights;
         _recurrent_to_input_weights = recurrent_to_input_weights;
@@ -125,8 +128,10 @@ public:
      *
      * @return Reference to this LSTMParams object
      */
-    LSTMParams &set_layer_normalization_params(T *input_layer_norm_weights, T *forget_layer_norm_weights,
-                                               T *cell_layer_norm_weights, T *output_layer_norm_weights)
+    LSTMParams &set_layer_normalization_params(T *input_layer_norm_weights,
+                                               T *forget_layer_norm_weights,
+                                               T *cell_layer_norm_weights,
+                                               T *output_layer_norm_weights)
     {
         _input_layer_norm_weights  = input_layer_norm_weights;
         _forget_layer_norm_weights = forget_layer_norm_weights;
@@ -169,7 +174,10 @@ public:
      *
      * @return Reference to this LSTMParams object
      */
-    LSTMParams &set_matmul_scale_params(float input_intermediate_scale, float forget_intermediate_scale, float cell_intermediate_scale, float output_intermediate_scale)
+    LSTMParams &set_matmul_scale_params(float input_intermediate_scale,
+                                        float forget_intermediate_scale,
+                                        float cell_intermediate_scale,
+                                        float output_intermediate_scale)
     {
         _input_intermediate_scale  = input_intermediate_scale;
         _forget_intermediate_scale = forget_intermediate_scale;
@@ -338,5 +346,5 @@ private:
     bool     _has_cifg_opt;
     bool     _use_layer_norm;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_LSTMPARAMS_H */
diff --git a/compute_kernel_writer/include/ckw/Error.h b/compute_kernel_writer/include/ckw/Error.h
index 7da9544b9e..6b80778957 100644
--- a/compute_kernel_writer/include/ckw/Error.h
+++ b/compute_kernel_writer/include/ckw/Error.h
@@ -53,7 +53,7 @@ create_error_msg(const std::string &file, const std::string &func, const std::st
         const std::string arg2(std::to_string(__LINE__));             \
         const std::string arg3(msg);                                  \
         std::runtime_error(create_error_msg(arg0, arg1, arg2, arg3)); \
-    } while(false)
+    } while (false)
 
 /** Mark the variables as unused.
  *
@@ -74,16 +74,16 @@ inline void ignore_unused(T &&...)
  *
  * @param[in] msg The error message.
  */
-#define CKW_THROW_MSG(msg)                                                              \
-    do                                                                                  \
-    {                                                                                   \
-        const std::string file(__FILE__);                                               \
-        const std::string func(__func__);                                               \
-        const std::string line(std::to_string(__LINE__));                               \
-        const std::string message(msg);                                                 \
-                                                                                        \
+#define CKW_THROW_MSG(msg)                                                          \
+    do                                                                              \
+    {                                                                               \
+        const std::string file(__FILE__);                                           \
+        const std::string func(__func__);                                           \
+        const std::string line(std::to_string(__LINE__));                           \
+        const std::string message(msg);                                             \
+                                                                                    \
         throw std::runtime_error(ckw::create_error_msg(file, func, line, message)); \
-    } while(false)
+    } while (false)
 
 #ifdef COMPUTE_KERNEL_WRITER_ASSERTS_ENABLED
 
@@ -95,11 +95,11 @@ inline void ignore_unused(T &&...)
 #define CKW_ASSERT_MSG(cond, msg) \
     do                            \
     {                             \
-        if(!(cond))               \
+        if (!(cond))              \
         {                         \
             CKW_THROW_MSG(msg);   \
         }                         \
-    } while(false)
+    } while (false)
 
 #else // COMPUTE_KERNEL_WRITER_ASSERTS_ENABLED
 
diff --git a/compute_kernel_writer/include/ckw/Kernel.h b/compute_kernel_writer/include/ckw/Kernel.h
index dc0cad5503..f9b7bbb82e 100644
--- a/compute_kernel_writer/include/ckw/Kernel.h
+++ b/compute_kernel_writer/include/ckw/Kernel.h
@@ -26,6 +26,7 @@
 #define CKW_INCLUDE_CKW_KERNEL_H
 
 #include "ckw/KernelArgument.h"
+
 #include <string>
 #include <vector>
 
diff --git a/compute_kernel_writer/include/ckw/KernelArgument.h b/compute_kernel_writer/include/ckw/KernelArgument.h
index 530e2920eb..7e9bcbf1ee 100644
--- a/compute_kernel_writer/include/ckw/KernelArgument.h
+++ b/compute_kernel_writer/include/ckw/KernelArgument.h
@@ -27,6 +27,7 @@
 
 #include "ckw/types/TensorComponentType.h"
 #include "ckw/types/TensorStorageType.h"
+
 #include <cstdint>
 
 namespace ckw
@@ -90,7 +91,7 @@ private:
         TensorComponentType tensor_component_type;
     };
 
-    SubId _sub_id{ 0 };
+    SubId _sub_id{0};
 };
 
 } // namespace ckw
diff --git a/compute_kernel_writer/include/ckw/KernelWriter.h b/compute_kernel_writer/include/ckw/KernelWriter.h
index 15c99fe652..0d739e859a 100644
--- a/compute_kernel_writer/include/ckw/KernelWriter.h
+++ b/compute_kernel_writer/include/ckw/KernelWriter.h
@@ -115,7 +115,8 @@ public:
      * @param[in] first  The first source tile.
      * @param[in] second The second source tile.
      */
-    virtual void op_binary(const TileOperand &dst, BinaryOp op, const TileOperand &first, const TileOperand &second) = 0;
+    virtual void
+    op_binary(const TileOperand &dst, BinaryOp op, const TileOperand &first, const TileOperand &second) = 0;
 
     /** Write ternary expression statement: `<dst> = <op>(<first>, <second>, <third>);`.
      *
@@ -125,7 +126,11 @@ public:
      * @param[in] second The second source tile.
      * @param[in] third  The third source tile.
      */
-    virtual void op_ternary(const TileOperand &dst, TernaryOp op, const TileOperand &first, const TileOperand &second, const TileOperand &third) = 0;
+    virtual void op_ternary(const TileOperand &dst,
+                            TernaryOp          op,
+                            const TileOperand &first,
+                            const TileOperand &second,
+                            const TileOperand &third) = 0;
 
     // =============================================================================================
     // Flow control
@@ -138,7 +143,8 @@ public:
      * @param[in] rhs  The RHS tile of the condition.
      * @param[in] body The function that writes the body of the if block.
      */
-    virtual void op_if(const TileOperand &lhs, BinaryOp op, const TileOperand &rhs, const std::function<void()> &body) = 0;
+    virtual void
+    op_if(const TileOperand &lhs, BinaryOp op, const TileOperand &rhs, const std::function<void()> &body) = 0;
 
     /** Write else-if block: `else if(<lhs> <op> <rhs>) { <body> }`.
      *
@@ -147,7 +153,8 @@ public:
      * @param[in] rhs  The RHS tile of the condition.
      * @param[in] body The function that writes the body of the else-if block.
      */
-    virtual void op_else_if(const TileOperand &lhs, BinaryOp op, const TileOperand &rhs, const std::function<void()> &body) = 0;
+    virtual void
+    op_else_if(const TileOperand &lhs, BinaryOp op, const TileOperand &rhs, const std::function<void()> &body) = 0;
 
     /** Write an else block: `else { <body> }`.
      *
@@ -165,10 +172,13 @@ public:
      * @param[in] update_value The value which is updated at every iteration.
      * @param[in] body         The function that writes the body of the for-loop block.
      */
-    virtual void op_for_loop(
-        const TileOperand &var, BinaryOp cond_op, const TileOperand &cond_value,
-        const TileOperand &update_var, AssignmentOp update_op, const TileOperand &update_value,
-        const std::function<void()> &body) = 0;
+    virtual void op_for_loop(const TileOperand           &var,
+                             BinaryOp                     cond_op,
+                             const TileOperand           &cond_value,
+                             const TileOperand           &update_var,
+                             AssignmentOp                 update_op,
+                             const TileOperand           &update_value,
+                             const std::function<void()> &body) = 0;
 
     /** Write the return statement. */
     virtual void op_return() = 0;
@@ -271,9 +281,13 @@ public:
      * @param[in] z         z-coordinate
      * @param[in] batch     batch
      */
-    virtual void op_load(
-        const TileOperand &tile_op, const TensorOperand &tensor_op, TensorSampler &sampler,
-        const TileOperand &x, const TileOperand &y, const TileOperand &z, const TileOperand &batch) = 0;
+    virtual void op_load(const TileOperand   &tile_op,
+                         const TensorOperand &tensor_op,
+                         TensorSampler       &sampler,
+                         const TileOperand   &x,
+                         const TileOperand   &y,
+                         const TileOperand   &z,
+                         const TileOperand   &batch) = 0;
 
     /** Load the data from the tensor memory to the tile in a dilated way using the sampling information.
      *
@@ -282,27 +296,41 @@ public:
      * @param[in] dilation_x Dilation while reading in x-dimension
      * @param[in] dilation_y Dilation while reading in y-dimension
      */
-    virtual void op_load_dilated(
-        const TileOperand &tile_op, const TensorOperand &tensor_op, TensorSampler &sampler,
-        const TileOperand &x, const TileOperand &y, const TileOperand &z, const TileOperand &batch,
-        const TileOperand &dilation_x, const TileOperand &dilation_y) = 0;
+    virtual void op_load_dilated(const TileOperand   &tile_op,
+                                 const TensorOperand &tensor_op,
+                                 TensorSampler       &sampler,
+                                 const TileOperand   &x,
+                                 const TileOperand   &y,
+                                 const TileOperand   &z,
+                                 const TileOperand   &batch,
+                                 const TileOperand   &dilation_x,
+                                 const TileOperand   &dilation_y) = 0;
 
     /** Store the data to the tensor memory from the tile using the sampling information.
      *
      * Similar to @ref KernelWriter::op_load()
      */
-    virtual void op_store(
-        const TensorOperand &tensor_op, const TileOperand &tile_op, TensorSampler &sampler,
-        const TileOperand &x, const TileOperand &y, const TileOperand &z, const TileOperand &batch) = 0;
+    virtual void op_store(const TensorOperand &tensor_op,
+                          const TileOperand   &tile_op,
+                          TensorSampler       &sampler,
+                          const TileOperand   &x,
+                          const TileOperand   &y,
+                          const TileOperand   &z,
+                          const TileOperand   &batch) = 0;
 
     /** Store the data to the tensor memory from the tile in a dilated way using the sampling information.
      *
      * Similar to @ref KernelWriter::op_load_dilated()
      */
-    virtual void op_store_dilated(
-        const TensorOperand &tensor_op, const TileOperand &tile_op, TensorSampler &sampler,
-        const TileOperand &x, const TileOperand &y, const TileOperand &z, const TileOperand &batch,
-        const TileOperand &dilation_x, const TileOperand &dilation_y) = 0;
+    virtual void op_store_dilated(const TensorOperand &tensor_op,
+                                  const TileOperand   &tile_op,
+                                  TensorSampler       &sampler,
+                                  const TileOperand   &x,
+                                  const TileOperand   &y,
+                                  const TileOperand   &z,
+                                  const TileOperand   &batch,
+                                  const TileOperand   &dilation_x,
+                                  const TileOperand   &dilation_y) = 0;
 
     /** Load the data from the tensor memory to the tile using the indirect buffer approach and respecting the sampling information.
      *
@@ -314,8 +342,13 @@ public:
      * @param[in] z         z-coordinate
      * @param[in] batch     batch
      */
-    virtual void op_load_indirect(const TileOperand &tile_op, const TensorOperand &tensor_op, TensorSampler &sampler,
-                                  const TileOperand &x, const TileOperand &y, const TileOperand &z, const TileOperand &batch_op) = 0;
+    virtual void op_load_indirect(const TileOperand   &tile_op,
+                                  const TensorOperand &tensor_op,
+                                  TensorSampler       &sampler,
+                                  const TileOperand   &x,
+                                  const TileOperand   &y,
+                                  const TileOperand   &z,
+                                  const TileOperand   &batch_op) = 0;
 
 protected:
     // =============================================================================================
@@ -373,8 +406,8 @@ protected:
     static DataType get_data_type(const ConstantData &data);
 
 private:
-    int32_t _id_space{ 0 };
-    int32_t _last_created_id_space{ 0 };
+    int32_t _id_space{0};
+    int32_t _last_created_id_space{0};
 };
 
 } // namespace ckw
diff --git a/compute_kernel_writer/include/ckw/TensorInfo.h b/compute_kernel_writer/include/ckw/TensorInfo.h
index 87cf7c1426..5c87cb5b12 100644
--- a/compute_kernel_writer/include/ckw/TensorInfo.h
+++ b/compute_kernel_writer/include/ckw/TensorInfo.h
@@ -27,6 +27,7 @@
 
 #include "ckw/types/DataType.h"
 #include "ckw/types/TensorDataLayout.h"
+
 #include <array>
 #include <cstdint>
 
@@ -85,10 +86,10 @@ public:
     int32_t id() const;
 
 private:
-    TensorShape      _shape{ { 0 } };
-    DataType         _dt{ DataType::Unknown };
-    TensorDataLayout _dl{ TensorDataLayout::Unknown };
-    int32_t          _id{ -1 };
+    TensorShape      _shape{{0}};
+    DataType         _dt{DataType::Unknown};
+    TensorDataLayout _dl{TensorDataLayout::Unknown};
+    int32_t          _id{-1};
 };
 } // namespace ckw
 
diff --git a/compute_kernel_writer/include/ckw/TensorSampler.h b/compute_kernel_writer/include/ckw/TensorSampler.h
index 1b51636edb..117e8de2cf 100644
--- a/compute_kernel_writer/include/ckw/TensorSampler.h
+++ b/compute_kernel_writer/include/ckw/TensorSampler.h
@@ -25,8 +25,8 @@
 #ifndef CKW_INCLUDE_CKW_TENSORSAMPLER_H
 #define CKW_INCLUDE_CKW_TENSORSAMPLER_H
 
-#include "ckw/types/TensorStorageType.h"
 #include "ckw/types/TensorSamplerTypes.h"
+#include "ckw/types/TensorStorageType.h"
 
 namespace ckw
 {
@@ -53,12 +53,11 @@ public:
      * @param[in] address_mode_y The address mode of the y dimension.
      * @param[in] address_mode_z The address mode of the z dimension.
      */
-    TensorSampler(
-        TensorStorageType         storage,
-        TensorSamplerFormat       format,
-        TensorSamplerAddressModeX address_mode_x,
-        TensorSamplerAddressModeY address_mode_y,
-        TensorSamplerAddressModeZ address_mode_z);
+    TensorSampler(TensorStorageType         storage,
+                  TensorSamplerFormat       format,
+                  TensorSamplerAddressModeX address_mode_x,
+                  TensorSamplerAddressModeY address_mode_y,
+                  TensorSamplerAddressModeZ address_mode_z);
 
     /** Get the storage for the tensor */
     TensorStorageType storage() const;
@@ -91,11 +90,11 @@ public:
     TensorSampler &address_mode_z(TensorSamplerAddressModeZ address_mode_z);
 
 private:
-    TensorStorageType                _storage { TensorStorageType::BufferUint8Ptr };
-    TensorSamplerFormat              _format  { TensorSamplerFormat::Unknown };
-    TensorSamplerAddressModeX _address_mode_x { TensorSamplerAddressModeX::Unknown };
-    TensorSamplerAddressModeY _address_mode_y { TensorSamplerAddressModeY::Unknown };
-    TensorSamplerAddressModeZ _address_mode_z { TensorSamplerAddressModeZ::Unknown };
+    TensorStorageType         _storage{TensorStorageType::BufferUint8Ptr};
+    TensorSamplerFormat       _format{TensorSamplerFormat::Unknown};
+    TensorSamplerAddressModeX _address_mode_x{TensorSamplerAddressModeX::Unknown};
+    TensorSamplerAddressModeY _address_mode_y{TensorSamplerAddressModeY::Unknown};
+    TensorSamplerAddressModeZ _address_mode_z{TensorSamplerAddressModeZ::Unknown};
 };
 
 } // namespace ckw
diff --git a/compute_kernel_writer/include/ckw/TileInfo.h b/compute_kernel_writer/include/ckw/TileInfo.h
index b8094f79bf..678bb7aaf6 100644
--- a/compute_kernel_writer/include/ckw/TileInfo.h
+++ b/compute_kernel_writer/include/ckw/TileInfo.h
@@ -83,7 +83,7 @@ public:
     DataType data_type() const;
 
 private:
-    DataType  _dt{ DataType::Unknown };
+    DataType  _dt{DataType::Unknown};
     TileShape _shape{};
 };
 
diff --git a/compute_kernel_writer/include/ckw/types/ConstantData.h b/compute_kernel_writer/include/ckw/types/ConstantData.h
index 95425b2c65..7708818ca8 100644
--- a/compute_kernel_writer/include/ckw/types/ConstantData.h
+++ b/compute_kernel_writer/include/ckw/types/ConstantData.h
@@ -45,12 +45,12 @@ class KernelWriter;
 
 class ConstantData
 {
-    using String = std::string;
+    using String       = std::string;
     using StringVector = std::vector<String>;
 
 public:
     /** Templated constructor */
-    template<typename T>
+    template <typename T>
     ConstantData(std::initializer_list<std::initializer_list<T>> values, DataType data_type);
 
 private:
@@ -60,14 +60,14 @@ private:
      *
      * @return true if user provided data type and the template type are conformant
      */
-    template<typename T>
+    template <typename T>
     bool validate(DataType data_type);
 
     /** Get the constant data as a 2d vector of string values
      *
      * @return a 2d vector of strings that has the string-converted values
      */
-    const std::vector<StringVector>& values() const;
+    const std::vector<StringVector> &values() const;
 
     /** Get the underlying data type of the constant values
      *
@@ -81,7 +81,7 @@ private:
 private:
     // Data members
     std::vector<StringVector> _values{};
-    DataType _data_type{};
+    DataType                  _data_type{};
 };
 
 } // namespace ckw
diff --git a/compute_kernel_writer/include/ckw/types/MemoryOperation.h b/compute_kernel_writer/include/ckw/types/MemoryOperation.h
index 0466b82df2..f93f60c51a 100644
--- a/compute_kernel_writer/include/ckw/types/MemoryOperation.h
+++ b/compute_kernel_writer/include/ckw/types/MemoryOperation.h
@@ -27,11 +27,11 @@
 
 namespace ckw
 {
-    enum class MemoryOperation
-    {
-        Load  = 1,
-        Store = 2
-    };
+enum class MemoryOperation
+{
+    Load  = 1,
+    Store = 2
+};
 } // namespace ckw
 
 #endif /* CKW_INCLUDE_CKW_TYPES_MEMORYOPERATION */
diff --git a/compute_kernel_writer/include/ckw/types/TensorSamplerTypes.h b/compute_kernel_writer/include/ckw/types/TensorSamplerTypes.h
index 43dce1d4e4..512d0b4501 100644
--- a/compute_kernel_writer/include/ckw/types/TensorSamplerTypes.h
+++ b/compute_kernel_writer/include/ckw/types/TensorSamplerTypes.h
@@ -75,8 +75,8 @@ enum class TensorSamplerAddressModeY : int32_t
  */
 enum class TensorSamplerAddressModeZ : int32_t
 {
-    Unknown        = 0,
-    None           = 1,
+    Unknown = 0,
+    None    = 1,
 };
 
 } // namespace ckw
diff --git a/compute_kernel_writer/prototype/examples/add_exp_store.cpp b/compute_kernel_writer/prototype/examples/add_exp_store.cpp
index 6a9884543c..2b640ca01b 100644
--- a/compute_kernel_writer/prototype/examples/add_exp_store.cpp
+++ b/compute_kernel_writer/prototype/examples/add_exp_store.cpp
@@ -32,7 +32,6 @@
 #include "common/ExampleComponentArgument.h"
 #include "common/ExampleKernelWriter.h"
 #include "common/ExampleScopedKernelWriter.h"
-
 #include <iostream>
 #include <vector>
 
@@ -78,14 +77,14 @@ void op_binary_elementwise(ExampleScopedKernelWriter writer, std::vector<Example
     auto dst = operands.at(2);
 
     // Load the LHS and RHS tile and prepare the tensor sampler.
-    if(!lhs->has_tile() && !rhs->has_tile())
+    if (!lhs->has_tile() && !rhs->has_tile())
     {
         const auto sampler = create_simple_sampler(writer);
 
         writer->op_load_once(lhs, sampler);
         writer->op_load_once(rhs, sampler);
     }
-    else if(lhs->has_tile())
+    else if (lhs->has_tile())
     {
         const auto &sampler = lhs->tile_sampler();
         writer->op_load_once(rhs, sampler);
@@ -101,7 +100,7 @@ void op_binary_elementwise(ExampleScopedKernelWriter writer, std::vector<Example
     const auto &sampler  = lhs->tile_sampler();
 
     // Prepare the output tile.
-    if(!dst->has_tile())
+    if (!dst->has_tile())
     {
         auto &tile = writer->declare_tile("dst_tile", lhs_tile.tile_info());
         dst->init_virtual_tensor(tile, sampler);
@@ -119,7 +118,7 @@ void op_exp(ExampleScopedKernelWriter writer, std::vector<ExampleComponentArgume
     auto dst = operands.at(1);
 
     // Load the source tile and prepare the sampler.
-    if(!src->has_tile())
+    if (!src->has_tile())
     {
         const auto sampler = create_simple_sampler(writer);
         writer->op_load_once(src, sampler);
@@ -129,7 +128,7 @@ void op_exp(ExampleScopedKernelWriter writer, std::vector<ExampleComponentArgume
     const auto &sampler  = src->tile_sampler();
 
     // Prepare the output tile.
-    if(!dst->has_tile())
+    if (!dst->has_tile())
     {
         auto &tile = writer->declare_tile("dst_tile", src_tile.tile_info());
         dst->init_virtual_tensor(tile, sampler);
@@ -160,34 +159,38 @@ int main()
 
     ExampleScopedKernelWriter writer(&root_writer);
 
-    const TensorInfo src0_info(DataType::Fp32, TensorShape({ 3, 10, 20, 1, 1 }), TensorDataLayout::Nhwc, 0);
-    const TensorInfo src1_info(DataType::Fp32, TensorShape({ 3, 10, 20, 1, 1 }), TensorDataLayout::Nhwc, 1);
-    const TensorInfo dst_info(DataType::Fp32, TensorShape({ 3, 10, 20, 1, 1 }), TensorDataLayout::Nhwc, 2);
+    const TensorInfo src0_info(DataType::Fp32, TensorShape({3, 10, 20, 1, 1}), TensorDataLayout::Nhwc, 0);
+    const TensorInfo src1_info(DataType::Fp32, TensorShape({3, 10, 20, 1, 1}), TensorDataLayout::Nhwc, 1);
+    const TensorInfo dst_info(DataType::Fp32, TensorShape({3, 10, 20, 1, 1}), TensorDataLayout::Nhwc, 2);
 
-    ExampleComponentArgument src0(writer->declare_tensor_argument("src0", src0_info, TensorStorageType::BufferUint8Ptr));
-    ExampleComponentArgument src1(writer->declare_tensor_argument("src1", src1_info, TensorStorageType::BufferUint8Ptr));
+    ExampleComponentArgument src0(
+        writer->declare_tensor_argument("src0", src0_info, TensorStorageType::BufferUint8Ptr));
+    ExampleComponentArgument src1(
+        writer->declare_tensor_argument("src1", src1_info, TensorStorageType::BufferUint8Ptr));
     ExampleComponentArgument dst(writer->declare_tensor_argument("dst", dst_info, TensorStorageType::BufferUint8Ptr));
 
     ExampleComponentArgument ans;
 
-    op_binary_elementwise(writer, { &src0, &src1, &ans });
-    op_exp(writer, { &ans, &ans });
-    op_store(writer, { &ans, &dst });
+    op_binary_elementwise(writer, {&src0, &src1, &ans});
+    op_exp(writer, {&ans, &ans});
+    op_store(writer, {&ans, &dst});
 
     const auto arguments = kernel.arguments();
 
     std::cout << "\n====================\nArguments:\n====================\n";
 
-    for(auto &arg : arguments)
+    for (auto &arg : arguments)
     {
-        switch(arg.type())
+        switch (arg.type())
         {
             case ckw::KernelArgument::Type::TensorStorage:
-                std::cout << "* Tensor storage:   ID = " << arg.id() << ", type = " << std::hex << "0x" << static_cast<uint32_t>(arg.tensor_storage_type()) << std::dec << "\n";
+                std::cout << "* Tensor storage:   ID = " << arg.id() << ", type = " << std::hex << "0x"
+                          << static_cast<uint32_t>(arg.tensor_storage_type()) << std::dec << "\n";
                 break;
 
             case ckw::KernelArgument::Type::TensorComponent:
-                std::cout << "* Tensor component: ID = " << arg.id() << ", type = " << std::hex << "0x" << static_cast<uint32_t>(arg.tensor_component_type()) << std::dec << "\n";
+                std::cout << "* Tensor component: ID = " << arg.id() << ", type = " << std::hex << "0x"
+                          << static_cast<uint32_t>(arg.tensor_component_type()) << std::dec << "\n";
                 break;
 
             default:
diff --git a/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.cpp b/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.cpp
index 5a2ec526cc..55223dae0e 100644
--- a/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.cpp
+++ b/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.cpp
@@ -23,19 +23,19 @@
  */
 
 #include "ExampleComponentArgument.h"
+
 #include "ckw/Error.h"
 
 ExampleComponentArgument::ExampleComponentArgument()
 {
 }
 
-ExampleComponentArgument::ExampleComponentArgument(ckw::TensorOperand &tensor)
-    : _tensor(&tensor)
+ExampleComponentArgument::ExampleComponentArgument(ckw::TensorOperand &tensor) : _tensor(&tensor)
 {
 }
 
-ExampleComponentArgument &
-ExampleComponentArgument::init_virtual_tensor(ckw::TileOperand &tile, const ckw::TensorTileSampler &tile_sampler)
+ExampleComponentArgument &ExampleComponentArgument::init_virtual_tensor(ckw::TileOperand             &tile,
+                                                                        const ckw::TensorTileSampler &tile_sampler)
 {
     CKW_ASSERT(_tile == nullptr);
 
diff --git a/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.h b/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.h
index 9fdc50ba08..0e029b1157 100644
--- a/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.h
+++ b/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.h
@@ -104,8 +104,8 @@ public:
     const ckw::TensorTileSampler &tile_sampler() const;
 
 private:
-    ckw::TensorOperand    *_tensor{ nullptr };
-    ckw::TileOperand      *_tile{ nullptr };
+    ckw::TensorOperand    *_tensor{nullptr};
+    ckw::TileOperand      *_tile{nullptr};
     ckw::TensorTileSampler _tile_sampler{};
 };
 
diff --git a/compute_kernel_writer/prototype/examples/common/ExampleKernelWriter.cpp b/compute_kernel_writer/prototype/examples/common/ExampleKernelWriter.cpp
index 6b9f244735..1734ce8823 100644
--- a/compute_kernel_writer/prototype/examples/common/ExampleKernelWriter.cpp
+++ b/compute_kernel_writer/prototype/examples/common/ExampleKernelWriter.cpp
@@ -23,26 +23,27 @@
  */
 
 #include "ExampleKernelWriter.h"
-#include "ExampleComponentArgument.h"
+
 #include "ckw/Error.h"
 #include "ckw/TileInfo.h"
 
-ExampleKernelWriter::ExampleKernelWriter(ckw::Kernel &kernel)
-    : KernelWriter(kernel)
+#include "ExampleComponentArgument.h"
+
+ExampleKernelWriter::ExampleKernelWriter(ckw::Kernel &kernel) : KernelWriter(kernel)
 {
 }
 
 void ExampleKernelWriter::op_load_once(ExampleComponentArgument *tensor_or_tile, const ckw::TensorTileSampler &sampler)
 {
-    if(!tensor_or_tile->has_tile())
+    if (!tensor_or_tile->has_tile())
     {
         CKW_ASSERT(tensor_or_tile->has_tensor());
 
         auto &tensor = tensor_or_tile->tensor();
 
         const auto tile_name = tensor.name() + "_tile";
-        auto      &tile      = declare_tile(tile_name.c_str(),
-                                            ckw::TileInfo(tensor.data_type(), sampler.height(), sampler.width()));
+        auto      &tile =
+            declare_tile(tile_name.c_str(), ckw::TileInfo(tensor.data_type(), sampler.height(), sampler.width()));
 
         op_load(tile, tensor, sampler);
 
diff --git a/compute_kernel_writer/prototype/examples/common/ExampleScopedKernelWriter.cpp b/compute_kernel_writer/prototype/examples/common/ExampleScopedKernelWriter.cpp
index 7c44fa8749..784d5ffb96 100644
--- a/compute_kernel_writer/prototype/examples/common/ExampleScopedKernelWriter.cpp
+++ b/compute_kernel_writer/prototype/examples/common/ExampleScopedKernelWriter.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "ExampleScopedKernelWriter.h"
+
 #include "ExampleKernelWriter.h"
 
 ExampleScopedKernelWriter::ExampleScopedKernelWriter(ExampleKernelWriter *writer)
diff --git a/compute_kernel_writer/prototype/examples/writer_helper.cpp b/compute_kernel_writer/prototype/examples/writer_helper.cpp
index ccef92dcdf..8623afbf50 100644
--- a/compute_kernel_writer/prototype/examples/writer_helper.cpp
+++ b/compute_kernel_writer/prototype/examples/writer_helper.cpp
@@ -23,14 +23,14 @@
 */
 
 #include "ckw/KernelWriter.h"
-#include "../include/ckw/KernelWriterHelper.h"
 #include "ckw/TensorTileSampler.h"
 
+#include "../include/ckw/KernelWriterHelper.h"
 #include <iostream>
 
 using namespace ckw;
 
-TensorTileSampler create_simple_sampler(KernelWriter& writer)
+TensorTileSampler create_simple_sampler(KernelWriter &writer)
 {
     TensorTileSampler sampler;
 
@@ -65,11 +65,11 @@ TensorTileSampler create_simple_sampler(KernelWriter& writer)
 
 int main()
 {
-    Kernel kernel("test", GpuTargetLanguage::OpenCL);
+    Kernel                           kernel("test", GpuTargetLanguage::OpenCL);
     KernelWriterHelper<KernelWriter> writer(kernel);
 
-    const TensorInfo src_info(DataType::Fp32, TensorShape({ 1, 1, 1, 1, 1 }), TensorDataLayout::Nhwc, 0);
-    const TensorInfo dst_info(DataType::Fp32, TensorShape({ 1, 1, 1, 1, 1 }), TensorDataLayout::Nhwc, 1);
+    const TensorInfo src_info(DataType::Fp32, TensorShape({1, 1, 1, 1, 1}), TensorDataLayout::Nhwc, 0);
+    const TensorInfo dst_info(DataType::Fp32, TensorShape({1, 1, 1, 1, 1}), TensorDataLayout::Nhwc, 1);
 
     auto &src_tensor = writer.declare_tensor_argument("src", src_info);
     auto &dst_tensor = writer.declare_tensor_argument("dst", dst_info);
@@ -77,27 +77,24 @@ int main()
     const auto sampler = create_simple_sampler(writer);
 
     auto &src = writer.declare_tile("src_tile", TileInfo(src_tensor.data_type(), sampler.height(), sampler.width()));
-    auto &other = writer.declare_tile("other_tile", TileInfo(src_tensor.data_type(), sampler.height(), sampler.width()));
+    auto &other =
+        writer.declare_tile("other_tile", TileInfo(src_tensor.data_type(), sampler.height(), sampler.width()));
     auto &dst = writer.declare_tile("dst_tile", TileInfo(src_tensor.data_type(), sampler.height(), sampler.width()));
 
     writer.op_load(src, src_tensor, sampler);
     writer.op_load(other, src_tensor, sampler);
     writer.op_load(dst, dst_tensor, sampler);
 
-    auto test = dst ^ src ^ other;
+    auto test       = dst ^ src ^ other;
     auto other_test = logical_and(dst, src, other);
     writer.op_assign(dst, logical_and(dst, src, other));
     writer.op_assign(dst, test);
     writer.op_assign(dst, other_test);
     writer.op_assign(dst, operator^(operator^(dst, src), other));
 
-    writer.op_if(exp(src) == dst, [&]{
-        writer.op_binary_expression(dst, src, BinaryOp::Add, src);
-    }).op_else_if(exp(src) > dst, [&]{
-        writer.op_binary_expression(dst, src, BinaryOp::Add, src);
-    }).op_else([&] {
-        writer.op_assign(dst, src);
-    });
+    writer.op_if(exp(src) == dst, [&] { writer.op_binary_expression(dst, src, BinaryOp::Add, src); })
+        .op_else_if(exp(src) > dst, [&] { writer.op_binary_expression(dst, src, BinaryOp::Add, src); })
+        .op_else([&] { writer.op_assign(dst, src); });
 
     writer.op_assign(dst, src + src * src);
     writer.op_assign(dst, src * max(src, dst) + src);
@@ -106,13 +103,11 @@ int main()
     writer.op_assign(dst, src ^ dst);
     writer.op_assign(dst, ~src);
 
-    writer.op_for_loop(dst < src, dst += src, [&]{
-        writer.op_assign(dst, src + dst);
-    });
+    writer.op_for_loop(dst < src, dst += src, [&] { writer.op_assign(dst, src + dst); });
 
     writer.op_assign(dst += src);
     writer.op_assign(dst += exp(src));
 
     std::cout << "======== KERNEL ========" << std::endl;
     std::cout << writer.generate_code() << std::endl;
-}
-\ No newline at end of file
+}
diff --git a/compute_kernel_writer/prototype/include/ckw/Error.h b/compute_kernel_writer/prototype/include/ckw/Error.h
index b18944eac5..aab713c817 100644
--- a/compute_kernel_writer/prototype/include/ckw/Error.h
+++ b/compute_kernel_writer/prototype/include/ckw/Error.h
@@ -39,11 +39,11 @@ namespace ckw
 #define CKW_ASSERT_MSG(cond, msg)            \
     do                                       \
     {                                        \
-        if(!(cond))                          \
+        if (!(cond))                         \
         {                                    \
             throw ::std::runtime_error(msg); \
         }                                    \
-    } while(false)
+    } while (false)
 
 /** If the condition is not met, throw an std::runtime_error.
  *
@@ -56,8 +56,7 @@ namespace ckw
  * @param[in] precond The condition if is met requires the consequence must also be met.
  * @param[in] cond    The condition that is expected to be true if the precondition is true.
  */
-#define CKW_ASSERT_IF(precond, cond) \
-    CKW_ASSERT_MSG(!(precond) || ((precond) && (cond)), #precond " |-> " #cond)
+#define CKW_ASSERT_IF(precond, cond) CKW_ASSERT_MSG(!(precond) || ((precond) && (cond)), #precond " |-> " #cond)
 
 /** Mark the variables as unused.
  *
diff --git a/compute_kernel_writer/prototype/include/ckw/KernelArgument.h b/compute_kernel_writer/prototype/include/ckw/KernelArgument.h
index af8bcde634..3384a20aef 100644
--- a/compute_kernel_writer/prototype/include/ckw/KernelArgument.h
+++ b/compute_kernel_writer/prototype/include/ckw/KernelArgument.h
@@ -26,6 +26,7 @@
 #define CKW_PROTOTYPE_INCLUDE_CKW_KERNELARGUMENT_H
 
 #include "ckw/TensorInfo.h"
+
 #include <cstdint>
 
 namespace ckw
@@ -98,7 +99,7 @@ private:
         TensorComponentType tensor_component_type;
     };
 
-    SubId _sub_id{ 0 };
+    SubId _sub_id{0};
 };
 
 } // namespace ckw
diff --git a/compute_kernel_writer/prototype/include/ckw/KernelWriter.h b/compute_kernel_writer/prototype/include/ckw/KernelWriter.h
index fdb5fedc59..f9e0066f91 100644
--- a/compute_kernel_writer/prototype/include/ckw/KernelWriter.h
+++ b/compute_kernel_writer/prototype/include/ckw/KernelWriter.h
@@ -94,7 +94,9 @@ public:
      *
      * @return The @ref TensorOperand object.
      */
-    TensorOperand &declare_tensor_argument(const std::string &name, const TensorInfo &info, TensorStorageType storage_type = TensorStorageType::BufferUint8Ptr);
+    TensorOperand &declare_tensor_argument(const std::string &name,
+                                           const TensorInfo  &info,
+                                           TensorStorageType  storage_type = TensorStorageType::BufferUint8Ptr);
 
     /** Declare a compile-time constant scalar argument.
      *
@@ -134,7 +136,10 @@ public:
      * @param[in]  sampler    The tensor sampling information.
      * @param[in]  dilation_y Dilation in the Y dimension.
      */
-    void op_load(TileOperand &tile, const TensorOperand &tensor, const TensorTileSampler &sampler, const TileOperand &dilation_y = TileOperand("dil_y", 1));
+    void op_load(TileOperand             &tile,
+                 const TensorOperand     &tensor,
+                 const TensorTileSampler &sampler,
+                 const TileOperand       &dilation_y = TileOperand("dil_y", 1));
 
     /** Load the data from the tensor memory to the tile using the indirect buffer approach and respective of the sampling information.
      *
@@ -221,7 +226,10 @@ public:
      * @param[in]  first    The first argument tile.
      * @param[in]  second   The second argument tile.
      */
-    void op_binary_elementwise_function(const TileOperand &dst, BinaryFunction func, const TileOperand &first, const TileOperand &second);
+    void op_binary_elementwise_function(const TileOperand &dst,
+                                        BinaryFunction     func,
+                                        const TileOperand &first,
+                                        const TileOperand &second);
 
     /** Write function applied to scalar value: `<dst> = <func>(<first>, <second>, <third>);`.
      *
@@ -231,7 +239,11 @@ public:
      * @param[in]  second   The second argument tile.
      * @param[in]  third    The third argument tile.
      */
-    void op_ternary_elementwise_function(const TileOperand &dst, TernaryFunction func, const TileOperand &first, const TileOperand &second, const TileOperand &third);
+    void op_ternary_elementwise_function(const TileOperand &dst,
+                                         TernaryFunction    func,
+                                         const TileOperand &first,
+                                         const TileOperand &second,
+                                         const TileOperand &third);
 
     /** Write if-statement: `if(<lhs> <op> <rhs>) { <body> }`.
      *
@@ -267,7 +279,13 @@ public:
      * @param[in, out]  update_value      The value which is updated at every iteration.
      * @param[in]       body              The body of the for-loop.
      */
-    void op_for_loop(const TileOperand &var_name, BinaryOp cond_op, const TileOperand &cond_value_name, const TileOperand &update_var_name, AssignmentOp update_op, const TileOperand &update_value_name, const std::function<void()> &body);
+    void op_for_loop(const TileOperand           &var_name,
+                     BinaryOp                     cond_op,
+                     const TileOperand           &cond_value_name,
+                     const TileOperand           &update_var_name,
+                     AssignmentOp                 update_op,
+                     const TileOperand           &update_value_name,
+                     const std::function<void()> &body);
 
     /** Write the return statement: `return;`
      */
@@ -311,8 +329,8 @@ private:
     ::std::unique_ptr<prototype::GpuKernelWriterAttribute> _impl_attr;
     ::std::unique_ptr<prototype::IGpuKernelWriter>         _impl;
 
-    int32_t _id_space{ 0 };
-    int32_t _max_id_space{ 0 };
+    int32_t _id_space{0};
+    int32_t _max_id_space{0};
 };
 
 } // namespace ckw
diff --git a/compute_kernel_writer/prototype/include/ckw/KernelWriterHelper.h b/compute_kernel_writer/prototype/include/ckw/KernelWriterHelper.h
index a8be859680..3ba079bbc2 100644
--- a/compute_kernel_writer/prototype/include/ckw/KernelWriterHelper.h
+++ b/compute_kernel_writer/prototype/include/ckw/KernelWriterHelper.h
@@ -32,8 +32,6 @@
 #include <iostream>
 #include <type_traits>
 
-#include <iostream>
-
 /*
  * By including this header file you will be able to supplement the default
  * Compute Kernel Writer API with additional syntax to help ease the use of CKW.
@@ -154,7 +152,9 @@ struct can_be_assigned<TileOperand &> : ::std::true_type
  * @tparam TLeft The type of the destination of the assignment.
  * @tparam TRight The type of the source assigned to the destination.
  */
-template <typename TLeft, typename TRight, typename = ::std::enable_if<can_be_operand<TRight>::value && can_be_assigned<TLeft>::value>>
+template <typename TLeft,
+          typename TRight,
+          typename = ::std::enable_if<can_be_operand<TRight>::value && can_be_assigned<TLeft>::value>>
 struct Assignment
 {
     TLeft        lhs;
@@ -173,7 +173,7 @@ struct Assignment
 template <typename TLeft, typename TRight>
 inline Assignment<TLeft, TRight> operator+=(TLeft &&lhs, TRight &&rhs)
 {
-    return Assignment<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), AssignmentOp::Increment };
+    return Assignment<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), AssignmentOp::Increment};
 }
 
 /** Represents the expression: `\p lhs -= \p rhs`.
@@ -187,7 +187,7 @@ inline Assignment<TLeft, TRight> operator+=(TLeft &&lhs, TRight &&rhs)
 template <typename TLeft, typename TRight>
 inline Assignment<TLeft, TRight> operator-=(TLeft &&lhs, TRight &&rhs)
 {
-    return Assignment<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), AssignmentOp::Decrement };
+    return Assignment<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), AssignmentOp::Decrement};
 }
 
 // ==================================================
@@ -221,7 +221,7 @@ struct can_be_operand<UnaryExpression<TLeft>> : ::std::true_type
 template <typename TSrc>
 inline UnaryExpression<TSrc> operator!(TSrc &&src)
 {
-    return UnaryExpression<TSrc>{ std::forward<TSrc>(src), UnaryOp::LogicalNot };
+    return UnaryExpression<TSrc>{std::forward<TSrc>(src), UnaryOp::LogicalNot};
 }
 
 /** Represents the expression: `~\p src`.
@@ -233,7 +233,7 @@ inline UnaryExpression<TSrc> operator!(TSrc &&src)
 template <typename TSrc>
 inline UnaryExpression<TSrc> operator~(TSrc &&src)
 {
-    return UnaryExpression<TSrc>{ std::forward<TSrc>(src), UnaryOp::BitwiseNot };
+    return UnaryExpression<TSrc>{std::forward<TSrc>(src), UnaryOp::BitwiseNot};
 }
 
 // ==================================================
@@ -247,7 +247,9 @@ inline UnaryExpression<TSrc> operator~(TSrc &&src)
  * @tparam TLeft  The type of the left argument of the expression.
  * @tparam TRight The type of the right argument of the expression.
  */
-template <typename TLeft, typename TRight, typename = ::std::enable_if_t<can_be_operand<TLeft>::value && can_be_operand<TRight>::value>>
+template <typename TLeft,
+          typename TRight,
+          typename = ::std::enable_if_t<can_be_operand<TLeft>::value && can_be_operand<TRight>::value>>
 struct BinaryExpression
 {
     TLeft    lhs;
@@ -271,7 +273,7 @@ struct can_be_operand<BinaryExpression<TLeft, TRight>> : ::std::true_type
 template <typename TLeft, typename TRight>
 inline BinaryExpression<TLeft, TRight> operator+(TLeft &&lhs, TRight &&rhs)
 {
-    return BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Add };
+    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Add};
 }
 
 /** Represents the expression: `\p lhs - \p rhs`.
@@ -285,7 +287,7 @@ inline BinaryExpression<TLeft, TRight> operator+(TLeft &&lhs, TRight &&rhs)
 template <typename TLeft, typename TRight>
 inline BinaryExpression<TLeft, TRight> operator-(TLeft &&lhs, TRight &&rhs)
 {
-    return BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Sub };
+    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Sub};
 }
 
 /** Represents the expression: `\p lhs * \p rhs`.
@@ -299,7 +301,7 @@ inline BinaryExpression<TLeft, TRight> operator-(TLeft &&lhs, TRight &&rhs)
 template <typename TLeft, typename TRight>
 inline BinaryExpression<TLeft, TRight> operator*(TLeft &&lhs, TRight &&rhs)
 {
-    return BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Mul };
+    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Mul};
 }
 
 /** Represents the expression: `\p lhs / \p rhs`.
@@ -313,7 +315,7 @@ inline BinaryExpression<TLeft, TRight> operator*(TLeft &&lhs, TRight &&rhs)
 template <typename TLeft, typename TRight>
 inline BinaryExpression<TLeft, TRight> operator/(TLeft &&lhs, TRight &&rhs)
 {
-    return BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Div };
+    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Div};
 }
 
 /** Represents the expression: `\p lhs % \p rhs`.
@@ -327,7 +329,7 @@ inline BinaryExpression<TLeft, TRight> operator/(TLeft &&lhs, TRight &&rhs)
 template <typename TLeft, typename TRight>
 inline BinaryExpression<TLeft, TRight> operator%(TLeft &&lhs, TRight &&rhs)
 {
-    return BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Mod };
+    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Mod};
 }
 
 /** Represents the expression: `\p lhs == \p rhs`.
@@ -341,7 +343,7 @@ inline BinaryExpression<TLeft, TRight> operator%(TLeft &&lhs, TRight &&rhs)
 template <typename TLeft, typename TRight>
 inline BinaryExpression<TLeft, TRight> operator==(TLeft &&lhs, TRight &&rhs)
 {
-    return BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Equal };
+    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Equal};
 }
 
 /** Represents the expression: `\p lhs < \p rhs`.
@@ -355,7 +357,7 @@ inline BinaryExpression<TLeft, TRight> operator==(TLeft &&lhs, TRight &&rhs)
 template <typename TLeft, typename TRight>
 inline BinaryExpression<TLeft, TRight> operator<(TLeft &&lhs, TRight &&rhs)
 {
-    return BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Less };
+    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Less};
 }
 
 /** Represents the expression: `\p lhs <= \p rhs`.
@@ -369,7 +371,7 @@ inline BinaryExpression<TLeft, TRight> operator<(TLeft &&lhs, TRight &&rhs)
 template <typename TLeft, typename TRight>
 inline BinaryExpression<TLeft, TRight> operator<=(TLeft &&lhs, TRight &&rhs)
 {
-    return BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LessEqual };
+    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LessEqual};
 }
 
 /** Represents the expression: `\p lhs > \p rhs`.
@@ -383,7 +385,7 @@ inline BinaryExpression<TLeft, TRight> operator<=(TLeft &&lhs, TRight &&rhs)
 template <typename TLeft, typename TRight>
 inline BinaryExpression<TLeft, TRight> operator>(TLeft &&lhs, TRight &&rhs)
 {
-    return BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Greater };
+    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Greater};
 }
 
 /** Represents the expression: `\p lhs >= \p rhs`.
@@ -397,7 +399,7 @@ inline BinaryExpression<TLeft, TRight> operator>(TLeft &&lhs, TRight &&rhs)
 template <typename TLeft, typename TRight>
 inline BinaryExpression<TLeft, TRight> operator>=(TLeft &&lhs, TRight &&rhs)
 {
-    return BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::GreaterEqual };
+    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::GreaterEqual};
 }
 
 /** Represents the expression: `\p lhs ^ \p rhs`.
@@ -411,7 +413,7 @@ inline BinaryExpression<TLeft, TRight> operator>=(TLeft &&lhs, TRight &&rhs)
 template <typename TLeft, typename TRight>
 inline BinaryExpression<TLeft, TRight> operator^(TLeft &&lhs, TRight &&rhs)
 {
-    return BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::BitwiseXOR };
+    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::BitwiseXOR};
 }
 
 /** Represents the expression: `\p lhs && \p rhs`.
@@ -425,7 +427,7 @@ inline BinaryExpression<TLeft, TRight> operator^(TLeft &&lhs, TRight &&rhs)
 template <typename TLeft, typename TRight>
 inline BinaryExpression<TLeft, TRight> logical_and(TLeft &&lhs, TRight &&rhs)
 {
-    return BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LogicalAnd };
+    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LogicalAnd};
 }
 
 /** Represents the expression: `\p lhs && \p rhs`.
@@ -440,7 +442,7 @@ template <typename TLeft, typename TRight, typename... TOps>
 inline BinaryExpression<BinaryExpression<TLeft, TRight>, TOps...> logical_and(TLeft &&lhs, TRight &&rhs, TOps &&...ops)
 {
     return logical_and(
-        BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LogicalAnd },
+        BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LogicalAnd},
         std::forward<TOps>(ops)...);
 }
 
@@ -455,7 +457,7 @@ inline BinaryExpression<BinaryExpression<TLeft, TRight>, TOps...> logical_and(TL
 template <typename TLeft, typename TRight>
 inline BinaryExpression<TLeft, TRight> logical_or(TLeft &&lhs, TRight &&rhs)
 {
-    return BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LogicalOr };
+    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LogicalOr};
 }
 
 /** Represents the expression: `\p lhs || \p rhs`.
@@ -470,7 +472,7 @@ template <typename TLeft, typename TRight, typename... TOps>
 inline BinaryExpression<BinaryExpression<TLeft, TRight>, TOps...> logical_or(TLeft &&lhs, TRight &&rhs, TOps &&...ops)
 {
     return logical_or(
-        BinaryExpression<TLeft, TRight>{ std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LogicalOr },
+        BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LogicalOr},
         std::forward<TOps>(ops)...);
 }
 
@@ -505,7 +507,7 @@ struct can_be_operand<UnaryElementwiseFunction<TLeft>> : ::std::true_type
 template <typename TSrc>
 UnaryElementwiseFunction<TSrc> exp(TSrc &&src)
 {
-    return UnaryElementwiseFunction<TSrc>{ std::forward<TSrc>(src), UnaryFunction::Exp };
+    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::Exp};
 }
 
 /** Represents the expression: `tanh(\p src)`.
@@ -517,7 +519,7 @@ UnaryElementwiseFunction<TSrc> exp(TSrc &&src)
 template <typename TSrc>
 UnaryElementwiseFunction<TSrc> tanh(TSrc &&src)
 {
-    return UnaryElementwiseFunction<TSrc>{ std::forward<TSrc>(src), UnaryFunction::Tanh };
+    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::Tanh};
 }
 
 /** Represents the expression: `sqrt(\p src)`.
@@ -529,7 +531,7 @@ UnaryElementwiseFunction<TSrc> tanh(TSrc &&src)
 template <typename TSrc>
 UnaryElementwiseFunction<TSrc> sqrt(TSrc &&src)
 {
-    return UnaryElementwiseFunction<TSrc>{ std::forward<TSrc>(src), UnaryFunction::Sqrt };
+    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::Sqrt};
 }
 
 /** Represents the expression: `erf(\p src)`.
@@ -541,7 +543,7 @@ UnaryElementwiseFunction<TSrc> sqrt(TSrc &&src)
 template <typename TSrc>
 UnaryElementwiseFunction<TSrc> erf(TSrc &&src)
 {
-    return UnaryElementwiseFunction<TSrc>{ std::forward<TSrc>(src), UnaryFunction::Erf };
+    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::Erf};
 }
 
 /** Represents the expression: `fabs(\p src)`.
@@ -553,7 +555,7 @@ UnaryElementwiseFunction<TSrc> erf(TSrc &&src)
 template <typename TSrc>
 UnaryElementwiseFunction<TSrc> fabs(TSrc &&src)
 {
-    return UnaryElementwiseFunction<TSrc>{ std::forward<TSrc>(src), UnaryFunction::Fabs };
+    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::Fabs};
 }
 
 /** Represents the expression: `log(\p src)`.
@@ -565,7 +567,7 @@ UnaryElementwiseFunction<TSrc> fabs(TSrc &&src)
 template <typename TSrc>
 UnaryElementwiseFunction<TSrc> log(TSrc &&src)
 {
-    return UnaryElementwiseFunction<TSrc>{ std::forward<TSrc>(src), UnaryFunction::Log };
+    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::Log};
 }
 
 /** Represents the expression: `round(\p src)`.
@@ -577,7 +579,7 @@ UnaryElementwiseFunction<TSrc> log(TSrc &&src)
 template <typename TSrc>
 UnaryElementwiseFunction<TSrc> round(TSrc &&src)
 {
-    return UnaryElementwiseFunction<TSrc>{ std::forward<TSrc>(src), UnaryFunction::Round };
+    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::Round};
 }
 
 /** Represents the expression: `sizeof(\p src)`.
@@ -589,7 +591,7 @@ UnaryElementwiseFunction<TSrc> round(TSrc &&src)
 template <typename TSrc>
 UnaryElementwiseFunction<TSrc> sizeOf(TSrc &&src)
 {
-    return UnaryElementwiseFunction<TSrc>{ std::forward<TSrc>(src), UnaryFunction::SizeOf };
+    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::SizeOf};
 }
 
 // ==================================================
@@ -603,7 +605,9 @@ UnaryElementwiseFunction<TSrc> sizeOf(TSrc &&src)
  * @tparam TFirst  The type of the left argument of the function.
  * @tparam TSecond The type of the right argument of the function.
  */
-template <typename TFirst, typename TSecond, typename = ::std::enable_if<can_be_operand<TFirst>::value && can_be_operand<TSecond>::value>>
+template <typename TFirst,
+          typename TSecond,
+          typename = ::std::enable_if<can_be_operand<TFirst>::value && can_be_operand<TSecond>::value>>
 struct BinaryElementwiseFunction
 {
     TFirst         first;
@@ -627,7 +631,8 @@ struct can_be_operand<BinaryElementwiseFunction<TFirst, TSecond>> : ::std::true_
 template <typename TFirst, typename TSecond>
 BinaryElementwiseFunction<TFirst, TSecond> max(TFirst &&first, TSecond &&second)
 {
-    return BinaryElementwiseFunction<TFirst, TSecond>{ std::forward<TFirst>(first), std::forward<TSecond>(second), BinaryFunction::Max };
+    return BinaryElementwiseFunction<TFirst, TSecond>{std::forward<TFirst>(first), std::forward<TSecond>(second),
+                                                      BinaryFunction::Max};
 }
 
 /** Represents the function call: `min(\p first, \p second)`.
@@ -641,7 +646,8 @@ BinaryElementwiseFunction<TFirst, TSecond> max(TFirst &&first, TSecond &&second)
 template <typename TFirst, typename TSecond>
 BinaryElementwiseFunction<TFirst, TSecond> min(TFirst &&first, TSecond &&second)
 {
-    return BinaryElementwiseFunction<TFirst, TSecond>{ std::forward<TFirst>(first), std::forward<TSecond>(second), BinaryFunction::Min };
+    return BinaryElementwiseFunction<TFirst, TSecond>{std::forward<TFirst>(first), std::forward<TSecond>(second),
+                                                      BinaryFunction::Min};
 }
 
 // ==================================================
@@ -656,7 +662,11 @@ BinaryElementwiseFunction<TFirst, TSecond> min(TFirst &&first, TSecond &&second)
  * @tparam TSecond The type of the second argument to the function.
  * @tparam TThird The type of the third argument to the function.
  */
-template <typename TFirst, typename TSecond, typename TThird, typename = ::std::enable_if<can_be_operand<TFirst>::value && can_be_operand<TSecond>::value && can_be_operand<TThird>::value>>
+template <typename TFirst,
+          typename TSecond,
+          typename TThird,
+          typename = ::std::enable_if<can_be_operand<TFirst>::value && can_be_operand<TSecond>::value &&
+                                      can_be_operand<TThird>::value>>
 struct TernaryElementwiseFunction
 {
     TFirst          first;
@@ -683,7 +693,9 @@ struct can_be_operand<TernaryElementwiseFunction<TFirst, TSecond, TThird>> : ::s
 template <typename TFirst, typename TSecond, typename TThird>
 TernaryElementwiseFunction<TFirst, TSecond, TThird> select(TFirst &&first, TSecond &&second, TThird &&third)
 {
-    return TernaryElementwiseFunction<TFirst, TSecond, TThird>{ std::forward<TFirst>(first), std::forward<TSecond>(second), std::forward<TThird>(third), TernaryFunction::Select };
+    return TernaryElementwiseFunction<TFirst, TSecond, TThird>{std::forward<TFirst>(first),
+                                                               std::forward<TSecond>(second),
+                                                               std::forward<TThird>(third), TernaryFunction::Select};
 }
 
 /** Helper class used to extend a KernelWriter with additional functionality
@@ -715,7 +727,8 @@ public:
      * @param[in] cond The BinaryExpression representing the condition.
      * @param[in] body The body of the if-statement.
      */
-    KernelWriterHelper<TWriter> &op_if(const BinaryExpression<TileOperand &, TileOperand &> &cond, const std::function<void()> &body)
+    KernelWriterHelper<TWriter> &op_if(const BinaryExpression<TileOperand &, TileOperand &> &cond,
+                                       const std::function<void()>                          &body)
     {
         TWriter::op_if(cond.lhs, cond.opcode, cond.rhs, body);
         return *this;
@@ -730,7 +743,8 @@ public:
      * @param[in] body The body of the if-statement.
      */
     template <typename TRight>
-    KernelWriterHelper<TWriter> &op_if(const BinaryExpression<TileOperand &, TRight> &cond, const std::function<void()> &body)
+    KernelWriterHelper<TWriter> &op_if(const BinaryExpression<TileOperand &, TRight> &cond,
+                                       const std::function<void()>                   &body)
     {
         auto &tmp1 = declare_temp_tile(cond.lhs.tile_info());
         op_assign(tmp1, cond.rhs);
@@ -747,7 +761,8 @@ public:
      * @param[in] body The body of the if-statement.
      */
     template <typename TLeft>
-    KernelWriterHelper<TWriter> &op_if(const BinaryExpression<TLeft, TileOperand &> &cond, const std::function<void()> &body)
+    KernelWriterHelper<TWriter> &op_if(const BinaryExpression<TLeft, TileOperand &> &cond,
+                                       const std::function<void()>                  &body)
     {
         auto &tmp1 = declare_temp_tile(cond.rhs.tile_info());
         op_assign(tmp1, cond.lhs);
@@ -766,7 +781,8 @@ public:
      * @param[in] cond The BinaryExpression representing the condition.
      * @param[in] body The body of the else-if-statement.
      */
-    KernelWriterHelper<TWriter> &op_else_if(const BinaryExpression<TileOperand &, TileOperand &> &cond, const std::function<void()> &body)
+    KernelWriterHelper<TWriter> &op_else_if(const BinaryExpression<TileOperand &, TileOperand &> &cond,
+                                            const std::function<void()>                          &body)
     {
         TWriter::op_else_if(cond.lhs, cond.opcode, cond.rhs, body);
         return *this;
@@ -781,7 +797,8 @@ public:
      * @param[in] body The body of the else-if-statement.
      */
     template <typename TRight>
-    KernelWriterHelper<TWriter> &op_else_if(const BinaryExpression<TileOperand &, TRight> &cond, const std::function<void()> &body)
+    KernelWriterHelper<TWriter> &op_else_if(const BinaryExpression<TileOperand &, TRight> &cond,
+                                            const std::function<void()>                   &body)
     {
         auto &tmp1 = declare_temp_tile(cond.lhs.tile_info());
         op_assign(tmp1, cond.rhs);
@@ -798,7 +815,8 @@ public:
      * @param[in] body The body of the else-if-statement.
      */
     template <typename TLeft>
-    KernelWriterHelper<TWriter> &op_else_if(const BinaryExpression<TLeft, TileOperand &> &cond, const std::function<void()> &body)
+    KernelWriterHelper<TWriter> &op_else_if(const BinaryExpression<TLeft, TileOperand &> &cond,
+                                            const std::function<void()>                  &body)
     {
         auto &tmp1 = declare_temp_tile(cond.rhs.tile_info());
         op_assign(tmp1, cond.lhs);
@@ -823,7 +841,9 @@ public:
      * @param[in] updater The Assignment representing the updater.
      * @param[in] body    The body of the for-loop.
      */
-    void op_for_loop(const BinaryExpression<TileOperand &, TileOperand &> &cond, const Assignment<TileOperand &, TileOperand &> &updater, const std::function<void()> &body)
+    void op_for_loop(const BinaryExpression<TileOperand &, TileOperand &> &cond,
+                     const Assignment<TileOperand &, TileOperand &>       &updater,
+                     const std::function<void()>                          &body)
     {
         TWriter::op_for_loop(cond.lhs, cond.opcode, cond.rhs, updater.lhs, updater.opcode, updater.rhs, body);
     }
@@ -1029,7 +1049,8 @@ public:
      * @param[in] dst The tile which is assigned to.
      * @param[in] exp The TernaryElementwiseFunction representing the expression to be evaluated and assigned.
      */
-    void op_assign(const TileOperand &dst, const TernaryElementwiseFunction<TileOperand &, TileOperand &, TileOperand &> &exp)
+    void op_assign(const TileOperand                                                             &dst,
+                   const TernaryElementwiseFunction<TileOperand &, TileOperand &, TileOperand &> &exp)
     {
         TWriter::op_ternary_elementwise_function(dst, exp.opcode, exp.first, exp.second, exp.third);
     }
@@ -1169,11 +1190,11 @@ public:
      */
     void op_assign(const Assignment<TileOperand &, TileOperand &> &exp)
     {
-        if(exp.opcode == AssignmentOp::Increment)
+        if (exp.opcode == AssignmentOp::Increment)
         {
             TWriter::op_binary_expression(exp.lhs, exp.lhs, BinaryOp::Add, exp.rhs);
         }
-        else if(exp.opcode == AssignmentOp::Decrement)
+        else if (exp.opcode == AssignmentOp::Decrement)
         {
             TWriter::op_binary_expression(exp.lhs, exp.lhs, BinaryOp::Sub, exp.rhs);
         }
@@ -1192,7 +1213,7 @@ public:
     {
         auto &tmp1 = declare_temp_tile(exp.lhs.tile_info());
         op_assign(tmp1, exp.rhs);
-        op_assign(Assignment<TileOperand &, TileOperand &>{ exp.lhs, tmp1, exp.opcode });
+        op_assign(Assignment<TileOperand &, TileOperand &>{exp.lhs, tmp1, exp.opcode});
     }
 
 private:
@@ -1241,11 +1262,8 @@ private:
     template <typename... TOps, typename = ::std::enable_if_t<std::is_same<TOps..., TileInfo>::value>>
     TileInfo get_largest_size(const TileInfo &first, const TileInfo &second, const TOps &...ops)
     {
-        TileInfo largest = {
-            first.data_type(),
-            std::max(first.width(), second.width()),
-            std::max(first.height(), second.height())
-        };
+        TileInfo largest = {first.data_type(), std::max(first.width(), second.width()),
+                            std::max(first.height(), second.height())};
         return get_largest_size(largest, ops...);
     }
 
diff --git a/compute_kernel_writer/prototype/include/ckw/OperandBase.h b/compute_kernel_writer/prototype/include/ckw/OperandBase.h
index 06d9f82756..9842127339 100644
--- a/compute_kernel_writer/prototype/include/ckw/OperandBase.h
+++ b/compute_kernel_writer/prototype/include/ckw/OperandBase.h
@@ -26,6 +26,7 @@
 #define CKW_PROTOTYPE_INCLUDE_CKW_OPERANDBASE_H
 
 #include "ckw/types/DataType.h"
+
 #include <string>
 
 namespace ckw
diff --git a/compute_kernel_writer/prototype/include/ckw/ScalarValue.h b/compute_kernel_writer/prototype/include/ckw/ScalarValue.h
index 16c3f6d441..2a9c42acc8 100644
--- a/compute_kernel_writer/prototype/include/ckw/ScalarValue.h
+++ b/compute_kernel_writer/prototype/include/ckw/ScalarValue.h
@@ -59,9 +59,9 @@ public:
 
         _size = sizeof(T);
 
-        if(::std::is_integral<T>::value)
+        if (::std::is_integral<T>::value)
         {
-            if(::std::is_signed<T>::value)
+            if (::std::is_signed<T>::value)
             {
                 _type      = Type::INT;
                 _value.i64 = value;
@@ -90,9 +90,9 @@ public:
         CKW_ASSERT(::std::is_integral<T>::value || ::std::is_floating_point<T>::value);
         CKW_ASSERT(sizeof(T) >= _size);
 
-        if(::std::is_integral<T>::value)
+        if (::std::is_integral<T>::value)
         {
-            if(::std::is_signed<T>::value)
+            if (::std::is_signed<T>::value)
             {
                 CKW_ASSERT(_type == Type::INT || _type == Type::UINT);
                 CKW_ASSERT_IF(_type == Type::UINT, sizeof(T) > _size);
diff --git a/compute_kernel_writer/prototype/include/ckw/TensorInfo.h b/compute_kernel_writer/prototype/include/ckw/TensorInfo.h
index 55f8101a53..24da7dc8ab 100644
--- a/compute_kernel_writer/prototype/include/ckw/TensorInfo.h
+++ b/compute_kernel_writer/prototype/include/ckw/TensorInfo.h
@@ -143,10 +143,10 @@ public:
     int32_t id() const;
 
 private:
-    TensorShape      _shape{ { 0 } };
-    DataType         _dt{ DataType::Unknown };
-    TensorDataLayout _dl{ TensorDataLayout::Unknown };
-    int32_t          _id{ -1 };
+    TensorShape      _shape{{0}};
+    DataType         _dt{DataType::Unknown};
+    TensorDataLayout _dl{TensorDataLayout::Unknown};
+    int32_t          _id{-1};
 };
 } // namespace ckw
 
diff --git a/compute_kernel_writer/prototype/include/ckw/TensorOperand.h b/compute_kernel_writer/prototype/include/ckw/TensorOperand.h
index 6d88932c66..c221b449fa 100644
--- a/compute_kernel_writer/prototype/include/ckw/TensorOperand.h
+++ b/compute_kernel_writer/prototype/include/ckw/TensorOperand.h
@@ -139,21 +139,21 @@ private:
     TensorInfo        _info;
     TensorStorageType _storage_type;
 
-    TileOperand      *_tile{ nullptr };
+    TileOperand      *_tile{nullptr};
     TensorTileSampler _tile_sampler{};
 
-    ::std::unique_ptr<TensorComponentOperand> _stride1{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _stride2{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _stride3{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _stride4{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _dim0{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _dim1{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _dim2{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _dim3{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _dim4{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _dim1_dim2{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _dim1_dim2_dim3{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _offset_first_element_in_bytes{ nullptr };
+    ::std::unique_ptr<TensorComponentOperand> _stride1{nullptr};
+    ::std::unique_ptr<TensorComponentOperand> _stride2{nullptr};
+    ::std::unique_ptr<TensorComponentOperand> _stride3{nullptr};
+    ::std::unique_ptr<TensorComponentOperand> _stride4{nullptr};
+    ::std::unique_ptr<TensorComponentOperand> _dim0{nullptr};
+    ::std::unique_ptr<TensorComponentOperand> _dim1{nullptr};
+    ::std::unique_ptr<TensorComponentOperand> _dim2{nullptr};
+    ::std::unique_ptr<TensorComponentOperand> _dim3{nullptr};
+    ::std::unique_ptr<TensorComponentOperand> _dim4{nullptr};
+    ::std::unique_ptr<TensorComponentOperand> _dim1_dim2{nullptr};
+    ::std::unique_ptr<TensorComponentOperand> _dim1_dim2_dim3{nullptr};
+    ::std::unique_ptr<TensorComponentOperand> _offset_first_element_in_bytes{nullptr};
 };
 
 // =================================================================================================
diff --git a/compute_kernel_writer/prototype/include/ckw/TensorTileSampler.h b/compute_kernel_writer/prototype/include/ckw/TensorTileSampler.h
index e1bf0c52b8..606dec3535 100644
--- a/compute_kernel_writer/prototype/include/ckw/TensorTileSampler.h
+++ b/compute_kernel_writer/prototype/include/ckw/TensorTileSampler.h
@@ -26,6 +26,7 @@
 #define CKW_PROTOTYPE_INCLUDE_CKW_TENSORTILESAMPLER_H
 
 #include "ckw/types/TensorSamplerTypes.h"
+
 #include <functional>
 
 namespace ckw
@@ -55,12 +56,14 @@ public:
      * @param[in] address_mode_y The address mode of the y dimension.
      * @param[in] address_mode_z The address mode of the z dimension.
      */
-    TensorTileSampler(
-        TileOperand &x, TileOperand &y, TileOperand &z, TileOperand &b,
-        TensorSamplerFormat       format,
-        TensorSamplerAddressModeX address_mode_x,
-        TensorSamplerAddressModeY address_mode_y,
-        TensorSamplerAddressModeZ address_mode_z);
+    TensorTileSampler(TileOperand              &x,
+                      TileOperand              &y,
+                      TileOperand              &z,
+                      TileOperand              &b,
+                      TensorSamplerFormat       format,
+                      TensorSamplerAddressModeX address_mode_x,
+                      TensorSamplerAddressModeY address_mode_y,
+                      TensorSamplerAddressModeZ address_mode_z);
 
     /** Initialize a new instance of @ref TensorSampler class.
      *
@@ -75,13 +78,16 @@ public:
      * @param[in] address_mode_y The address mode of the y dimension.
      * @param[in] address_mode_z The address mode of the z dimension.
      */
-    TensorTileSampler(
-        TileOperand &x, TileOperand &y, TileOperand &z, TileOperand &b,
-        int32_t height, int32_t width,
-        TensorSamplerFormat       format,
-        TensorSamplerAddressModeX address_mode_x,
-        TensorSamplerAddressModeY address_mode_y,
-        TensorSamplerAddressModeZ address_mode_z);
+    TensorTileSampler(TileOperand              &x,
+                      TileOperand              &y,
+                      TileOperand              &z,
+                      TileOperand              &b,
+                      int32_t                   height,
+                      int32_t                   width,
+                      TensorSamplerFormat       format,
+                      TensorSamplerAddressModeX address_mode_x,
+                      TensorSamplerAddressModeY address_mode_y,
+                      TensorSamplerAddressModeZ address_mode_z);
 
     /** Get the coordinate in the x dimension. */
     const TileOperand &x() const;
@@ -144,18 +150,18 @@ public:
     TensorTileSampler &address_mode_z(TensorSamplerAddressModeZ address_mode_z);
 
 private:
-    TileOperand *_x{ nullptr };
-    TileOperand *_y{ nullptr };
-    TileOperand *_z{ nullptr };
-    TileOperand *_b{ nullptr };
-
-    int32_t _height{ 0 };
-    int32_t _width{ 0 };
-
-    TensorSamplerFormat       _format{ TensorSamplerFormat::Unknown };
-    TensorSamplerAddressModeX _address_mode_x{ TensorSamplerAddressModeX::Unknown };
-    TensorSamplerAddressModeY _address_mode_y{ TensorSamplerAddressModeY::Unknown };
-    TensorSamplerAddressModeZ _address_mode_z{ TensorSamplerAddressModeZ::Unknown };
+    TileOperand *_x{nullptr};
+    TileOperand *_y{nullptr};
+    TileOperand *_z{nullptr};
+    TileOperand *_b{nullptr};
+
+    int32_t _height{0};
+    int32_t _width{0};
+
+    TensorSamplerFormat       _format{TensorSamplerFormat::Unknown};
+    TensorSamplerAddressModeX _address_mode_x{TensorSamplerAddressModeX::Unknown};
+    TensorSamplerAddressModeY _address_mode_y{TensorSamplerAddressModeY::Unknown};
+    TensorSamplerAddressModeZ _address_mode_z{TensorSamplerAddressModeZ::Unknown};
 };
 
 } // namespace ckw
diff --git a/compute_kernel_writer/prototype/include/ckw/TileInfo.h b/compute_kernel_writer/prototype/include/ckw/TileInfo.h
index de9e47af2b..e0d064169e 100644
--- a/compute_kernel_writer/prototype/include/ckw/TileInfo.h
+++ b/compute_kernel_writer/prototype/include/ckw/TileInfo.h
@@ -83,7 +83,7 @@ public:
     DataType data_type() const;
 
 private:
-    DataType  _dt{ DataType::Unknown };
+    DataType  _dt{DataType::Unknown};
     TileShape _shape{};
 };
 
diff --git a/compute_kernel_writer/prototype/include/ckw/types/Functions.h b/compute_kernel_writer/prototype/include/ckw/types/Functions.h
index bc1f85c188..c6afaa0ac8 100644
--- a/compute_kernel_writer/prototype/include/ckw/types/Functions.h
+++ b/compute_kernel_writer/prototype/include/ckw/types/Functions.h
@@ -32,14 +32,14 @@ namespace ckw
 
 enum class UnaryFunction : int32_t
 {
-    Exp            = 0x0000,
-    Tanh           = 0x0001,
-    Sqrt           = 0x0002,
-    Erf            = 0x0003,
-    Fabs           = 0x0004,
-    Log            = 0x0006,
-    Round          = 0x0007,
-    Floor          = 0x0008,
+    Exp   = 0x0000,
+    Tanh  = 0x0001,
+    Sqrt  = 0x0002,
+    Erf   = 0x0003,
+    Fabs  = 0x0004,
+    Log   = 0x0006,
+    Round = 0x0007,
+    Floor = 0x0008,
 
     // Misc
     SizeOf = 0x0009,
@@ -47,8 +47,8 @@ enum class UnaryFunction : int32_t
 
 enum class BinaryFunction : int32_t
 {
-    Min  = 0x0000,
-    Max  = 0x0001,
+    Min = 0x0000,
+    Max = 0x0001,
 };
 
 enum class TernaryFunction : int32_t
diff --git a/compute_kernel_writer/prototype/include/ckw/types/Operators.h b/compute_kernel_writer/prototype/include/ckw/types/Operators.h
index 43241170a5..b560996837 100644
--- a/compute_kernel_writer/prototype/include/ckw/types/Operators.h
+++ b/compute_kernel_writer/prototype/include/ckw/types/Operators.h
@@ -69,8 +69,8 @@ enum class BinaryOp : int32_t
 enum class AssignmentOp : int32_t
 {
     // Unary
-    Increment  = 0x0000, // +=
-    Decrement  = 0x0001, // -=
+    Increment = 0x0000, // +=
+    Decrement = 0x0001, // -=
 };
 
 } // namespace ckw
diff --git a/compute_kernel_writer/prototype/include/ckw/types/TensorSamplerTypes.h b/compute_kernel_writer/prototype/include/ckw/types/TensorSamplerTypes.h
index 836bd13c95..63405a0764 100644
--- a/compute_kernel_writer/prototype/include/ckw/types/TensorSamplerTypes.h
+++ b/compute_kernel_writer/prototype/include/ckw/types/TensorSamplerTypes.h
@@ -39,34 +39,38 @@ enum class TensorSamplerFormat : int32_t
 
 enum class TensorSamplerAddressModeX : int32_t
 {
-    Unknown        = 0,
-    None           = 1, // The user guarantees that the X coordinate is always in-bound
-    OverlappingMin = 2  // (FIXED shapes only) Reduce the load/store length when x == 0 (MIN). The load length will be width % original length
-                        // Leftover elements can be handled using overlapping. This involves processing some of the elements in the array twice.
+    Unknown = 0,
+    None    = 1, // The user guarantees that the X coordinate is always in-bound
+    OverlappingMin =
+        2 // (FIXED shapes only) Reduce the load/store length when x == 0 (MIN). The load length will be width % original length
+    // Leftover elements can be handled using overlapping. This involves processing some of the elements in the array twice.
 };
 
 enum class TensorSamplerAddressModeY : int32_t
 {
-    Unknown                  = 0,
-    None                     = 1, // The user guarantees that the Y coordinate is always in-bound
-    OverlappingMin           = 2, // (FIXED shapes only) Reduce the load/store length when x == 0 (MIN). The load length will be width % original length
-    Skip                     = 3, // Skip the read/write
-    SkipMinEdgeOnly          = 4, // Skip greater than or equal to max only. The user guarantees that the Y coordinate is always >= 0
-    SkipMaxEdgeOnly          = 5, // Skip less than 0 only
-    ClampToNearest           = 6, // Clamp the coordinate to nearest edge (0 or max value allowed on Y)
-    ClampToMinEdgeOnly       = 7, // Clamp the negative coordinate to 0 only. Therefore, we expect Y to be always < MAX
-    ClampToMaxEdgeOnly       = 8, // Clamp the coordinate to the max value allowed on Y only. We expect Y to be always >= 0
-    ClampToBorder            = 9, // Clamp to border which always has 0 value
+    Unknown = 0,
+    None    = 1, // The user guarantees that the Y coordinate is always in-bound
+    OverlappingMin =
+        2, // (FIXED shapes only) Reduce the load/store length when x == 0 (MIN). The load length will be width % original length
+    Skip = 3, // Skip the read/write
+    SkipMinEdgeOnly =
+        4, // Skip greater than or equal to max only. The user guarantees that the Y coordinate is always >= 0
+    SkipMaxEdgeOnly    = 5, // Skip less than 0 only
+    ClampToNearest     = 6, // Clamp the coordinate to nearest edge (0 or max value allowed on Y)
+    ClampToMinEdgeOnly = 7, // Clamp the negative coordinate to 0 only. Therefore, we expect Y to be always < MAX
+    ClampToMaxEdgeOnly = 8, // Clamp the coordinate to the max value allowed on Y only. We expect Y to be always >= 0
+    ClampToBorder      = 9, // Clamp to border which always has 0 value
     ClampToBorderMinEdgeOnly = 10,
     ClampToBorderMaxEdgeOnly = 11
 };
 
 enum class TensorSamplerAddressModeZ : int32_t
 {
-    Unknown            = 0,
-    None               = 1, // The user guarantees that the Y coordinate is always in-bound
-    Skip               = 3, // Skip the read/write
-    SkipMinEdgeOnly    = 4, // Skip greater than or equal to max only. The user guarantees that the Y coordinate is always >= 0
+    Unknown = 0,
+    None    = 1, // The user guarantees that the Y coordinate is always in-bound
+    Skip    = 3, // Skip the read/write
+    SkipMinEdgeOnly =
+        4, // Skip greater than or equal to max only. The user guarantees that the Y coordinate is always >= 0
     SkipMaxEdgeOnly    = 5, // Skip less than 0 only
     ClampToNearest     = 6, // Clamp the coordinate to nearest edge (0 or max value allowed on Y)
     ClampToMinEdgeOnly = 7, // Clamp the negative coordinate to 0 only. Therefore, we expect Y to be always < MAX
diff --git a/compute_kernel_writer/prototype/src/Kernel.cpp b/compute_kernel_writer/prototype/src/Kernel.cpp
index 095ac879f1..6228ed17d0 100644
--- a/compute_kernel_writer/prototype/src/Kernel.cpp
+++ b/compute_kernel_writer/prototype/src/Kernel.cpp
@@ -23,24 +23,27 @@
  */
 
 #include "ckw/Kernel.h"
+
 #include "ckw/TensorOperand.h"
 #include "ckw/types/GpuTargetLanguage.h"
+
 #include "src/Prototype.h"
 
 namespace ckw
 {
 
-Kernel::Kernel(GpuTargetLanguage language)
-    : Kernel{"unnamed", language}
+Kernel::Kernel(GpuTargetLanguage language) : Kernel{"unnamed", language}
 {
 }
 
 Kernel::Kernel(const char *name, GpuTargetLanguage language)
-    : _name(name), _kernel(std::make_unique<prototype::GpuKernelWriterDataHolder>(language)), _operands{}, _tensor_id_operands{}
+    : _name(name),
+      _kernel(std::make_unique<prototype::GpuKernelWriterDataHolder>(language)),
+      _operands{},
+      _tensor_id_operands{}
 {
 }
 
-
 Kernel::~Kernel()
 {
 }
@@ -50,7 +53,7 @@ const std::string &Kernel::name() const
     return _name;
 }
 
-void Kernel::name(const std::string& name)
+void Kernel::name(const std::string &name)
 {
     _name = name;
 }
@@ -60,14 +63,14 @@ std::vector<KernelArgument> Kernel::arguments() const
 
     const auto impl_args = _kernel->arguments.tensor_argument_declarations();
 
-    for(auto tensor_arg : impl_args)
+    for (auto tensor_arg : impl_args)
     {
         auto tensor = _tensor_id_operands.at(tensor_arg->format().id);
         arguments.push_back(*tensor);
 
-        for(auto component_arg : tensor_arg->component_declarations())
+        for (auto component_arg : tensor_arg->component_declarations())
         {
-            switch(component_arg)
+            switch (component_arg)
             {
                 case TensorComponentType::OffsetFirstElement:
                     arguments.push_back(tensor->offset_first_element_in_bytes());
diff --git a/compute_kernel_writer/prototype/src/KernelArgument.cpp b/compute_kernel_writer/prototype/src/KernelArgument.cpp
index 2b4d7c8cee..24ace28eb3 100644
--- a/compute_kernel_writer/prototype/src/KernelArgument.cpp
+++ b/compute_kernel_writer/prototype/src/KernelArgument.cpp
@@ -23,14 +23,14 @@
  */
 
 #include "ckw/KernelArgument.h"
+
 #include "ckw/Error.h"
 #include "ckw/TensorOperand.h"
 
 namespace ckw
 {
 
-KernelArgument::KernelArgument(TensorOperand &tensor)
-    : _type(Type::TensorStorage), _id(tensor.info().id())
+KernelArgument::KernelArgument(TensorOperand &tensor) : _type(Type::TensorStorage), _id(tensor.info().id())
 {
     _sub_id.tensor_storage_type = tensor.storage_type();
 }
diff --git a/compute_kernel_writer/prototype/src/KernelWriter.cpp b/compute_kernel_writer/prototype/src/KernelWriter.cpp
index 5c9a16ee33..9f58d9fefa 100644
--- a/compute_kernel_writer/prototype/src/KernelWriter.cpp
+++ b/compute_kernel_writer/prototype/src/KernelWriter.cpp
@@ -23,9 +23,11 @@
  */
 
 #include "ckw/KernelWriter.h"
+
 #include "ckw/Error.h"
 #include "ckw/TensorInfo.h"
 #include "ckw/TensorOperand.h"
+
 #include "src/Prototype.h"
 
 #include <sstream>
@@ -38,7 +40,7 @@ namespace
 
 inline prototype::TensorInfo create_impl_tensor_info(const TensorInfo &info)
 {
-    return prototype::TensorInfo{ info.shape(), info.data_type(), info.data_layout(), info.id() };
+    return prototype::TensorInfo{info.shape(), info.data_type(), info.data_layout(), info.id()};
 }
 
 } // namespace
@@ -86,7 +88,8 @@ int32_t KernelWriter::next_id_space()
 // Tensor and tile declaration
 // =================================================================================================
 
-TensorOperand &KernelWriter::declare_tensor_argument(const std::string &name, const TensorInfo &info, TensorStorageType storage_type)
+TensorOperand &
+KernelWriter::declare_tensor_argument(const std::string &name, const TensorInfo &info, TensorStorageType storage_type)
 {
     const auto var_name = generate_variable_name(name);
 
@@ -120,13 +123,11 @@ TileOperand &KernelWriter::declare_tile_operand(std::unique_ptr<TileOperand> ope
     auto       &operand = _kernel->register_operand(std::move(operand_ptr));
     const auto &name    = operand.name();
 
-    if(!operand.is_constant())
+    if (!operand.is_constant())
     {
         const auto &info = operand.tile_info();
 
-        _impl->declare_tile(
-            name,
-            prototype::TileInfo(info.data_type(), info.width(), info.height()));
+        _impl->declare_tile(name, prototype::TileInfo(info.data_type(), info.width(), info.height()));
     }
     else
     {
@@ -140,16 +141,15 @@ TileOperand &KernelWriter::declare_tile_operand(std::unique_ptr<TileOperand> ope
 // Load and store
 // =================================================================================================
 
-void KernelWriter::op_load(TileOperand &tile, const TensorOperand &tensor, const TensorTileSampler &sampler, const TileOperand &dilation_y)
+void KernelWriter::op_load(TileOperand             &tile,
+                           const TensorOperand     &tensor,
+                           const TensorTileSampler &sampler,
+                           const TileOperand       &dilation_y)
 {
     prototype::TensorOperand impl_tensor(
         tensor.name(),
-        prototype::GpuSampler{
-            sampler.format(),
-            prototype::to_gpu_tensor_storage(tensor.storage_type()),
-            sampler.address_mode_x(),
-            sampler.address_mode_y(),
-            sampler.address_mode_z() });
+        prototype::GpuSampler{sampler.format(), prototype::to_gpu_tensor_storage(tensor.storage_type()),
+                              sampler.address_mode_x(), sampler.address_mode_y(), sampler.address_mode_z()});
 
     auto impl_x = sampler.x().create_impl_operand(_impl.get());
     auto impl_y = sampler.y().create_impl_operand(_impl.get());
@@ -167,12 +167,8 @@ void KernelWriter::op_load_indirect(TileOperand &tile, const TensorOperand &tens
 {
     prototype::TensorOperand impl_tensor(
         tensor.name(),
-        prototype::GpuSampler{
-            sampler.format(),
-            prototype::to_gpu_tensor_storage(tensor.storage_type()),
-            sampler.address_mode_x(),
-            sampler.address_mode_y(),
-            sampler.address_mode_z() });
+        prototype::GpuSampler{sampler.format(), prototype::to_gpu_tensor_storage(tensor.storage_type()),
+                              sampler.address_mode_x(), sampler.address_mode_y(), sampler.address_mode_z()});
 
     auto impl_x = sampler.x().create_impl_operand(_impl.get());
     auto impl_y = sampler.y().create_impl_operand(_impl.get());
@@ -194,12 +190,8 @@ void KernelWriter::util_get_indirect_buffer(TileOperand             &tile,
 {
     prototype::TensorOperand impl_tensor(
         tensor.name(),
-        prototype::GpuSampler{
-            sampler.format(),
-            prototype::to_gpu_tensor_storage(tensor.storage_type()),
-            sampler.address_mode_x(),
-            sampler.address_mode_y(),
-            sampler.address_mode_z() });
+        prototype::GpuSampler{sampler.format(), prototype::to_gpu_tensor_storage(tensor.storage_type()),
+                              sampler.address_mode_x(), sampler.address_mode_y(), sampler.address_mode_z()});
 
     auto impl_x     = x.create_impl_operand(_impl.get());
     auto impl_y     = y.create_impl_operand(_impl.get());
@@ -215,12 +207,8 @@ void KernelWriter::op_store(TensorOperand &tensor, const TileOperand &tile, cons
 {
     prototype::TensorOperand impl_tensor(
         tensor.name(),
-        prototype::GpuSampler{
-            sampler.format(),
-            prototype::to_gpu_tensor_storage(tensor.storage_type()),
-            sampler.address_mode_x(),
-            sampler.address_mode_y(),
-            sampler.address_mode_z() });
+        prototype::GpuSampler{sampler.format(), prototype::to_gpu_tensor_storage(tensor.storage_type()),
+                              sampler.address_mode_x(), sampler.address_mode_y(), sampler.address_mode_z()});
     auto impl_src = tile.create_impl_operand(_impl.get());
     auto impl_x   = sampler.x().create_impl_operand(_impl.get());
     auto impl_y   = sampler.y().create_impl_operand(_impl.get());
@@ -250,7 +238,10 @@ void KernelWriter::op_cast_expression(const TileOperand &dst, const TileOperand
     _impl->op_cast_expression(impl_dst, impl_src, policy);
 }
 
-void KernelWriter::op_binary_expression(const TileOperand &dst, const TileOperand &lhs, BinaryOp op, const TileOperand &rhs)
+void KernelWriter::op_binary_expression(const TileOperand &dst,
+                                        const TileOperand &lhs,
+                                        BinaryOp           op,
+                                        const TileOperand &rhs)
 {
     auto impl_lhs = lhs.create_impl_operand(_impl.get());
     auto impl_rhs = rhs.create_impl_operand(_impl.get());
@@ -275,7 +266,10 @@ void KernelWriter::op_unary_elementwise_function(const TileOperand &dst, UnaryFu
     _impl->op_unary_elementwise_function(impl_dst, opcode, impl_src);
 }
 
-void KernelWriter::op_binary_elementwise_function(const TileOperand &dst, BinaryFunction opcode, const TileOperand &first, const TileOperand &second)
+void KernelWriter::op_binary_elementwise_function(const TileOperand &dst,
+                                                  BinaryFunction     opcode,
+                                                  const TileOperand &first,
+                                                  const TileOperand &second)
 {
     auto impl_dst    = dst.create_impl_operand(_impl.get());
     auto impl_first  = first.create_impl_operand(_impl.get());
@@ -284,7 +278,11 @@ void KernelWriter::op_binary_elementwise_function(const TileOperand &dst, Binary
     _impl->op_binary_elementwise_function(impl_dst, opcode, impl_first, impl_second);
 }
 
-void KernelWriter::op_ternary_elementwise_function(const TileOperand &dst, TernaryFunction opcode, const TileOperand &first, const TileOperand &second, const TileOperand &third)
+void KernelWriter::op_ternary_elementwise_function(const TileOperand &dst,
+                                                   TernaryFunction    opcode,
+                                                   const TileOperand &first,
+                                                   const TileOperand &second,
+                                                   const TileOperand &third)
 {
     auto impl_dst    = dst.create_impl_operand(_impl.get());
     auto impl_first  = first.create_impl_operand(_impl.get());
@@ -305,7 +303,10 @@ void KernelWriter::op_if(const TileOperand &lhs, BinaryOp op, const TileOperand
     _impl->compound_statement_end();
 }
 
-void KernelWriter::op_else_if(const TileOperand &lhs, BinaryOp op, const TileOperand &rhs, const std::function<void()> &body)
+void KernelWriter::op_else_if(const TileOperand           &lhs,
+                              BinaryOp                     op,
+                              const TileOperand           &rhs,
+                              const std::function<void()> &body)
 {
     auto impl_lhs = lhs.create_impl_operand(_impl.get());
     auto impl_rhs = rhs.create_impl_operand(_impl.get());
@@ -324,14 +325,21 @@ void KernelWriter::op_else(const std::function<void()> &body)
     _impl->compound_statement_end();
 }
 
-void KernelWriter::op_for_loop(const TileOperand &var_name, BinaryOp cond_op, const TileOperand &cond_value_name, const TileOperand &update_var_name, AssignmentOp update_op, const TileOperand &update_value_name, const std::function<void()> &body)
+void KernelWriter::op_for_loop(const TileOperand           &var_name,
+                               BinaryOp                     cond_op,
+                               const TileOperand           &cond_value_name,
+                               const TileOperand           &update_var_name,
+                               AssignmentOp                 update_op,
+                               const TileOperand           &update_value_name,
+                               const std::function<void()> &body)
 {
     auto impl_var_name          = var_name.create_impl_operand(_impl.get());
     auto impl_cond_value_name   = cond_value_name.create_impl_operand(_impl.get());
     auto impl_update_var_name   = update_var_name.create_impl_operand(_impl.get());
     auto impl_update_value_name = update_value_name.create_impl_operand(_impl.get());
 
-    _impl->op_for_loop_header(impl_var_name, cond_op, impl_cond_value_name, impl_update_var_name, update_op, impl_update_value_name);
+    _impl->op_for_loop_header(impl_var_name, cond_op, impl_cond_value_name, impl_update_var_name, update_op,
+                              impl_update_value_name);
     _impl->compound_statement_begin();
     body();
     _impl->compound_statement_end();
diff --git a/compute_kernel_writer/prototype/src/OperandBase.cpp b/compute_kernel_writer/prototype/src/OperandBase.cpp
index 59cf846cc7..e0617fdc06 100644
--- a/compute_kernel_writer/prototype/src/OperandBase.cpp
+++ b/compute_kernel_writer/prototype/src/OperandBase.cpp
@@ -27,8 +27,7 @@
 namespace ckw
 {
 
-OperandBase::OperandBase(const std::string &name)
-    : _name(name)
+OperandBase::OperandBase(const std::string &name) : _name(name)
 {
 }
 
diff --git a/compute_kernel_writer/prototype/src/Prototype.h b/compute_kernel_writer/prototype/src/Prototype.h
index eb9d7198a9..433eef9e7b 100644
--- a/compute_kernel_writer/prototype/src/Prototype.h
+++ b/compute_kernel_writer/prototype/src/Prototype.h
@@ -25,12 +25,21 @@
 #ifndef CKW_PROTOTYPE_SRC_PROTOTYPE_H
 #define CKW_PROTOTYPE_SRC_PROTOTYPE_H
 
+#include "ckw/Error.h"
+#include "ckw/TensorInfo.h"
+#include "ckw/types/ConvertPolicy.h"
+#include "ckw/types/DataType.h"
+#include "ckw/types/Functions.h"
+#include "ckw/types/GpuTargetLanguage.h"
+#include "ckw/types/Operators.h"
+#include "ckw/types/TensorSamplerTypes.h"
+
 #include <algorithm>
 #include <array>
 #include <cassert> // assert (to be removed)
 #include <chrono>
 #include <cmath>
-#include <cstdint>  // int32_t
+#include <cstdint> // int32_t
 #include <functional>
 #include <iostream> // cout (to be removed)
 #include <map>
@@ -40,15 +49,6 @@
 #include <unordered_map>
 #include <vector>
 
-#include "ckw/Error.h"
-#include "ckw/TensorInfo.h"
-#include "ckw/types/ConvertPolicy.h"
-#include "ckw/types/DataType.h"
-#include "ckw/types/Functions.h"
-#include "ckw/types/GpuTargetLanguage.h"
-#include "ckw/types/Operators.h"
-#include "ckw/types/TensorSamplerTypes.h"
-
 namespace ckw
 {
 namespace prototype
@@ -83,21 +83,21 @@ enum class GpuExtensions
 
 struct TensorInfo
 {
-    TensorShape      shape{ { 0 } };
-    DataType         data_type{ DataType::Unknown };
-    TensorDataLayout data_layout{ TensorDataLayout::Nhwc };
-    int32_t          id{ -1 };
+    TensorShape      shape{{0}};
+    DataType         data_type{DataType::Unknown};
+    TensorDataLayout data_layout{TensorDataLayout::Nhwc};
+    int32_t          id{-1};
 };
 
 struct ComponentAttribute
 {
-    GpuCompilationSpeed compilation_speed{ GpuCompilationSpeed::Fast };
-    bool                overwrite_tile{ true };
+    GpuCompilationSpeed compilation_speed{GpuCompilationSpeed::Fast};
+    bool                overwrite_tile{true};
 };
 
 inline std::string data_type_to_cl_type(DataType dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::Fp32:
             return "float";
@@ -125,7 +125,7 @@ inline std::string data_type_to_cl_type(DataType dt)
 
 inline int32_t width_to_cl_vector_size(int32_t width)
 {
-    switch(width)
+    switch (width)
     {
         case 1:
             return 1;
@@ -160,7 +160,7 @@ inline std::string get_cl_data_type(DataType dt, int32_t width)
     std::string data_type;
     int32_t     w = width_to_cl_vector_size(width);
     data_type += data_type_to_cl_type(dt);
-    if(w != 1)
+    if (w != 1)
     {
         data_type += std::to_string(w);
     }
@@ -169,7 +169,7 @@ inline std::string get_cl_data_type(DataType dt, int32_t width)
 
 inline std::string to_opencl_store(int32_t vector_length)
 {
-    if(vector_length != 1)
+    if (vector_length != 1)
     {
         return "vstore" + std::to_string(vector_length) + "(";
     }
@@ -185,24 +185,21 @@ struct TileInfo
     {
     }
 
-    TileInfo(DataType dt)
-        : dt(dt), w(1), h(1)
+    TileInfo(DataType dt) : dt(dt), w(1), h(1)
     {
     }
 
-    TileInfo(DataType dt, int32_t width)
-        : dt(dt), w(width), h(1)
+    TileInfo(DataType dt, int32_t width) : dt(dt), w(width), h(1)
     {
     }
 
-    TileInfo(DataType dt, int32_t width, int32_t height)
-        : dt(dt), w(width), h(height)
+    TileInfo(DataType dt, int32_t width, int32_t height) : dt(dt), w(width), h(height)
     {
     }
 
-    DataType dt{ DataType::Unknown }; // Data type of the tile
-    int32_t  w{ 0 };                  // Width (i.e. c0 - portion of the channels)
-    int32_t  h{ 0 };                  // Height (i.e. s0 - portion of the spatial dimensions)
+    DataType dt{DataType::Unknown}; // Data type of the tile
+    int32_t  w{0};                  // Width (i.e. c0 - portion of the channels)
+    int32_t  h{0};                  // Height (i.e. s0 - portion of the spatial dimensions)
 };
 
 inline std::ostream &operator<<(std::ostream &o, const TileInfo &a)
@@ -213,14 +210,14 @@ inline std::ostream &operator<<(std::ostream &o, const TileInfo &a)
 
 struct DataTypeAsString
 {
-    std::string str{ "" };
-    DataType    dt{ DataType::Unknown };
-    int32_t     size{ 1 };
+    std::string str{""};
+    DataType    dt{DataType::Unknown};
+    int32_t     size{1};
 };
 
 struct ValueAsString
 {
-    std::string      str{ "" };
+    std::string      str{""};
     DataTypeAsString type{};
 };
 
@@ -276,8 +273,8 @@ public:
     virtual bool need_declaration() const = 0;
 
 protected:
-    TileInfo    _format{};       // Tile format
-    std::string _basename{ "" }; // Tile name
+    TileInfo    _format{};     // Tile format
+    std::string _basename{""}; // Tile name
 };
 
 // A tile is a collection of variables used to express a 2D data. The variables are vectors in the GPU context.
@@ -329,7 +326,7 @@ public:
         t.type.size = 1;
 
         // Check required because if the width has only one element, we cannot use .s0
-        if(_format.w != 1)
+        if (_format.w != 1)
         {
             // Automatic broadcasting
             t.str += ".s" + std::to_string(x);
@@ -360,10 +357,10 @@ public:
         t.type.dt   = _format.dt;
         t.type.size = width;
 
-        if(_format.w != 1)
+        if (_format.w != 1)
         {
             t.str += ".s";
-            for(int i = 0; i < width; ++i)
+            for (int i = 0; i < width; ++i)
             {
                 t.str += to_scalar_hex(x_start + i);
             }
@@ -374,7 +371,7 @@ public:
     std::vector<ValueAsString> underlying_source_variables() const override
     {
         std::vector<ValueAsString> vars;
-        for(int32_t y = 0; y < _format.h; ++y)
+        for (int32_t y = 0; y < _format.h; ++y)
         {
             ValueAsString t;
             t.str       = build_variable_name(y);
@@ -401,7 +398,7 @@ private:
     {
         std::string var_name = _basename;
 
-        if(_format.h == 1)
+        if (_format.h == 1)
         {
             return var_name;
         }
@@ -416,7 +413,7 @@ private:
 
     std::string to_scalar_hex(int32_t x) const
     {
-        switch(x)
+        switch (x)
         {
             case 0:
             case 1:
@@ -461,9 +458,9 @@ public:
 
         _data = std::vector<std::vector<std::string>>(_format.h, std::vector<std::string>(_format.w));
 
-        for(int32_t y = 0; y < _format.h; ++y)
+        for (int32_t y = 0; y < _format.h; ++y)
         {
-            for(int32_t x = 0; x < _format.w; ++x)
+            for (int32_t x = 0; x < _format.w; ++x)
             {
                 _data[y][x] = in[y][x];
             }
@@ -501,20 +498,20 @@ public:
         t.type.dt   = _format.dt;
         t.type.size = width;
 
-        if(width > 1)
+        if (width > 1)
         {
             t.str += "((" + get_cl_data_type(_format.dt, width) + ")(";
         }
 
         int32_t x = x_start;
-        for(; x < width - 1; ++x)
+        for (; x < width - 1; ++x)
         {
             t.str += scalar(x, y).str;
             t.str += ", ";
         }
         t.str += scalar(x, y).str;
 
-        if(width > 1)
+        if (width > 1)
         {
             t.str += "))";
         }
@@ -526,9 +523,9 @@ public:
     {
         std::vector<ValueAsString> vars;
 
-        for(int32_t y = 0; y < _format.h; ++y)
+        for (int32_t y = 0; y < _format.h; ++y)
         {
-            for(int32_t x = 0; x < _format.w; ++x)
+            for (int32_t x = 0; x < _format.w; ++x)
             {
                 ValueAsString t;
                 t.str       = _data[y][x];
@@ -572,7 +569,7 @@ enum class TensorComponentGroup : int32_t
 
 inline std::string to_string(TensorComponentType x)
 {
-    switch(x)
+    switch (x)
     {
         case TensorComponentType::Unknown:
             return "Unknown";
@@ -672,7 +669,7 @@ enum class GpuTensorStorage : int32_t
 
 inline GpuTensorStorage to_gpu_tensor_storage(TensorStorageType s)
 {
-    switch(s)
+    switch (s)
     {
         case TensorStorageType::Unknown:
             return GpuTensorStorage::Unknown;
@@ -694,7 +691,7 @@ inline GpuTensorStorage to_gpu_tensor_storage(TensorStorageType s)
 
 inline TensorStorageType to_tensor_storage(GpuTensorStorage s)
 {
-    switch(s)
+    switch (s)
     {
         case GpuTensorStorage::Unknown:
             return TensorStorageType::Unknown;
@@ -755,23 +752,23 @@ public:
     // Methods to override
     std::string component(TensorComponentType x) override
     {
-        if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentGroup::Constant)))
+        if ((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentGroup::Constant)))
         {
             int32_t idx = static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentIndex::IndexMask);
             return std::to_string(idx - 1);
         }
 
-        if(_return_by_value_when_possible)
+        if (_return_by_value_when_possible)
         {
-            if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentGroup::Dimension)))
+            if ((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentGroup::Dimension)))
             {
                 int32_t idx = static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentIndex::IndexMask);
                 return std::to_string(_format.shape[idx]);
             }
 
-            if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentGroup::FoldedDimension)))
+            if ((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentGroup::FoldedDimension)))
             {
-                switch(x)
+                switch (x)
                 {
                     case TensorComponentType::Dim1xDim2:
                         return std::to_string(_format.shape[1] * _format.shape[2]);
@@ -784,7 +781,7 @@ public:
             }
         }
 
-        if(std::find(_components_required.begin(), _components_required.end(), x) == _components_required.end())
+        if (std::find(_components_required.begin(), _components_required.end(), x) == _components_required.end())
         {
             _components_required.push_back(x);
         }
@@ -804,7 +801,7 @@ public:
 
     std::string storage(GpuTensorStorage x) override
     {
-        if(std::find(_storage_required.begin(), _storage_required.end(), x) == _storage_required.end())
+        if (std::find(_storage_required.begin(), _storage_required.end(), x) == _storage_required.end())
         {
             _storage_required.push_back(x);
         }
@@ -814,7 +811,7 @@ public:
 
     std::string storage_type_declaration(GpuTensorStorage x) const override
     {
-        switch(x)
+        switch (x)
         {
             case GpuTensorStorage::BufferUint8Ptr:
                 return "__global uchar*";
@@ -848,7 +845,7 @@ private:
     {
         std::string var_name = _basename;
 
-        switch(x)
+        switch (x)
         {
             case GpuTensorStorage::BufferUint8Ptr:
                 return var_name + "_ptr";
@@ -870,7 +867,7 @@ private:
     {
         std::string var_name = _basename;
 
-        switch(x)
+        switch (x)
         {
             case TensorComponentType::OffsetFirstElement:
                 return var_name + "_offset_first_element";
@@ -900,9 +897,9 @@ private:
         return var_name;
     }
 
-    bool                          _return_by_value_when_possible{ false };
-    std::vector<GpuTensorStorage> _storage_required{};
-    std::vector<TensorComponentType>  _components_required{};
+    bool                             _return_by_value_when_possible{false};
+    std::vector<GpuTensorStorage>    _storage_required{};
+    std::vector<TensorComponentType> _components_required{};
 };
 
 /**
@@ -930,16 +927,16 @@ public:
 
     struct RegistryTileTableEntry
     {
-        RegistryLevel                registry_level{ 0 };
-        std::unique_ptr<IVectorTile> tile_object{ nullptr };
+        RegistryLevel                registry_level{0};
+        std::unique_ptr<IVectorTile> tile_object{nullptr};
     };
 
     struct RegistryTileTypeTableEntry
     {
-        RegistryTileType tile_type{ RegistryTileType::Tile };
+        RegistryTileType tile_type{RegistryTileType::Tile};
         RegistryTileName tile_name{};
-        RegistryIdSpace  registry_idspace{ 0 };
-        RegistryLevel    registry_level{ 0 };
+        RegistryIdSpace  registry_idspace{0};
+        RegistryLevel    registry_level{0};
     };
 
     using RegistryTileTable     = std::map<RegistryIdSpace, std::map<RegistryTileName, RegistryTileTableEntry>>;
@@ -1002,7 +999,7 @@ public:
 
         auto it = _frags.begin();
 
-        while(it != _frags.end())
+        while (it != _frags.end())
         {
             x.push_back(it->first);
 
@@ -1026,7 +1023,7 @@ public:
         // First check whether a tile with the same name exists
         IVectorTile *result = (*this)[key_var_name];
         assert(result == nullptr);
-        if(result == nullptr)
+        if (result == nullptr)
         {
             std::unique_ptr<ClTile> tile = std::make_unique<ClTile>(var_name, format);
 
@@ -1058,7 +1055,7 @@ public:
         // First check whether a tile with the same name exists
         IVectorTile *result = (*this)[key_var_name];
         assert(result == nullptr);
-        if(result == nullptr)
+        if (result == nullptr)
         {
             std::unique_ptr<ClTile> tile                     = std::make_unique<ClTile>(var_name, format);
             _frags[key_IdSpace][key_var_name].tile_object    = std::move(tile);
@@ -1090,7 +1087,7 @@ public:
         // First check whether a tile with the same name exists
         IVectorTile *result = (*this)[key_var_name];
         assert(result == nullptr);
-        if(result == nullptr)
+        if (result == nullptr)
         {
             std::unique_ptr<ClConstantTile> tile             = std::make_unique<ClConstantTile>(in, dt);
             _frags[key_IdSpace][key_var_name].tile_object    = std::move(tile);
@@ -1123,7 +1120,7 @@ public:
         // First check whether a tile with the same name exists
         IVectorTile *result = (*this)[key_var_name];
         assert(result == nullptr);
-        if(result == nullptr)
+        if (result == nullptr)
         {
             std::unique_ptr<ClConstantTile> tile             = std::make_unique<ClConstantTile>(in, dt);
             _frags[key_IdSpace][key_var_name].tile_object    = std::move(tile);
@@ -1153,10 +1150,10 @@ public:
 
         IVectorTile *result         = nullptr;
         auto         search_IdSpace = _frags.find(key_IdSpace);
-        if(search_IdSpace != _frags.end())
+        if (search_IdSpace != _frags.end())
         {
             auto search_tile = _frags[key_IdSpace].find(key_var_name);
-            if(search_tile != _frags[key_IdSpace].end())
+            if (search_tile != _frags[key_IdSpace].end())
             {
                 result = search_tile->second.tile_object.get();
                 assert(result != nullptr);
@@ -1224,7 +1221,7 @@ public:
 
         std::map<RegistryTileName, RegistryTileTypeTableEntry>::iterator it = _frag_types[IdSpace].begin();
 
-        while(it != _frag_types[IdSpace].end())
+        while (it != _frag_types[IdSpace].end())
         {
             // The following line should be enabled. However, we cannot at this stage
             // because it used to retrieve the output tile produced by each component.
@@ -1259,9 +1256,9 @@ public:
         // Remove all variables in the local scope
         std::map<RegistryTileName, RegistryTileTableEntry>::iterator it = _frags[_IdSpace].begin();
 
-        while(it != _frags[_IdSpace].end())
+        while (it != _frags[_IdSpace].end())
         {
-            if(it->second.registry_level == _registry_level)
+            if (it->second.registry_level == _registry_level)
             {
                 it = _frags[_IdSpace].erase(it);
             }
@@ -1273,9 +1270,9 @@ public:
 
         std::map<RegistryTileName, RegistryTileTypeTableEntry>::iterator it_type = _frag_types[_IdSpace].begin();
 
-        while(it_type != _frag_types[_IdSpace].end())
+        while (it_type != _frag_types[_IdSpace].end())
         {
-            if(it_type->second.registry_level == _registry_level)
+            if (it_type->second.registry_level == _registry_level)
             {
                 it_type = _frag_types[_IdSpace].erase(it_type);
             }
@@ -1302,7 +1299,7 @@ private:
     std::string generate_tile_name(const std::string &name)
     {
         assert(_IdSpace >= 0);
-        if(_registry_level == 0)
+        if (_registry_level == 0)
         {
             return "_G" + std::to_string(_IdSpace) + "_" + name;
         }
@@ -1314,10 +1311,10 @@ private:
 
     RegistryTileTable     _frags{};
     RegistryTileTypeTable _frag_types{};
-    RegistryLevel         _registry_level{ 0 };
-    RegistryIdSpace       _IdSpace{ -1 };
-    int32_t               _anonymous_frag_count{ 0 };              // Counter used to create the anonymous tiles
-    GpuTargetLanguage     _language{ GpuTargetLanguage::Unknown }; // Gpu programming language
+    RegistryLevel         _registry_level{0};
+    RegistryIdSpace       _IdSpace{-1};
+    int32_t               _anonymous_frag_count{0};              // Counter used to create the anonymous tiles
+    GpuTargetLanguage     _language{GpuTargetLanguage::Unknown}; // Gpu programming language
 };
 
 using TensorEntry = std::unique_ptr<IGpuTensorArgument>;
@@ -1388,7 +1385,7 @@ public:
 
         auto it = _refs.begin();
 
-        while(it != _refs.end())
+        while (it != _refs.end())
         {
             x.push_back(it->first);
 
@@ -1420,12 +1417,12 @@ public:
 
         // Check whether a tensor with that tensorID exists
         auto result = _tensor_arguments.find(tensor_id);
-        if(result == _tensor_arguments.end())
+        if (result == _tensor_arguments.end())
         {
             // It means that we haven't added a tensor with that tensor_id yet. Create a IGpuTensorArgument before creating the reference
-            std::unique_ptr<ClTensorArgument> arg = std::make_unique<ClTensorArgument>(var_name, x,
-                                                                                       return_by_value_when_possible);
-            _tensor_arguments[tensor_id]          = std::move(arg);
+            std::unique_ptr<ClTensorArgument> arg =
+                std::make_unique<ClTensorArgument>(var_name, x, return_by_value_when_possible);
+            _tensor_arguments[tensor_id] = std::move(arg);
         }
 
         _refs[key_IdSpace][key_var_name] = tensor_id;
@@ -1445,15 +1442,15 @@ public:
 
         IGpuTensorArgument *result         = nullptr;
         auto                search_IdSpace = _refs.find(key_IdSpace);
-        if(search_IdSpace != _refs.end())
+        if (search_IdSpace != _refs.end())
         {
             auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
 
-            if(search_tensor_id != _refs[key_IdSpace].end())
+            if (search_tensor_id != _refs[key_IdSpace].end())
             {
                 const int32_t tensor_id              = search_tensor_id->second;
                 auto          search_tensor_argument = _tensor_arguments.find(tensor_id);
-                if(search_tensor_argument != _tensor_arguments.end())
+                if (search_tensor_argument != _tensor_arguments.end())
                 {
                     result = search_tensor_argument->second.get();
                 }
@@ -1475,7 +1472,7 @@ public:
 
         auto it = _tensor_arguments.begin();
 
-        while(it != _tensor_arguments.end())
+        while (it != _tensor_arguments.end())
         {
             args.push_back(it->second.get());
             it++;
@@ -1499,7 +1496,7 @@ public:
 
         auto search_IdSpace = _refs.find(key_IdSpace);
 
-        if(search_IdSpace != _refs.end())
+        if (search_IdSpace != _refs.end())
         {
             auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
 
@@ -1527,7 +1524,7 @@ public:
 
         auto search_IdSpace = _refs.find(key_IdSpace);
 
-        if(search_IdSpace != _refs.end())
+        if (search_IdSpace != _refs.end())
         {
             auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
 
@@ -1550,8 +1547,8 @@ private:
 
     std::map<int32_t, TensorEntry>                    _tensor_arguments{};
     std::map<int32_t, std::map<std::string, int32_t>> _refs{};
-    int32_t                                           _IdSpace{ -1 };
-    GpuTargetLanguage                                 _language{ GpuTargetLanguage::Unknown }; // Gpu programming language
+    int32_t                                           _IdSpace{-1};
+    GpuTargetLanguage                                 _language{GpuTargetLanguage::Unknown}; // Gpu programming language
 };
 
 enum class OpType : int32_t
@@ -1563,7 +1560,7 @@ enum class OpType : int32_t
 
 inline std::string to_string(AssignmentOp op)
 {
-    switch(op)
+    switch (op)
     {
         case AssignmentOp::Decrement:
             return "-=";
@@ -1577,7 +1574,7 @@ inline std::string to_string(AssignmentOp op)
 
 inline std::string to_string(UnaryOp op)
 {
-    switch(op)
+    switch (op)
     {
         case UnaryOp::LogicalNot:
             return "!";
@@ -1593,7 +1590,7 @@ inline std::string to_string(UnaryOp op)
 
 inline std::string to_string(BinaryOp op)
 {
-    switch(op)
+    switch (op)
     {
         case BinaryOp::Add:
             return "+";
@@ -1629,7 +1626,7 @@ inline std::string to_string(BinaryOp op)
 
 inline std::string binary_op_string(BinaryOp op)
 {
-    switch(op)
+    switch (op)
     {
         case BinaryOp::Add:
             return "add";
@@ -1698,13 +1695,12 @@ struct ScalarTileCoord
     {
     }
 
-    ScalarTileCoord(int32_t x0, int32_t y0)
-        : x(x0), y(y0)
+    ScalarTileCoord(int32_t x0, int32_t y0) : x(x0), y(y0)
     {
     }
 
-    int32_t x{ -1 };
-    int32_t y{ -1 };
+    int32_t x{-1};
+    int32_t y{-1};
 };
 
 /**
@@ -1768,7 +1764,7 @@ public:
 
 private:
     std::string     _str{};
-    OperandType     _type{ OperandType::Unknown };
+    OperandType     _type{OperandType::Unknown};
     ScalarTileCoord _coord{};
 };
 
@@ -1778,16 +1774,15 @@ struct GpuSampler
 {
     GpuSampler() = default;
 
-    TensorSamplerFormat       format{ TensorSamplerFormat::Unknown };
-    GpuSamplerTensorStorage   storage{ GpuSamplerTensorStorage::Unknown };
-    TensorSamplerAddressModeX address_mode_x{ TensorSamplerAddressModeX::Unknown };
-    TensorSamplerAddressModeY address_mode_y{ TensorSamplerAddressModeY::Unknown };
-    TensorSamplerAddressModeZ address_mode_z{ TensorSamplerAddressModeZ::Unknown };
+    TensorSamplerFormat       format{TensorSamplerFormat::Unknown};
+    GpuSamplerTensorStorage   storage{GpuSamplerTensorStorage::Unknown};
+    TensorSamplerAddressModeX address_mode_x{TensorSamplerAddressModeX::Unknown};
+    TensorSamplerAddressModeY address_mode_y{TensorSamplerAddressModeY::Unknown};
+    TensorSamplerAddressModeZ address_mode_z{TensorSamplerAddressModeZ::Unknown};
 };
 
-inline GpuSampler
-create_simple_sampler(const TensorInfo *tensor_info_id, GpuSampler sampler, int32_t step_x, int32_t step_y,
-                      int32_t step_z)
+inline GpuSampler create_simple_sampler(
+    const TensorInfo *tensor_info_id, GpuSampler sampler, int32_t step_x, int32_t step_y, int32_t step_z)
 {
     CKW_UNUSED(step_x, step_y, step_z);
 
@@ -1804,7 +1799,7 @@ create_simple_sampler(const TensorInfo *tensor_info_id, GpuSampler sampler, int3
     int32_t dim_y = 0;
     int32_t dim_z = 0;
 
-    switch(sampler.format)
+    switch (sampler.format)
     {
         case TensorSamplerFormat::C_W_H:
             dim_x = tensor[0];
@@ -1822,19 +1817,19 @@ create_simple_sampler(const TensorInfo *tensor_info_id, GpuSampler sampler, int3
             break;
     }
 
-    if(dim_x == 1)
+    if (dim_x == 1)
     {
         assert(step_x == 1);
         dst_sampler.address_mode_x = TensorSamplerAddressModeX::None;
     }
 
-    if(dim_y == 1)
+    if (dim_y == 1)
     {
         assert(step_y == 1);
         dst_sampler.address_mode_y = TensorSamplerAddressModeY::None;
     }
 
-    if(dim_z == 1)
+    if (dim_z == 1)
     {
         assert(step_z == 1);
         dst_sampler.address_mode_z = TensorSamplerAddressModeZ::None;
@@ -1858,8 +1853,12 @@ public:
      * @param[in] step_y  Increment step in the Y direction. Not necessarily it is the same of m0 of tile!
      * @param[in] step_z  Increment step in the Z direction. Not necessarily it is the same of d0 of tile!
      */
-    void initialize(const TensorInfo *tensor_info_id, GpuSamplerTensorStorage tensor_storage,
-                    TensorSamplerFormat tensor_format, int32_t step_x, int32_t step_y, int32_t step_z)
+    void initialize(const TensorInfo       *tensor_info_id,
+                    GpuSamplerTensorStorage tensor_storage,
+                    TensorSamplerFormat     tensor_format,
+                    int32_t                 step_x,
+                    int32_t                 step_y,
+                    int32_t                 step_z)
     {
         assert(_is_initialized == false);
 
@@ -1908,13 +1907,13 @@ private:
         sampler.address_mode_z = TensorSamplerAddressModeZ::None;
 
         // In the case of texture, we do not need any special checks at the border
-        if(tensor_storage == GpuSamplerTensorStorage::BufferUint8Ptr)
+        if (tensor_storage == GpuSamplerTensorStorage::BufferUint8Ptr)
         {
             int32_t dim_x = 0;
             int32_t dim_y = 0;
             int32_t dim_z = 0;
 
-            switch(tensor_format)
+            switch (tensor_format)
             {
                 case TensorSamplerFormat::C_W_H:
                     dim_x = tensor[0];
@@ -1932,17 +1931,17 @@ private:
                     break;
             }
 
-            if((dim_x % _step_x) != 0 && dim_x != 1)
+            if ((dim_x % _step_x) != 0 && dim_x != 1)
             {
                 sampler.address_mode_x = TensorSamplerAddressModeX::OverlappingMin;
             }
 
-            if((dim_y % _step_y) != 0 && dim_y != 1)
+            if ((dim_y % _step_y) != 0 && dim_y != 1)
             {
                 sampler.address_mode_y = TensorSamplerAddressModeY::ClampToMaxEdgeOnly;
             }
 
-            if((dim_z % _step_z) != 0 && dim_z != 1)
+            if ((dim_z % _step_z) != 0 && dim_z != 1)
             {
                 sampler.address_mode_z = TensorSamplerAddressModeZ::ClampToMaxEdgeOnly;
             }
@@ -1952,11 +1951,11 @@ private:
     }
 
     GpuSampler        _sampler{}; // GpuSampler
-    int32_t           _step_x{ 1 };
-    int32_t           _step_y{ 1 };
-    int32_t           _step_z{ 1 };
-    const TensorInfo *_tensor_info_id{ nullptr };
-    bool              _is_initialized{ false };
+    int32_t           _step_x{1};
+    int32_t           _step_y{1};
+    int32_t           _step_z{1};
+    const TensorInfo *_tensor_info_id{nullptr};
+    bool              _is_initialized{false};
 };
 
 /**
@@ -1965,8 +1964,7 @@ private:
 class TensorOperand
 {
 public:
-    TensorOperand(const std::string &val, GpuSampler sampler)
-        : _str(val), _sampler(sampler)
+    TensorOperand(const std::string &val, GpuSampler sampler) : _str(val), _sampler(sampler)
     {
     }
 
@@ -2050,9 +2048,9 @@ private:
 
 struct LWS
 {
-    int32_t x{ 1 };
-    int32_t y{ 1 };
-    int32_t z{ 1 };
+    int32_t x{1};
+    int32_t y{1};
+    int32_t z{1};
 };
 
 /**
@@ -2062,8 +2060,7 @@ struct LWS
 class OperandUnpacker
 {
 public:
-    OperandUnpacker(GpuTileRegistry &tiles, GpuTensorArgumentRegistry &arguments)
-        : _tiles(tiles), _arguments(arguments)
+    OperandUnpacker(GpuTileRegistry &tiles, GpuTensorArgumentRegistry &arguments) : _tiles(tiles), _arguments(arguments)
     {
         // Increase the level of the stack to allocate possible temporary tiles
         _tiles.increment_registry_level();
@@ -2078,26 +2075,26 @@ public:
     IVectorTile *unpack(const Operand &src)
     {
         // Get the tile
-        if(src.type() == OperandType::Tile)
+        if (src.type() == OperandType::Tile)
         {
             assert(_tiles.has_tile(src.value()));
             return _tiles[src.value()];
         }
         // Create an anonymous tile with a constant
-        else if(static_cast<int32_t>(src.type()) & 0x00001000)
+        else if (static_cast<int32_t>(src.type()) & 0x00001000)
         {
-            if(src.type() == OperandType::ScalarTile)
+            if (src.type() == OperandType::ScalarTile)
             {
                 ScalarTileCoord coord = src.scalar_tile_coordinate();
                 assert(_tiles.has_tile(src.value()));
                 assert(coord.x >= 0);
                 assert(coord.y >= 0);
                 auto val = _tiles[src.value()]->scalar(coord.x, coord.y);
-                return _tiles.insert({ { { val.str } } }, val.type.dt);
+                return _tiles.insert({{{val.str}}}, val.type.dt);
             }
             else
             {
-                return _tiles.insert({ { { src.value() } } }, to_tile_data_type(src.type()));
+                return _tiles.insert({{{src.value()}}}, to_tile_data_type(src.type()));
             }
         }
         // Create an anonymous tile with the tensor component
@@ -2107,7 +2104,7 @@ public:
             auto              x   = _arguments[src.value()];
             const std::string val = x->component(to_tensor_component(src.type()));
             const DataType    dt  = x->component_data_type();
-            return _tiles.insert({ { { val } } }, dt);
+            return _tiles.insert({{{val}}}, dt);
         }
     }
 
@@ -2119,7 +2116,7 @@ private:
 
     TensorComponentType to_tensor_component(OperandType x)
     {
-        switch(x)
+        switch (x)
         {
             case OperandType::TensorDim0:
                 return TensorComponentType::Dim0;
@@ -2163,8 +2160,7 @@ private:
 class TensorOperandUnpacker
 {
 public:
-    TensorOperandUnpacker(GpuTensorArgumentRegistry &arguments)
-        : _arguments(arguments){};
+    TensorOperandUnpacker(GpuTensorArgumentRegistry &arguments) : _arguments(arguments){};
 
     IGpuTensorArgument *unpack(const TensorOperand &src)
     {
@@ -2191,9 +2187,11 @@ struct GpuKernel
     std::string      config_id{}; // Unique id, required for the tuning stage
     std::vector<LWS> list_lws{};  // LWS to test, required for the tuning stage
     // Dispatch stage
-    GpuOutputSampler                                  output_sampler{};       // GpuOutputSampler, required for the dispatch stage
-    std::vector<std::pair<int32_t, GpuTensorStorage>> list_tensor_storages;   // List of tensor storages, required for the dispatch stage
-    std::vector<std::pair<int32_t, TensorComponentType>>  list_tensor_components; // List of tensor components (width, stride,..), required for the dispatch stage)
+    GpuOutputSampler output_sampler{}; // GpuOutputSampler, required for the dispatch stage
+    std::vector<std::pair<int32_t, GpuTensorStorage>>
+        list_tensor_storages; // List of tensor storages, required for the dispatch stage
+    std::vector<std::pair<int32_t, TensorComponentType>>
+        list_tensor_components; // List of tensor components (width, stride,..), required for the dispatch stage)
 };
 
 // Generate all extension pragmas (hardcoded for now)
@@ -2234,13 +2232,13 @@ inline std::string generate_code(GpuKernelWriterDataHolder &in, const std::strin
 
     auto tensor_args = in.arguments.tensor_argument_declarations();
 
-    for(auto &i : tensor_args)
+    for (auto &i : tensor_args)
     {
         // For each tensor used, get the storage and tensor components
         auto storages   = i->storage_declarations();
         auto components = i->component_declarations();
 
-        for(auto &y : storages)
+        for (auto &y : storages)
         {
             std::string str;
             str += i->storage_type_declaration(y);
@@ -2249,7 +2247,7 @@ inline std::string generate_code(GpuKernelWriterDataHolder &in, const std::strin
             arg_str.push_back(str);
         }
 
-        for(auto &y : components)
+        for (auto &y : components)
         {
             std::string str;
             str += i->component_type_declaration();
@@ -2259,10 +2257,10 @@ inline std::string generate_code(GpuKernelWriterDataHolder &in, const std::strin
         }
     }
 
-    for(size_t i = 0; i < arg_str.size(); ++i)
+    for (size_t i = 0; i < arg_str.size(); ++i)
     {
         code += arg_str[i];
-        if(i + 1 < arg_str.size())
+        if (i + 1 < arg_str.size())
         {
             code += ",\n";
         }
@@ -2284,13 +2282,12 @@ inline std::string generate_code(GpuKernelWriterDataHolder &in, const std::strin
 class GpuTensor3dMapper
 {
 public:
-    GpuTensor3dMapper(IGpuTensorArgument *tensor, GpuSampler sampler)
-        : _sampler(sampler), _tensor(tensor){};
+    GpuTensor3dMapper(IGpuTensorArgument *tensor, GpuSampler sampler) : _sampler(sampler), _tensor(tensor){};
 
     std::string tensor_component_x() const
     {
         const auto format = _sampler.format;
-        switch(format)
+        switch (format)
         {
             case TensorSamplerFormat::C_WH_1:
             case TensorSamplerFormat::C_W_H:
@@ -2305,7 +2302,7 @@ public:
     std::string tensor_component_y() const
     {
         const auto format = _sampler.format;
-        switch(format)
+        switch (format)
         {
             case TensorSamplerFormat::C_WH_1:
                 return _tensor->component(TensorComponentType::Dim1xDim2);
@@ -2321,7 +2318,7 @@ public:
     std::string tensor_component_z() const
     {
         const auto format = _sampler.format;
-        switch(format)
+        switch (format)
         {
             case TensorSamplerFormat::C_WH_1:
                 return "1";
@@ -2337,7 +2334,7 @@ public:
     std::string tensor_component_stride_y() const
     {
         const auto format = _sampler.format;
-        switch(format)
+        switch (format)
         {
             case TensorSamplerFormat::C_WH_1:
             case TensorSamplerFormat::C_W_H:
@@ -2352,7 +2349,7 @@ public:
     std::string tensor_component_stride_z() const
     {
         const auto format = _sampler.format;
-        switch(format)
+        switch (format)
         {
             case TensorSamplerFormat::C_WH_1:
                 return "0";
@@ -2368,7 +2365,7 @@ public:
     std::string tensor_component_stride_batch() const
     {
         const auto format = _sampler.format;
-        switch(format)
+        switch (format)
         {
             case TensorSamplerFormat::C_WH_1:
             case TensorSamplerFormat::C_W_H:
@@ -2384,7 +2381,7 @@ public:
     {
         auto       t      = _tensor->format();
         const auto format = _sampler.format;
-        switch(format)
+        switch (format)
         {
             case TensorSamplerFormat::C_WH_1:
             case TensorSamplerFormat::C_W_H:
@@ -2400,7 +2397,7 @@ public:
     {
         auto       t      = _tensor->format();
         const auto format = _sampler.format;
-        switch(format)
+        switch (format)
         {
             case TensorSamplerFormat::C_WH_1:
                 return (t.shape[1] * t.shape[2]) == 1;
@@ -2417,7 +2414,7 @@ public:
     {
         auto       t      = _tensor->format();
         const auto format = _sampler.format;
-        switch(format)
+        switch (format)
         {
             case TensorSamplerFormat::C_WH_1:
                 return true;
@@ -2434,7 +2431,7 @@ public:
     {
         auto       t      = _tensor->format();
         const auto format = _sampler.format;
-        switch(format)
+        switch (format)
         {
             case TensorSamplerFormat::C_WH_1:
             case TensorSamplerFormat::C_W_H:
@@ -2463,7 +2460,7 @@ private:
 
 struct GpuKernelWriterAttribute
 {
-    bool return_tensor_component_by_value{ false };
+    bool return_tensor_component_by_value{false};
 };
 
 enum class RoundingMode
@@ -2489,7 +2486,8 @@ public:
 
     virtual void declare_tile(const std::string &name, const TileInfo &info) = 0;
 
-    virtual void declare_const_tile(const std::string &name, const std::vector<std::vector<std::string>> &in, DataType dt) = 0;
+    virtual void
+    declare_const_tile(const std::string &name, const std::vector<std::vector<std::string>> &in, DataType dt) = 0;
 
     virtual void write_text(const std::string &x) = 0;
 
@@ -2498,48 +2496,82 @@ public:
     virtual void compound_statement_end() = 0;
 
     // Operations
-    virtual void op_get_global_id(const Operand &dst_var, int32_t dim)                                                                                                                                                                                                   = 0;
+    virtual void op_get_global_id(const Operand &dst_var, int32_t dim) = 0;
 
-    virtual void op_get_global_coord(const Operand &dst, const Operand &step, const TensorOperand &tensor, int32_t dim)                                                                                                                                                  = 0;
+    virtual void
+    op_get_global_coord(const Operand &dst, const Operand &step, const TensorOperand &tensor, int32_t dim) = 0;
 
-    virtual void op_get_global_batch(const Operand &dst, const TensorOperand &tensor)                                                                                                                                                                                    = 0;
+    virtual void op_get_global_batch(const Operand &dst, const TensorOperand &tensor) = 0;
 
-    virtual void op_get_global_size(const Operand &dst_var, int32_t dim)                                                                                                                                                                                                 = 0;
+    virtual void op_get_global_size(const Operand &dst_var, int32_t dim) = 0;
 
-    virtual void op_unary_expression(const Operand &dst, UnaryOp op, const Operand &src)                                                                                                                                                                                 = 0;
+    virtual void op_unary_expression(const Operand &dst, UnaryOp op, const Operand &src) = 0;
 
-    virtual void op_binary_expression(const Operand &dst, const Operand &lhs, BinaryOp op, const Operand &rhs)                                                                                                                                                           = 0;
+    virtual void op_binary_expression(const Operand &dst, const Operand &lhs, BinaryOp op, const Operand &rhs) = 0;
 
-    virtual void op_assign(const Operand &dst_name, const Operand &src_name)                                                                                                                                                                                             = 0;
+    virtual void op_assign(const Operand &dst_name, const Operand &src_name) = 0;
 
-    virtual void op_unary_elementwise_function(const Operand &dst_name, UnaryFunction func, const Operand &src_name)                                                                                                                                                     = 0;
+    virtual void
+    op_unary_elementwise_function(const Operand &dst_name, UnaryFunction func, const Operand &src_name) = 0;
 
-    virtual void op_binary_elementwise_function(const Operand &dst_name, BinaryFunction func, const Operand &first_name, const Operand &second_name)                                                                                                                     = 0;
+    virtual void op_binary_elementwise_function(const Operand &dst_name,
+                                                BinaryFunction func,
+                                                const Operand &first_name,
+                                                const Operand &second_name) = 0;
 
-    virtual void op_ternary_elementwise_function(const Operand &dst_name, TernaryFunction func, const Operand &first_name, const Operand &second_name, const Operand &third_name)                                                                                        = 0;
+    virtual void op_ternary_elementwise_function(const Operand  &dst_name,
+                                                 TernaryFunction func,
+                                                 const Operand  &first_name,
+                                                 const Operand  &second_name,
+                                                 const Operand  &third_name) = 0;
 
-    virtual void op_if_header(const Operand &lhs, BinaryOp op, const Operand &rhs)                                                                                                                                                                                       = 0;
+    virtual void op_if_header(const Operand &lhs, BinaryOp op, const Operand &rhs) = 0;
 
-    virtual void op_else_if_header(const Operand &lhs, BinaryOp op, const Operand &rhs)                                                                                                                                                                                  = 0;
+    virtual void op_else_if_header(const Operand &lhs, BinaryOp op, const Operand &rhs) = 0;
 
-    virtual void op_else_header()                                                                                                                                                                                                                                        = 0;
+    virtual void op_else_header() = 0;
 
-    virtual void op_for_loop_header(const Operand &var_name, BinaryOp cond_op, const Operand &cond_value, const Operand &update_var, AssignmentOp update_op, const Operand &update_value)                                                                                                           = 0;
+    virtual void op_for_loop_header(const Operand &var_name,
+                                    BinaryOp       cond_op,
+                                    const Operand &cond_value,
+                                    const Operand &update_var,
+                                    AssignmentOp   update_op,
+                                    const Operand &update_value) = 0;
 
-    virtual void op_load_indirect(const TensorOperand &tensor, const Operand &dst, const Operand &x, const Operand &y_indirect, const Operand &z, const Operand &b = Operand("0", OperandType::ScalarInt32))                                                             = 0;
+    virtual void op_load_indirect(const TensorOperand &tensor,
+                                  const Operand       &dst,
+                                  const Operand       &x,
+                                  const Operand       &y_indirect,
+                                  const Operand       &z,
+                                  const Operand       &b = Operand("0", OperandType::ScalarInt32)) = 0;
 
-    virtual void op_load_immediate(const TensorOperand &tensor, const Operand &dst, const Operand &x, const Operand &y, const Operand &z, const Operand &b = Operand("0", OperandType::ScalarInt32), const Operand &dilation_y = Operand("1", OperandType::ScalarInt32)) = 0;
+    virtual void op_load_immediate(const TensorOperand &tensor,
+                                   const Operand       &dst,
+                                   const Operand       &x,
+                                   const Operand       &y,
+                                   const Operand       &z,
+                                   const Operand       &b          = Operand("0", OperandType::ScalarInt32),
+                                   const Operand       &dilation_y = Operand("1", OperandType::ScalarInt32)) = 0;
 
-    virtual void op_store_immediate(const TensorOperand &tensor, const Operand &src, const Operand &x, const Operand &y, const Operand &z, const Operand &b = Operand("0", OperandType::ScalarInt32))                                                                    = 0;
+    virtual void op_store_immediate(const TensorOperand &tensor,
+                                    const Operand       &src,
+                                    const Operand       &x,
+                                    const Operand       &y,
+                                    const Operand       &z,
+                                    const Operand       &b = Operand("0", OperandType::ScalarInt32)) = 0;
 
-    virtual void op_cast_expression(const Operand &dst, const Operand &src, ConvertPolicy policy)                                                                                                                                                                        = 0;
+    virtual void op_cast_expression(const Operand &dst, const Operand &src, ConvertPolicy policy) = 0;
 
-    virtual void op_return()                                                                                                                                                                                                                                             = 0;
+    virtual void op_return() = 0;
 
     // Utils
     // It is the process of converting
-    virtual void util_get_indirect_buffer(const Operand &dst, const TensorOperand &tensor, const Operand &x,
-                                          const Operand &y, const Operand &x_off, const Operand &y_off) = 0;
+    virtual void util_get_indirect_buffer(const Operand       &dst,
+                                          const TensorOperand &tensor,
+                                          const Operand       &x,
+                                          const Operand       &y,
+                                          const Operand       &x_off,
+                                          const Operand       &y_off) = 0;
 };
 
 enum class GpuLoadStoreType
@@ -2586,12 +2618,11 @@ public:
 
     ClLoadStoreBufferHelperWriter &operator=(const ClLoadStoreBufferHelperWriter &) = default;
 
-    static bool
-    validate(IGpuKernelWriter *x, GpuTensor3dMapper mapper, GpuLoadStoreType type, IVectorTile *dst)
+    static bool validate(IGpuKernelWriter *x, GpuTensor3dMapper mapper, GpuLoadStoreType type, IVectorTile *dst)
     {
         CKW_UNUSED(x, type, dst);
 
-        if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::BufferUint8Ptr)
+        if (mapper.gpu_sampler().storage != GpuSamplerTensorStorage::BufferUint8Ptr)
         {
             return false;
         }
@@ -2675,10 +2706,10 @@ public:
         out_of_bound_finalize_y(dst);
 
         // The left over load/store will be written in the finalize stage
-        if(_ls_width_part.size() != 0)
+        if (_ls_width_part.size() != 0)
         {
             int32_t w = 0;
-            for(auto &p : _ls_width_part)
+            for (auto &p : _ls_width_part)
             {
                 const std::string dst0    = _dst->vector(w, p, idx_y).str;
                 const std::string coord_x = _coord_x + " + " + std::to_string(w);
@@ -2698,8 +2729,8 @@ public:
     }
 
 private:
-    IVectorTile                                                             *_dst{ nullptr };
-    int32_t                                                                  _ls_width_full{ 0 };
+    IVectorTile                                                             *_dst{nullptr};
+    int32_t                                                                  _ls_width_full{0};
     std::vector<int32_t>                                                     _ls_width_part{};
     std::vector<std::pair<std::pair<std::string, std::string>, std::string>> _leftovers_x{};
     std::string                                                              _coord_x{};
@@ -2709,13 +2740,13 @@ private:
 
     void out_of_bound_initialize_x(std::string &coord)
     {
-        if(_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
+        if (_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
         {
             auto tensor_format = _mapper.tensor_argument()->format();
             auto shape         = tensor_format.shape;
 
             _ls_width_part = decompose_leftover_ls_vector_width(shape[0] % _ls_width_full);
-            if(_ls_width_part.size() != 0)
+            if (_ls_width_part.size() != 0)
             {
                 _writer->write_text("if(" + coord + " > 0)\n");
                 _writer->compound_statement_begin();
@@ -2725,16 +2756,16 @@ private:
 
     void out_of_bound_finalize_x()
     {
-        if(_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
+        if (_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
         {
-            if(_ls_width_part.size() != 0)
+            if (_ls_width_part.size() != 0)
             {
                 _writer->compound_statement_end();
                 _writer->write_text("else\n");
                 _writer->compound_statement_begin();
 
                 out_of_bound_initialize_z(_coord_orig_z);
-                for(auto &i : _leftovers_x)
+                for (auto &i : _leftovers_x)
                 {
                     out_of_bound_initialize_y(i.first.second);
                     _writer->write_text(i.second);
@@ -2753,7 +2784,7 @@ private:
 
         const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
 
-        switch(address_mode_y)
+        switch (address_mode_y)
         {
             case TensorSamplerAddressModeY::Skip:
             case TensorSamplerAddressModeY::ClampToBorder:
@@ -2799,7 +2830,7 @@ private:
     {
         const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
 
-        switch(address_mode_y)
+        switch (address_mode_y)
         {
             case TensorSamplerAddressModeY::ClampToBorder:
             case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
@@ -2816,7 +2847,7 @@ private:
                 assert(false);
         }
 
-        switch(address_mode_y)
+        switch (address_mode_y)
         {
             case TensorSamplerAddressModeY::ClampToBorder:
             case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
@@ -2841,7 +2872,7 @@ private:
 
         const auto address_mode_z = _mapper.gpu_sampler().address_mode_z;
 
-        switch(address_mode_z)
+        switch (address_mode_z)
         {
             case TensorSamplerAddressModeZ::Skip:
                 max = _mapper.tensor_component_z();
@@ -2880,7 +2911,7 @@ private:
     {
         const auto address_mode_z = _mapper.gpu_sampler().address_mode_z;
 
-        switch(address_mode_z)
+        switch (address_mode_z)
         {
             case TensorSamplerAddressModeZ::Skip:
             case TensorSamplerAddressModeZ::SkipMinEdgeOnly:
@@ -2899,7 +2930,7 @@ private:
     {
         std::vector<int32_t> x;
 
-        switch(ls_leftover_vector_width)
+        switch (ls_leftover_vector_width)
         {
             case 0:
                 break;
@@ -2961,13 +2992,13 @@ private:
         return x;
     }
 
-    std::string to_ls_buffer(GpuLoadStoreType type, int32_t vector_width, const std::string &data,
-                             const std::string &address)
+    std::string
+    to_ls_buffer(GpuLoadStoreType type, int32_t vector_width, const std::string &data, const std::string &address)
     {
-        switch(type)
+        switch (type)
         {
             case GpuLoadStoreType::Load:
-                if(vector_width != 1)
+                if (vector_width != 1)
                 {
                     return data + " = vload" + std::to_string(vector_width) + "(0, " + address + ")";
                 }
@@ -2977,7 +3008,7 @@ private:
                 }
                 break;
             case GpuLoadStoreType::Store:
-                if(vector_width != 1)
+                if (vector_width != 1)
                 {
                     return "vstore" + std::to_string(vector_width) + "(" + data + ", 0, " + address + ")";
                 }
@@ -2993,25 +3024,25 @@ private:
         }
     }
 
-    std::string to_ls_buffer_address(const std::string &x, const std::string &y, const std::string &z,
-                                     const std::string &b) const
+    std::string
+    to_ls_buffer_address(const std::string &x, const std::string &y, const std::string &z, const std::string &b) const
     {
-        auto tensor_storage            = static_cast<GpuTensorStorage>(_mapper.gpu_sampler().storage);
+        auto tensor_storage = static_cast<GpuTensorStorage>(_mapper.gpu_sampler().storage);
         assert(tensor_storage == GpuTensorStorage::BufferUint8Ptr);
-        const std::string ptr_buf      = _mapper.tensor_argument()->storage(tensor_storage);
-        const std::string dst_type     = get_cl_data_type(_dst->format().dt, 1);
+        const std::string ptr_buf  = _mapper.tensor_argument()->storage(tensor_storage);
+        const std::string dst_type = get_cl_data_type(_dst->format().dt, 1);
 
         std::string address;
         address += "(__global ";
         address += dst_type;
         address += "*)(";
         address += ptr_buf;
-        if(x != "0" && (_mapper.is_one_component_x() != true))
+        if (x != "0" && (_mapper.is_one_component_x() != true))
         {
             address += " + (";
             address += x + ") * sizeof(" + dst_type + ")";
         }
-        if(y != "0")
+        if (y != "0")
         {
             const std::string stride_y = _mapper.tensor_component_stride_y();
             address += " + (";
@@ -3019,7 +3050,7 @@ private:
             address += " * ";
             address += stride_y;
         }
-        if(z != "0" && (_mapper.is_one_component_z() != true))
+        if (z != "0" && (_mapper.is_one_component_z() != true))
         {
             const std::string stride_z = _mapper.tensor_component_stride_z();
             address += " + (";
@@ -3027,7 +3058,7 @@ private:
             address += " * ";
             address += stride_z;
         }
-        if(b != "0" && (_mapper.is_one_component_batch() != true))
+        if (b != "0" && (_mapper.is_one_component_batch() != true))
         {
             const std::string stride_b = _mapper.tensor_component_stride_batch();
             address += " + (";
@@ -3043,32 +3074,32 @@ private:
 class ClLoadStoreImage2dHelperWriter : public IGpuLoadStoreHelperWriter
 {
 public:
-    static bool
-    validate(IGpuKernelWriter *x, const GpuTensor3dMapper &mapper, GpuLoadStoreType type, IVectorTile *dst)
+    static bool validate(IGpuKernelWriter *x, const GpuTensor3dMapper &mapper, GpuLoadStoreType type, IVectorTile *dst)
     {
         CKW_UNUSED(x);
 
-        if(dst->format().w != 4)
+        if (dst->format().w != 4)
         {
             return false;
         }
-        if(mapper.gpu_sampler().address_mode_x != TensorSamplerAddressModeX::None)
+        if (mapper.gpu_sampler().address_mode_x != TensorSamplerAddressModeX::None)
         {
             return false;
         }
-        if(mapper.gpu_sampler().address_mode_z != TensorSamplerAddressModeZ::None)
+        if (mapper.gpu_sampler().address_mode_z != TensorSamplerAddressModeZ::None)
         {
             return false;
         }
-        if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dReadOnly && type == GpuLoadStoreType::Load)
+        if (mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dReadOnly && type == GpuLoadStoreType::Load)
         {
             return false;
         }
-        if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dWriteOnly && type == GpuLoadStoreType::Store)
+        if (mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dWriteOnly &&
+            type == GpuLoadStoreType::Store)
         {
             return false;
         }
-        if((dst->format().dt != DataType::Fp32) && (dst->format().dt != DataType::Fp16))
+        if ((dst->format().dt != DataType::Fp32) && (dst->format().dt != DataType::Fp16))
         {
             return false;
         }
@@ -3134,8 +3165,8 @@ public:
     }
 
 private:
-    IVectorTile *_dst{ nullptr };
-    int32_t      _ls_width_full{ 0 };
+    IVectorTile *_dst{nullptr};
+    int32_t      _ls_width_full{0};
     std::string  _coord_x{};
     std::string  _coord_z{};
     std::string  _coord_b{};
@@ -3146,7 +3177,7 @@ private:
 
         const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
 
-        switch(address_mode_y)
+        switch (address_mode_y)
         {
             case TensorSamplerAddressModeY::Skip:
                 max = _mapper.tensor_component_y();
@@ -3182,7 +3213,7 @@ private:
 
         const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
 
-        switch(address_mode_y)
+        switch (address_mode_y)
         {
             case TensorSamplerAddressModeY::Skip:
             case TensorSamplerAddressModeY::SkipMinEdgeOnly:
@@ -3195,16 +3226,19 @@ private:
         }
     };
 
-    std::string to_ls_image2d(GpuLoadStoreType type, int32_t vector_width, const std::string &data,
-                              const std::string &sampler, const std::string &coord)
+    std::string to_ls_image2d(GpuLoadStoreType   type,
+                              int32_t            vector_width,
+                              const std::string &data,
+                              const std::string &sampler,
+                              const std::string &coord)
     {
         CKW_UNUSED(vector_width);
 
         auto              tensor_storage = static_cast<GpuTensorStorage>(_mapper.gpu_sampler().storage);
         const std::string image2d_obj    = _mapper.tensor_argument()->storage(tensor_storage);
-        const std::string post_fix = _dst->format().dt == DataType::Fp32 ? "f" : "h";
+        const std::string post_fix       = _dst->format().dt == DataType::Fp32 ? "f" : "h";
 
-        switch(type)
+        switch (type)
         {
             case GpuLoadStoreType::Load:
                 return data + " = read_image" + post_fix + "(" + image2d_obj + ", " + sampler + ", " + coord + ")";
@@ -3223,7 +3257,7 @@ private:
     {
         const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
 
-        switch(address_mode_y)
+        switch (address_mode_y)
         {
             case TensorSamplerAddressModeY::None:
                 return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST";
@@ -3245,17 +3279,17 @@ private:
         }
     }
 
-    std::string to_ls_image2d_coord(const std::string &x, const std::string &y, const std::string &z,
-                                    const std::string &b) const
+    std::string
+    to_ls_image2d_coord(const std::string &x, const std::string &y, const std::string &z, const std::string &b) const
     {
         std::string coord_x = "(" + x + ") >> 2";
         std::string coord_y = "(";
 
-        if(y != "0")
+        if (y != "0")
         {
             coord_y += y;
         }
-        if(z != "0" && (_mapper.is_one_component_z() != true))
+        if (z != "0" && (_mapper.is_one_component_z() != true))
         {
             const std::string dim = _mapper.tensor_component_y();
             coord_y += " + (";
@@ -3263,7 +3297,7 @@ private:
             coord_y += " * ";
             coord_y += dim;
         }
-        if(b != "0" && (_mapper.is_one_component_batch() != true))
+        if (b != "0" && (_mapper.is_one_component_batch() != true))
         {
             const std::string dim0 = _mapper.tensor_component_y();
             const std::string dim1 = _mapper.tensor_component_z();
@@ -3292,7 +3326,7 @@ public:
     create(IGpuKernelWriter *x, const GpuTensor3dMapper &mapper, GpuLoadStoreType type)
     {
         const auto tensor_storage = mapper.gpu_sampler().storage;
-        switch(tensor_storage)
+        switch (tensor_storage)
         {
             case GpuSamplerTensorStorage::BufferUint8Ptr:
                 return std::make_unique<ClLoadStoreBufferHelperWriter>(x, mapper, type);
@@ -3352,14 +3386,14 @@ public:
 
         IVectorTile *x = _data->tiles[name];
 
-        for(auto &t : x->underlying_source_variables())
+        for (auto &t : x->underlying_source_variables())
         {
             _data->code += t.type.str + " " + t.str + ";\n";
         }
     }
 
-    void declare_const_tile(const std::string &name, const std::vector<std::vector<std::string>> &in,
-                            DataType dt) override
+    void
+    declare_const_tile(const std::string &name, const std::vector<std::vector<std::string>> &in, DataType dt) override
     {
         assert(_data->tiles[name] == nullptr);
         _data->tiles.insert(name, in, dt);
@@ -3387,7 +3421,8 @@ public:
     {
         assert(dst_var.type() == OperandType::Tile);
         assert(_data->tiles.has_tile(dst_var.value()));
-        assert(_data->tiles[dst_var.value()]->format().w == 1 && _data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable
+        assert(_data->tiles[dst_var.value()]->format().w == 1 &&
+               _data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable
 
         auto var = _data->tiles[dst_var.value()];
 
@@ -3397,8 +3432,10 @@ public:
         _data->code += ");\n";
     };
 
-    void op_get_global_coord(const Operand &o_dst, const Operand &o_step, const TensorOperand &o_tensor,
-                             int32_t dim) override
+    void op_get_global_coord(const Operand       &o_dst,
+                             const Operand       &o_step,
+                             const TensorOperand &o_tensor,
+                             int32_t              dim) override
     {
         OperandUnpacker operands(_data->tiles, _data->arguments);
         auto            dst  = operands.unpack(o_dst);
@@ -3412,17 +3449,17 @@ public:
 
         GpuTensor3dMapper mapper(tensor, gpu_sampler);
 
-        switch(dim)
+        switch (dim)
         {
             case 0:
-                if(mapper.is_one_component_x())
+                if (mapper.is_one_component_x())
                 {
                     _data->code += dst->scalar(0, 0).str;
                     _data->code += " = 0;\n";
                 }
                 else
                 {
-                    if(mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
+                    if (mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
                     {
                         // Validation: Check: fixed tensor shape
                         // TO BE CHANGED
@@ -3441,14 +3478,14 @@ public:
                 }
                 break;
             case 1:
-                if(mapper.is_one_component_y())
+                if (mapper.is_one_component_y())
                 {
                     _data->code += dst->scalar(0, 0).str;
                     _data->code += " = 0;\n";
                 }
                 else
                 {
-                    if(mapper.gpu_sampler().address_mode_y == TensorSamplerAddressModeY::OverlappingMin)
+                    if (mapper.gpu_sampler().address_mode_y == TensorSamplerAddressModeY::OverlappingMin)
                     {
                     }
                     else
@@ -3461,7 +3498,7 @@ public:
                 }
                 break;
             case 2:
-                if(mapper.is_one_component_z())
+                if (mapper.is_one_component_z())
                 {
                     _data->code += dst->scalar(0, 0).str;
                     _data->code += " = 0;\n";
@@ -3490,7 +3527,7 @@ public:
 
         GpuTensor3dMapper mapper(tensor, gpu_sampler);
 
-        if(mapper.is_one_component_batch())
+        if (mapper.is_one_component_batch())
         {
             _data->code += dst->scalar(0, 0).str;
             _data->code += " = 0;\n";
@@ -3506,7 +3543,8 @@ public:
     {
         assert(dst_var.type() == OperandType::Tile);
         assert(_data->tiles.has_tile(dst_var.value()));
-        assert(_data->tiles[dst_var.value()]->format().w == 1 && _data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable
+        assert(_data->tiles[dst_var.value()]->format().w == 1 &&
+               _data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable
 
         auto var = _data->tiles[dst_var.value()];
 
@@ -3532,7 +3570,7 @@ public:
         const std::string src_prefix = broadcast_src_x ? "(" + dt + ")" : "";
 
         // Broadcasting on Y is automatic
-        for(int32_t y = 0; y < dst_h; ++y)
+        for (int32_t y = 0; y < dst_h; ++y)
         {
             _data->code += dst->vector(y).str;
             _data->code += " = ";
@@ -3542,7 +3580,9 @@ public:
         }
     }
 
-    void op_binary_expression(const Operand &dst_name, const Operand &lhs_name, BinaryOp op,
+    void op_binary_expression(const Operand &dst_name,
+                              const Operand &lhs_name,
+                              BinaryOp       op,
                               const Operand &rhs_name) override
     {
         OperandUnpacker    operands(_data->tiles, _data->arguments);
@@ -3556,14 +3596,14 @@ public:
         const int32_t lhs_w = lhs->format().w;
         const int32_t rhs_w = rhs->format().w;
 
-        if(op == BinaryOp::MatMul_Nt_T)
+        if (op == BinaryOp::MatMul_Nt_T)
         {
             assert((dst->format().dt == DataType::Fp32) || (dst->format().dt == DataType::Fp16));
-            for(int32_t y = 0; y < dst_h; ++y)
+            for (int32_t y = 0; y < dst_h; ++y)
             {
-                for(int32_t x = 0; x < dst_w; ++x)
+                for (int32_t x = 0; x < dst_w; ++x)
                 {
-                    for(int32_t k = 0; k < lhs_w; ++k)
+                    for (int32_t k = 0; k < lhs_w; ++k)
                     {
                         _data->code += dst->scalar(x, y).str;
                         _data->code += " = fma(";
@@ -3583,12 +3623,14 @@ public:
         const bool broadcast_lhs_x = dst_w != 1 && lhs_w == 1;
         const bool broadcast_rhs_x = dst_w != 1 && rhs_w == 1;
 
-        const std::string lhs_prefix = broadcast_lhs_x ? "(" + dst->underlying_source_variables()[0].type.str + ")" : "";
-        const std::string rhs_prefix = broadcast_rhs_x ? "(" + dst->underlying_source_variables()[0].type.str + ")" : "";
-        const std::string op_str     = to_string(op);
+        const std::string lhs_prefix =
+            broadcast_lhs_x ? "(" + dst->underlying_source_variables()[0].type.str + ")" : "";
+        const std::string rhs_prefix =
+            broadcast_rhs_x ? "(" + dst->underlying_source_variables()[0].type.str + ")" : "";
+        const std::string op_str = to_string(op);
 
         // Broadcasting on Y is automatic
-        for(int32_t y = 0; y < dst_h; ++y)
+        for (int32_t y = 0; y < dst_h; ++y)
         {
             _data->code += dst->vector(y).str;
             _data->code += " = ";
@@ -3607,13 +3649,13 @@ public:
         const IVectorTile *src = operands.unpack(o_src);
         const IVectorTile *dst = operands.unpack(o_dst);
         // const int32_t dst_w  = dst->format().w;
-        const int32_t     dst_h = dst->format().h;
-        const std::string dt    = dst->underlying_source_variables()[0].type.str;
-        const bool is_float     = (dst->format().dt == DataType::Fp32) || (dst->format().dt == DataType::Fp16);
-        const std::string sat   = ((policy == ConvertPolicy::Saturate && !is_float) ? "_sat" : "");
+        const int32_t     dst_h    = dst->format().h;
+        const std::string dt       = dst->underlying_source_variables()[0].type.str;
+        const bool        is_float = (dst->format().dt == DataType::Fp32) || (dst->format().dt == DataType::Fp16);
+        const std::string sat      = ((policy == ConvertPolicy::Saturate && !is_float) ? "_sat" : "");
 
         // Broadcasting on Y is automatic
-        for(int32_t y = 0; y < dst_h; ++y)
+        for (int32_t y = 0; y < dst_h; ++y)
         {
             _data->code += dst->vector(y).str;
             _data->code += " = convert_" + dt + sat + "(";
@@ -3638,7 +3680,7 @@ public:
         const std::string src_prefix = broadcast_src_x ? "(" + dt + ")" : "";
 
         // Broadcasting on Y is automatic
-        for(int32_t y = 0; y < dst_h; ++y)
+        for (int32_t y = 0; y < dst_h; ++y)
         {
             _data->code += dst->vector(y).str;
             _data->code += " = ";
@@ -3647,8 +3689,7 @@ public:
         }
     }
 
-    void
-    op_unary_elementwise_function(const Operand &dst_name, UnaryFunction func, const Operand &src_name) override
+    void op_unary_elementwise_function(const Operand &dst_name, UnaryFunction func, const Operand &src_name) override
     {
         OperandUnpacker    operands(_data->tiles, _data->arguments);
         const IVectorTile *src = operands.unpack(src_name);
@@ -3665,12 +3706,12 @@ public:
         const std::string src_prefix = "(" + dt + ")";
 
         // Broadcasting on Y is automatic
-        for(int32_t y = 0; y < dst_h; ++y)
+        for (int32_t y = 0; y < dst_h; ++y)
         {
             _data->code += dst->vector(y).str;
             _data->code += " = ";
 
-            switch(func)
+            switch (func)
             {
                 case UnaryFunction::Exp:
                     _data->code += "exp(";
@@ -3708,7 +3749,10 @@ public:
         }
     }
 
-    void op_binary_elementwise_function(const Operand &dst_name, BinaryFunction func, const Operand &first_name, const Operand &second_name) override
+    void op_binary_elementwise_function(const Operand &dst_name,
+                                        BinaryFunction func,
+                                        const Operand &first_name,
+                                        const Operand &second_name) override
     {
         OperandUnpacker    operands(_data->tiles, _data->arguments);
         const IVectorTile *first  = operands.unpack(first_name);
@@ -3726,12 +3770,12 @@ public:
         const bool is_float = (datatype.dt == DataType::Fp32 || datatype.dt == DataType::Fp16);
 
         // Broadcasting on Y is automatic
-        for(int32_t y = 0; y < dst_h; ++y)
+        for (int32_t y = 0; y < dst_h; ++y)
         {
             _data->code += dst->vector(y).str;
             _data->code += " = ";
 
-            switch(func)
+            switch (func)
             {
                 case BinaryFunction::Min:
                     _data->code += is_float ? "fmin(" : "min(";
@@ -3750,7 +3794,11 @@ public:
         }
     }
 
-    void op_ternary_elementwise_function(const Operand &dst_name, TernaryFunction func, const Operand &first_name, const Operand &second_name, const Operand &third_name) override
+    void op_ternary_elementwise_function(const Operand  &dst_name,
+                                         TernaryFunction func,
+                                         const Operand  &first_name,
+                                         const Operand  &second_name,
+                                         const Operand  &third_name) override
     {
         OperandUnpacker    operands(_data->tiles, _data->arguments);
         const IVectorTile *first  = operands.unpack(first_name);
@@ -3758,8 +3806,8 @@ public:
         const IVectorTile *third  = operands.unpack(third_name);
         const IVectorTile *dst    = operands.unpack(dst_name);
 
-        const int32_t     dst_h    = dst->format().h;
-        const std::string dt       = dst->underlying_source_variables()[0].type.str;
+        const int32_t     dst_h = dst->format().h;
+        const std::string dt    = dst->underlying_source_variables()[0].type.str;
 
         // Always perform an explicit cast. See similar comments in op_unary_elementwise_function
         const std::string first_prefix  = "(" + dt + ")";
@@ -3767,12 +3815,12 @@ public:
         const std::string third_prefix  = "(" + dt + ")";
 
         // Broadcasting on Y is automatic
-        for(int32_t y = 0; y < dst_h; ++y)
+        for (int32_t y = 0; y < dst_h; ++y)
         {
             _data->code += dst->vector(y).str;
             _data->code += " = ";
 
-            switch(func)
+            switch (func)
             {
                 case TernaryFunction::Select:
                     _data->code += "select(";
@@ -3822,7 +3870,12 @@ public:
         _data->code += "else\n";
     }
 
-    void op_for_loop_header(const Operand& var_name, BinaryOp cond_op, const Operand& cond_value_name, const Operand &update_var_name, AssignmentOp update_op, const Operand& update_value_name) override
+    void op_for_loop_header(const Operand &var_name,
+                            BinaryOp       cond_op,
+                            const Operand &cond_value_name,
+                            const Operand &update_var_name,
+                            AssignmentOp   update_op,
+                            const Operand &update_value_name) override
     {
         OperandUnpacker    operands(_data->tiles, _data->arguments);
         const IVectorTile *var          = operands.unpack(var_name);
@@ -3850,9 +3903,13 @@ public:
         _data->code += "\n";
     }
 
-    void op_load_immediate(const TensorOperand &o_tensor, const Operand &o_dst, const Operand &o_x,
-                           const Operand &o_y, const Operand &o_z, const Operand &o_batch_idx,
-                           const Operand &dilation_y) override
+    void op_load_immediate(const TensorOperand &o_tensor,
+                           const Operand       &o_dst,
+                           const Operand       &o_x,
+                           const Operand       &o_y,
+                           const Operand       &o_z,
+                           const Operand       &o_batch_idx,
+                           const Operand       &dilation_y) override
     {
         OperandUnpacker operands(_data->tiles, _data->arguments);
 
@@ -3875,10 +3932,10 @@ public:
         // Initialize the constant part
         load_writer->initialize(dst, x, z, b);
 
-        for(int i = 0; i < dst->format().h; ++i)
+        for (int i = 0; i < dst->format().h; ++i)
         {
             std::string coord_y = y->scalar(0, 0).str + " + " + std::to_string(i);
-            if(dil_y->scalar(0, 0).str != "1")
+            if (dil_y->scalar(0, 0).str != "1")
             {
                 coord_y += " * " + dil_y->scalar(0, 0).str;
             }
@@ -3888,9 +3945,12 @@ public:
         load_writer->finalize();
     }
 
-    void op_load_indirect(const TensorOperand &o_tensor, const Operand &o_dst, const Operand &o_x,
-                          const Operand &o_indirect_h, const Operand &o_z,
-                          const Operand &o_batch_idx) override
+    void op_load_indirect(const TensorOperand &o_tensor,
+                          const Operand       &o_dst,
+                          const Operand       &o_x,
+                          const Operand       &o_indirect_h,
+                          const Operand       &o_z,
+                          const Operand       &o_batch_idx) override
     {
         OperandUnpacker operands(_data->tiles, _data->arguments);
 
@@ -3912,7 +3972,7 @@ public:
         // Initialize the constant part
         load_writer->initialize(dst, x, z, b);
 
-        for(int i = 0; i < dst->format().h; ++i)
+        for (int i = 0; i < dst->format().h; ++i)
         {
             load_writer->write(std::make_pair(i, y_ind->scalar(0, i).str));
         }
@@ -3920,9 +3980,12 @@ public:
         load_writer->finalize();
     }
 
-    void op_store_immediate(const TensorOperand &tensor_name, const Operand &src_name, const Operand &x_name,
-                            const Operand &y_name, const Operand &z_name,
-                            const Operand &batch_index_name) override
+    void op_store_immediate(const TensorOperand &tensor_name,
+                            const Operand       &src_name,
+                            const Operand       &x_name,
+                            const Operand       &y_name,
+                            const Operand       &z_name,
+                            const Operand       &batch_index_name) override
     {
         OperandUnpacker operands(_data->tiles, _data->arguments);
 
@@ -3946,7 +4009,7 @@ public:
 
         int32_t tile_h = src->format().h;
 
-        for(int m0 = tile_h - 1; m0 >= 0; m0--)
+        for (int m0 = tile_h - 1; m0 >= 0; m0--)
         {
             store_writer->write(std::make_pair(m0, y->scalar(0, 0).str + " + " + std::to_string(m0)));
         }
@@ -3959,8 +4022,12 @@ public:
         _data->code += "return;\n";
     }
 
-    void util_get_indirect_buffer(const Operand &o_dst, const TensorOperand &o_tensor, const Operand &o_x,
-                                  const Operand &o_y, const Operand &o_x_off, const Operand &o_y_off) override
+    void util_get_indirect_buffer(const Operand       &o_dst,
+                                  const TensorOperand &o_tensor,
+                                  const Operand       &o_x,
+                                  const Operand       &o_y,
+                                  const Operand       &o_x_off,
+                                  const Operand       &o_y_off) override
     {
         OperandUnpacker    operands(_data->tiles, _data->arguments);
         const IVectorTile *dst   = operands.unpack(o_dst);
@@ -4002,7 +4069,7 @@ public:
         declare_tile("_y_s", TileInfo(DataType::Int32));
         auto x_s = operands.unpack(Operand("_x_s"));
         auto y_s = operands.unpack(Operand("_y_s"));
-        for(int i = 0; i < dst->format().h; ++i)
+        for (int i = 0; i < dst->format().h; ++i)
         {
             // x_s = (xi_0 + x_k);
             // y_s = (yi_0 + y_k);
@@ -4060,8 +4127,8 @@ public:
     }
 
 private:
-    GpuKernelWriterDataHolder *_data{ nullptr };
-    GpuKernelWriterAttribute  *_attr{ nullptr };
+    GpuKernelWriterDataHolder *_data{nullptr};
+    GpuKernelWriterAttribute  *_attr{nullptr};
 };
 
 /** IGpuKernelWriter factory class */
@@ -4074,10 +4141,9 @@ public:
      *
      * @return IGpuKernelWriter
      */
-    static std::unique_ptr<IGpuKernelWriter>
-    create(GpuKernelWriterAttribute *attr, GpuKernelWriterDataHolder *x)
+    static std::unique_ptr<IGpuKernelWriter> create(GpuKernelWriterAttribute *attr, GpuKernelWriterDataHolder *x)
     {
-        switch(x->programming_language())
+        switch (x->programming_language())
         {
             case GpuTargetLanguage::OpenCL:
                 return std::make_unique<ClKernelWriter>(attr, x);
@@ -4094,9 +4160,9 @@ adjust_step(TensorSamplerFormat tensor_format, int32_t step, const TensorInfo *t
 {
     auto tensor = tensor_info_id->shape;
 
-    int32_t dim[3] = { 0 };
+    int32_t dim[3] = {0};
 
-    switch(tensor_format)
+    switch (tensor_format)
     {
         case TensorSamplerFormat::C_W_H:
             dim[0] = tensor[0];
diff --git a/compute_kernel_writer/prototype/src/TensorOperand.cpp b/compute_kernel_writer/prototype/src/TensorOperand.cpp
index c6725d3b26..d1aefbbb71 100644
--- a/compute_kernel_writer/prototype/src/TensorOperand.cpp
+++ b/compute_kernel_writer/prototype/src/TensorOperand.cpp
@@ -23,10 +23,12 @@
  */
 
 #include "ckw/TensorOperand.h"
+
 #include "ckw/Error.h"
 #include "ckw/Kernel.h"
 #include "ckw/TensorInfo.h"
 #include "ckw/TileOperand.h"
+
 #include "src/Prototype.h"
 
 namespace ckw
@@ -35,9 +37,11 @@ namespace ckw
 namespace
 {
 
-TensorComponentOperand &get_or_create_component(TensorOperand &tensor, std::unique_ptr<TensorComponentOperand> &ptr, TensorComponentType component)
+TensorComponentOperand &get_or_create_component(TensorOperand                           &tensor,
+                                                std::unique_ptr<TensorComponentOperand> &ptr,
+                                                TensorComponentType                      component)
 {
-    if(ptr == nullptr)
+    if (ptr == nullptr)
     {
         ptr = std::make_unique<TensorComponentOperand>(tensor, component);
     }
@@ -59,7 +63,7 @@ TensorOperand::TensorOperand(const std::string &name, const TensorInfo &info, Te
 prototype::Operand TensorOperand::create_impl_operand(prototype::IGpuKernelWriter *writer) const
 {
     CKW_UNUSED(writer);
-    return { name() };
+    return {name()};
 }
 
 const TensorInfo &TensorOperand::info() const
@@ -206,9 +210,9 @@ TensorComponentType TensorComponentOperand::component_type() const
 prototype::Operand TensorComponentOperand::create_impl_operand(prototype::IGpuKernelWriter *writer) const
 {
     CKW_UNUSED(writer);
-    prototype::OperandType type{ prototype::OperandType::Unknown };
+    prototype::OperandType type{prototype::OperandType::Unknown};
 
-    switch(_component)
+    switch (_component)
     {
         case TensorComponentType::OffsetFirstElement:
             type = prototype::OperandType::TensorDataOffset;
diff --git a/compute_kernel_writer/prototype/src/TensorTileSampler.cpp b/compute_kernel_writer/prototype/src/TensorTileSampler.cpp
index 28e54df3a5..bf9f946ce8 100644
--- a/compute_kernel_writer/prototype/src/TensorTileSampler.cpp
+++ b/compute_kernel_writer/prototype/src/TensorTileSampler.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "ckw/TensorTileSampler.h"
+
 #include "ckw/TileOperand.h"
 #include "ckw/types/TensorSamplerTypes.h"
 
@@ -33,24 +34,47 @@ TensorTileSampler::TensorTileSampler()
 {
 }
 
-TensorTileSampler::TensorTileSampler(
-    TileOperand &x, TileOperand &y, TileOperand &z, TileOperand &b,
-    TensorSamplerFormat       format,
-    TensorSamplerAddressModeX address_mode_x,
-    TensorSamplerAddressModeY address_mode_y,
-    TensorSamplerAddressModeZ address_mode_z)
-    : _x(&x), _y(&y), _z(&z), _b(&b), _height(0), _width(0), _format(format), _address_mode_x(address_mode_x), _address_mode_y(address_mode_y), _address_mode_z(address_mode_z)
-{
-}
-
-TensorTileSampler::TensorTileSampler(
-    TileOperand &x, TileOperand &y, TileOperand &z, TileOperand &b,
-    int32_t height, int32_t width,
-    TensorSamplerFormat       format,
-    TensorSamplerAddressModeX address_mode_x,
-    TensorSamplerAddressModeY address_mode_y,
-    TensorSamplerAddressModeZ address_mode_z)
-    : _x(&x), _y(&y), _z(&z), _b(&b), _height(height), _width(width), _format(format), _address_mode_x(address_mode_x), _address_mode_y(address_mode_y), _address_mode_z(address_mode_z)
+TensorTileSampler::TensorTileSampler(TileOperand              &x,
+                                     TileOperand              &y,
+                                     TileOperand              &z,
+                                     TileOperand              &b,
+                                     TensorSamplerFormat       format,
+                                     TensorSamplerAddressModeX address_mode_x,
+                                     TensorSamplerAddressModeY address_mode_y,
+                                     TensorSamplerAddressModeZ address_mode_z)
+    : _x(&x),
+      _y(&y),
+      _z(&z),
+      _b(&b),
+      _height(0),
+      _width(0),
+      _format(format),
+      _address_mode_x(address_mode_x),
+      _address_mode_y(address_mode_y),
+      _address_mode_z(address_mode_z)
+{
+}
+
+TensorTileSampler::TensorTileSampler(TileOperand              &x,
+                                     TileOperand              &y,
+                                     TileOperand              &z,
+                                     TileOperand              &b,
+                                     int32_t                   height,
+                                     int32_t                   width,
+                                     TensorSamplerFormat       format,
+                                     TensorSamplerAddressModeX address_mode_x,
+                                     TensorSamplerAddressModeY address_mode_y,
+                                     TensorSamplerAddressModeZ address_mode_z)
+    : _x(&x),
+      _y(&y),
+      _z(&z),
+      _b(&b),
+      _height(height),
+      _width(width),
+      _format(format),
+      _address_mode_x(address_mode_x),
+      _address_mode_y(address_mode_y),
+      _address_mode_z(address_mode_z)
 {
 }
 
diff --git a/compute_kernel_writer/prototype/src/TileInfo.cpp b/compute_kernel_writer/prototype/src/TileInfo.cpp
index 66d8cb1620..273266eedc 100644
--- a/compute_kernel_writer/prototype/src/TileInfo.cpp
+++ b/compute_kernel_writer/prototype/src/TileInfo.cpp
@@ -26,18 +26,15 @@
 
 namespace ckw
 {
-TileInfo::TileInfo(DataType dt)
-    : _dt(dt), _shape({ { 1, 1 } })
+TileInfo::TileInfo(DataType dt) : _dt(dt), _shape({{1, 1}})
 {
 }
 
-TileInfo::TileInfo(DataType dt, int32_t w)
-    : _dt(dt), _shape({ { w, 1 } })
+TileInfo::TileInfo(DataType dt, int32_t w) : _dt(dt), _shape({{w, 1}})
 {
 }
 
-TileInfo::TileInfo(DataType dt, int32_t h, int32_t w)
-    : _dt(dt), _shape({ { w, h } })
+TileInfo::TileInfo(DataType dt, int32_t h, int32_t w) : _dt(dt), _shape({{w, h}})
 {
 }
 
diff --git a/compute_kernel_writer/prototype/src/TileOperand.cpp b/compute_kernel_writer/prototype/src/TileOperand.cpp
index 0eb2ca6a64..e09c833d96 100644
--- a/compute_kernel_writer/prototype/src/TileOperand.cpp
+++ b/compute_kernel_writer/prototype/src/TileOperand.cpp
@@ -23,47 +23,43 @@
  */
 
 #include "ckw/TileOperand.h"
+
 #include "ckw/Error.h"
+
 #include "src/Prototype.h"
 
 namespace ckw
 {
 
 TileOperand::TileOperand(const std::string &name, const TileInfo &info)
-    : OperandBase(name),
-      _info(info),
-      _value{ std::vector<std::string>{ "0" } },
-      _constant(false)
+    : OperandBase(name), _info(info), _value{std::vector<std::string>{"0"}}, _constant(false)
 {
 }
 
 TileOperand::TileOperand(const std::string &name, DataType data_type)
-    : OperandBase(name),
-      _info(TileInfo{ data_type }),
-      _value{ std::vector<std::string>{ "0" } },
-      _constant(false)
+    : OperandBase(name), _info(TileInfo{data_type}), _value{std::vector<std::string>{"0"}}, _constant(false)
 {
 }
 
 TileOperand::TileOperand(const std::string &name, int32_t value)
     : OperandBase(name),
-      _info(TileInfo{ DataType::Int32 }),
-      _value{ std::vector<std::string>{ std::to_string(value) } },
+      _info(TileInfo{DataType::Int32}),
+      _value{std::vector<std::string>{std::to_string(value)}},
       _constant(true)
 {
 }
 
 TileOperand::TileOperand(const std::string &name, float value)
     : OperandBase(name),
-      _info(TileInfo{ DataType::Fp32 }),
-      _value{ std::vector<std::string>{ std::to_string(value) } },
+      _info(TileInfo{DataType::Fp32}),
+      _value{std::vector<std::string>{std::to_string(value)}},
       _constant(true)
 {
 }
 
 TileOperand::TileOperand(const std::string &name, const TileContainer &vals, DataType dt)
     : OperandBase(name),
-      _info(TileInfo{ dt, static_cast<int32_t>(vals.size()), static_cast<int32_t>(vals[0].size()) }),
+      _info(TileInfo{dt, static_cast<int32_t>(vals.size()), static_cast<int32_t>(vals[0].size())}),
       _value(vals),
       _constant(true)
 {
@@ -73,11 +69,11 @@ prototype::Operand TileOperand::create_impl_operand(prototype::IGpuKernelWriter
 {
     CKW_UNUSED(writer);
 
-    if(_constant)
+    if (_constant)
     {
-        if(is_scalar())
+        if (is_scalar())
         {
-            switch(_info.data_type())
+            switch (_info.data_type())
             {
                 case DataType::Int32:
                     return prototype::Operand(_value[0][0], prototype::OperandType::ScalarInt32);
diff --git a/compute_kernel_writer/src/Error.cpp b/compute_kernel_writer/src/Error.cpp
index c5dae2eb75..e1e4bffcec 100644
--- a/compute_kernel_writer/src/Error.cpp
+++ b/compute_kernel_writer/src/Error.cpp
@@ -28,8 +28,8 @@
 
 namespace ckw
 {
-std::string create_error_msg(const std::string &file, const std::string &func, const std::string &line,
-                             const std::string &msg)
+std::string
+create_error_msg(const std::string &file, const std::string &func, const std::string &line, const std::string &msg)
 {
     std::string err;
     err += "[COMPUTE_KERNEL_WRITER][ERROR]:";
@@ -38,4 +38,4 @@ std::string create_error_msg(const std::string &file, const std::string &func, c
     err += " " + msg;
     return err;
 }
-} // namespace ckw
-\ No newline at end of file
+} // namespace ckw
diff --git a/compute_kernel_writer/src/Helpers.cpp b/compute_kernel_writer/src/Helpers.cpp
index 799f79a187..82d4c4e917 100644
--- a/compute_kernel_writer/src/Helpers.cpp
+++ b/compute_kernel_writer/src/Helpers.cpp
@@ -22,15 +22,15 @@
  * SOFTWARE.
  */
 
-#include "ckw/Error.h"
-
 #include "src/Helpers.h"
 
+#include "ckw/Error.h"
+
 namespace ckw
 {
 std::string dec_to_hex_as_string(int32_t dec)
 {
-    switch(dec)
+    switch (dec)
     {
         case 0:
         case 1:
diff --git a/compute_kernel_writer/src/ITensorArgument.h b/compute_kernel_writer/src/ITensorArgument.h
index 838bd40f85..ece45a4dc4 100644
--- a/compute_kernel_writer/src/ITensorArgument.h
+++ b/compute_kernel_writer/src/ITensorArgument.h
@@ -28,6 +28,7 @@
 #include "ckw/TensorInfo.h"
 #include "ckw/types/TensorComponentType.h"
 #include "ckw/types/TensorStorageType.h"
+
 #include "src/ITile.h"
 
 #include <string>
@@ -41,8 +42,8 @@ class ITensorComponent;
 /** Tensor storage variable */
 struct TensorStorageVariable
 {
-    std::string       val{ "" };                          /** Tensor storage as a string */
-    TensorStorageType type{ TensorStorageType::Unknown }; /** Tensor storage type */
+    std::string       val{""};                          /** Tensor storage as a string */
+    TensorStorageType type{TensorStorageType::Unknown}; /** Tensor storage type */
 };
 
 /** Tensor argument base class.
@@ -83,8 +84,8 @@ public:
     }
 
 protected:
-    TensorInfo  _info{};         // Tensor info
-    std::string _basename{ "" }; // Tensor name
+    TensorInfo  _info{};       // Tensor info
+    std::string _basename{""}; // Tensor name
 };
 
 /** Tensor component argument base class */
diff --git a/compute_kernel_writer/src/ITensorComponent.h b/compute_kernel_writer/src/ITensorComponent.h
index e2775b62b0..f9c9d8fd81 100644
--- a/compute_kernel_writer/src/ITensorComponent.h
+++ b/compute_kernel_writer/src/ITensorComponent.h
@@ -26,6 +26,7 @@
 #define CKW_SRC_ITENSORCOMPONENT_H
 
 #include "ckw/types/TensorComponentType.h"
+
 #include "src/ITile.h"
 
 namespace ckw
diff --git a/compute_kernel_writer/src/ITile.h b/compute_kernel_writer/src/ITile.h
index 73b7315fb5..8eaac5ac12 100644
--- a/compute_kernel_writer/src/ITile.h
+++ b/compute_kernel_writer/src/ITile.h
@@ -37,15 +37,15 @@ using TileContainer = std::vector<std::vector<std::string>>;
 /** Tile descriptor which reports the underlying datatype and vector length */
 struct TileVariableDescriptor
 {
-    DataType dt{ DataType::Unknown }; /** Data type  */
-    int32_t  len{ 1 };                /** Number of elements in a single variable. For example, 1 for scalar  */
+    DataType dt{DataType::Unknown}; /** Data type  */
+    int32_t  len{1};                /** Number of elements in a single variable. For example, 1 for scalar  */
 };
 
 /** Tile variable */
 struct TileVariable
 {
-    std::string            str{ "" }; /** Tile variable as a string */
-    TileVariableDescriptor desc{};    /** Tile value descriptor which reports the datatype and vector length */
+    std::string            str{""}; /** Tile variable as a string */
+    TileVariableDescriptor desc{};  /** Tile value descriptor which reports the datatype and vector length */
 };
 
 /** Interface to provide support for scalar access for a Tile.
diff --git a/compute_kernel_writer/src/Kernel.cpp b/compute_kernel_writer/src/Kernel.cpp
index bfb0f46300..12389b3816 100644
--- a/compute_kernel_writer/src/Kernel.cpp
+++ b/compute_kernel_writer/src/Kernel.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "ckw/Kernel.h"
+
 #include "ckw/types/TargetLanguage.h"
 
 namespace ckw
diff --git a/compute_kernel_writer/src/KernelArgument.cpp b/compute_kernel_writer/src/KernelArgument.cpp
index a31ca1757b..a640d36507 100644
--- a/compute_kernel_writer/src/KernelArgument.cpp
+++ b/compute_kernel_writer/src/KernelArgument.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "ckw/KernelArgument.h"
+
 #include "ckw/Error.h"
 
 namespace ckw
diff --git a/compute_kernel_writer/src/KernelWriter.cpp b/compute_kernel_writer/src/KernelWriter.cpp
index 0bea1200b7..a478231c09 100644
--- a/compute_kernel_writer/src/KernelWriter.cpp
+++ b/compute_kernel_writer/src/KernelWriter.cpp
@@ -23,14 +23,16 @@
  */
 
 #include "ckw/KernelWriter.h"
+
 #include "ckw/Error.h"
 #include "ckw/TileOperand.h"
 #include "ckw/types/TargetArchitecture.h"
 #include "ckw/types/TargetLanguage.h"
-#include "src/TileView.h"
+
 #include "src/cl/CLKernelWriter.h"
 #include "src/cl/CLTensorArgument.h"
 #include "src/cl/CLTile.h"
+#include "src/TileView.h"
 
 #include <tuple>
 
@@ -42,7 +44,7 @@ KernelWriter::~KernelWriter() = default;
 std::unique_ptr<KernelWriter> KernelWriter::create_instance(TargetArchitecture architecture, TargetLanguage language)
 {
     CKW_UNUSED(architecture);
-    switch(language)
+    switch (language)
     {
         case TargetLanguage::OpenCL:
             // Currently this is the oldest and the only supported GPU architecture.
@@ -95,7 +97,7 @@ TileOperand KernelWriter::create_tile_operand(ITile &tile)
 
 std::tuple<ITile &, TileArea> KernelWriter::get_tile(const TileOperand &operand)
 {
-    return { *operand._tile, { operand._row_start, operand._row_end, operand._col_start, operand._col_end } };
+    return {*operand._tile, {operand._row_start, operand._row_end, operand._col_start, operand._col_end}};
 }
 
 TensorOperand KernelWriter::create_tensor_operand(ITensor &tensor)
diff --git a/compute_kernel_writer/src/Tensor3dMapper.cpp b/compute_kernel_writer/src/Tensor3dMapper.cpp
index 7384b924da..acef6412a4 100644
--- a/compute_kernel_writer/src/Tensor3dMapper.cpp
+++ b/compute_kernel_writer/src/Tensor3dMapper.cpp
@@ -26,19 +26,19 @@
 
 #include "ckw/Error.h"
 #include "ckw/types/TensorSamplerTypes.h"
+
 #include "src/ITensor.h"
 #include "src/ITile.h"
 
 namespace ckw
 {
-Tensor3dMapper::Tensor3dMapper(ITensor *tensor, TensorSamplerFormat format)
-        : _tensor(tensor), _format(format)
+Tensor3dMapper::Tensor3dMapper(ITensor *tensor, TensorSamplerFormat format) : _tensor(tensor), _format(format)
 {
 }
 
 TileVariable Tensor3dMapper::dim_x() const
 {
-    switch(_format)
+    switch (_format)
     {
         case TensorSamplerFormat::Dim0_Dim1xDim2_1:
         case TensorSamplerFormat::Dim0_Dim1_Dim2:
@@ -51,7 +51,7 @@ TileVariable Tensor3dMapper::dim_x() const
 
 TileVariable Tensor3dMapper::dim_y() const
 {
-    switch(_format)
+    switch (_format)
     {
         case TensorSamplerFormat::Dim0_Dim1xDim2_1:
             return _tensor->component(TensorComponentType::Dim1xDim2).scalar(0, 0);
@@ -67,10 +67,10 @@ TileVariable Tensor3dMapper::dim_z() const
 {
     TileVariable dim_one;
 
-    switch(_format)
+    switch (_format)
     {
         case TensorSamplerFormat::Dim0_Dim1xDim2_1:
-            dim_one = _tensor->component(TensorComponentType::Dim3).scalar(0, 0);
+            dim_one     = _tensor->component(TensorComponentType::Dim3).scalar(0, 0);
             dim_one.str = "1";
             return dim_one;
         case TensorSamplerFormat::Dim0_Dim1_Dim2:
@@ -85,7 +85,7 @@ TileVariable Tensor3dMapper::dim_batch() const
 {
     TileVariable dim_one;
 
-    switch(_format)
+    switch (_format)
     {
         case TensorSamplerFormat::Dim0_Dim1xDim2_1:
         case TensorSamplerFormat::Dim0_Dim1_Dim2:
@@ -98,7 +98,7 @@ TileVariable Tensor3dMapper::dim_batch() const
 
 TileVariable Tensor3dMapper::stride_x() const
 {
-    switch(_format)
+    switch (_format)
     {
         case TensorSamplerFormat::Dim0_Dim1xDim2_1:
         case TensorSamplerFormat::Dim0_Dim1_Dim2:
@@ -111,7 +111,7 @@ TileVariable Tensor3dMapper::stride_x() const
 
 TileVariable Tensor3dMapper::stride_y() const
 {
-    switch(_format)
+    switch (_format)
     {
         case TensorSamplerFormat::Dim0_Dim1xDim2_1:
         case TensorSamplerFormat::Dim0_Dim1_Dim2:
@@ -126,10 +126,10 @@ TileVariable Tensor3dMapper::stride_z() const
 {
     TileVariable stride_zero;
 
-    switch(_format)
+    switch (_format)
     {
         case TensorSamplerFormat::Dim0_Dim1xDim2_1:
-            stride_zero = _tensor->component(TensorComponentType::Stride3).scalar(0, 0);
+            stride_zero     = _tensor->component(TensorComponentType::Stride3).scalar(0, 0);
             stride_zero.str = "0";
             return stride_zero;
         case TensorSamplerFormat::Dim0_Dim1_Dim2:
@@ -142,7 +142,7 @@ TileVariable Tensor3dMapper::stride_z() const
 
 TileVariable Tensor3dMapper::stride_batch() const
 {
-    switch(_format)
+    switch (_format)
     {
         case TensorSamplerFormat::Dim0_Dim1xDim2_1:
         case TensorSamplerFormat::Dim0_Dim1_Dim2:
@@ -152,4 +152,4 @@ TileVariable Tensor3dMapper::stride_batch() const
             return _tensor->component(TensorComponentType::Unknown).scalar(0, 0);
     }
 }
-} // namespace ckw
-\ No newline at end of file
+} // namespace ckw
diff --git a/compute_kernel_writer/src/Tensor3dMapper.h b/compute_kernel_writer/src/Tensor3dMapper.h
index fa68ac2d15..e94b595193 100644
--- a/compute_kernel_writer/src/Tensor3dMapper.h
+++ b/compute_kernel_writer/src/Tensor3dMapper.h
@@ -74,8 +74,8 @@ public:
     TileVariable stride_batch() const;
 
 private:
-    ITensor             *_tensor;
-    TensorSamplerFormat  _format;
+    ITensor            *_tensor;
+    TensorSamplerFormat _format;
 };
 } // namespace ckw
 
diff --git a/compute_kernel_writer/src/TensorOperand.cpp b/compute_kernel_writer/src/TensorOperand.cpp
index 5ad24c6276..bf11d0d332 100644
--- a/compute_kernel_writer/src/TensorOperand.cpp
+++ b/compute_kernel_writer/src/TensorOperand.cpp
@@ -23,13 +23,13 @@
  */
 
 #include "ckw/TensorOperand.h"
+
 #include "src/ITensor.h"
 
 namespace ckw
 {
 
-TensorOperand::TensorOperand(ITensor &tensor)
-    : _tensor(tensor)
+TensorOperand::TensorOperand(ITensor &tensor) : _tensor(tensor)
 {
 }
 
@@ -108,4 +108,4 @@ TileOperand TensorOperand::offset_first_element_in_bytes()
     return TileOperand(_tensor.component(TensorComponentType::OffsetFirstElement));
 }
 
-} // namespace ckw
-\ No newline at end of file
+} // namespace ckw
diff --git a/compute_kernel_writer/src/TensorSampler.cpp b/compute_kernel_writer/src/TensorSampler.cpp
index 2ee8df4bca..91d5af2fd0 100644
--- a/compute_kernel_writer/src/TensorSampler.cpp
+++ b/compute_kernel_writer/src/TensorSampler.cpp
@@ -32,7 +32,11 @@ TensorSampler::TensorSampler(TensorStorageType         storage,
                              TensorSamplerAddressModeX address_mode_x,
                              TensorSamplerAddressModeY address_mode_y,
                              TensorSamplerAddressModeZ address_mode_z)
-    : _storage(storage), _format(format), _address_mode_x(address_mode_x), _address_mode_y(address_mode_y), _address_mode_z(address_mode_z)
+    : _storage(storage),
+      _format(format),
+      _address_mode_x(address_mode_x),
+      _address_mode_y(address_mode_y),
+      _address_mode_z(address_mode_z)
 {
 }
 
diff --git a/compute_kernel_writer/src/TensorUtils.cpp b/compute_kernel_writer/src/TensorUtils.cpp
index 24836092d4..17fc9547ae 100644
--- a/compute_kernel_writer/src/TensorUtils.cpp
+++ b/compute_kernel_writer/src/TensorUtils.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "src/TensorUtils.h"
+
 #include "ckw/Error.h"
 #include "ckw/TensorInfo.h"
 #include "ckw/types/TensorComponentType.h"
@@ -31,10 +32,10 @@ namespace ckw
 {
 TensorComponentType get_tensor_dimension(TensorDataLayout layout, TensorDataLayoutComponent component)
 {
-    switch(layout)
+    switch (layout)
     {
         case TensorDataLayout::Nhwc:
-            switch(component)
+            switch (component)
             {
                 case TensorDataLayoutComponent::C:
                     return TensorComponentType::Dim0;
@@ -49,7 +50,7 @@ TensorComponentType get_tensor_dimension(TensorDataLayout layout, TensorDataLayo
                     return TensorComponentType::Unknown;
             }
         case TensorDataLayout::Ndhwc:
-            switch(component)
+            switch (component)
             {
                 case TensorDataLayoutComponent::C:
                     return TensorComponentType::Dim0;
@@ -73,10 +74,10 @@ TensorComponentType get_tensor_dimension(TensorDataLayout layout, TensorDataLayo
 
 TensorComponentType get_tensor_stride(TensorDataLayout layout, TensorDataLayoutComponent component)
 {
-    switch(layout)
+    switch (layout)
     {
         case TensorDataLayout::Nhwc:
-            switch(component)
+            switch (component)
             {
                 case TensorDataLayoutComponent::C:
                     return TensorComponentType::Stride0;
@@ -91,7 +92,7 @@ TensorComponentType get_tensor_stride(TensorDataLayout layout, TensorDataLayoutC
                     return TensorComponentType::Unknown;
             }
         case TensorDataLayout::Ndhwc:
-            switch(component)
+            switch (component)
             {
                 case TensorDataLayoutComponent::C:
                     return TensorComponentType::Stride0;
diff --git a/compute_kernel_writer/src/TileInfo.cpp b/compute_kernel_writer/src/TileInfo.cpp
index 66d8cb1620..273266eedc 100644
--- a/compute_kernel_writer/src/TileInfo.cpp
+++ b/compute_kernel_writer/src/TileInfo.cpp
@@ -26,18 +26,15 @@
 
 namespace ckw
 {
-TileInfo::TileInfo(DataType dt)
-    : _dt(dt), _shape({ { 1, 1 } })
+TileInfo::TileInfo(DataType dt) : _dt(dt), _shape({{1, 1}})
 {
 }
 
-TileInfo::TileInfo(DataType dt, int32_t w)
-    : _dt(dt), _shape({ { w, 1 } })
+TileInfo::TileInfo(DataType dt, int32_t w) : _dt(dt), _shape({{w, 1}})
 {
 }
 
-TileInfo::TileInfo(DataType dt, int32_t h, int32_t w)
-    : _dt(dt), _shape({ { w, h } })
+TileInfo::TileInfo(DataType dt, int32_t h, int32_t w) : _dt(dt), _shape({{w, h}})
 {
 }
 
diff --git a/compute_kernel_writer/src/TileOperand.cpp b/compute_kernel_writer/src/TileOperand.cpp
index 3dfa2b8b2b..865ef85a13 100644
--- a/compute_kernel_writer/src/TileOperand.cpp
+++ b/compute_kernel_writer/src/TileOperand.cpp
@@ -23,7 +23,9 @@
  */
 
 #include "ckw/TileOperand.h"
+
 #include "ckw/Error.h"
+
 #include "src/ITile.h"
 
 namespace ckw
@@ -34,7 +36,8 @@ TileOperand::TileOperand(ITile &tile)
 {
 }
 
-TileOperand::TileOperand(const TileOperand &operand, int32_t row_start, int32_t row_end, int32_t col_start, int32_t col_end)
+TileOperand::TileOperand(
+    const TileOperand &operand, int32_t row_start, int32_t row_end, int32_t col_start, int32_t col_end)
     : _tile(operand._tile), _row_start(row_start), _row_end(row_end), _col_start(col_start), _col_end(col_end)
 {
     CKW_ASSERT(row_start >= 0 && row_start < _tile->info().height());
@@ -50,7 +53,8 @@ TileOperand TileOperand::tile(int32_t row_start, int32_t row_end, int32_t col_st
     CKW_ASSERT(col_start >= 0 && _col_start + col_start < _col_end);
     CKW_ASSERT(col_end > col_start && _col_start + col_end <= _col_end);
 
-    return TileOperand(*this, _row_start + row_start, _row_start + row_end, _col_start + col_start, _col_start + col_end);
+    return TileOperand(*this, _row_start + row_start, _row_start + row_end, _col_start + col_start,
+                       _col_start + col_end);
 }
 
 TileOperand TileOperand::row(int32_t row) const
diff --git a/compute_kernel_writer/src/TileView.h b/compute_kernel_writer/src/TileView.h
index e0d034fa8d..50ae66b389 100644
--- a/compute_kernel_writer/src/TileView.h
+++ b/compute_kernel_writer/src/TileView.h
@@ -27,6 +27,7 @@
 
 #include "ckw/Error.h"
 #include "ckw/types/DataType.h"
+
 #include "src/ITile.h"
 
 #include <cstdint>
@@ -81,8 +82,7 @@ public:
      *
      * @param[in] tile The tile object.
      */
-    TileView(const T &tile)
-        : _tile(&tile), _area(0, tile.info().height(), 0, tile.info().width())
+    TileView(const T &tile) : _tile(&tile), _area(0, tile.info().height(), 0, tile.info().width())
     {
     }
 
@@ -91,8 +91,7 @@ public:
      * @param[in] tile The tile object.
      * @param[in] area The rectangular active area.
      */
-    TileView(const T &tile, const TileArea &area)
-        : _tile(&tile), _area(area)
+    TileView(const T &tile, const TileArea &area) : _tile(&tile), _area(area)
     {
     }
 
@@ -176,7 +175,8 @@ public:
     /** Get whether the tile view refers to the whole tile. */
     bool is_full_tile() const
     {
-        return row_start() == 0 && row_end() == _tile->info().height() && col_start() == 0 && col_end() == _tile->info().width();
+        return row_start() == 0 && row_end() == _tile->info().height() && col_start() == 0 &&
+               col_end() == _tile->info().width();
     }
 
 private:
diff --git a/compute_kernel_writer/src/cl/CLHelpers.cpp b/compute_kernel_writer/src/cl/CLHelpers.cpp
index ff4408b1a3..8e4a932764 100644
--- a/compute_kernel_writer/src/cl/CLHelpers.cpp
+++ b/compute_kernel_writer/src/cl/CLHelpers.cpp
@@ -28,6 +28,7 @@
 #include "ckw/types/DataType.h"
 #include "ckw/types/Operators.h"
 #include "ckw/types/TensorStorageType.h"
+
 #include "src/types/DataTypeHelpers.h"
 
 namespace ckw
@@ -35,7 +36,7 @@ namespace ckw
 bool cl_validate_vector_length(int32_t len)
 {
     bool valid_vector_length = true;
-    if(len < 1 || len > 16 || (len > 4 && len < 8) || (len > 8 && len < 16))
+    if (len < 1 || len > 16 || (len > 4 && len < 8) || (len > 8 && len < 16))
     {
         valid_vector_length = false;
     }
@@ -44,14 +45,14 @@ bool cl_validate_vector_length(int32_t len)
 
 std::string cl_get_variable_datatype_as_string(DataType dt, int32_t len)
 {
-    if(cl_validate_vector_length(len) == false)
+    if (cl_validate_vector_length(len) == false)
     {
         CKW_THROW_MSG("Unsupported vector length");
         return "";
     }
 
     std::string res;
-    switch(dt)
+    switch (dt)
     {
         case DataType::Fp32:
             res += "float";
@@ -85,7 +86,7 @@ std::string cl_get_variable_datatype_as_string(DataType dt, int32_t len)
             return "";
     }
 
-    if(len > 1)
+    if (len > 1)
     {
         res += std::to_string(len);
     }
@@ -95,7 +96,7 @@ std::string cl_get_variable_datatype_as_string(DataType dt, int32_t len)
 
 int32_t cl_round_up_to_nearest_valid_vector_width(int32_t width)
 {
-    switch(width)
+    switch (width)
     {
         case 1:
             return 1;
@@ -128,7 +129,7 @@ int32_t cl_round_up_to_nearest_valid_vector_width(int32_t width)
 std::string cl_get_variable_storagetype_as_string(TensorStorageType storage)
 {
     std::string res;
-    switch(storage)
+    switch (storage)
     {
         case TensorStorageType::BufferUint8Ptr:
             res += "__global uchar*";
@@ -148,7 +149,7 @@ std::string cl_get_variable_storagetype_as_string(TensorStorageType storage)
 
 std::string cl_get_assignment_op_as_string(AssignmentOp op)
 {
-    switch(op)
+    switch (op)
     {
         case AssignmentOp::Increment:
             return "+=";
@@ -163,34 +164,34 @@ std::string cl_get_assignment_op_as_string(AssignmentOp op)
 
 std::tuple<bool, std::string> cl_get_unary_op(UnaryOp op)
 {
-    switch(op)
+    switch (op)
     {
         case UnaryOp::LogicalNot:
-            return { false, "!" };
+            return {false, "!"};
 
         case UnaryOp::BitwiseNot:
-            return { false, "~" };
+            return {false, "~"};
 
         case UnaryOp::Exp:
-            return { true, "exp" };
+            return {true, "exp"};
 
         case UnaryOp::Tanh:
-            return { true, "tanh" };
+            return {true, "tanh"};
 
         case UnaryOp::Sqrt:
-            return { true, "sqrt" };
+            return {true, "sqrt"};
 
         case UnaryOp::Erf:
-            return { true, "erf" };
+            return {true, "erf"};
 
         case UnaryOp::Fabs:
-            return { true, "fabs" };
+            return {true, "fabs"};
 
         case UnaryOp::Log:
-            return { true, "log" };
+            return {true, "log"};
 
         case UnaryOp::Round:
-            return { true, "round" };
+            return {true, "round"};
 
         default:
             CKW_THROW_MSG("Unsupported unary operation!");
@@ -201,52 +202,52 @@ std::tuple<bool, std::string> cl_get_binary_op(BinaryOp op, DataType data_type)
 {
     const auto is_float = is_data_type_float(data_type);
 
-    switch(op)
+    switch (op)
     {
         case BinaryOp::Add:
-            return { false, "+" };
+            return {false, "+"};
 
         case BinaryOp::Sub:
-            return { false, "-" };
+            return {false, "-"};
 
         case BinaryOp::Mul:
-            return { false, "*" };
+            return {false, "*"};
 
         case BinaryOp::Div:
-            return { false, "/" };
+            return {false, "/"};
 
         case BinaryOp::Mod:
-            return { false, "%" };
+            return {false, "%"};
 
         case BinaryOp::Equal:
-            return { false, "==" };
+            return {false, "=="};
 
         case BinaryOp::Less:
-            return { false, "<" };
+            return {false, "<"};
 
         case BinaryOp::LessEqual:
-            return { false, "<=" };
+            return {false, "<="};
 
         case BinaryOp::Greater:
-            return { false, ">" };
+            return {false, ">"};
 
         case BinaryOp::GreaterEqual:
-            return { false, ">=" };
+            return {false, ">="};
 
         case BinaryOp::LogicalAnd:
-            return { false, "&&" };
+            return {false, "&&"};
 
         case BinaryOp::LogicalOr:
-            return { false, "||" };
+            return {false, "||"};
 
         case BinaryOp::BitwiseXOR:
-            return { false, "^" };
+            return {false, "^"};
 
         case BinaryOp::Min:
-            return { true, is_float ? "fmin" : "min" };
+            return {true, is_float ? "fmin" : "min"};
 
         case BinaryOp::Max:
-            return { true, is_float ? "fmax" : "max" };
+            return {true, is_float ? "fmax" : "max"};
 
         default:
             CKW_THROW_MSG("Unsupported binary operator/function!");
@@ -255,13 +256,13 @@ std::tuple<bool, std::string> cl_get_binary_op(BinaryOp op, DataType data_type)
 
 std::tuple<bool, std::string> cl_get_ternary_op(TernaryOp op)
 {
-    switch(op)
+    switch (op)
     {
         case TernaryOp::Select:
-            return { true, "select" };
+            return {true, "select"};
 
         case TernaryOp::Clamp:
-            return { true, "clamp" };
+            return {true, "clamp"};
 
         default:
             CKW_THROW_MSG("Unsupported ternary function!");
@@ -273,7 +274,7 @@ std::string cl_data_type_rounded_up_to_valid_vector_width(DataType dt, int32_t w
     std::string   data_type;
     const int32_t w = cl_round_up_to_nearest_valid_vector_width(width);
     data_type += cl_get_variable_datatype_as_string(dt, 1);
-    if(w != 1)
+    if (w != 1)
     {
         data_type += std::to_string(w);
     }
@@ -284,7 +285,7 @@ std::vector<int32_t> cl_decompose_vector_width(int32_t vector_width)
 {
     std::vector<int32_t> x;
 
-    switch(vector_width)
+    switch (vector_width)
     {
         case 0:
             break;
diff --git a/compute_kernel_writer/src/cl/CLKernelWriter.cpp b/compute_kernel_writer/src/cl/CLKernelWriter.cpp
index 2db9c139b7..62e6853a7a 100644
--- a/compute_kernel_writer/src/cl/CLKernelWriter.cpp
+++ b/compute_kernel_writer/src/cl/CLKernelWriter.cpp
@@ -31,14 +31,15 @@
 #include "ckw/types/DataType.h"
 #include "ckw/types/MemoryOperation.h"
 #include "ckw/types/TargetLanguage.h"
-#include "src/ITensorComponent.h"
-#include "src/TileView.h"
+
 #include "src/cl/CLHelpers.h"
 #include "src/cl/CLTensorArgument.h"
 #include "src/cl/CLTile.h"
 #include "src/cl/helpers/CLMemoryOpBufferHelper.h"
 #include "src/cl/helpers/CLMemoryOpImage2dHelper.h"
 #include "src/cl/helpers/ICLMemoryOpHelper.h"
+#include "src/ITensorComponent.h"
+#include "src/TileView.h"
 #include "src/types/DataTypeHelpers.h"
 
 #include <algorithm>
@@ -63,14 +64,14 @@ std::unique_ptr<Kernel> CLKernelWriter::emit_kernel(const std::string &name)
     // Create the list of arguments.
     std::vector<KernelArgument> arguments;
 
-    for(const auto &tensor : _tensors)
+    for (const auto &tensor : _tensors)
     {
         const auto tensor_id = tensor->info().id();
 
         const auto storages   = tensor->storages();
         const auto components = tensor->components();
 
-        for(const auto &storage : storages)
+        for (const auto &storage : storages)
         {
             code += cl_get_variable_storagetype_as_string(storage.type);
             code += " ";
@@ -80,7 +81,7 @@ std::unique_ptr<Kernel> CLKernelWriter::emit_kernel(const std::string &name)
             arguments.emplace_back(tensor_id, storage.type);
         }
 
-        for(const auto &component : components)
+        for (const auto &component : components)
         {
             const auto &tile      = component->tile();
             const auto &tile_info = tile.info();
@@ -96,7 +97,7 @@ std::unique_ptr<Kernel> CLKernelWriter::emit_kernel(const std::string &name)
         }
     }
 
-    if(code.size() >= 2 && code[code.size() - 2] == ',' && code[code.size() - 1] == '\n')
+    if (code.size() >= 2 && code[code.size() - 2] == ',' && code[code.size() - 1] == '\n')
     {
         // Remove the last comma in the argument list.
         code.pop_back();
@@ -127,11 +128,12 @@ void CLKernelWriter::op_assign(const TileOperand &dst, const TileOperand &src)
     const std::string src_prefix      = broadcast_src_x ? "(" + data_type_str + ")" : "";
 
     CKW_ASSERT_MSG(src_view.data_type() == dst_view.data_type(), "Source and destination type must match.");
-    CKW_ASSERT_MSG(src_view.height() == dst_h || src_view.height() == 1, "Tile height must match or source is broadcasting in y dimension.");
+    CKW_ASSERT_MSG(src_view.height() == dst_h || src_view.height() == 1,
+                   "Tile height must match or source is broadcasting in y dimension.");
     CKW_ASSERT_MSG(src_w == dst_w || src_w == 1, "Tile width must match or source is broadcasting in x dimension.");
 
     // Broadcasting on y dimension is automatic (see CLTile::vector).
-    for(int32_t y = 0; y < dst_h; ++y)
+    for (int32_t y = 0; y < dst_h; ++y)
     {
         append_code(dst_view.vector(y).str, " = ", src_prefix, src_view.vector(y).str, ";\n");
     }
@@ -158,13 +160,15 @@ void CLKernelWriter::op_cast(const TileOperand &dst, const TileOperand &src, Con
     const std::string prefix      = broadcast_x ? "(" + dst_type_str + ")" : "";
 
     CKW_ASSERT_MSG(src_view.data_type() != dst_view.data_type(), "Source and destination type must be different.");
-    CKW_ASSERT_MSG(src_view.height() == dst_h || src_view.height() == 1, "Tile height must match or source is broadcasting in y dimension.");
+    CKW_ASSERT_MSG(src_view.height() == dst_h || src_view.height() == 1,
+                   "Tile height must match or source is broadcasting in y dimension.");
     CKW_ASSERT_MSG(src_w == dst_w || src_w == 1, "Tile width must match or source is broadcasting in x dimension.");
 
     // Broadcasting on y dimension is automatic (see CLTile::vector).
-    for(int32_t y = 0; y < dst_h; ++y)
+    for (int32_t y = 0; y < dst_h; ++y)
     {
-        append_code(dst_view.vector(y).str, " = ", prefix, "convert_", convert_type_str, sat, "(", src_view.vector(y).str, ");\n");
+        append_code(dst_view.vector(y).str, " = ", prefix, "convert_", convert_type_str, sat, "(",
+                    src_view.vector(y).str, ");\n");
     }
 }
 
@@ -189,11 +193,12 @@ void CLKernelWriter::op_unary(const TileOperand &dst, UnaryOp op, const TileOper
     const auto  op_suffix  = op_is_func ? ")" : "";
 
     CKW_ASSERT_MSG(src_view.data_type() == dst_view.data_type(), "Source and destination type must match.");
-    CKW_ASSERT_MSG(src_view.height() == dst_h || src_view.height() == 1, "Tile height must match or source is broadcasting in y dimension.");
+    CKW_ASSERT_MSG(src_view.height() == dst_h || src_view.height() == 1,
+                   "Tile height must match or source is broadcasting in y dimension.");
     CKW_ASSERT_MSG(src_w == dst_w || src_w == 1, "Tile width must match or source is broadcasting in x dimension.");
 
     // Broadcasting on y dimension is automatic (see CLTile::vector).
-    for(int32_t y = 0; y < dst_h; ++y)
+    for (int32_t y = 0; y < dst_h; ++y)
     {
         append_code(dst_view.vector(y).str, " = ", src_prefix, op_prefix, src_view.vector(y).str, op_suffix, ";\n");
     }
@@ -214,27 +219,28 @@ void CLKernelWriter::op_binary(const TileOperand &dst, BinaryOp op, const TileOp
 
     CKW_ASSERT_MSG(lhs_view.data_type() == rhs_view.data_type(), "LHS and RHS type must match.");
 
-    CKW_ASSERT_MSG(lhs_view.height() == dst_h || lhs_view.height() == 1, "LHS tile height must match or source is broadcasting in y dimension.");
-    CKW_ASSERT_MSG(rhs_view.height() == dst_h || rhs_view.height() == 1, "RHS tile height must match or source is broadcasting in y dimension.");
+    CKW_ASSERT_MSG(lhs_view.height() == dst_h || lhs_view.height() == 1,
+                   "LHS tile height must match or source is broadcasting in y dimension.");
+    CKW_ASSERT_MSG(rhs_view.height() == dst_h || rhs_view.height() == 1,
+                   "RHS tile height must match or source is broadcasting in y dimension.");
 
-    CKW_ASSERT_MSG(lhs_w == dst_w || lhs_w == 1, "LHS tile width must match destination or LHS is broadcasting in x dimension.");
-    CKW_ASSERT_MSG(rhs_w == dst_w || rhs_w == 1, "RHS tile width must match destination or RHS is broadcasting in x dimension.");
+    CKW_ASSERT_MSG(lhs_w == dst_w || lhs_w == 1,
+                   "LHS tile width must match destination or LHS is broadcasting in x dimension.");
+    CKW_ASSERT_MSG(rhs_w == dst_w || rhs_w == 1,
+                   "RHS tile width must match destination or RHS is broadcasting in x dimension.");
 
-    if(op == BinaryOp::MatMul_Nt_T)
+    if (op == BinaryOp::MatMul_Nt_T)
     {
         CKW_ASSERT(is_data_type_float(data_type));
 
-        for(int32_t y = 0; y < dst_h; ++y)
+        for (int32_t y = 0; y < dst_h; ++y)
         {
-            for(int32_t x = 0; x < dst_w; ++x)
+            for (int32_t x = 0; x < dst_w; ++x)
             {
-                for(int32_t k = 0; k < lhs_w; ++k)
+                for (int32_t k = 0; k < lhs_w; ++k)
                 {
-                    append_code(
-                        dst_view.scalar(x, y).str, " = fma(",
-                        lhs_view.scalar(k, y).str, ", ",
-                        rhs_view.scalar(k, x).str, ", ",
-                        dst_view.scalar(x, y).str, ");\n");
+                    append_code(dst_view.scalar(x, y).str, " = fma(", lhs_view.scalar(k, y).str, ", ",
+                                rhs_view.scalar(k, x).str, ", ", dst_view.scalar(x, y).str, ");\n");
                 }
             }
         }
@@ -258,14 +264,16 @@ void CLKernelWriter::op_binary(const TileOperand &dst, BinaryOp op, const TileOp
         const std::string op_suffix    = op_is_func ? ");\n" : ";\n";
 
         // Broadcasting on y dimension is automatic (see CLTile::vector).
-        for(int32_t y = 0; y < dst_h; ++y)
+        for (int32_t y = 0; y < dst_h; ++y)
         {
-            append_code(dst_view.vector(y).str, op_prefix, lhs_prefix, lhs_view.vector(y).str, op_separator, rhs_prefix, rhs_view.vector(y).str, op_suffix);
+            append_code(dst_view.vector(y).str, op_prefix, lhs_prefix, lhs_view.vector(y).str, op_separator, rhs_prefix,
+                        rhs_view.vector(y).str, op_suffix);
         }
     }
 }
 
-void CLKernelWriter::op_ternary(const TileOperand &dst, TernaryOp op, const TileOperand &first, const TileOperand &second, const TileOperand &third)
+void CLKernelWriter::op_ternary(
+    const TileOperand &dst, TernaryOp op, const TileOperand &first, const TileOperand &second, const TileOperand &third)
 {
     const auto dst_view    = to_cl_tile_view(dst);
     const auto first_view  = to_cl_tile_view(first);
@@ -297,37 +305,42 @@ void CLKernelWriter::op_ternary(const TileOperand &dst, TernaryOp op, const Tile
     CKW_ASSERT_MSG(second_view.data_type() == dst_view.data_type(), "2nd source and destination type must match.");
     CKW_ASSERT_MSG(third_view.data_type() == dst_view.data_type(), "3rd source and destination type must match.");
 
-    CKW_ASSERT_MSG(first_view.height() == dst_h || first_view.height() == 1, "1st tile height must match or source is broadcasting in y dimension.");
-    CKW_ASSERT_MSG(second_view.height() == dst_h || second_view.height() == 1, "2nd tile height must match or source is broadcasting in y dimension.");
-    CKW_ASSERT_MSG(third_view.height() == dst_h || third_view.height() == 1, "3rd tile height must match or source is broadcasting in y dimension.");
+    CKW_ASSERT_MSG(first_view.height() == dst_h || first_view.height() == 1,
+                   "1st tile height must match or source is broadcasting in y dimension.");
+    CKW_ASSERT_MSG(second_view.height() == dst_h || second_view.height() == 1,
+                   "2nd tile height must match or source is broadcasting in y dimension.");
+    CKW_ASSERT_MSG(third_view.height() == dst_h || third_view.height() == 1,
+                   "3rd tile height must match or source is broadcasting in y dimension.");
 
-    CKW_ASSERT_MSG(first_w == dst_w || first_w == 1, "1st tile width must match or source is broadcasting in x dimension.");
-    CKW_ASSERT_MSG(second_w == dst_w || second_w == 1, "2nd tile width must match or source is broadcasting in x dimension.");
-    CKW_ASSERT_MSG(third_w == dst_w || third_w == 1, "3rd tile width must match or source is broadcasting in x dimension.");
+    CKW_ASSERT_MSG(first_w == dst_w || first_w == 1,
+                   "1st tile width must match or source is broadcasting in x dimension.");
+    CKW_ASSERT_MSG(second_w == dst_w || second_w == 1,
+                   "2nd tile width must match or source is broadcasting in x dimension.");
+    CKW_ASSERT_MSG(third_w == dst_w || third_w == 1,
+                   "3rd tile width must match or source is broadcasting in x dimension.");
 
     // Broadcasting on y dimension is automatic (see CLTile::vector).
-    for(int32_t y = 0; y < dst_h; ++y)
+    for (int32_t y = 0; y < dst_h; ++y)
     {
-        append_code(
-            dst_view.vector(y).str, " = ", op_name, "(",
-            first_prefix, first_view.vector(y).str, ", ",
-            second_prefix, second_view.vector(y).str, ", ",
-            third_prefix, third_view.vector(y).str, ");\n");
+        append_code(dst_view.vector(y).str, " = ", op_name, "(", first_prefix, first_view.vector(y).str, ", ",
+                    second_prefix, second_view.vector(y).str, ", ", third_prefix, third_view.vector(y).str, ");\n");
     }
 }
 
-void CLKernelWriter::op_if_generic(const TileOperand &lhs, BinaryOp op, const TileOperand &rhs, const std::function<void()> &body, bool is_else_if)
+void CLKernelWriter::op_if_generic(
+    const TileOperand &lhs, BinaryOp op, const TileOperand &rhs, const std::function<void()> &body, bool is_else_if)
 {
     const auto lhs_view = to_cl_tile_view(lhs);
     const auto rhs_view = to_cl_tile_view(rhs);
 
     const auto op_name = std::get<1>(cl_get_binary_op(op, lhs_view.data_type()));
-    CKW_ASSERT(op == BinaryOp::Less || op == BinaryOp::LessEqual || op == BinaryOp::Equal || op == BinaryOp::GreaterEqual || op == BinaryOp::Greater);
+    CKW_ASSERT(op == BinaryOp::Less || op == BinaryOp::LessEqual || op == BinaryOp::Equal ||
+               op == BinaryOp::GreaterEqual || op == BinaryOp::Greater);
 
     CKW_ASSERT(lhs_view.is_scalar());
     CKW_ASSERT(rhs_view.is_scalar());
 
-    if(is_else_if)
+    if (is_else_if)
     {
         append_code("else ");
     }
@@ -337,12 +350,18 @@ void CLKernelWriter::op_if_generic(const TileOperand &lhs, BinaryOp op, const Ti
     append_code("}\n");
 }
 
-void CLKernelWriter::op_if(const TileOperand &lhs, BinaryOp op, const TileOperand &rhs, const std::function<void()> &body)
+void CLKernelWriter::op_if(const TileOperand           &lhs,
+                           BinaryOp                     op,
+                           const TileOperand           &rhs,
+                           const std::function<void()> &body)
 {
     op_if_generic(lhs, op, rhs, body, false /* is_else_if */);
 }
 
-void CLKernelWriter::op_else_if(const TileOperand &lhs, BinaryOp op, const TileOperand &rhs, const std::function<void()> &body)
+void CLKernelWriter::op_else_if(const TileOperand           &lhs,
+                                BinaryOp                     op,
+                                const TileOperand           &rhs,
+                                const std::function<void()> &body)
 {
     op_if_generic(lhs, op, rhs, body, true /* is_else_if */);
 }
@@ -354,10 +373,13 @@ void CLKernelWriter::op_else(const std::function<void()> &body)
     append_code("}\n");
 }
 
-void CLKernelWriter::op_for_loop(
-    const TileOperand &var, BinaryOp cond_op, const TileOperand &cond_value,
-    const TileOperand &update_var, AssignmentOp update_op, const TileOperand &update_value,
-    const std::function<void()> &body)
+void CLKernelWriter::op_for_loop(const TileOperand           &var,
+                                 BinaryOp                     cond_op,
+                                 const TileOperand           &cond_value,
+                                 const TileOperand           &update_var,
+                                 AssignmentOp                 update_op,
+                                 const TileOperand           &update_value,
+                                 const std::function<void()> &body)
 {
     const auto var_view          = to_cl_tile_view(var);
     const auto cond_value_view   = to_cl_tile_view(cond_value);
@@ -373,11 +395,12 @@ void CLKernelWriter::op_for_loop(
     CKW_ASSERT(update_var_view.data_type() == update_value_view.data_type());
 
     const auto cond_op_name = std::get<1>(cl_get_binary_op(cond_op, var_view.data_type()));
-    CKW_ASSERT(cond_op == BinaryOp::Less || cond_op == BinaryOp::LessEqual || cond_op == BinaryOp::Equal || cond_op == BinaryOp::GreaterEqual || cond_op == BinaryOp::Greater);
+    CKW_ASSERT(cond_op == BinaryOp::Less || cond_op == BinaryOp::LessEqual || cond_op == BinaryOp::Equal ||
+               cond_op == BinaryOp::GreaterEqual || cond_op == BinaryOp::Greater);
 
-    append_code(
-        "for (; ", var_view.scalar(0, 0).str, " ", cond_op_name, " ", cond_value_view.scalar(0, 0).str, "; ",
-        update_var_view.scalar(0, 0).str, " ", cl_get_assignment_op_as_string(update_op), " ", update_value_view.scalar(0, 0).str, ")\n{\n");
+    append_code("for (; ", var_view.scalar(0, 0).str, " ", cond_op_name, " ", cond_value_view.scalar(0, 0).str, "; ",
+                update_var_view.scalar(0, 0).str, " ", cl_get_assignment_op_as_string(update_op), " ",
+                update_value_view.scalar(0, 0).str, ")\n{\n");
     write_body(body);
     append_code("}\n");
 }
@@ -404,7 +427,7 @@ void CLKernelWriter::op_print(const std::string &prefix, const std::vector<TileO
     std::string format_code;
     std::string args_code;
 
-    for(auto &op : operands)
+    for (auto &op : operands)
     {
         const auto tile_view = to_cl_tile_view(op);
 
@@ -416,12 +439,12 @@ void CLKernelWriter::op_print(const std::string &prefix, const std::vector<TileO
         // Construct the format specifier to print out one row of the tile.
         std::string row_format("%");
 
-        if(width > 1)
+        if (width > 1)
         {
             row_format += "v" + std::to_string(width);
         }
 
-        switch(data_type)
+        switch (data_type)
         {
             case DataType::Fp32:
                 row_format += "hlg";
@@ -452,7 +475,7 @@ void CLKernelWriter::op_print(const std::string &prefix, const std::vector<TileO
                 CKW_THROW_MSG("Unsupported data type!");
         }
 
-        if(width > 1)
+        if (width > 1)
         {
             row_format = "[" + row_format + "]";
         }
@@ -460,14 +483,14 @@ void CLKernelWriter::op_print(const std::string &prefix, const std::vector<TileO
         // Construct the format specifier for the printf statement.
         format_code += name + " = ";
 
-        if(height == 1)
+        if (height == 1)
         {
             format_code += row_format;
         }
         else
         {
             format_code += "[" + row_format;
-            for(int32_t row = 1; row < height; ++row)
+            for (int32_t row = 1; row < height; ++row)
             {
                 format_code += ", " + row_format;
             }
@@ -477,7 +500,7 @@ void CLKernelWriter::op_print(const std::string &prefix, const std::vector<TileO
         format_code += "\\n";
 
         // Construct the variable arguments for the printf statement.
-        for(int32_t row = 0; row < height; ++row)
+        for (int32_t row = 0; row < height; ++row)
         {
             args_code += ", " + tile_view.vector(row).str;
         }
@@ -527,19 +550,14 @@ TileOperand CLKernelWriter::declare_tile(const std::string &name, const TileInfo
     const int32_t  width     = tile_info.width();
     const DataType data_type = tile_info.data_type();
 
-    CKW_ASSERT_MSG(
-        std::find_if(
-            _tiles.begin(), _tiles.end(),
-            [=](const std::unique_ptr<CLTile> &e)
-            {
-                return e->name() == fullname;
-            })
-            == _tiles.end(),
-        "There is already a tile with name: " + fullname);
+    CKW_ASSERT_MSG(std::find_if(_tiles.begin(), _tiles.end(),
+                                [=](const std::unique_ptr<CLTile> &e)
+                                { return e->name() == fullname; }) == _tiles.end(),
+                   "There is already a tile with name: " + fullname);
 
     auto tile = std::make_unique<CLTile>(fullname, tile_info);
 
-    for(int32_t row = 0; row < height; ++row)
+    for (int32_t row = 0; row < height; ++row)
     {
         const std::string cl_type = cl_get_variable_datatype_as_string(data_type, width);
         append_code(cl_type, " ", tile->vector(row).str, ";\n");
@@ -578,40 +596,40 @@ TileView<CLTile> CLKernelWriter::to_cl_tile_view(const TileOperand &operand) con
     {
         bool found = false;
 
-        for(const auto &t : _tiles)
+        for (const auto &t : _tiles)
         {
-            if(&tile == t.get())
+            if (&tile == t.get())
             {
                 found = true;
                 break;
             }
         }
 
-        for(const auto &t : _constant_tiles)
+        for (const auto &t : _constant_tiles)
         {
-            if(&tile == t.get())
+            if (&tile == t.get())
             {
                 found = true;
                 break;
             }
         }
 
-        if(!found)
+        if (!found)
         {
-            for(const auto &t : _tensors)
+            for (const auto &t : _tensors)
             {
                 const auto components = t->components();
 
-                for(const auto component : components)
+                for (const auto component : components)
                 {
-                    if(&tile == &component->tile())
+                    if (&tile == &component->tile())
                     {
                         found = true;
                         break;
                     }
                 }
 
-                if(found)
+                if (found)
                 {
                     break;
                 }
@@ -622,66 +640,106 @@ TileView<CLTile> CLKernelWriter::to_cl_tile_view(const TileOperand &operand) con
     }
 #endif // COMPUTE_KERNEL_WRITER_ASSERTS_ENABLED
 
-    return { static_cast<CLTile &>(tile), area };
+    return {static_cast<CLTile &>(tile), area};
 }
 
-void CLKernelWriter::op_load(const TileOperand &tile_op, const TensorOperand &tensor_op, TensorSampler &sampler,
-                             const TileOperand &x, const TileOperand &y, const TileOperand &z, const TileOperand &batch)
+void CLKernelWriter::op_load(const TileOperand   &tile_op,
+                             const TensorOperand &tensor_op,
+                             TensorSampler       &sampler,
+                             const TileOperand   &x,
+                             const TileOperand   &y,
+                             const TileOperand   &z,
+                             const TileOperand   &batch)
 {
-    const CLTile dilation_x({ { "1" } }, DataType::Int32);
-    const CLTile dilation_y({ { "1" } }, DataType::Int32);
+    const CLTile dilation_x({{"1"}}, DataType::Int32);
+    const CLTile dilation_y({{"1"}}, DataType::Int32);
 
-    op_load_store(MemoryOperation::Load, tile_op, tensor_op, sampler, x, y, z, batch, dilation_x, dilation_y, false /* indirect buffer */);
+    op_load_store(MemoryOperation::Load, tile_op, tensor_op, sampler, x, y, z, batch, dilation_x, dilation_y,
+                  false /* indirect buffer */);
 }
 
-void CLKernelWriter::op_load_dilated(const TileOperand &tile_op, const TensorOperand &tensor_op, TensorSampler &sampler,
-                                     const TileOperand &x, const TileOperand &y, const TileOperand &z, const TileOperand &batch,
-                                     const TileOperand &dilation_x, const TileOperand &dilation_y)
+void CLKernelWriter::op_load_dilated(const TileOperand   &tile_op,
+                                     const TensorOperand &tensor_op,
+                                     TensorSampler       &sampler,
+                                     const TileOperand   &x,
+                                     const TileOperand   &y,
+                                     const TileOperand   &z,
+                                     const TileOperand   &batch,
+                                     const TileOperand   &dilation_x,
+                                     const TileOperand   &dilation_y)
 {
     const auto dil_x_view = to_cl_tile_view(dilation_x);
     const auto dil_y_view = to_cl_tile_view(dilation_y);
 
-    op_load_store(MemoryOperation::Load, tile_op, tensor_op, sampler, x, y, z, batch, dil_x_view, dil_y_view, false /* indirect buffer */);
+    op_load_store(MemoryOperation::Load, tile_op, tensor_op, sampler, x, y, z, batch, dil_x_view, dil_y_view,
+                  false /* indirect buffer */);
 }
 
-void CLKernelWriter::op_store(const TensorOperand &tensor_op, const TileOperand &tile_op, TensorSampler &sampler,
-                              const TileOperand &x, const TileOperand &y, const TileOperand &z, const TileOperand &batch)
+void CLKernelWriter::op_store(const TensorOperand &tensor_op,
+                              const TileOperand   &tile_op,
+                              TensorSampler       &sampler,
+                              const TileOperand   &x,
+                              const TileOperand   &y,
+                              const TileOperand   &z,
+                              const TileOperand   &batch)
 {
-    const CLTile dilation_x({ { "1" } }, DataType::Int32);
-    const CLTile dilation_y({ { "1" } }, DataType::Int32);
+    const CLTile dilation_x({{"1"}}, DataType::Int32);
+    const CLTile dilation_y({{"1"}}, DataType::Int32);
 
-    op_load_store(MemoryOperation::Store, tile_op, tensor_op, sampler, x, y, z, batch, dilation_x, dilation_y, false /* indirect buffer */);
+    op_load_store(MemoryOperation::Store, tile_op, tensor_op, sampler, x, y, z, batch, dilation_x, dilation_y,
+                  false /* indirect buffer */);
 }
 
-void CLKernelWriter::op_store_dilated(const TensorOperand &tensor_op, const TileOperand &tile_op, TensorSampler &sampler,
-                                      const TileOperand &x, const TileOperand &y, const TileOperand &z, const TileOperand &batch,
-                                      const TileOperand &dilation_x, const TileOperand &dilation_y)
+void CLKernelWriter::op_store_dilated(const TensorOperand &tensor_op,
+                                      const TileOperand   &tile_op,
+                                      TensorSampler       &sampler,
+                                      const TileOperand   &x,
+                                      const TileOperand   &y,
+                                      const TileOperand   &z,
+                                      const TileOperand   &batch,
+                                      const TileOperand   &dilation_x,
+                                      const TileOperand   &dilation_y)
 {
     const auto dil_x_view = to_cl_tile_view(dilation_x);
     const auto dil_y_view = to_cl_tile_view(dilation_y);
 
-    op_load_store(MemoryOperation::Store, tile_op, tensor_op, sampler, x, y, z, batch, dil_x_view, dil_y_view, false /* indirect buffer */);
+    op_load_store(MemoryOperation::Store, tile_op, tensor_op, sampler, x, y, z, batch, dil_x_view, dil_y_view,
+                  false /* indirect buffer */);
 }
 
-void CLKernelWriter::op_load_indirect(const TileOperand &tile_op, const TensorOperand &tensor_op, TensorSampler &sampler,
-                                      const TileOperand &x, const TileOperand &y, const TileOperand &z, const TileOperand &batch)
+void CLKernelWriter::op_load_indirect(const TileOperand   &tile_op,
+                                      const TensorOperand &tensor_op,
+                                      TensorSampler       &sampler,
+                                      const TileOperand   &x,
+                                      const TileOperand   &y,
+                                      const TileOperand   &z,
+                                      const TileOperand   &batch)
 {
-    const CLTile dilation_x({ { "1" } }, DataType::Int32);
-    const CLTile dilation_y({ { "1" } }, DataType::Int32);
+    const CLTile dilation_x({{"1"}}, DataType::Int32);
+    const CLTile dilation_y({{"1"}}, DataType::Int32);
 
-    op_load_store(MemoryOperation::Load, tile_op, tensor_op, sampler, x, y, z, batch, dilation_x, dilation_y, true /* indirect buffer */);
+    op_load_store(MemoryOperation::Load, tile_op, tensor_op, sampler, x, y, z, batch, dilation_x, dilation_y,
+                  true /* indirect buffer */);
 }
 
-void CLKernelWriter::op_load_store(MemoryOperation op, const TileOperand &tile_op, const TensorOperand &tensor_op, TensorSampler &sampler,
-                                   const TileOperand &x, const TileOperand &y, const TileOperand &z, const TileOperand &batch,
-                                   const TileView<CLTile> &dilation_x, const TileView<CLTile> &dilation_y, bool indirect_buffer)
+void CLKernelWriter::op_load_store(MemoryOperation         op,
+                                   const TileOperand      &tile_op,
+                                   const TensorOperand    &tensor_op,
+                                   TensorSampler          &sampler,
+                                   const TileOperand      &x,
+                                   const TileOperand      &y,
+                                   const TileOperand      &z,
+                                   const TileOperand      &batch,
+                                   const TileView<CLTile> &dilation_x,
+                                   const TileView<CLTile> &dilation_y,
+                                   bool                    indirect_buffer)
 {
     CKW_UNUSED(dilation_x);
     CKW_ASSERT(dilation_x.is_scalar());
     CKW_ASSERT(dilation_y.is_scalar());
     CKW_ASSERT(dilation_x.scalar(0, 0).str == "((int)(1))"); // Dilation in x dimension is not implemented yet
 
-    if(indirect_buffer)
+    if (indirect_buffer)
     {
         CKW_ASSERT(dilation_y.scalar(0, 0).str == "((int)(1))" && dilation_x.scalar(0, 0).str == "((int)(1))");
     }
@@ -689,7 +747,7 @@ void CLKernelWriter::op_load_store(MemoryOperation op, const TileOperand &tile_o
     ITensor &tensor = get_tensor(tensor_op);
 
     std::unique_ptr<ICLMemoryOpHelper> helper;
-    switch(sampler.storage())
+    switch (sampler.storage())
     {
         case TensorStorageType::BufferUint8Ptr:
             helper = std::make_unique<CLMemoryOpBufferHelper>(this, &tensor, &sampler, op);
@@ -717,13 +775,13 @@ void CLKernelWriter::op_load_store(MemoryOperation op, const TileOperand &tile_o
 
     helper->initialize(&tile, &x_tile, &z_tile, &batch_tile);
 
-    for(int row = 0; row < tile.info().height(); ++row)
+    for (int row = 0; row < tile.info().height(); ++row)
     {
-        if(!indirect_buffer)
+        if (!indirect_buffer)
         {
             std::string coord_y = y_tile.scalar(0, 0).str + " + " + std::to_string(row);
 
-            if(dilation_y.scalar(0, 0).str != "((int)(1))")
+            if (dilation_y.scalar(0, 0).str != "((int)(1))")
             {
                 coord_y += " * " + dilation_y.scalar(0, 0).str;
             }
diff --git a/compute_kernel_writer/src/cl/CLKernelWriter.h b/compute_kernel_writer/src/cl/CLKernelWriter.h
index d7cf24d5e6..6485bae512 100644
--- a/compute_kernel_writer/src/cl/CLKernelWriter.h
+++ b/compute_kernel_writer/src/cl/CLKernelWriter.h
@@ -26,6 +26,7 @@
 #define CKW_SRC_CL_CLKERNELWRITER_H
 
 #include "ckw/KernelWriter.h"
+
 #include "src/TileView.h"
 
 #include <memory>
@@ -73,7 +74,11 @@ public:
 
     void op_binary(const TileOperand &dst, BinaryOp op, const TileOperand &first, const TileOperand &second) override;
 
-    void op_ternary(const TileOperand &dst, TernaryOp op, const TileOperand &first, const TileOperand &second, const TileOperand &third) override;
+    void op_ternary(const TileOperand &dst,
+                    TernaryOp          op,
+                    const TileOperand &first,
+                    const TileOperand &second,
+                    const TileOperand &third) override;
 
     // =============================================================================================
     // Flow control
@@ -81,14 +86,18 @@ public:
 
     void op_if(const TileOperand &lhs, BinaryOp op, const TileOperand &rhs, const std::function<void()> &body) override;
 
-    void op_else_if(const TileOperand &lhs, BinaryOp op, const TileOperand &rhs, const std::function<void()> &body) override;
+    void
+    op_else_if(const TileOperand &lhs, BinaryOp op, const TileOperand &rhs, const std::function<void()> &body) override;
 
     void op_else(const std::function<void()> &body) override;
 
-    void op_for_loop(
-        const TileOperand &var, BinaryOp cond_op, const TileOperand &cond_value,
-        const TileOperand &update_var, AssignmentOp update_op, const TileOperand &update_value,
-        const std::function<void()> &body) override;
+    void op_for_loop(const TileOperand           &var,
+                     BinaryOp                     cond_op,
+                     const TileOperand           &cond_value,
+                     const TileOperand           &update_var,
+                     AssignmentOp                 update_op,
+                     const TileOperand           &update_value,
+                     const std::function<void()> &body) override;
 
     void op_return() override;
 
@@ -132,26 +141,49 @@ public:
     // Memory Operations
     // =============================================================================================
 
-    void op_load(
-        const TileOperand &tile_op, const TensorOperand &tensor_op, TensorSampler &sampler,
-        const TileOperand &x, const TileOperand &y, const TileOperand &z, const TileOperand &batch) override;
-
-    void op_load_dilated(
-        const TileOperand &tile_op, const TensorOperand &tensor_op, TensorSampler &sampler,
-        const TileOperand &x, const TileOperand &y, const TileOperand &z, const TileOperand &batch,
-        const TileOperand &dilation_x, const TileOperand &dilation_y) override;
-
-    void op_store(
-        const TensorOperand &tensor_op, const TileOperand &tile_op, TensorSampler &sampler,
-        const TileOperand &x, const TileOperand &y, const TileOperand &z, const TileOperand &batch) override;
-
-    void op_store_dilated(
-        const TensorOperand &tensor_op, const TileOperand &tile_op, TensorSampler &sampler,
-        const TileOperand &x, const TileOperand &y, const TileOperand &z, const TileOperand &batch,
-        const TileOperand &dilation_x, const TileOperand &dilation_y) override;
-
-    void op_load_indirect(const TileOperand &tile_op, const TensorOperand &tensor_op, TensorSampler &sampler,
-                          const TileOperand &x, const TileOperand &y, const TileOperand &z, const TileOperand &batch) override;
+    void op_load(const TileOperand   &tile_op,
+                 const TensorOperand &tensor_op,
+                 TensorSampler       &sampler,
+                 const TileOperand   &x,
+                 const TileOperand   &y,
+                 const TileOperand   &z,
+                 const TileOperand   &batch) override;
+
+    void op_load_dilated(const TileOperand   &tile_op,
+                         const TensorOperand &tensor_op,
+                         TensorSampler       &sampler,
+                         const TileOperand   &x,
+                         const TileOperand   &y,
+                         const TileOperand   &z,
+                         const TileOperand   &batch,
+                         const TileOperand   &dilation_x,
+                         const TileOperand   &dilation_y) override;
+
+    void op_store(const TensorOperand &tensor_op,
+                  const TileOperand   &tile_op,
+                  TensorSampler       &sampler,
+                  const TileOperand   &x,
+                  const TileOperand   &y,
+                  const TileOperand   &z,
+                  const TileOperand   &batch) override;
+
+    void op_store_dilated(const TensorOperand &tensor_op,
+                          const TileOperand   &tile_op,
+                          TensorSampler       &sampler,
+                          const TileOperand   &x,
+                          const TileOperand   &y,
+                          const TileOperand   &z,
+                          const TileOperand   &batch,
+                          const TileOperand   &dilation_x,
+                          const TileOperand   &dilation_y) override;
+
+    void op_load_indirect(const TileOperand   &tile_op,
+                          const TensorOperand &tensor_op,
+                          TensorSampler       &sampler,
+                          const TileOperand   &x,
+                          const TileOperand   &y,
+                          const TileOperand   &z,
+                          const TileOperand   &batch) override;
 
 protected:
     /** Return a tile view containing a reference to @ref CLTile object and the active area.
@@ -181,9 +213,17 @@ protected:
     // For helper functions
 private:
     /** Helper method to consolidate all load/store logic in this class */
-    void op_load_store(MemoryOperation op, const TileOperand &tile_op, const TensorOperand &tensor_op, TensorSampler &sampler,
-                       const TileOperand &x, const TileOperand &y, const TileOperand &z, const TileOperand &batch,
-                       const TileView<CLTile> &dilation_x, const TileView<CLTile> &dilation_y, bool indirect_buffer);
+    void op_load_store(MemoryOperation         op,
+                       const TileOperand      &tile_op,
+                       const TensorOperand    &tensor_op,
+                       TensorSampler          &sampler,
+                       const TileOperand      &x,
+                       const TileOperand      &y,
+                       const TileOperand      &z,
+                       const TileOperand      &batch,
+                       const TileView<CLTile> &dilation_x,
+                       const TileView<CLTile> &dilation_y,
+                       bool                    indirect_buffer);
 
     /** This function is the generic function to write both `if` and `else if` blocks.
      *
@@ -195,7 +235,11 @@ private:
      * @param[in] body       The function that writes the body of the else-if block.
      * @param[in] is_else_if True if this is an `else if` block, otherwise this is an `if` block.
      */
-    void op_if_generic(const TileOperand &lhs, BinaryOp op, const TileOperand &rhs, const std::function<void()> &body, bool is_else_if);
+    void op_if_generic(const TileOperand           &lhs,
+                       BinaryOp                     op,
+                       const TileOperand           &rhs,
+                       const std::function<void()> &body,
+                       bool                         is_else_if);
 
     // For attributes
 private:
diff --git a/compute_kernel_writer/src/cl/CLTensorArgument.cpp b/compute_kernel_writer/src/cl/CLTensorArgument.cpp
index 7d4dc958df..e53de2830d 100644
--- a/compute_kernel_writer/src/cl/CLTensorArgument.cpp
+++ b/compute_kernel_writer/src/cl/CLTensorArgument.cpp
@@ -23,11 +23,13 @@
  */
 
 #include "src/cl/CLTensorArgument.h"
+
 #include "ckw/Error.h"
-#include "src/ITensorArgument.h"
-#include "src/ITensorComponent.h"
+
 #include "src/cl/CLHelpers.h"
 #include "src/cl/CLTensorComponent.h"
+#include "src/ITensorArgument.h"
+#include "src/ITensorComponent.h"
 #include "src/types/TensorComponentType.h"
 
 #include <algorithm>
@@ -48,25 +50,23 @@ CLTensorComponent &CLTensorArgument::cl_component(TensorComponentType x)
 {
     // Return the component if it has already been created.
     {
-        const auto it = std::find_if(
-            _components_used.begin(), _components_used.end(),
-            [=](const std::unique_ptr<CLTensorComponent> &item)
-            {
-                return item->component_type() == x;
-            });
+        const auto it =
+            std::find_if(_components_used.begin(), _components_used.end(),
+                         [=](const std::unique_ptr<CLTensorComponent> &item) { return item->component_type() == x; });
 
-        if(it != _components_used.end())
+        if (it != _components_used.end())
         {
             return **it;
         }
     }
 
-    if(_return_dims_by_value)
+    if (_return_dims_by_value)
     {
         uint32_t component_type = static_cast<uint32_t>(x);
 
-        const bool is_dimension         = (component_type & static_cast<uint32_t>(TensorComponentBitmask::Dimension)) != 0;
-        const bool is_folded_dimensions = (component_type & static_cast<uint32_t>(TensorComponentBitmask::FoldedDimensions)) != 0;
+        const bool is_dimension = (component_type & static_cast<uint32_t>(TensorComponentBitmask::Dimension)) != 0;
+        const bool is_folded_dimensions =
+            (component_type & static_cast<uint32_t>(TensorComponentBitmask::FoldedDimensions)) != 0;
 
         constexpr auto bitmask_all     = static_cast<uint32_t>(TensorComponentIndexBitmask::All);
         constexpr auto bitmask_index_0 = static_cast<uint32_t>(TensorComponentIndexBitmask::Index0);
@@ -83,16 +83,16 @@ CLTensorComponent &CLTensorArgument::cl_component(TensorComponentType x)
         CKW_ASSERT(bitmask_index_2 == bitmask_index_3 >> 4);
 
         // If we have a dimension or folded dimensions, we can return the corresponding value if it is not dynamic (not equal to -1)
-        if(is_dimension == true || is_folded_dimensions == true)
+        if (is_dimension == true || is_folded_dimensions == true)
         {
             component_type = component_type & bitmask_all;
 
             int32_t idx = 1;
-            for(int32_t i = 0; i < tensor_component_index_max_count; ++i)
+            for (int32_t i = 0; i < tensor_component_index_max_count; ++i)
             {
                 uint32_t dim_idx = component_type & bitmask_index_0;
 
-                if(dim_idx == 0)
+                if (dim_idx == 0)
                 {
                     // Stop at the first nibble containing 0
                     break;
@@ -104,7 +104,7 @@ CLTensorComponent &CLTensorArgument::cl_component(TensorComponentType x)
                 // Get the dimension value
                 const int32_t dim_val = _info.shape()[dim_idx];
 
-                if(dim_val == kDynamicTensorDimensionValue)
+                if (dim_val == kDynamicTensorDimensionValue)
                 {
                     // We cannot return the dimension by value if it is dynamic.
                     // Therefore, force the idx variable to kDynamicTensorDimensionValue and break the loop.
@@ -118,7 +118,7 @@ CLTensorComponent &CLTensorArgument::cl_component(TensorComponentType x)
                 component_type >>= 4;
             }
 
-            if(idx != kDynamicTensorDimensionValue)
+            if (idx != kDynamicTensorDimensionValue)
             {
                 _components_used.emplace_back(std::make_unique<CLTensorComponent>(*this, x, idx));
 
@@ -141,14 +141,10 @@ TensorStorageVariable &CLTensorArgument::storage(TensorStorageType x)
 {
     // Return the storage if it has already been created.
     {
-        const auto it = std::find_if(
-            _storages_used.begin(), _storages_used.end(),
-            [=](const TensorStorageVariable &item)
-            {
-                return item.type == x;
-            });
+        const auto it = std::find_if(_storages_used.begin(), _storages_used.end(),
+                                     [=](const TensorStorageVariable &item) { return item.type == x; });
 
-        if(it != _storages_used.end())
+        if (it != _storages_used.end())
         {
             return *it;
         }
@@ -167,7 +163,7 @@ std::string CLTensorArgument::create_storage_name(TensorStorageType x) const
 {
     std::string var_name = _basename;
 
-    switch(x)
+    switch (x)
     {
         case TensorStorageType::BufferUint8Ptr:
             var_name += "_ptr";
@@ -198,9 +194,9 @@ std::vector<const ITensorComponent *> CLTensorArgument::components() const
 {
     std::vector<const ITensorComponent *> components;
 
-    for(const auto &component : _components_used)
+    for (const auto &component : _components_used)
     {
-        if(component->is_assignable())
+        if (component->is_assignable())
         {
             components.push_back(component.get());
         }
diff --git a/compute_kernel_writer/src/cl/CLTensorArgument.h b/compute_kernel_writer/src/cl/CLTensorArgument.h
index 4cbbee21ee..35df51422e 100644
--- a/compute_kernel_writer/src/cl/CLTensorArgument.h
+++ b/compute_kernel_writer/src/cl/CLTensorArgument.h
@@ -26,7 +26,9 @@
 
 #include "ckw/types/TensorComponentType.h"
 #include "ckw/types/TensorStorageType.h"
+
 #include "src/ITensor.h"
+
 #include <memory>
 #include <string>
 #include <vector>
@@ -67,7 +69,7 @@ public:
      * unlike @ref CLTensorComponent::component which is for the public API and only returns
      * a reference to a generic @ref ITile object.
      */
-    CLTensorComponent& cl_component(TensorComponentType component_type);
+    CLTensorComponent &cl_component(TensorComponentType component_type);
 
     // Inherited method overridden
     TensorStorageVariable                &storage(TensorStorageType x) override;
@@ -78,7 +80,7 @@ public:
 private:
     std::string create_storage_name(TensorStorageType x) const;
 
-    bool                                            _return_dims_by_value{ false };
+    bool                                            _return_dims_by_value{false};
     std::vector<TensorStorageVariable>              _storages_used{};
     std::vector<std::unique_ptr<CLTensorComponent>> _components_used{};
 };
diff --git a/compute_kernel_writer/src/cl/CLTensorComponent.cpp b/compute_kernel_writer/src/cl/CLTensorComponent.cpp
index c29b307748..dbe2036768 100644
--- a/compute_kernel_writer/src/cl/CLTensorComponent.cpp
+++ b/compute_kernel_writer/src/cl/CLTensorComponent.cpp
@@ -23,8 +23,10 @@
  */
 
 #include "src/cl/CLTensorComponent.h"
+
 #include "ckw/Error.h"
 #include "ckw/types/TensorComponentType.h"
+
 #include "src/cl/CLTensorArgument.h"
 #include "src/cl/CLTile.h"
 
@@ -38,7 +40,7 @@ std::string create_component_name(const std::string &name, TensorComponentType x
 {
     std::string var_name(name);
 
-    switch(x)
+    switch (x)
     {
         case TensorComponentType::OffsetFirstElement:
             var_name += "_offset_first_element";
@@ -93,12 +95,13 @@ std::string create_component_name(const std::string &name, TensorComponentType x
 } // namespace
 
 CLTensorComponent::CLTensorComponent(const CLTensorArgument &tensor, TensorComponentType component_type)
-    : CLTile(create_component_name(tensor.name(), component_type), TileInfo(DataType::Int32)), _component_type(component_type)
+    : CLTile(create_component_name(tensor.name(), component_type), TileInfo(DataType::Int32)),
+      _component_type(component_type)
 {
 }
 
 CLTensorComponent::CLTensorComponent(const CLTensorArgument &tensor, TensorComponentType component_type, int32_t value)
-    : CLTile({ { std::to_string(value) } }, DataType::Int32), _component_type(component_type)
+    : CLTile({{std::to_string(value)}}, DataType::Int32), _component_type(component_type)
 {
     CKW_UNUSED(tensor);
 }
diff --git a/compute_kernel_writer/src/cl/CLTensorComponent.h b/compute_kernel_writer/src/cl/CLTensorComponent.h
index 42a42666dc..731597ebbf 100644
--- a/compute_kernel_writer/src/cl/CLTensorComponent.h
+++ b/compute_kernel_writer/src/cl/CLTensorComponent.h
@@ -26,8 +26,9 @@
 #define CKW_SRC_CL_CLTENSORCOMPONENT_H
 
 #include "ckw/types/TensorComponentType.h"
-#include "src/ITensorComponent.h"
+
 #include "src/cl/CLTile.h"
+#include "src/ITensorComponent.h"
 
 namespace ckw
 {
@@ -72,7 +73,7 @@ public:
     TensorComponentType component_type() const override;
 
 private:
-    TensorComponentType _component_type{ TensorComponentType::Unknown };
+    TensorComponentType _component_type{TensorComponentType::Unknown};
 };
 
 } // namespace ckw
diff --git a/compute_kernel_writer/src/cl/CLTile.cpp b/compute_kernel_writer/src/cl/CLTile.cpp
index 0cce69a9e1..f6e271e813 100644
--- a/compute_kernel_writer/src/cl/CLTile.cpp
+++ b/compute_kernel_writer/src/cl/CLTile.cpp
@@ -21,20 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "src/cl/CLTile.h"
+
 #include "ckw/Error.h"
 #include "ckw/TileInfo.h"
 
-#include "src/Helpers.h"
 #include "src/cl/CLHelpers.h"
-#include "src/cl/CLTile.h"
+#include "src/Helpers.h"
 
 #include <algorithm>
 #include <vector>
 
 namespace ckw
 {
-CLTile::CLTile(const std::string &name, const TileInfo &info)
-    : _is_constant(false)
+CLTile::CLTile(const std::string &name, const TileInfo &info) : _is_constant(false)
 {
     validate_tile_info(info);
 
@@ -42,8 +42,7 @@ CLTile::CLTile(const std::string &name, const TileInfo &info)
     _info     = info;
 }
 
-CLTile::CLTile(const TileContainer &vals, DataType dt)
-    : _is_constant(true)
+CLTile::CLTile(const TileContainer &vals, DataType dt) : _is_constant(true)
 {
     const int32_t w = vals[0].size();
     const int32_t h = vals.size();
@@ -56,9 +55,9 @@ CLTile::CLTile(const TileContainer &vals, DataType dt)
 
     _vals = TileContainer(h, std::vector<std::string>(w));
 
-    for(int32_t y = 0; y < h; ++y)
+    for (int32_t y = 0; y < h; ++y)
     {
-        for(int32_t x = 0; x < w; ++x)
+        for (int32_t x = 0; x < w; ++x)
         {
             _vals[y][x] = vals[y][x];
         }
@@ -81,7 +80,7 @@ TileVariable CLTile::scalar(int32_t row, int32_t col) const
     col = clamp(col, static_cast<int32_t>(0), _info.width() - 1);
     row = clamp(row, static_cast<int32_t>(0), _info.height() - 1);
 
-    if(_is_constant)
+    if (_is_constant)
     {
         // We can use the vector method to retrieve the scalar variable stored in the constant tile
         return vector(row, col, 1);
@@ -94,7 +93,7 @@ TileVariable CLTile::scalar(int32_t row, int32_t col) const
         t.desc.len = 1;
 
         // This check is required because if the width has only one element, we cannot use .s0
-        if(_info.width() != 1)
+        if (_info.width() != 1)
         {
             // Automatic broadcasting
             t.str += ".s" + dec_to_hex_as_string(col);
@@ -109,7 +108,7 @@ TileVariable CLTile::vector(int32_t row) const
     // Clamp to nearest valid edge
     row = clamp(row, static_cast<int32_t>(0), _info.height() - 1);
 
-    if(_is_constant)
+    if (_is_constant)
     {
         return vector(row, 0, _info.width());
     }
@@ -138,14 +137,14 @@ TileVariable CLTile::vector(int32_t row, int32_t col_start, int32_t width) const
     t.desc.dt  = _info.data_type();
     t.desc.len = width;
 
-    if(_is_constant)
+    if (_is_constant)
     {
         // The vector has the following form: ((data_typeN)(val0, val1,..., ValN-1))
         t.str = "((" + cl_get_variable_datatype_as_string(t.desc.dt, width) + ")";
         t.str += "(";
 
         int32_t col = col_start;
-        for(; col < width - 1; ++col)
+        for (; col < width - 1; ++col)
         {
             t.str += _vals[row][col];
             t.str += ", ";
@@ -157,10 +156,10 @@ TileVariable CLTile::vector(int32_t row, int32_t col_start, int32_t width) const
     {
         t.str = create_var_name(row);
 
-        if(_info.width() != 1 && _info.width() != width)
+        if (_info.width() != 1 && _info.width() != width)
         {
             t.str += ".s";
-            for(int i = 0; i < width; ++i)
+            for (int i = 0; i < width; ++i)
             {
                 t.str += dec_to_hex_as_string(col_start + i);
             }
@@ -174,11 +173,11 @@ std::vector<TileVariable> CLTile::all() const
 {
     std::vector<TileVariable> vars;
 
-    if(_is_constant)
+    if (_is_constant)
     {
-        for(int32_t y = 0; y < _info.height(); ++y)
+        for (int32_t y = 0; y < _info.height(); ++y)
         {
-            for(int32_t x = 0; x < _info.width(); ++x)
+            for (int32_t x = 0; x < _info.width(); ++x)
             {
                 // We can use the vector method to retrieve all the scalar variables stored in the constant tile
                 TileVariable t = vector(y, x, 1);
@@ -188,7 +187,7 @@ std::vector<TileVariable> CLTile::all() const
     }
     else
     {
-        for(int32_t y = 0; y < _info.height(); ++y)
+        for (int32_t y = 0; y < _info.height(); ++y)
         {
             TileVariable t;
             t.str      = create_var_name(y);
@@ -211,7 +210,7 @@ std::string CLTile::create_var_name(int32_t row) const
     std::string var_name = _basename;
 
     // If a scalar variable, we do not append the row index
-    if(_info.height() > 1)
+    if (_info.height() > 1)
     {
         var_name += "__";
         var_name += std::to_string(row);
@@ -222,7 +221,7 @@ std::string CLTile::create_var_name(int32_t row) const
 
 std::vector<int32_t> CLTile::supported_vector_lengths() const
 {
-    return std::vector<int32_t>{ 1, 2, 3, 4, 8, 16 };
+    return std::vector<int32_t>{1, 2, 3, 4, 8, 16};
 }
 
 void CLTile::validate_tile_info(const TileInfo &info) const
diff --git a/compute_kernel_writer/src/cl/CLTile.h b/compute_kernel_writer/src/cl/CLTile.h
index 1fb0fc9dbe..498cf51034 100644
--- a/compute_kernel_writer/src/cl/CLTile.h
+++ b/compute_kernel_writer/src/cl/CLTile.h
@@ -25,6 +25,7 @@
 #define COMPUTE_KERNEL_WRITER_SRC_CL_CLTILE_H
 
 #include "src/ITile.h"
+
 #include <string>
 
 namespace ckw
@@ -75,9 +76,9 @@ private:
 
     std::string create_var_name(int32_t row) const;
 
-    TileInfo      _info{ DataType::Unknown };
-    std::string   _basename{ "" };
-    bool          _is_constant{ false };
+    TileInfo      _info{DataType::Unknown};
+    std::string   _basename{""};
+    bool          _is_constant{false};
     TileContainer _vals{};
 };
 } // namespace ckw
diff --git a/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.cpp b/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.cpp
index f906bcd4b1..a98ebed8fa 100644
--- a/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.cpp
+++ b/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.cpp
@@ -28,20 +28,25 @@
 #include "ckw/types/MemoryOperation.h"
 #include "ckw/types/TensorStorageType.h"
 
-#include "src/ITensor.h"
-#include "src/Tensor3dMapper.h"
 #include "src/cl/CLHelpers.h"
 #include "src/cl/CLKernelWriter.h"
 #include "src/cl/CLTensorArgument.h"
 #include "src/cl/CLTile.h"
+#include "src/ITensor.h"
+#include "src/Tensor3dMapper.h"
 
 namespace ckw
 {
-bool CLMemoryOpBufferHelper::validate(const CLKernelWriter *writer, const ITensor *tensor, const TensorSampler *sampler, const Tensor3dMapper *mapper, MemoryOperation op, const CLTile *dst)
+bool CLMemoryOpBufferHelper::validate(const CLKernelWriter *writer,
+                                      const ITensor        *tensor,
+                                      const TensorSampler  *sampler,
+                                      const Tensor3dMapper *mapper,
+                                      MemoryOperation       op,
+                                      const CLTile         *dst)
 {
     CKW_UNUSED(writer, tensor, mapper, op, dst);
 
-    if(sampler->storage() != TensorStorageType::BufferUint8Ptr)
+    if (sampler->storage() != TensorStorageType::BufferUint8Ptr)
     {
         return false;
     }
@@ -97,15 +102,15 @@ bool CLMemoryOpBufferHelper::validate(const CLKernelWriter *writer, const ITenso
  */
 void CLMemoryOpBufferHelper::initialize(const CLTile *dst, const CLTile *x, const CLTile *z, const CLTile *b)
 {
-    _dst           = dst;
+    _dst = dst;
 
     CKW_ASSERT(validate(_writer, _tensor, _sampler, _mapper.get(), _op, _dst));
 
     _ls_width_full = dst->info().width();
-    _coord_x      = x->scalar(0, 0).str;
-    _coord_z      = z->scalar(0, 0).str;
-    _coord_b      = b->scalar(0, 0).str;
-    _coord_orig_z = _coord_z;
+    _coord_x       = x->scalar(0, 0).str;
+    _coord_z       = z->scalar(0, 0).str;
+    _coord_b       = b->scalar(0, 0).str;
+    _coord_orig_z  = _coord_z;
 
     out_of_bound_initialize_x(_coord_x);
     out_of_bound_initialize_z(_coord_z);
@@ -126,10 +131,10 @@ void CLMemoryOpBufferHelper::write_row(int32_t row_id, const std::string &coord_
     out_of_bound_finalize_y(dst);
 
     // The left over load/store will be written in the finalize stage
-    if(_ls_width_part.size() != 0)
+    if (_ls_width_part.size() != 0)
     {
         int32_t col_start = 0;
-        for(int32_t partial_width : _ls_width_part)
+        for (int32_t partial_width : _ls_width_part)
         {
             const std::string dst       = _dst->vector(row_id, col_start, partial_width).str;
             const std::string coord_x   = _coord_x + " + " + std::to_string(col_start);
@@ -150,13 +155,13 @@ void CLMemoryOpBufferHelper::finalize()
 
 void CLMemoryOpBufferHelper::out_of_bound_initialize_x(const std::string &coord)
 {
-    if(_sampler->address_mode_x() == TensorSamplerAddressModeX::OverlappingMin)
+    if (_sampler->address_mode_x() == TensorSamplerAddressModeX::OverlappingMin)
     {
-        TensorInfo tensor_info = _tensor->info();
-        TensorShape shape      = tensor_info.shape();
+        TensorInfo  tensor_info = _tensor->info();
+        TensorShape shape       = tensor_info.shape();
 
         _ls_width_part = cl_decompose_vector_width(shape[0] % _ls_width_full);
-        if(_ls_width_part.size() != 0)
+        if (_ls_width_part.size() != 0)
         {
             _writer->op_write_raw_code("if(" + coord + " > 0)\n{\n");
         }
@@ -165,14 +170,14 @@ void CLMemoryOpBufferHelper::out_of_bound_initialize_x(const std::string &coord)
 
 void CLMemoryOpBufferHelper::out_of_bound_finalize_x()
 {
-    if(_sampler->address_mode_x() == TensorSamplerAddressModeX::OverlappingMin)
+    if (_sampler->address_mode_x() == TensorSamplerAddressModeX::OverlappingMin)
     {
-        if(_ls_width_part.size() != 0)
+        if (_ls_width_part.size() != 0)
         {
             _writer->op_write_raw_code("}\nelse\n{\n");
 
             out_of_bound_initialize_z(_coord_orig_z);
-            for(LeftoverDescriptor leftover_desc : _leftovers_x)
+            for (LeftoverDescriptor leftover_desc : _leftovers_x)
             {
                 out_of_bound_initialize_y(leftover_desc.coord);
                 _writer->op_write_raw_code(leftover_desc.statement);
@@ -191,7 +196,7 @@ void CLMemoryOpBufferHelper::out_of_bound_initialize_y(const std::string &coord)
 
     const TensorSamplerAddressModeY address_mode_y = _sampler->address_mode_y();
 
-    switch(address_mode_y)
+    switch (address_mode_y)
     {
         case TensorSamplerAddressModeY::ClampToBorderMaxOnly:
             // Not to be moved outside the case because it marks the relevant tensor component as used even if we dont't use the variable
@@ -212,7 +217,7 @@ void CLMemoryOpBufferHelper::out_of_bound_finalize_y(const std::string &dst)
 {
     const TensorSamplerAddressModeY address_mode_y = _sampler->address_mode_y();
 
-    switch(address_mode_y)
+    switch (address_mode_y)
     {
         case TensorSamplerAddressModeY::ClampToBorderMaxOnly:
             _writer->op_write_raw_code("}\nelse\n{\n");
@@ -234,7 +239,7 @@ void CLMemoryOpBufferHelper::out_of_bound_initialize_z(const std::string &coord)
     CKW_UNUSED(coord);
 
     const TensorSamplerAddressModeZ address_mode_z = _sampler->address_mode_z();
-    switch(address_mode_z)
+    switch (address_mode_z)
     {
         case TensorSamplerAddressModeZ::None:
             break;
@@ -247,7 +252,7 @@ void CLMemoryOpBufferHelper::out_of_bound_finalize_z()
 {
     const TensorSamplerAddressModeZ address_mode_z = _sampler->address_mode_z();
 
-    switch(address_mode_z)
+    switch (address_mode_z)
     {
         case TensorSamplerAddressModeZ::None:
             break;
@@ -256,13 +261,15 @@ void CLMemoryOpBufferHelper::out_of_bound_finalize_z()
     }
 }
 
-std::string CLMemoryOpBufferHelper::to_statement(MemoryOperation op, int32_t vector_width, const std::string &data,
-                             const std::string &address) const
+std::string CLMemoryOpBufferHelper::to_statement(MemoryOperation    op,
+                                                 int32_t            vector_width,
+                                                 const std::string &data,
+                                                 const std::string &address) const
 {
-    switch(op)
+    switch (op)
     {
         case MemoryOperation::Load:
-            if(vector_width != 1)
+            if (vector_width != 1)
             {
                 return data + " = vload" + std::to_string(vector_width) + "(0, " + address + ")";
             }
@@ -272,7 +279,7 @@ std::string CLMemoryOpBufferHelper::to_statement(MemoryOperation op, int32_t vec
             }
             break;
         case MemoryOperation::Store:
-            if(vector_width != 1)
+            if (vector_width != 1)
             {
                 return "vstore" + std::to_string(vector_width) + "(" + data + ", 0, " + address + ")";
             }
@@ -288,26 +295,28 @@ std::string CLMemoryOpBufferHelper::to_statement(MemoryOperation op, int32_t vec
     return "";
 }
 
-std::string CLMemoryOpBufferHelper::to_buffer_address(const std::string &x, const std::string &y, const std::string &z,
-                                     const std::string &b) const
+std::string CLMemoryOpBufferHelper::to_buffer_address(const std::string &x,
+                                                      const std::string &y,
+                                                      const std::string &z,
+                                                      const std::string &b) const
 {
     TensorStorageType tensor_storage = _sampler->storage();
     CKW_ASSERT(tensor_storage == TensorStorageType::BufferUint8Ptr);
 
-    const std::string ptr_buf      = _tensor->storage(tensor_storage).val;
-    const std::string dst_type     = cl_data_type_rounded_up_to_valid_vector_width(_dst->info().data_type(), 1);
+    const std::string ptr_buf  = _tensor->storage(tensor_storage).val;
+    const std::string dst_type = cl_data_type_rounded_up_to_valid_vector_width(_dst->info().data_type(), 1);
 
     std::string address;
     address += "(__global ";
     address += dst_type;
     address += "*)(";
     address += ptr_buf;
-    if(x != "0" && (_mapper->dim_x().str != "1"))
+    if (x != "0" && (_mapper->dim_x().str != "1"))
     {
         address += " + (";
         address += x + ") * sizeof(" + dst_type + ")";
     }
-    if(y != "0")
+    if (y != "0")
     {
         const std::string stride_y = _mapper->stride_y().str;
         address += " + (";
@@ -315,7 +324,7 @@ std::string CLMemoryOpBufferHelper::to_buffer_address(const std::string &x, cons
         address += " * ";
         address += stride_y;
     }
-    if(z != "0" && (_mapper->dim_z().str != "1"))
+    if (z != "0" && (_mapper->dim_z().str != "1"))
     {
         const std::string stride_z = _mapper->stride_z().str;
         address += " + (";
@@ -323,7 +332,7 @@ std::string CLMemoryOpBufferHelper::to_buffer_address(const std::string &x, cons
         address += " * ";
         address += stride_z;
     }
-    if(b != "0" && (_mapper->dim_batch().str != "1"))
+    if (b != "0" && (_mapper->dim_batch().str != "1"))
     {
         const std::string stride_b = _mapper->stride_batch().str;
         address += " + (";
diff --git a/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.h b/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.h
index 9bcd571a81..4e1a842fe1 100644
--- a/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.h
+++ b/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.h
@@ -27,9 +27,9 @@
 
 #include "src/cl/helpers/ICLMemoryOpHelper.h"
 
+#include <cstdint>
 #include <string>
 #include <vector>
-#include <cstdint>
 
 namespace ckw
 {
@@ -65,20 +65,25 @@ private:
     struct LeftoverDescriptor
     {
         LeftoverDescriptor(const std::string &dst, const std::string &coord, const std::string &statement)
-        : dst(dst), coord(coord), statement(statement)
+            : dst(dst), coord(coord), statement(statement)
         {
         }
 
-        std::string dst{};        // Describes the destination tile or part of it
-        std::string coord{};      // Describes the coordinate to be used in boundary checks
-        std::string statement{};  // Describes the memory operation statement
+        std::string dst{};       // Describes the destination tile or part of it
+        std::string coord{};     // Describes the coordinate to be used in boundary checks
+        std::string statement{}; // Describes the memory operation statement
     };
 
     std::vector<int32_t>            _ls_width_part{};
     std::vector<LeftoverDescriptor> _leftovers_x{};
     std::string                     _coord_orig_z{};
 
-    static bool validate(const CLKernelWriter *writer, const ITensor *tensor, const TensorSampler *sampler, const Tensor3dMapper *mapper, MemoryOperation op, const CLTile *dst);
+    static bool validate(const CLKernelWriter *writer,
+                         const ITensor        *tensor,
+                         const TensorSampler  *sampler,
+                         const Tensor3dMapper *mapper,
+                         MemoryOperation       op,
+                         const CLTile         *dst);
 
     void out_of_bound_initialize_x(const std::string &coord);
     void out_of_bound_finalize_x();
@@ -87,8 +92,10 @@ private:
     void out_of_bound_initialize_z(const std::string &coord);
     void out_of_bound_finalize_z();
 
-    std::string to_statement(MemoryOperation op, int32_t vector_width, const std::string &data, const std::string &address) const;
-    std::string to_buffer_address(const std::string &x, const std::string &y, const std::string &z, const std::string &b) const;
+    std::string
+    to_statement(MemoryOperation op, int32_t vector_width, const std::string &data, const std::string &address) const;
+    std::string
+    to_buffer_address(const std::string &x, const std::string &y, const std::string &z, const std::string &b) const;
 };
 } // namespace ckw
 
diff --git a/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.cpp b/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.cpp
index 55f88f4136..b7d146bdee 100644
--- a/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.cpp
+++ b/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.cpp
@@ -28,11 +28,11 @@
 #include "ckw/types/MemoryOperation.h"
 #include "ckw/types/TensorStorageType.h"
 
-#include "src/ITensor.h"
-#include "src/Tensor3dMapper.h"
 #include "src/cl/CLKernelWriter.h"
 #include "src/cl/CLTensorArgument.h"
 #include "src/cl/CLTile.h"
+#include "src/ITensor.h"
+#include "src/Tensor3dMapper.h"
 
 namespace ckw
 {
@@ -66,31 +66,36 @@ void CLMemoryOpImage2dHelper::finalize()
 {
 }
 
-bool CLMemoryOpImage2dHelper::validate(const CLKernelWriter *writer, const ITensor *tensor, const TensorSampler *sampler, const Tensor3dMapper *mapper, MemoryOperation op, const CLTile *dst)
+bool CLMemoryOpImage2dHelper::validate(const CLKernelWriter *writer,
+                                       const ITensor        *tensor,
+                                       const TensorSampler  *sampler,
+                                       const Tensor3dMapper *mapper,
+                                       MemoryOperation       op,
+                                       const CLTile         *dst)
 {
     CKW_UNUSED(writer, tensor, mapper);
 
-    if(dst->info().width() != 4)
+    if (dst->info().width() != 4)
     {
         return false;
     }
-    if(sampler->address_mode_x() != TensorSamplerAddressModeX::None)
+    if (sampler->address_mode_x() != TensorSamplerAddressModeX::None)
     {
         return false;
     }
-    if(sampler->address_mode_z() != TensorSamplerAddressModeZ::None)
+    if (sampler->address_mode_z() != TensorSamplerAddressModeZ::None)
     {
         return false;
     }
-    if(sampler->storage() != TensorStorageType::Texture2dReadOnly && op == MemoryOperation::Load)
+    if (sampler->storage() != TensorStorageType::Texture2dReadOnly && op == MemoryOperation::Load)
     {
         return false;
     }
-    if(sampler->storage() != TensorStorageType::Texture2dWriteOnly && op == MemoryOperation::Store)
+    if (sampler->storage() != TensorStorageType::Texture2dWriteOnly && op == MemoryOperation::Store)
     {
         return false;
     }
-    if((dst->info().data_type() != DataType::Fp32) && (dst->info().data_type() != DataType::Fp16))
+    if ((dst->info().data_type() != DataType::Fp32) && (dst->info().data_type() != DataType::Fp16))
     {
         return false;
     }
@@ -102,7 +107,7 @@ void CLMemoryOpImage2dHelper::out_of_bound_initialize_y(const std::string &coord
     CKW_UNUSED(coord);
 
     const TensorSamplerAddressModeY address_mode_y = _sampler->address_mode_y();
-    switch(address_mode_y)
+    switch (address_mode_y)
     {
         case TensorSamplerAddressModeY::SkipLessThanZero:
             _writer->op_write_raw_code("if(" + coord + " >= 0)\n{\n");
@@ -118,7 +123,7 @@ void CLMemoryOpImage2dHelper::out_of_bound_initialize_y(const std::string &coord
 void CLMemoryOpImage2dHelper::out_of_bound_finalize_y()
 {
     const TensorSamplerAddressModeY address_mode_y = _sampler->address_mode_y();
-    switch(address_mode_y)
+    switch (address_mode_y)
     {
         case TensorSamplerAddressModeY::SkipLessThanZero:
             _writer->op_write_raw_code("}\n");
@@ -131,15 +136,19 @@ void CLMemoryOpImage2dHelper::out_of_bound_finalize_y()
     }
 }
 
-std::string CLMemoryOpImage2dHelper::to_ls_image2d(MemoryOperation op, int32_t vector_width, const std::string &data, const std::string &sampler, const std::string &address) const
+std::string CLMemoryOpImage2dHelper::to_ls_image2d(MemoryOperation    op,
+                                                   int32_t            vector_width,
+                                                   const std::string &data,
+                                                   const std::string &sampler,
+                                                   const std::string &address) const
 {
     CKW_UNUSED(vector_width);
 
     const TensorStorageType tensor_storage = _sampler->storage();
-    const std::string image2d_obj    = _tensor->storage(tensor_storage).val;
-    const std::string post_fix = _dst->info().data_type() == DataType::Fp32 ? "f" : "h";
+    const std::string       image2d_obj    = _tensor->storage(tensor_storage).val;
+    const std::string       post_fix       = _dst->info().data_type() == DataType::Fp32 ? "f" : "h";
 
-    switch(op)
+    switch (op)
     {
         case MemoryOperation::Load:
             return data + " = read_image" + post_fix + "(" + image2d_obj + ", " + sampler + ", " + address + ")";
@@ -155,7 +164,7 @@ std::string CLMemoryOpImage2dHelper::to_ls_image2d_sampler() const
 {
     const auto address_mode_y = _sampler->address_mode_y();
 
-    switch(address_mode_y)
+    switch (address_mode_y)
     {
         case TensorSamplerAddressModeY::None:
             return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST";
@@ -167,17 +176,19 @@ std::string CLMemoryOpImage2dHelper::to_ls_image2d_sampler() const
     }
 }
 
-std::string CLMemoryOpImage2dHelper::to_ls_image2d_address(const std::string &x, const std::string &y, const std::string &z,
+std::string CLMemoryOpImage2dHelper::to_ls_image2d_address(const std::string &x,
+                                                           const std::string &y,
+                                                           const std::string &z,
                                                            const std::string &b) const
 {
     std::string coord_x = "(" + x + ") >> 2";
     std::string coord_y = "(";
 
-    if(y != "0")
+    if (y != "0")
     {
         coord_y += y;
     }
-    if(z != "0" && (_mapper->dim_z().str != "1"))
+    if (z != "0" && (_mapper->dim_z().str != "1"))
     {
         const std::string dim = _mapper->dim_y().str;
         coord_y += " + (";
@@ -185,7 +196,7 @@ std::string CLMemoryOpImage2dHelper::to_ls_image2d_address(const std::string &x,
         coord_y += " * ";
         coord_y += dim;
     }
-    if(b != "0" && (_mapper->dim_batch().str != "1"))
+    if (b != "0" && (_mapper->dim_batch().str != "1"))
     {
         const std::string dim0 = _mapper->dim_y().str;
         const std::string dim1 = _mapper->dim_z().str;
diff --git a/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.h b/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.h
index 73bede7789..fd9b097a24 100644
--- a/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.h
+++ b/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.h
@@ -59,14 +59,24 @@ public:
     void finalize() override;
 
 private:
-    static bool validate(const CLKernelWriter *writer, const ITensor *tensor, const TensorSampler *sampler, const Tensor3dMapper *mapper, MemoryOperation op, const CLTile *dst);
+    static bool validate(const CLKernelWriter *writer,
+                         const ITensor        *tensor,
+                         const TensorSampler  *sampler,
+                         const Tensor3dMapper *mapper,
+                         MemoryOperation       op,
+                         const CLTile         *dst);
 
     void out_of_bound_initialize_y(const std::string &coord);
     void out_of_bound_finalize_y();
 
-    std::string to_ls_image2d(MemoryOperation op, int32_t vector_width, const std::string &data, const std::string &sampler, const std::string &address) const;
+    std::string to_ls_image2d(MemoryOperation    op,
+                              int32_t            vector_width,
+                              const std::string &data,
+                              const std::string &sampler,
+                              const std::string &address) const;
     std::string to_ls_image2d_sampler() const;
-    std::string to_ls_image2d_address(const std::string &x, const std::string &y, const std::string &z, const std::string &b) const;
+    std::string
+    to_ls_image2d_address(const std::string &x, const std::string &y, const std::string &z, const std::string &b) const;
 };
 } // namespace ckw
 
diff --git a/compute_kernel_writer/src/cl/helpers/ICLMemoryOpHelper.h b/compute_kernel_writer/src/cl/helpers/ICLMemoryOpHelper.h
index 7f363431e8..f46fee9750 100644
--- a/compute_kernel_writer/src/cl/helpers/ICLMemoryOpHelper.h
+++ b/compute_kernel_writer/src/cl/helpers/ICLMemoryOpHelper.h
@@ -26,6 +26,7 @@
 #define CKW_SRC_CL_HELPERS_ICLMEMORYOPHELPER_H
 
 #include "ckw/TensorSampler.h"
+
 #include "src/Tensor3dMapper.h"
 
 #include <cstdint>
@@ -98,16 +99,16 @@ public:
     virtual void finalize() = 0;
 
 protected:
-    CLKernelWriter                  *_writer{ nullptr };
-    ITensor                         *_tensor{ nullptr };
-    TensorSampler                   *_sampler{ nullptr };
-    MemoryOperation                  _op;
-    std::unique_ptr<Tensor3dMapper>  _mapper{ nullptr };
-    const CLTile                    *_dst{ nullptr };
-    int32_t                          _ls_width_full{ 0 };
-    std::string                      _coord_x{};
-    std::string                      _coord_z{};
-    std::string                      _coord_b{};
+    CLKernelWriter                 *_writer{nullptr};
+    ITensor                        *_tensor{nullptr};
+    TensorSampler                  *_sampler{nullptr};
+    MemoryOperation                 _op;
+    std::unique_ptr<Tensor3dMapper> _mapper{nullptr};
+    const CLTile                   *_dst{nullptr};
+    int32_t                         _ls_width_full{0};
+    std::string                     _coord_x{};
+    std::string                     _coord_z{};
+    std::string                     _coord_b{};
 };
 } // namespace ckw
 
diff --git a/compute_kernel_writer/src/types/ConstantData.cpp b/compute_kernel_writer/src/types/ConstantData.cpp
index d2155cf55a..67b1103860 100644
--- a/compute_kernel_writer/src/types/ConstantData.cpp
+++ b/compute_kernel_writer/src/types/ConstantData.cpp
@@ -30,52 +30,51 @@ namespace ckw
 {
 namespace
 {
-    template<typename T>
-    inline typename std::enable_if<std::is_same<T, float>::value, std::string>::type to_str(T value)
-    {
-        std::stringstream ss;
-        ss << std::scientific << std::setprecision(std::numeric_limits<T>::max_digits10) << value;
-        return ss.str();
-    }
+template <typename T>
+inline typename std::enable_if<std::is_same<T, float>::value, std::string>::type to_str(T value)
+{
+    std::stringstream ss;
+    ss << std::scientific << std::setprecision(std::numeric_limits<T>::max_digits10) << value;
+    return ss.str();
+}
 
-    template<typename T>
-    inline typename std::enable_if<!std::is_same<T, float>::value && !std::is_same<T, bool>::value, std::string>::type to_str(T value)
-    {
-        return std::to_string(value);
-    }
+template <typename T>
+inline typename std::enable_if<!std::is_same<T, float>::value && !std::is_same<T, bool>::value, std::string>::type
+to_str(T value)
+{
+    return std::to_string(value);
+}
 
-    template<typename T>
-    inline typename std::enable_if<std::is_same<T, bool>::value, std::string>::type to_str(T value)
-    {
-        return std::to_string((int) value);
-    }
+template <typename T>
+inline typename std::enable_if<std::is_same<T, bool>::value, std::string>::type to_str(T value)
+{
+    return std::to_string((int)value);
 }
+} // namespace
 
-template<typename T>
+template <typename T>
 ConstantData::ConstantData(std::initializer_list<std::initializer_list<T>> values, DataType data_type)
     : _data_type(data_type)
 {
     CKW_ASSERT(validate<T>(data_type));
     CKW_ASSERT(values.size() > 0);
 
-    for(auto value_arr: values)
+    for (auto value_arr : values)
     {
         // Each row must have the same number of elements
         CKW_ASSERT(value_arr.size() == (*values.begin()).size());
 
         StringVector vec;
-        std::transform(value_arr.begin(), value_arr.end(),
-            std::back_inserter(vec),
-            [](T val) { return to_str(val); });
+        std::transform(value_arr.begin(), value_arr.end(), std::back_inserter(vec), [](T val) { return to_str(val); });
 
         _values.push_back(std::move(vec));
     }
 }
 
-template<typename T>
+template <typename T>
 bool ConstantData::validate(DataType data_type)
 {
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::Fp32:
         case DataType::Fp16:
@@ -107,7 +106,7 @@ template bool ConstantData::validate<uint32_t>(DataType);
 template bool ConstantData::validate<bool>(DataType);
 template bool ConstantData::validate<float>(DataType);
 
-const std::vector<std::vector<std::string>>& ConstantData::values() const
+const std::vector<std::vector<std::string>> &ConstantData::values() const
 {
     return _values;
 }
diff --git a/examples/cl_cache.cpp b/examples/cl_cache.cpp
index 6de62f7c5d..9da5b9176d 100644
--- a/examples/cl_cache.cpp
+++ b/examples/cl_cache.cpp
@@ -25,8 +25,9 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/Utils.h"
 #include "arm_compute/runtime/CL/functions/CLPermute.h"
+#include "arm_compute/runtime/CL/Utils.h"
+
 #include "utils/Utils.h"
 
 using namespace arm_compute;
@@ -43,14 +44,15 @@ public:
 
     bool do_setup(int argc, char **argv) override
     {
-        std::cout << "Once the program has run and created the file cache.bin, rerun with --restore_cache." << std::endl;
+        std::cout << "Once the program has run and created the file cache.bin, rerun with --restore_cache."
+                  << std::endl;
         CLScheduler::get().default_init();
 
-        if(argc > 1)
+        if (argc > 1)
         {
             std::string argv1 = argv[1];
             std::transform(argv1.begin(), argv1.end(), argv1.begin(), ::tolower);
-            if(argv1 == "--restore_cache")
+            if (argv1 == "--restore_cache")
             {
                 // Load the precompiled kernels from a file into the kernel library, in this way the next time they are needed
                 // compilation won't be required.
@@ -110,11 +112,13 @@ private:
         window.use_tensor_dimensions(reference.info()->tensor_shape());
         Iterator it_ref(&reference, window);
         Iterator it_res(&result, window);
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-            assert(*reinterpret_cast<unsigned char *>(it_ref.ptr()) == *reinterpret_cast<unsigned char *>(it_res.ptr()));
-        },
-        it_ref, it_res);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &) {
+                assert(*reinterpret_cast<unsigned char *>(it_ref.ptr()) ==
+                       *reinterpret_cast<unsigned char *>(it_res.ptr()));
+            },
+            it_ref, it_res);
         reference.unmap();
         result.unmap();
     }
@@ -126,11 +130,9 @@ private:
         window.use_tensor_dimensions(tensor.info()->tensor_shape());
         Iterator      it_tensor(&tensor, window);
         unsigned char val(0);
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-            *reinterpret_cast<unsigned char *>(it_tensor.ptr()) = val++;
-        },
-        it_tensor);
+        execute_window_loop(
+            window, [&](const Coordinates &) { *reinterpret_cast<unsigned char *>(it_tensor.ptr()) = val++; },
+            it_tensor);
         tensor.unmap();
     }
     void init_tensor(const TensorShape shape, CLTensor &tensor, DataType type, DataLayout layout)
diff --git a/examples/cl_sgemm.cpp b/examples/cl_sgemm.cpp
index 27af228954..68955c52f7 100644
--- a/examples/cl_sgemm.cpp
+++ b/examples/cl_sgemm.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
+
 #include "utils/Utils.h"
 
 #include <cstdlib>
@@ -50,15 +51,16 @@ public:
         CLScheduler::get().default_init(&tuner);
 
         std::ifstream stream;
-        if(argc > 1)
+        if (argc > 1)
         {
             stream.open(argv[1], std::fstream::in);
         }
 
-        if(argc < 3 || (argc < 4 && stream.bad()))
+        if (argc < 3 || (argc < 4 && stream.bad()))
         {
             // Print help
-            std::cout << "Usage: 1) ./build/cl_sgemm input_matrix_1.npy input_matrix_2.npy [input_matrix_3.npy] [alpha = 1] [beta = 0]\n";
+            std::cout << "Usage: 1) ./build/cl_sgemm input_matrix_1.npy input_matrix_2.npy [input_matrix_3.npy] [alpha "
+                         "= 1] [beta = 0]\n";
             std::cout << "       2) ./build/cl_sgemm M N K [alpha = 1.0f] [beta = 0.0f]\n\n";
             std::cout << "Too few or no input_matrices provided. Using M=7, N=3, K=5, alpha=1.0f and beta=0.0f\n\n";
 
@@ -68,29 +70,29 @@ public:
         }
         else
         {
-            if(stream.good()) /* case file1.npy file2.npy [file3.npy] [alpha = 1.0f] [beta = 0.0f] */
+            if (stream.good()) /* case file1.npy file2.npy [file3.npy] [alpha = 1.0f] [beta = 0.0f] */
             {
                 npy0.open(argv[1]);
                 npy0.init_tensor(src0, DataType::F32);
                 npy1.open(argv[2]);
                 npy1.init_tensor(src1, DataType::F32);
 
-                if(argc > 3)
+                if (argc > 3)
                 {
                     stream.close();
                     stream.clear();
                     stream.open(argv[3], std::fstream::in);
-                    if(stream.good()) /* case with third file */
+                    if (stream.good()) /* case with third file */
                     {
                         npy2.open(argv[3]);
                         npy2.init_tensor(src2, DataType::F32);
 
-                        if(argc > 4)
+                        if (argc > 4)
                         {
                             // Convert string to float
                             alpha = strtof(argv[4], nullptr);
 
-                            if(argc > 5)
+                            if (argc > 5)
                             {
                                 // Convert string to float
                                 beta = strtof(argv[5], nullptr);
@@ -101,7 +103,7 @@ public:
                     {
                         alpha = strtof(argv[3], nullptr);
 
-                        if(argc > 4)
+                        if (argc > 4)
                         {
                             beta = strtof(argv[4], nullptr);
                         }
@@ -118,11 +120,11 @@ public:
                 src1.allocator()->init(TensorInfo(TensorShape(N, K), 1, DataType::F32));
                 src2.allocator()->init(TensorInfo(TensorShape(N, M), 1, DataType::F32));
 
-                if(argc > 4)
+                if (argc > 4)
                 {
                     alpha = strtof(argv[4], nullptr);
 
-                    if(argc > 5)
+                    if (argc > 5)
                     {
                         beta = strtof(argv[5], nullptr);
                     }
@@ -141,7 +143,7 @@ public:
         dst.allocator()->allocate();
 
         // Fill the input images with either the data provided or random data
-        if(npy0.is_open())
+        if (npy0.is_open())
         {
             npy0.fill_tensor(src0);
             npy1.fill_tensor(src1);
@@ -149,7 +151,7 @@ public:
             output_filename = "sgemm_out.npy";
             is_fortran      = npy0.is_fortran();
 
-            if(npy2.is_open())
+            if (npy2.is_open())
             {
                 src2.allocator()->allocate();
                 npy2.fill_tensor(src2);
@@ -179,7 +181,7 @@ public:
     }
     void do_teardown() override
     {
-        if(!output_filename.empty()) /* Save to .npy file */
+        if (!output_filename.empty()) /* Save to .npy file */
         {
             save_to_npy(dst, output_filename, is_fortran);
         }
diff --git a/examples/gemm_tuner/CommonGemmExampleOptions.cpp b/examples/gemm_tuner/CommonGemmExampleOptions.cpp
index bee202b99e..c2a465604a 100644
--- a/examples/gemm_tuner/CommonGemmExampleOptions.cpp
+++ b/examples/gemm_tuner/CommonGemmExampleOptions.cpp
@@ -39,7 +39,8 @@ using namespace utils;
     return os;
 }
 
-CommonGemmExampleOptions::CommonGemmExampleOptions(arm_compute::utils::CommandLineParser &parser, arm_compute::DataType default_data_type)
+CommonGemmExampleOptions::CommonGemmExampleOptions(arm_compute::utils::CommandLineParser &parser,
+                                                   arm_compute::DataType                  default_data_type)
     : help(parser.add_option<ToggleOption>("help")),
       M(parser.add_positional_option<SimpleOption<size_t>>("M", 100)),
       N(parser.add_positional_option<SimpleOption<size_t>>("N", 100)),
@@ -48,21 +49,16 @@ CommonGemmExampleOptions::CommonGemmExampleOptions(arm_compute::utils::CommandLi
       data_type(),
       tuner_mode()
 {
-    const std::set<DataType> supported_data_types
-    {
+    const std::set<DataType> supported_data_types{
         DataType::F16,
         DataType::F32,
         DataType::QASYMM8,
     };
 
-    const std::set<CLTunerMode> supported_tuner_modes
-    {
-        CLTunerMode::EXHAUSTIVE,
-        CLTunerMode::NORMAL,
-        CLTunerMode::RAPID
-    };
+    const std::set<CLTunerMode> supported_tuner_modes{CLTunerMode::EXHAUSTIVE, CLTunerMode::NORMAL, CLTunerMode::RAPID};
 
-    ARM_COMPUTE_ERROR_ON_MSG(supported_data_types.find(default_data_type) == supported_data_types.end(), "Default data type unsupported");
+    ARM_COMPUTE_ERROR_ON_MSG(supported_data_types.find(default_data_type) == supported_data_types.end(),
+                             "Default data type unsupported");
 
     data_type  = parser.add_option<EnumOption<DataType>>("type", supported_data_types, default_data_type);
     tuner_mode = parser.add_option<EnumOption<CLTunerMode>>("tuner-mode", supported_tuner_modes, CLTunerMode::RAPID);
diff --git a/examples/gemm_tuner/CommonGemmExampleOptions.h b/examples/gemm_tuner/CommonGemmExampleOptions.h
index f7447e3db3..38178bcef8 100644
--- a/examples/gemm_tuner/CommonGemmExampleOptions.h
+++ b/examples/gemm_tuner/CommonGemmExampleOptions.h
@@ -27,21 +27,22 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
-#include "utils/TypePrinter.h"
+
 #include "utils/command_line/CommandLineOptions.h"
 #include "utils/command_line/CommandLineParser.h"
+#include "utils/TypePrinter.h"
 
 namespace gemm_tuner
 {
 /** Structure holding all the common gemm example parameters */
 struct CommonGemmExampleParams
 {
-    size_t                   M{ 100 };                                      /**< Number of lhs matrix rows */
-    size_t                   N{ 100 };                                      /**< Number of rhs matrix columns */
-    size_t                   K{ 50 };                                       /**< Number of lhs matrix columns/rhs matrix rows */
-    size_t                   B{ 1 };                                        /**< Batch size */
-    arm_compute::DataType    data_type{ arm_compute::DataType::F32 };       /**< Data type */
-    arm_compute::CLTunerMode tuner_mode{ arm_compute::CLTunerMode::RAPID }; /**< OpenCL tuner mode */
+    size_t                   M{100};                                /**< Number of lhs matrix rows */
+    size_t                   N{100};                                /**< Number of rhs matrix columns */
+    size_t                   K{50};                                 /**< Number of lhs matrix columns/rhs matrix rows */
+    size_t                   B{1};                                  /**< Batch size */
+    arm_compute::DataType    data_type{arm_compute::DataType::F32}; /**< Data type */
+    arm_compute::CLTunerMode tuner_mode{arm_compute::CLTunerMode::RAPID}; /**< OpenCL tuner mode */
 };
 
 /** Formatted output of the CommonGemmExampleParams type
@@ -70,7 +71,8 @@ public:
      * @param[in,out] parser            A parser on which "parse()" hasn't been called yet.
      * @param[in]     default_data_type Default data type if unspecified.
      */
-    CommonGemmExampleOptions(arm_compute::utils::CommandLineParser &parser, arm_compute::DataType default_data_type = arm_compute::DataType::F32);
+    CommonGemmExampleOptions(arm_compute::utils::CommandLineParser &parser,
+                             arm_compute::DataType                  default_data_type = arm_compute::DataType::F32);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     CommonGemmExampleOptions(const CommonGemmExampleOptions &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -82,11 +84,11 @@ public:
     /** Default destructor */
     ~CommonGemmExampleOptions() = default;
 
-    arm_compute::utils::ToggleOption                         *help;       /**< Show help option */
-    arm_compute::utils::SimpleOption<size_t>                 *M;          /**< Number of lhs matrix rows option */
-    arm_compute::utils::SimpleOption<size_t>                 *N;          /**< Number of rhs matrix columns option */
-    arm_compute::utils::SimpleOption<size_t>                 *K;          /**< Number of lhs matrix columns/rhs matrix rows option */
-    arm_compute::utils::SimpleOption<size_t>                 *B;          /**< Batch size option */
+    arm_compute::utils::ToggleOption         *help; /**< Show help option */
+    arm_compute::utils::SimpleOption<size_t> *M;    /**< Number of lhs matrix rows option */
+    arm_compute::utils::SimpleOption<size_t> *N;    /**< Number of rhs matrix columns option */
+    arm_compute::utils::SimpleOption<size_t> *K;    /**< Number of lhs matrix columns/rhs matrix rows option */
+    arm_compute::utils::SimpleOption<size_t> *B;    /**< Batch size option */
     arm_compute::utils::EnumOption<arm_compute::DataType>    *data_type;  /**< Data type */
     arm_compute::utils::EnumOption<arm_compute::CLTunerMode> *tuner_mode; /**< OpenCL tuner mode */
 };
diff --git a/examples/gemm_tuner/GemmTunerHelpers.h b/examples/gemm_tuner/GemmTunerHelpers.h
index ae5cfbb19e..dbff9e2dff 100644
--- a/examples/gemm_tuner/GemmTunerHelpers.h
+++ b/examples/gemm_tuner/GemmTunerHelpers.h
@@ -36,9 +36,9 @@ bool update_padding_for_cl_image(arm_compute::ITensorInfo *tensor)
     constexpr unsigned int num_floats_per_pixel = 4;
 
     const unsigned int stride_y_in_elements = tensor->strides_in_bytes()[1] / tensor->element_size();
-    const unsigned int pixel_aligment       = arm_compute::get_cl_image_pitch_alignment(
-                                                  arm_compute::CLKernelLibrary::get().get_device());
-    if(pixel_aligment == 0)
+    const unsigned int pixel_aligment =
+        arm_compute::get_cl_image_pitch_alignment(arm_compute::CLKernelLibrary::get().get_device());
+    if (pixel_aligment == 0)
     {
         return false;
     }
diff --git a/examples/gemm_tuner/cl_gemm_native.cpp b/examples/gemm_tuner/cl_gemm_native.cpp
index dd03873921..7daa0b07d3 100644
--- a/examples/gemm_tuner/cl_gemm_native.cpp
+++ b/examples/gemm_tuner/cl_gemm_native.cpp
@@ -25,19 +25,20 @@
 #error "This example needs to be built with -DARM_COMPUTE_CL"
 #endif /* ARM_COMPUTE_CL */
 
-#include "CommonGemmExampleOptions.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
+
 #include "src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h"
 #include "tests/CL/Helper.h"
-#include "utils/Utils.h"
 #include "utils/command_line/CommandLineOptions.h"
 #include "utils/command_line/CommandLineParser.h"
+#include "utils/Utils.h"
 
+#include "CommonGemmExampleOptions.h"
 #include <cstdlib>
 
 using namespace arm_compute;
@@ -51,9 +52,9 @@ namespace
 /** Structure holding all tunable gemm configs specific to this example/strategy */
 struct GemmConfigs
 {
-    size_t m0{ 4 }; /**< Number of rows processed by the matrix multiplication */
-    size_t n0{ 4 }; /**< Number of columns processed by the matrix multiplication */
-    size_t k0{ 4 }; /**< Number of partial accumulations performed by the matrix multiplication */
+    size_t m0{4}; /**< Number of rows processed by the matrix multiplication */
+    size_t n0{4}; /**< Number of columns processed by the matrix multiplication */
+    size_t k0{4}; /**< Number of partial accumulations performed by the matrix multiplication */
 };
 
 /** Formatted output of the GemmConfigs type
@@ -145,13 +146,13 @@ public:
 
         // Parse command line options
         parser.parse(argc, argv);
-        if(param_options.help->is_set() && param_options.help->value())
+        if (param_options.help->is_set() && param_options.help->value())
         {
             // Print help message
             parser.print_help(argv[0]);
             return false;
         }
-        if(!parser.validate())
+        if (!parser.validate())
         {
             // Invalid arguments. Use default parameters and configs
             std::cerr << "Invalid arguments." << std::endl;
@@ -198,8 +199,9 @@ public:
 
         // Validate argments
         Status status{};
-        status = gemm.validate(lhs.info(), rhs.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
-        if(!status)
+        status = gemm.validate(lhs.info(), rhs.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info,
+                               kernel_info);
+        if (!status)
         {
             // Unsupported arguments
             std::cerr << "Unsupported arguments." << std::endl;
@@ -221,11 +223,7 @@ public:
     void do_run() override
     {
         // Execute the function
-        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
-            { ACL_SRC_1, &rhs },
-            { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
+        ITensorPack gemm_pack({{ACL_SRC_0, &lhs}, {ACL_SRC_1, &rhs}, {ACL_SRC_2, &bias}, {ACL_DST, &dst}});
         gemm.run(gemm_pack);
 
         // Make sure all the OpenCL jobs are done executing:
diff --git a/examples/gemm_tuner/cl_gemm_reshaped.cpp b/examples/gemm_tuner/cl_gemm_reshaped.cpp
index 59044477bf..75f3539cb9 100644
--- a/examples/gemm_tuner/cl_gemm_reshaped.cpp
+++ b/examples/gemm_tuner/cl_gemm_reshaped.cpp
@@ -31,14 +31,15 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
+
 #include "examples/gemm_tuner/CommonGemmExampleOptions.h"
 #include "examples/gemm_tuner/GemmTunerHelpers.h"
 #include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h"
 #include "src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h"
 #include "tests/CL/Helper.h"
-#include "utils/Utils.h"
 #include "utils/command_line/CommandLineOptions.h"
 #include "utils/command_line/CommandLineParser.h"
+#include "utils/Utils.h"
 
 #include <cstdlib>
 
@@ -53,16 +54,16 @@ namespace
 /** Structure holding all tunable gemm configs specific to this example/strategy */
 struct GemmConfigs
 {
-    size_t m0{ 4 };                        /**< Number of rows processed by the matrix multiplication */
-    size_t n0{ 4 };                        /**< Number of columns processed by the matrix multiplication */
-    size_t k0{ 4 };                        /**< Number of partial accumulations performed by the matrix multiplication */
-    size_t v0{ 1 };                        /**< Number of vertical blocks of size (m0xk0) stored on the same output row */
-    size_t h0{ 1 };                        /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
-    bool   interleave_lhs{ true };         /**< Interleave lhs matrix */
-    bool   transpose_lhs{ true };          /**< Transpose lhs matrix. */
-    bool   interleave_rhs{ true };         /**< Interleave rhs matrix */
-    bool   transpose_rhs{ true };          /**< Transpose rhs matrix. */
-    bool   export_to_cl_image_rhs{ true }; /**< Export rhs matrix to cl_image. */
+    size_t m0{4};                /**< Number of rows processed by the matrix multiplication */
+    size_t n0{4};                /**< Number of columns processed by the matrix multiplication */
+    size_t k0{4};                /**< Number of partial accumulations performed by the matrix multiplication */
+    size_t v0{1};                /**< Number of vertical blocks of size (m0xk0) stored on the same output row */
+    size_t h0{1};                /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
+    bool   interleave_lhs{true}; /**< Interleave lhs matrix */
+    bool   transpose_lhs{true};  /**< Transpose lhs matrix. */
+    bool   interleave_rhs{true}; /**< Interleave rhs matrix */
+    bool   transpose_rhs{true};  /**< Transpose rhs matrix. */
+    bool   export_to_cl_image_rhs{true}; /**< Export rhs matrix to cl_image. */
 };
 
 /** Formatted output of the GemmConfigs type
@@ -119,8 +120,10 @@ public:
         // FIXME: Currently we only support 2 variants of the gemm reshaped kernels in which transpose_lhs and
         // transpose_rhs are the opposites of each other. In the future we may extend the kernels to include the other
         // 2 variants (both transposed and none transposed)
-        transpose_rhs->set_help("Transpose rhs matrix but not lhs matrix (1) / Do not transpose rhs matrix but do transpose lhs matrix (0)");
-        export_to_cl_image_rhs->set_help("Export rhs matrix to cl_image (1) / Do not export rhs matrix to cl_image (0)");
+        transpose_rhs->set_help("Transpose rhs matrix but not lhs matrix (1) / Do not transpose rhs matrix but do "
+                                "transpose lhs matrix (0)");
+        export_to_cl_image_rhs->set_help(
+            "Export rhs matrix to cl_image (1) / Do not export rhs matrix to cl_image (0)");
     }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     GemmConfigOptions(const GemmConfigOptions &) = delete;
@@ -133,17 +136,18 @@ public:
     /** Default destructor */
     ~GemmConfigOptions() = default;
 
-    SimpleOption<size_t> *m0;             /**< Number of rows processed by the matrix multiplication option */
-    SimpleOption<size_t> *n0;             /**< Number of columns processed by the matrix multiplication option */
-    SimpleOption<size_t> *k0;             /**< Number of partial accumulations performed by the matrix multiplication option */
-    SimpleOption<size_t> *v0;             /**< Number of vertical blocks of size (m0xk0) stored on the same output row option */
-    SimpleOption<size_t> *h0;             /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */
+    SimpleOption<size_t> *m0; /**< Number of rows processed by the matrix multiplication option */
+    SimpleOption<size_t> *n0; /**< Number of columns processed by the matrix multiplication option */
+    SimpleOption<size_t> *k0; /**< Number of partial accumulations performed by the matrix multiplication option */
+    SimpleOption<size_t> *v0; /**< Number of vertical blocks of size (m0xk0) stored on the same output row option */
+    SimpleOption<size_t> *h0; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */
     SimpleOption<size_t> *interleave_lhs; /**< Interleave lhs matrix option (1 enable; 0 disable) */
     SimpleOption<size_t> *interleave_rhs; /**< Interleave rhs matrix option (1 enable; 0 disable) */
     // FIXME: Currently we only support 2 variants of the gemm reshaped kernels in which transpose_lhs and
     // transpose_rhs are the opposites of each other. In the future we may extend the kernels to include the other
     // 2 variants (both transposed and none transposed)
-    SimpleOption<size_t> *transpose_rhs;          /**< Transpose rhs matrix option (1 enable; 0 disable). Also set the lhs matrix transpose option to the opposite. */
+    SimpleOption<size_t>                   *
+        transpose_rhs; /**< Transpose rhs matrix option (1 enable; 0 disable). Also set the lhs matrix transpose option to the opposite. */
     SimpleOption<size_t> *export_to_cl_image_rhs; /**< Export rhs matrix to cl_image.*/
 };
 
@@ -198,13 +202,13 @@ public:
 
         // Parse command line options
         parser.parse(argc, argv);
-        if(param_options.help->is_set() && param_options.help->value())
+        if (param_options.help->is_set() && param_options.help->value())
         {
             // Print help message
             parser.print_help(argv[0]);
             return false;
         }
-        if(!parser.validate())
+        if (!parser.validate())
         {
             // Invalid arguments. Use default parameters and configs
             std::cerr << "Invalid arguments." << std::endl;
@@ -256,20 +260,22 @@ public:
         kernel_info.broadcast_bias          = true;
         kernel_info.activation_info         = act_info;
 
-        if(rhs_info.h0 == 0)
+        if (rhs_info.h0 == 0)
         {
             rhs_info.h0 = std::max(kernel_info.n / rhs_info.n0, 1U);
         }
 
         // Initialise lhs_reshaped tensor info
-        lhs_reshaped.allocator()->init(TensorInfo(compute_lhs_reshaped_shape(*lhs.info(), lhs_info), 1, params.data_type));
+        lhs_reshaped.allocator()->init(
+            TensorInfo(compute_lhs_reshaped_shape(*lhs.info(), lhs_info), 1, params.data_type));
 
         // Initialise rhs_reshaped tensor info
-        rhs_reshaped.allocator()->init(TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type));
+        rhs_reshaped.allocator()->init(
+            TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type));
 
-        if(rhs_info.export_to_cl_image)
+        if (rhs_info.export_to_cl_image)
         {
-            if(!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info()))
+            if (!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info()))
             {
                 std::cerr << "cl_image is not supported on the device, disable export_to_cl_image" << std::endl;
                 return false;
@@ -279,7 +285,7 @@ public:
         // Validate argments
         Status status{};
         status = reshape_lhs.validate(lhs.info(), lhs_reshaped.info(), lhs_info, kernel_info.reinterpret_input_as_3d);
-        if(!status)
+        if (!status)
         {
             // Unsupported arguments
             std::cerr << "Unsupported arguments." << std::endl;
@@ -287,8 +293,9 @@ public:
             return false;
         }
 
-        status = gemm.validate(lhs_reshaped.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
-        if(!status)
+        status = gemm.validate(lhs_reshaped.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info,
+                               rhs_info, kernel_info);
+        if (!status)
         {
             // Unsupported arguments
             std::cerr << "Unsupported arguments." << std::endl;
@@ -300,7 +307,8 @@ public:
         reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info);
 
         // Configure function
-        gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
+        gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info,
+                       rhs_info, kernel_info);
 
         // Allocate tensors
         lhs.allocator()->allocate();
@@ -315,14 +323,11 @@ public:
     void do_run() override
     {
         // Execute the functions
-        ITensorPack reshape_lsh_pack({ { ACL_SRC, &lhs }, { ACL_DST, &lhs_reshaped } });
+        ITensorPack reshape_lsh_pack({{ACL_SRC, &lhs}, {ACL_DST, &lhs_reshaped}});
         reshape_lhs.run(reshape_lsh_pack);
 
-        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped },
-            { ACL_SRC_1, &rhs_reshaped },
-            { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
+        ITensorPack gemm_pack(
+            {{ACL_SRC_0, &lhs_reshaped}, {ACL_SRC_1, &rhs_reshaped}, {ACL_SRC_2, &bias}, {ACL_DST, &dst}});
         gemm.run(gemm_pack);
 
         // Make sure all the OpenCL jobs are done executing:
diff --git a/examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp b/examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp
index 0ad2a65dc2..cfea2c9bac 100644
--- a/examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp
+++ b/examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp
@@ -25,20 +25,21 @@
 #error "This example needs to be built with -DARM_COMPUTE_CL"
 #endif /* ARM_COMPUTE_CL */
 
-#include "CommonGemmExampleOptions.h"
-#include "GemmTunerHelpers.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
+
 #include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h"
 #include "tests/CL/Helper.h"
-#include "utils/Utils.h"
 #include "utils/command_line/CommandLineOptions.h"
 #include "utils/command_line/CommandLineParser.h"
+#include "utils/Utils.h"
 
+#include "CommonGemmExampleOptions.h"
+#include "GemmTunerHelpers.h"
 #include <cstdlib>
 
 using namespace arm_compute;
@@ -52,13 +53,13 @@ namespace
 /** Structure holding all tunable gemm configs specific to this example/strategy */
 struct GemmConfigs
 {
-    size_t m0{ 4 };                        /**< Number of rows processed by the matrix multiplication */
-    size_t n0{ 4 };                        /**< Number of columns processed by the matrix multiplication */
-    size_t k0{ 4 };                        /**< Number of partial accumulations performed by the matrix multiplication */
-    size_t h0{ 1 };                        /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
-    bool   interleave_rhs{ true };         /**< Interleave rhs matrix */
-    bool   transpose_rhs{ true };          /**< Transpose rhs matrix */
-    bool   export_to_cl_image_rhs{ true }; /**< Export rhs matrix to cl_image.*/
+    size_t m0{4};                /**< Number of rows processed by the matrix multiplication */
+    size_t n0{4};                /**< Number of columns processed by the matrix multiplication */
+    size_t k0{4};                /**< Number of partial accumulations performed by the matrix multiplication */
+    size_t h0{1};                /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
+    bool   interleave_rhs{true}; /**< Interleave rhs matrix */
+    bool   transpose_rhs{true};  /**< Transpose rhs matrix */
+    bool   export_to_cl_image_rhs{true}; /**< Export rhs matrix to cl_image.*/
 };
 
 /** Formatted output of the GemmConfigs type
@@ -106,7 +107,8 @@ public:
         h0->set_help("Number of horizontal blocks of size (k0xn0) stored on the same output row");
         interleave_rhs->set_help("Interleave rhs matrix (1) / Do not interleave rhs matrix (0)");
         transpose_rhs->set_help("Transpose rhs matrix (1) / Do not transpose rhs matrix (0)");
-        export_to_cl_image_rhs->set_help("Export rhs matrix to cl_image (1) / Do not export rhs matrix to cl_image (0)");
+        export_to_cl_image_rhs->set_help(
+            "Export rhs matrix to cl_image (1) / Do not export rhs matrix to cl_image (0)");
     }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     GemmConfigOptions(const GemmConfigOptions &) = delete;
@@ -119,10 +121,10 @@ public:
     /** Default destructor */
     ~GemmConfigOptions() = default;
 
-    SimpleOption<size_t> *m0;                     /**< Number of rows processed by the matrix multiplication option */
-    SimpleOption<size_t> *n0;                     /**< Number of columns processed by the matrix multiplication option */
-    SimpleOption<size_t> *k0;                     /**< Number of partial accumulations performed by the matrix multiplication option */
-    SimpleOption<size_t> *h0;                     /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */
+    SimpleOption<size_t> *m0; /**< Number of rows processed by the matrix multiplication option */
+    SimpleOption<size_t> *n0; /**< Number of columns processed by the matrix multiplication option */
+    SimpleOption<size_t> *k0; /**< Number of partial accumulations performed by the matrix multiplication option */
+    SimpleOption<size_t> *h0; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */
     SimpleOption<size_t> *interleave_rhs;         /**< Interleave rhs matrix option (1 enable; 0 disable) */
     SimpleOption<size_t> *transpose_rhs;          /**< Transpose rhs matrix option (1 enable; 0 disable) */
     SimpleOption<size_t> *export_to_cl_image_rhs; /**< Export rhs matrix to cl_image.*/
@@ -170,13 +172,13 @@ public:
 
         // Parse command line options
         parser.parse(argc, argv);
-        if(param_options.help->is_set() && param_options.help->value())
+        if (param_options.help->is_set() && param_options.help->value())
         {
             // Print help message
             parser.print_help(argv[0]);
             return false;
         }
-        if(!parser.validate())
+        if (!parser.validate())
         {
             // Invalid arguments. Use default parameters and configs
             std::cerr << "Invalid arguments." << std::endl;
@@ -225,17 +227,18 @@ public:
         kernel_info.broadcast_bias          = true;
         kernel_info.activation_info         = act_info;
 
-        if(rhs_info.h0 == 0)
+        if (rhs_info.h0 == 0)
         {
             rhs_info.h0 = std::max(kernel_info.n / rhs_info.n0, 1U);
         }
 
         // Initialise rhs_reshaped tensor info
-        rhs_reshaped.allocator()->init(TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type));
+        rhs_reshaped.allocator()->init(
+            TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type));
 
-        if(rhs_info.export_to_cl_image)
+        if (rhs_info.export_to_cl_image)
         {
-            if(!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info()))
+            if (!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info()))
             {
                 std::cerr << "cl_image is not supported on the device, disable export_to_cl_image" << std::endl;
                 return false;
@@ -244,8 +247,9 @@ public:
 
         // Validate argments
         Status status{};
-        status = gemm.validate(lhs.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
-        if(!status)
+        status = gemm.validate(lhs.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info,
+                               rhs_info, kernel_info);
+        if (!status)
         {
             // Unsupported arguments
             std::cerr << "Unsupported arguments." << std::endl;
@@ -254,7 +258,8 @@ public:
         }
 
         // Configure function
-        gemm.configure(lhs.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
+        gemm.configure(lhs.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info,
+                       kernel_info);
 
         // Allocate tensors
         lhs.allocator()->allocate();
@@ -268,11 +273,7 @@ public:
     void do_run() override
     {
         // Execute the function
-        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs },
-            { ACL_SRC_1, &rhs_reshaped },
-            { ACL_SRC_2, &bias },
-            { ACL_DST, &dst }
-        });
+        ITensorPack gemm_pack({{ACL_SRC_0, &lhs}, {ACL_SRC_1, &rhs_reshaped}, {ACL_SRC_2, &bias}, {ACL_DST, &dst}});
         gemm.run(gemm_pack);
 
         // Make sure all the OpenCL jobs are done executing:
diff --git a/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp b/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp
index 9cf9c9fed0..3808b98b7d 100644
--- a/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp
+++ b/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp
@@ -31,14 +31,15 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
+
 #include "examples/gemm_tuner/CommonGemmExampleOptions.h"
 #include "examples/gemm_tuner/GemmTunerHelpers.h"
 #include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h"
 #include "src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h"
 #include "tests/CL/Helper.h"
-#include "utils/Utils.h"
 #include "utils/command_line/CommandLineOptions.h"
 #include "utils/command_line/CommandLineParser.h"
+#include "utils/Utils.h"
 
 #include <cstdlib>
 
@@ -53,15 +54,15 @@ namespace
 /** Structure holding all tunable gemm configs specific to this example/strategy */
 struct GemmConfigs
 {
-    size_t m0{ 4 };                /**< Number of rows processed by the matrix multiplication */
-    size_t n0{ 4 };                /**< Number of columns processed by the matrix multiplication */
-    size_t k0{ 4 };                /**< Number of partial accumulations performed by the matrix multiplication */
-    size_t v0{ 1 };                /**< Number of vertical blocks of size (m0xk0) stored on the same output row */
-    size_t h0{ 1 };                /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
-    bool   interleave_lhs{ true }; /**< Interleave lhs matrix */
-    bool   transpose_lhs{ true };  /**< Transpose lhs matrix. */
-    bool   interleave_rhs{ true }; /**< Interleave rhs matrix */
-    bool   transpose_rhs{ true };  /**< Transpose rhs matrix. */
+    size_t m0{4};                /**< Number of rows processed by the matrix multiplication */
+    size_t n0{4};                /**< Number of columns processed by the matrix multiplication */
+    size_t k0{4};                /**< Number of partial accumulations performed by the matrix multiplication */
+    size_t v0{1};                /**< Number of vertical blocks of size (m0xk0) stored on the same output row */
+    size_t h0{1};                /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
+    bool   interleave_lhs{true}; /**< Interleave lhs matrix */
+    bool   transpose_lhs{true};  /**< Transpose lhs matrix. */
+    bool   interleave_rhs{true}; /**< Interleave rhs matrix */
+    bool   transpose_rhs{true};  /**< Transpose rhs matrix. */
 };
 
 /** Formatted output of the GemmConfigs type
@@ -116,7 +117,8 @@ public:
         // FIXME: Currently we only support 2 variants of the gemm reshaped kernels in which transpose_lhs and
         // transpose_rhs are the opposites of each other. In the future we may extend the kernels to include the other
         // 2 variants (both transposed and none transposed)
-        transpose_rhs->set_help("Transpose rhs matrix but not lhs matrix (1) / Do not transpose rhs matrix but do transpose lhs matrix (0)");
+        transpose_rhs->set_help("Transpose rhs matrix but not lhs matrix (1) / Do not transpose rhs matrix but do "
+                                "transpose lhs matrix (0)");
     }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     GemmConfigOptions(const GemmConfigOptions &) = delete;
@@ -129,17 +131,18 @@ public:
     /** Default destructor */
     ~GemmConfigOptions() = default;
 
-    SimpleOption<size_t> *m0;             /**< Number of rows processed by the matrix multiplication option */
-    SimpleOption<size_t> *n0;             /**< Number of columns processed by the matrix multiplication option */
-    SimpleOption<size_t> *k0;             /**< Number of partial accumulations performed by the matrix multiplication option */
-    SimpleOption<size_t> *v0;             /**< Number of vertical blocks of size (m0xk0) stored on the same output row option */
-    SimpleOption<size_t> *h0;             /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */
+    SimpleOption<size_t> *m0; /**< Number of rows processed by the matrix multiplication option */
+    SimpleOption<size_t> *n0; /**< Number of columns processed by the matrix multiplication option */
+    SimpleOption<size_t> *k0; /**< Number of partial accumulations performed by the matrix multiplication option */
+    SimpleOption<size_t> *v0; /**< Number of vertical blocks of size (m0xk0) stored on the same output row option */
+    SimpleOption<size_t> *h0; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */
     SimpleOption<size_t> *interleave_lhs; /**< Interleave lhs matrix option (1 enable; 0 disable) */
     SimpleOption<size_t> *interleave_rhs; /**< Interleave rhs matrix option (1 enable; 0 disable) */
     // FIXME: Currently we only support 2 variants of the gemm reshaped kernels in which transpose_lhs and
     // transpose_rhs are the opposites of each other. In the future we may extend the kernels to include the other
     // 2 variants (both transposed and none transposed)
-    SimpleOption<size_t> *transpose_rhs; /**< Transpose rhs matrix option (1 enable; 0 disable). Also set the lhs matrix transpose option to the opposite. */
+    SimpleOption<size_t> *
+        transpose_rhs; /**< Transpose rhs matrix option (1 enable; 0 disable). Also set the lhs matrix transpose option to the opposite. */
 };
 
 /** Consumes the gemm configuration options and creates a structure containing all information
@@ -186,12 +189,12 @@ public:
         GemmConfigOptions        config_options(parser);
 
         parser.parse(argc, argv);
-        if(param_options.help->is_set() && param_options.help->value())
+        if (param_options.help->is_set() && param_options.help->value())
         {
             parser.print_help(argv[0]);
             return false;
         }
-        if(!parser.validate())
+        if (!parser.validate())
         {
             // Invalid arguments. Use default parameters and configs
             std::cerr << "Invalid arguments." << std::endl;
@@ -217,10 +220,7 @@ public:
         rhs.allocator()->init(TensorInfo(TensorShape(params.N, params.K, params.B), 1, params.data_type));
 
         // Set arbitrary quantization information
-        const QuantizationInfo q_info
-        {
-            0.012, 3
-        };
+        const QuantizationInfo q_info{0.012, 3};
         lhs.info()->set_quantization_info(q_info);
         rhs.info()->set_quantization_info(q_info);
         dst.info()->set_quantization_info(q_info);
@@ -240,45 +240,44 @@ public:
         rhs_info.transpose          = configs.transpose_rhs;
         rhs_info.export_to_cl_image = false; // CL image not supported for quantized cases yet
 
-        if(rhs_info.h0 == 0)
+        if (rhs_info.h0 == 0)
         {
             rhs_info.h0 = std::max(static_cast<unsigned int>(params.N) / rhs_info.n0, 1U);
         }
 
-        lhs_reshaped.allocator()->init(TensorInfo(compute_lhs_reshaped_shape(*lhs.info(), lhs_info), 1, params.data_type));
-        rhs_reshaped.allocator()->init(TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type));
+        lhs_reshaped.allocator()->init(
+            TensorInfo(compute_lhs_reshaped_shape(*lhs.info(), lhs_info), 1, params.data_type));
+        rhs_reshaped.allocator()->init(
+            TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type));
         lhs_reshaped.info()->set_quantization_info(q_info);
         rhs_reshaped.info()->set_quantization_info(q_info);
 
-        if(rhs_info.export_to_cl_image)
+        if (rhs_info.export_to_cl_image)
         {
-            if(!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info()))
+            if (!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info()))
             {
                 std::cerr << "cl_image is not supported on the device, disable export_to_cl_image" << std::endl;
                 return false;
             }
         }
 
-        GEMMReshapeInfo gemm_info
-        {
-            static_cast<int>(params.M),
-            static_cast<int>(params.N),
-            static_cast<int>(params.K),
-            static_cast<int>(configs.h0),
-            static_cast<int>(configs.v0),
-            0,
-            false,
-            true
-        };
+        GEMMReshapeInfo gemm_info{static_cast<int>(params.M),
+                                  static_cast<int>(params.N),
+                                  static_cast<int>(params.K),
+                                  static_cast<int>(configs.h0),
+                                  static_cast<int>(configs.v0),
+                                  0,
+                                  false,
+                                  true};
 
         // Validate argments
-        if(!reshape_lhs.validate(lhs.info(), lhs_reshaped.info(), lhs_info, gemm_info.reinterpret_input_as_3d()))
+        if (!reshape_lhs.validate(lhs.info(), lhs_reshaped.info(), lhs_info, gemm_info.reinterpret_input_as_3d()))
         {
             std::cerr << "Invalid arguments for ClGemmReshapeLHSMatrixKernel." << std::endl;
             return false;
         }
 
-        if(!gemm.validate(lhs_reshaped.info(), rhs_reshaped.info(), dst.info(), lhs_info, rhs_info, gemm_info))
+        if (!gemm.validate(lhs_reshaped.info(), rhs_reshaped.info(), dst.info(), lhs_info, rhs_info, gemm_info))
         {
             std::cerr << "Invalid arguments for ClGemmLowpMatrixMultiplyReshapedKernel." << std::endl;
             return false;
@@ -300,10 +299,10 @@ public:
     }
     void do_run() override
     {
-        ITensorPack reshape_lsh_pack({ { ACL_SRC, &lhs }, { ACL_DST, &lhs_reshaped } });
+        ITensorPack reshape_lsh_pack({{ACL_SRC, &lhs}, {ACL_DST, &lhs_reshaped}});
         reshape_lhs.run(reshape_lsh_pack);
 
-        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped }, { ACL_SRC_1, &rhs_reshaped }, { ACL_DST, &dst } });
+        ITensorPack gemm_pack({{ACL_SRC_0, &lhs_reshaped}, {ACL_SRC_1, &rhs_reshaped}, {ACL_DST, &dst}});
         gemm.run(gemm_pack);
 
         // Make sure all the OpenCL jobs are done executing:
diff --git a/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp b/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp
index 94f3c93166..4acb316a3c 100644
--- a/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp
+++ b/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp
@@ -25,23 +25,23 @@
 #error "This example needs to be built with -DARM_COMPUTE_CL"
 #endif /* ARM_COMPUTE_CL */
 
-#include "CommonGemmExampleOptions.h"
-#include "GemmTunerHelpers.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
+
 #include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h"
 #include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h"
 #include "tests/CL/Helper.h"
-#include "utils/Utils.h"
 #include "utils/command_line/CommandLineOptions.h"
 #include "utils/command_line/CommandLineParser.h"
+#include "utils/Utils.h"
 
+#include "CommonGemmExampleOptions.h"
+#include "GemmTunerHelpers.h"
 #include <cstdlib>
 #include <memory>
 
@@ -56,12 +56,12 @@ namespace
 /** Structure holding all tunable gemm configs specific to this example/strategy */
 struct GemmConfigs
 {
-    size_t m0{ 4 };                /**< Number of rows processed by the matrix multiplication */
-    size_t n0{ 4 };                /**< Number of columns processed by the matrix multiplication */
-    size_t k0{ 4 };                /**< Number of partial accumulations performed by the matrix multiplication */
-    size_t h0{ 1 };                /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
-    bool   interleave_rhs{ true }; /**< Interleave rhs matrix */
-    bool   transpose_rhs{ true };  /**< Transpose rhs matrix */
+    size_t m0{4};                /**< Number of rows processed by the matrix multiplication */
+    size_t n0{4};                /**< Number of columns processed by the matrix multiplication */
+    size_t k0{4};                /**< Number of partial accumulations performed by the matrix multiplication */
+    size_t h0{1};                /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
+    bool   interleave_rhs{true}; /**< Interleave rhs matrix */
+    bool   transpose_rhs{true};  /**< Transpose rhs matrix */
 };
 
 /** Formatted output of the GemmConfigs type
@@ -119,10 +119,10 @@ public:
     /** Default destructor */
     ~GemmConfigOptions() = default;
 
-    SimpleOption<size_t> *m0;             /**< Number of rows processed by the matrix multiplication option */
-    SimpleOption<size_t> *n0;             /**< Number of columns processed by the matrix multiplication option */
-    SimpleOption<size_t> *k0;             /**< Number of partial accumulations performed by the matrix multiplication option */
-    SimpleOption<size_t> *h0;             /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */
+    SimpleOption<size_t> *m0; /**< Number of rows processed by the matrix multiplication option */
+    SimpleOption<size_t> *n0; /**< Number of columns processed by the matrix multiplication option */
+    SimpleOption<size_t> *k0; /**< Number of partial accumulations performed by the matrix multiplication option */
+    SimpleOption<size_t> *h0; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */
     SimpleOption<size_t> *interleave_rhs; /**< Interleave rhs matrix option (1 enable; 0 disable) */
     SimpleOption<size_t> *transpose_rhs;  /**< Transpose rhs matrix option (1 enable; 0 disable) */
 };
@@ -147,8 +147,9 @@ GemmConfigs consume_gemm_configs(const GemmConfigOptions &options)
 
 } // namespace
 
-using ClGemmLowpMatrixMultiplyReshapedOnlyRhs = test::CLSynthetizeOperator<ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>;
-using ClGemmLowpMatrixAReduction              = test::CLSynthetizeOperator<ClGemmLowpMatrixAReductionKernel>;
+using ClGemmLowpMatrixMultiplyReshapedOnlyRhs =
+    test::CLSynthetizeOperator<ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>;
+using ClGemmLowpMatrixAReduction = test::CLSynthetizeOperator<ClGemmLowpMatrixAReductionKernel>;
 
 class CLGEMMLowpMatrixMultiplyReshapedOnlyRHSFusedOutputStageFixedpointExample : public Example
 {
@@ -165,12 +166,12 @@ public:
         GemmConfigOptions        config_options(parser);
 
         parser.parse(argc, argv);
-        if(param_options.help->is_set() && param_options.help->value())
+        if (param_options.help->is_set() && param_options.help->value())
         {
             parser.print_help(argv[0]);
             return false;
         }
-        if(!parser.validate())
+        if (!parser.validate())
         {
             // Invalid arguments. Use default parameters and configs
             std::cerr << "Invalid arguments." << std::endl;
@@ -199,10 +200,7 @@ public:
 
         // Set arbitrary quantization information (non-zero offset to ensure offset contribution stage is included)
         // Could be extended in the future to include a user-controlled option for offset == 0
-        const QuantizationInfo q_info
-        {
-            0.012, 3
-        };
+        const QuantizationInfo q_info{0.012, 3};
         lhs.info()->set_quantization_info(q_info);
         rhs.info()->set_quantization_info(q_info);
         bias.info()->set_quantization_info(q_info);
@@ -220,16 +218,17 @@ public:
         rhs_info.transpose          = configs.transpose_rhs;
         rhs_info.export_to_cl_image = false; // CL image not supported for quantized cases yet
 
-        if(rhs_info.h0 == 0)
+        if (rhs_info.h0 == 0)
         {
             rhs_info.h0 = std::max(static_cast<unsigned int>(params.N) / rhs_info.n0, 1U);
         }
 
-        rhs_reshaped.allocator()->init(TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type));
+        rhs_reshaped.allocator()->init(
+            TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type));
         rhs_reshaped.info()->set_quantization_info(q_info);
-        if(rhs_info.export_to_cl_image)
+        if (rhs_info.export_to_cl_image)
         {
-            if(!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info()))
+            if (!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info()))
             {
                 std::cerr << "cl_image is not supported on the device, disable export_to_cl_image" << std::endl;
                 return false;
@@ -251,9 +250,7 @@ public:
 
             gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters);
             gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters);
-            quantization::compute_quantized_multipliers_and_shifts(lhs.info(),
-                                                                   rhs.info(),
-                                                                   dst.info(),
+            quantization::compute_quantized_multipliers_and_shifts(lhs.info(), rhs.info(), dst.info(),
                                                                    gemmlowp_output_stage.gemmlowp_multipliers.data(),
                                                                    gemmlowp_output_stage.gemmlowp_shifts.data());
             gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0];
@@ -290,14 +287,14 @@ public:
         gemm_info.output_stage            = gemmlowp_output_stage;
 
         // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
-        if(gemm_info.b_offset != 0)
+        if (gemm_info.b_offset != 0)
         {
             const TensorInfo info_vector_sum_row(compute_reductionB_shape(*lhs.info()), 1, DataType::S32);
             vector_sum_row.allocator()->init(info_vector_sum_row);
 
             mtx_a_reduction = std::make_unique<ClGemmLowpMatrixAReduction>();
 
-            if(!mtx_a_reduction->validate(lhs.info(), vector_sum_row.info(), GEMMLowpReductionKernelInfo{}))
+            if (!mtx_a_reduction->validate(lhs.info(), vector_sum_row.info(), GEMMLowpReductionKernelInfo{}))
             {
                 std::cerr << "Invalid arguments for CLGEMMLowpMatrixAReductionKernel." << std::endl;
                 return false;
@@ -306,7 +303,7 @@ public:
             mtx_a_reduction->configure(lhs.info(), vector_sum_row.info(), GEMMLowpReductionKernelInfo{});
         }
         // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
-        if(gemm_info.a_offset != 0)
+        if (gemm_info.a_offset != 0)
         {
             const TensorInfo info_vector_sum_col(compute_reductionA_shape(*rhs.info()), 1, DataType::S32);
             vector_sum_col.allocator()->init(info_vector_sum_col);
@@ -314,8 +311,10 @@ public:
         }
 
         // Validate argments
-        if(!gemm.validate(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info, gemm_info.a_offset == 0 ? nullptr : vector_sum_col.info(),
-                          gemm_info.b_offset == 0 ? nullptr : vector_sum_row.info(), bias.info(), dst_multipliers.info(), dst_shifts.info()))
+        if (!gemm.validate(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info,
+                           gemm_info.a_offset == 0 ? nullptr : vector_sum_col.info(),
+                           gemm_info.b_offset == 0 ? nullptr : vector_sum_row.info(), bias.info(),
+                           dst_multipliers.info(), dst_shifts.info()))
         {
             std::cerr << "Invalid arguments for ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel." << std::endl;
             return false;
@@ -323,8 +322,9 @@ public:
 
         // Configure function
         gemm.configure(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info,
-                       gemm_info.a_offset == 0 ? nullptr : vector_sum_col.info(), gemm_info.b_offset == 0 ? nullptr : vector_sum_row.info(),
-                       bias.info(), dst_multipliers.info(), dst_shifts.info());
+                       gemm_info.a_offset == 0 ? nullptr : vector_sum_col.info(),
+                       gemm_info.b_offset == 0 ? nullptr : vector_sum_row.info(), bias.info(), dst_multipliers.info(),
+                       dst_shifts.info());
 
         // Allocate tensors
         lhs.allocator()->allocate();
@@ -341,13 +341,20 @@ public:
     }
     void do_run() override
     {
-        if(mtx_a_reduction != nullptr)
+        if (mtx_a_reduction != nullptr)
         {
-            ITensorPack red_pack({ { ACL_SRC, &lhs }, { ACL_DST, &dst } });
+            ITensorPack red_pack({{ACL_SRC, &lhs}, {ACL_DST, &dst}});
             mtx_a_reduction->run(red_pack);
         }
 
-        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, { ACL_SRC_1, &rhs }, { ACL_BIAS, &bias }, { ACL_VEC_COL_SUM, &vector_sum_col }, { ACL_VEC_ROW_SUM, &vector_sum_row }, { ACL_SHIFTS, &dst_shifts }, { ACL_MULTIPLIERS, &dst_multipliers }, { ACL_DST, &dst } });
+        ITensorPack gemm_pack({{ACL_SRC_0, &lhs},
+                               {ACL_SRC_1, &rhs},
+                               {ACL_BIAS, &bias},
+                               {ACL_VEC_COL_SUM, &vector_sum_col},
+                               {ACL_VEC_ROW_SUM, &vector_sum_row},
+                               {ACL_SHIFTS, &dst_shifts},
+                               {ACL_MULTIPLIERS, &dst_multipliers},
+                               {ACL_DST, &dst}});
         gemm.run(gemm_pack);
 
         // Make sure all the OpenCL jobs are done executing:
@@ -370,7 +377,7 @@ private:
     CLTensor                                    dst_shifts{};
     CLTuner                                     tuner{};
     ClGemmLowpMatrixMultiplyReshapedOnlyRhs     gemm{};
-    std::unique_ptr<ClGemmLowpMatrixAReduction> mtx_a_reduction{ nullptr };
+    std::unique_ptr<ClGemmLowpMatrixAReduction> mtx_a_reduction{nullptr};
 };
 
 /** Main test program for gemmlowp reshaped rhs only with fused output stage fixedpoint
diff --git a/examples/graph_alexnet.cpp b/examples/graph_alexnet.cpp
index 53a4547e04..be0b8a7d8a 100644
--- a/examples/graph_alexnet.cpp
+++ b/examples/graph_alexnet.cpp
@@ -39,8 +39,7 @@ using namespace arm_compute::graph_utils;
 class GraphAlexnetExample : public Example
 {
 public:
-    GraphAlexnetExample()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "AlexNet")
+    GraphAlexnetExample() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "AlexNet")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -53,14 +52,15 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
         }
 
         // Checks
-        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type), "QASYMM8 not supported for this graph");
+        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type),
+                                "QASYMM8 not supported for this graph");
 
         // Print parameter values
         std::cout << common_params << std::endl;
@@ -69,88 +69,80 @@ public:
         std::string data_path = common_params.data_path;
 
         // Create a preprocessor object
-        const std::array<float, 3> mean_rgb{ { 122.68f, 116.67f, 104.01f } };
+        const std::array<float, 3>     mean_rgb{{122.68f, 116.67f, 104.01f}};
         std::unique_ptr<IPreprocessor> preprocessor = std::make_unique<CaffePreproccessor>(mean_rgb);
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(227U, 227U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(227U, 227U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
-              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor)))
-              // Layer 1
-              << ConvolutionLayer(
-                  11U, 11U, 96U,
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv1_b.npy"),
-                  PadStrideInfo(4, 4, 0, 0))
-              .set_name("conv1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu1")
-              << NormalizationLayer(NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f)).set_name("norm1")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool1")
-              // Layer 2
-              << ConvolutionLayer(
-                  5U, 5U, 256U,
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv2_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv2_b.npy"),
-                  PadStrideInfo(1, 1, 2, 2), 2)
-              .set_name("conv2")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu2")
-              << NormalizationLayer(NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f)).set_name("norm2")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool2")
-              // Layer 3
-              << ConvolutionLayer(
-                  3U, 3U, 384U,
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv3_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv3_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv3")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu3")
-              // Layer 4
-              << ConvolutionLayer(
-                  3U, 3U, 384U,
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv4_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv4_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1), 2)
-              .set_name("conv4")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu4")
-              // Layer 5
-              << ConvolutionLayer(
-                  3U, 3U, 256U,
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv5_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv5_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1), 2)
-              .set_name("conv5")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu5")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool5")
-              // Layer 6
-              << FullyConnectedLayer(
-                  4096U,
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc6_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc6_b.npy"))
-              .set_name("fc6")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu6")
-              // Layer 7
-              << FullyConnectedLayer(
-                  4096U,
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc7_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc7_b.npy"))
-              .set_name("fc7")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu7")
-              // Layer 8
-              << FullyConnectedLayer(
-                  1000U,
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc8_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc8_b.npy"))
-              .set_name("fc8")
-              // Softmax
-              << SoftmaxLayer().set_name("prob")
-              << OutputLayer(get_output_accessor(common_params, 5));
+        graph
+            << common_params.target << common_params.fast_math_hint
+            << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor)))
+            // Layer 1
+            << ConvolutionLayer(11U, 11U, 96U,
+                                get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv1_w.npy", weights_layout),
+                                get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv1_b.npy"),
+                                PadStrideInfo(4, 4, 0, 0))
+                   .set_name("conv1")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu1")
+            << NormalizationLayer(NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f)).set_name("norm1")
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                   .set_name("pool1")
+            // Layer 2
+            << ConvolutionLayer(
+                   5U, 5U, 256U, get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv2_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv2_b.npy"), PadStrideInfo(1, 1, 2, 2), 2)
+                   .set_name("conv2")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu2")
+            << NormalizationLayer(NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f)).set_name("norm2")
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                   .set_name("pool2")
+            // Layer 3
+            << ConvolutionLayer(
+                   3U, 3U, 384U, get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv3_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv3_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv3")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu3")
+            // Layer 4
+            << ConvolutionLayer(
+                   3U, 3U, 384U, get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv4_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv4_b.npy"), PadStrideInfo(1, 1, 1, 1), 2)
+                   .set_name("conv4")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu4")
+            // Layer 5
+            << ConvolutionLayer(
+                   3U, 3U, 256U, get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv5_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv5_b.npy"), PadStrideInfo(1, 1, 1, 1), 2)
+                   .set_name("conv5")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu5")
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                   .set_name("pool5")
+            // Layer 6
+            << FullyConnectedLayer(4096U,
+                                   get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc6_w.npy", weights_layout),
+                                   get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc6_b.npy"))
+                   .set_name("fc6")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu6")
+            // Layer 7
+            << FullyConnectedLayer(4096U,
+                                   get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc7_w.npy", weights_layout),
+                                   get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc7_b.npy"))
+                   .set_name("fc7")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu7")
+            // Layer 8
+            << FullyConnectedLayer(1000U,
+                                   get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc8_w.npy", weights_layout),
+                                   get_weights_accessor(data_path, "/cnn_data/alexnet_model/fc8_b.npy"))
+                   .set_name("fc8")
+            // Softmax
+            << SoftmaxLayer().set_name("prob") << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
         GraphConfig config;
@@ -163,7 +155,7 @@ public:
 
         // Load the precompiled kernels from a file into the kernel library, in this way the next time they are needed
         // compilation won't be required.
-        if(common_params.enable_cl_cache)
+        if (common_params.enable_cl_cache)
         {
 #ifdef ARM_COMPUTE_CL
             restore_program_cache_from_file();
@@ -173,7 +165,7 @@ public:
         graph.finalize(common_params.target, config);
 
         // Save the opencl kernels to a file
-        if(common_opts.enable_cl_cache)
+        if (common_opts.enable_cl_cache)
         {
 #ifdef ARM_COMPUTE_CL
             save_program_cache_to_file();
diff --git a/examples/graph_deepspeech_v0_4_1.cpp b/examples/graph_deepspeech_v0_4_1.cpp
index da163b6493..08cd4a47b1 100644
--- a/examples/graph_deepspeech_v0_4_1.cpp
+++ b/examples/graph_deepspeech_v0_4_1.cpp
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/graph.h"
 #include "arm_compute/graph/Types.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -37,8 +38,7 @@ using namespace arm_compute::graph_utils;
 class GraphDeepSpeechExample : public Example
 {
 public:
-    GraphDeepSpeechExample()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "DeepSpeech v0.4.1")
+    GraphDeepSpeechExample() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "DeepSpeech v0.4.1")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -51,7 +51,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -64,7 +64,7 @@ public:
         std::string       data_path  = common_params.data_path;
         const std::string model_path = "/cnn_data/deepspeech_model/";
 
-        if(!data_path.empty())
+        if (!data_path.empty())
         {
             data_path += model_path;
         }
@@ -77,131 +77,131 @@ public:
         const float cell_clip = 20.f;
 
         // Create input descriptor
-        const TensorShape tensor_shape     = permute_shape(TensorShape(26U, 19U, n_steps, 1U), DataLayout::NHWC, common_params.data_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(26U, 19U, n_steps, 1U), DataLayout::NHWC, common_params.data_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NHWC;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
+        graph << common_params.target << common_params.fast_math_hint
               << InputLayer(input_descriptor,
-                            get_weights_accessor(data_path, "input_values_x" + std::to_string(n_steps) + ".npy", weights_layout))
-              .set_name("input_node");
+                            get_weights_accessor(data_path, "input_values_x" + std::to_string(n_steps) + ".npy",
+                                                 weights_layout))
+                     .set_name("input_node");
 
-        if(common_params.data_layout == DataLayout::NCHW)
+        if (common_params.data_layout == DataLayout::NCHW)
         {
             graph << PermuteLayer(PermutationVector(2U, 0U, 1U), common_params.data_layout).set_name("permute_to_nhwc");
         }
 
         graph << ReshapeLayer(TensorShape(494U, n_steps)).set_name("Reshape_input")
               // Layer 1
-              << FullyConnectedLayer(
-                  2048U,
-                  get_weights_accessor(data_path, "h1_transpose.npy", weights_layout),
-                  get_weights_accessor(data_path, "MatMul_bias.npy"))
-              .set_name("fc0")
+              << FullyConnectedLayer(2048U, get_weights_accessor(data_path, "h1_transpose.npy", weights_layout),
+                                     get_weights_accessor(data_path, "MatMul_bias.npy"))
+                     .set_name("fc0")
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, cell_clip))
-              .set_name("Relu")
+                     .set_name("Relu")
               // Layer 2
-              << FullyConnectedLayer(
-                  2048U,
-                  get_weights_accessor(data_path, "h2_transpose.npy", weights_layout),
-                  get_weights_accessor(data_path, "MatMul_1_bias.npy"))
-              .set_name("fc1")
+              << FullyConnectedLayer(2048U, get_weights_accessor(data_path, "h2_transpose.npy", weights_layout),
+                                     get_weights_accessor(data_path, "MatMul_1_bias.npy"))
+                     .set_name("fc1")
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, cell_clip))
-              .set_name("Relu_1")
+                     .set_name("Relu_1")
               // Layer 3
-              << FullyConnectedLayer(
-                  2048U,
-                  get_weights_accessor(data_path, "h3_transpose.npy", weights_layout),
-                  get_weights_accessor(data_path, "MatMul_2_bias.npy"))
-              .set_name("fc2")
+              << FullyConnectedLayer(2048U, get_weights_accessor(data_path, "h3_transpose.npy", weights_layout),
+                                     get_weights_accessor(data_path, "MatMul_2_bias.npy"))
+                     .set_name("fc2")
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, cell_clip))
-              .set_name("Relu_2")
+                     .set_name("Relu_2")
               // Layer 4
               << ReshapeLayer(TensorShape(2048U, 1U, n_steps)).set_name("Reshape_1");
 
         // Unstack Layer (using SplitLayerNode)
-        NodeParams unstack_params = { "unstack", graph.hints().target_hint };
-        NodeID     unstack_nid    = GraphBuilder::add_split_node(graph.graph(), unstack_params, { graph.tail_node(), 0 }, n_steps, 2);
+        NodeParams unstack_params = {"unstack", graph.hints().target_hint};
+        NodeID     unstack_nid =
+            GraphBuilder::add_split_node(graph.graph(), unstack_params, {graph.tail_node(), 0}, n_steps, 2);
 
         // Create input state descriptor
-        TensorDescriptor state_descriptor = TensorDescriptor(TensorShape(2048U), common_params.data_type).set_layout(common_params.data_layout);
-        SubStream        previous_state(graph);
-        SubStream        add_y(graph);
+        TensorDescriptor state_descriptor =
+            TensorDescriptor(TensorShape(2048U), common_params.data_type).set_layout(common_params.data_layout);
+        SubStream previous_state(graph);
+        SubStream add_y(graph);
 
         // Initial state for LSTM is all zeroes for both state_h and state_c, therefore only one input is created
-        previous_state << InputLayer(state_descriptor,
-                                     get_weights_accessor(data_path, "zeros.npy"))
-                       .set_name("previous_state_c_h");
-        add_y << InputLayer(state_descriptor,
-                            get_weights_accessor(data_path, "ones.npy"))
-              .set_name("add_y");
+        previous_state << InputLayer(state_descriptor, get_weights_accessor(data_path, "zeros.npy"))
+                              .set_name("previous_state_c_h");
+        add_y << InputLayer(state_descriptor, get_weights_accessor(data_path, "ones.npy")).set_name("add_y");
 
         // Create LSTM Fully Connected weights and bias descriptors
-        TensorDescriptor lstm_weights_descriptor = TensorDescriptor(TensorShape(4096U, 8192U), common_params.data_type).set_layout(common_params.data_layout);
-        TensorDescriptor lstm_bias_descriptor    = TensorDescriptor(TensorShape(8192U), common_params.data_type).set_layout(common_params.data_layout);
-        SubStream        lstm_fc_weights(graph);
-        SubStream        lstm_fc_bias(graph);
-        lstm_fc_weights << ConstantLayer(lstm_weights_descriptor,
-                                         get_weights_accessor(data_path, "rnn_lstm_cell_kernel_transpose.npy", weights_layout))
-                        .set_name("h5/transpose");
+        TensorDescriptor lstm_weights_descriptor =
+            TensorDescriptor(TensorShape(4096U, 8192U), common_params.data_type).set_layout(common_params.data_layout);
+        TensorDescriptor lstm_bias_descriptor =
+            TensorDescriptor(TensorShape(8192U), common_params.data_type).set_layout(common_params.data_layout);
+        SubStream lstm_fc_weights(graph);
+        SubStream lstm_fc_bias(graph);
+        lstm_fc_weights << ConstantLayer(
+                               lstm_weights_descriptor,
+                               get_weights_accessor(data_path, "rnn_lstm_cell_kernel_transpose.npy", weights_layout))
+                               .set_name("h5/transpose");
         lstm_fc_bias << ConstantLayer(lstm_bias_descriptor,
                                       get_weights_accessor(data_path, "rnn_lstm_cell_MatMul_bias.npy"))
-                     .set_name("MatMul_3_bias");
+                            .set_name("MatMul_3_bias");
 
         // LSTM Block
-        std::pair<SubStream, SubStream> new_state_1  = add_lstm_cell(unstack_nid, 0, previous_state, previous_state, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_2  = add_lstm_cell(unstack_nid, 1, new_state_1.first, new_state_1.second, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_3  = add_lstm_cell(unstack_nid, 2, new_state_2.first, new_state_2.second, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_4  = add_lstm_cell(unstack_nid, 3, new_state_3.first, new_state_3.second, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_5  = add_lstm_cell(unstack_nid, 4, new_state_4.first, new_state_4.second, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_6  = add_lstm_cell(unstack_nid, 5, new_state_5.first, new_state_5.second, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_7  = add_lstm_cell(unstack_nid, 6, new_state_6.first, new_state_6.second, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_8  = add_lstm_cell(unstack_nid, 7, new_state_7.first, new_state_7.second, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_9  = add_lstm_cell(unstack_nid, 8, new_state_8.first, new_state_8.second, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_10 = add_lstm_cell(unstack_nid, 9, new_state_9.first, new_state_9.second, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_11 = add_lstm_cell(unstack_nid, 10, new_state_10.first, new_state_10.second, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_12 = add_lstm_cell(unstack_nid, 11, new_state_11.first, new_state_11.second, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_13 = add_lstm_cell(unstack_nid, 12, new_state_12.first, new_state_12.second, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_14 = add_lstm_cell(unstack_nid, 13, new_state_13.first, new_state_13.second, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_15 = add_lstm_cell(unstack_nid, 14, new_state_14.first, new_state_14.second, add_y, lstm_fc_weights, lstm_fc_bias);
-        std::pair<SubStream, SubStream> new_state_16 = add_lstm_cell(unstack_nid, 15, new_state_15.first, new_state_15.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_1 =
+            add_lstm_cell(unstack_nid, 0, previous_state, previous_state, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_2 =
+            add_lstm_cell(unstack_nid, 1, new_state_1.first, new_state_1.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_3 =
+            add_lstm_cell(unstack_nid, 2, new_state_2.first, new_state_2.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_4 =
+            add_lstm_cell(unstack_nid, 3, new_state_3.first, new_state_3.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_5 =
+            add_lstm_cell(unstack_nid, 4, new_state_4.first, new_state_4.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_6 =
+            add_lstm_cell(unstack_nid, 5, new_state_5.first, new_state_5.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_7 =
+            add_lstm_cell(unstack_nid, 6, new_state_6.first, new_state_6.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_8 =
+            add_lstm_cell(unstack_nid, 7, new_state_7.first, new_state_7.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_9 =
+            add_lstm_cell(unstack_nid, 8, new_state_8.first, new_state_8.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_10 =
+            add_lstm_cell(unstack_nid, 9, new_state_9.first, new_state_9.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_11 = add_lstm_cell(
+            unstack_nid, 10, new_state_10.first, new_state_10.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_12 = add_lstm_cell(
+            unstack_nid, 11, new_state_11.first, new_state_11.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_13 = add_lstm_cell(
+            unstack_nid, 12, new_state_12.first, new_state_12.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_14 = add_lstm_cell(
+            unstack_nid, 13, new_state_13.first, new_state_13.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_15 = add_lstm_cell(
+            unstack_nid, 14, new_state_14.first, new_state_14.second, add_y, lstm_fc_weights, lstm_fc_bias);
+        std::pair<SubStream, SubStream> new_state_16 = add_lstm_cell(
+            unstack_nid, 15, new_state_15.first, new_state_15.second, add_y, lstm_fc_weights, lstm_fc_bias);
 
         // Concatenate new states on height
         const int axis = 1;
-        graph << StackLayer(axis,
-                            std::move(new_state_1.second),
-                            std::move(new_state_2.second),
-                            std::move(new_state_3.second),
-                            std::move(new_state_4.second),
-                            std::move(new_state_5.second),
-                            std::move(new_state_6.second),
-                            std::move(new_state_7.second),
-                            std::move(new_state_8.second),
-                            std::move(new_state_9.second),
-                            std::move(new_state_10.second),
-                            std::move(new_state_11.second),
-                            std::move(new_state_12.second),
-                            std::move(new_state_13.second),
-                            std::move(new_state_14.second),
-                            std::move(new_state_15.second),
-                            std::move(new_state_16.second))
-              .set_name("concat");
-
-        graph << FullyConnectedLayer(
-                  2048U,
-                  get_weights_accessor(data_path, "h5_transpose.npy", weights_layout),
-                  get_weights_accessor(data_path, "MatMul_3_bias.npy"))
-              .set_name("fc3")
+        graph << StackLayer(axis, std::move(new_state_1.second), std::move(new_state_2.second),
+                            std::move(new_state_3.second), std::move(new_state_4.second), std::move(new_state_5.second),
+                            std::move(new_state_6.second), std::move(new_state_7.second), std::move(new_state_8.second),
+                            std::move(new_state_9.second), std::move(new_state_10.second),
+                            std::move(new_state_11.second), std::move(new_state_12.second),
+                            std::move(new_state_13.second), std::move(new_state_14.second),
+                            std::move(new_state_15.second), std::move(new_state_16.second))
+                     .set_name("concat");
+
+        graph << FullyConnectedLayer(2048U, get_weights_accessor(data_path, "h5_transpose.npy", weights_layout),
+                                     get_weights_accessor(data_path, "MatMul_3_bias.npy"))
+                     .set_name("fc3")
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, cell_clip))
-              .set_name("Relu3")
-              << FullyConnectedLayer(
-                  29U,
-                  get_weights_accessor(data_path, "h6_transpose.npy", weights_layout),
-                  get_weights_accessor(data_path, "MatMul_4_bias.npy"))
-              .set_name("fc3")
+                     .set_name("Relu3")
+              << FullyConnectedLayer(29U, get_weights_accessor(data_path, "h6_transpose.npy", weights_layout),
+                                     get_weights_accessor(data_path, "MatMul_4_bias.npy"))
+                     .set_name("fc3")
               << SoftmaxLayer().set_name("logits");
 
         graph << OutputLayer(get_output_accessor(common_params, 5));
@@ -241,7 +241,7 @@ private:
         return Status{};
     }
 
-    std::pair<SubStream, SubStream> add_lstm_cell(NodeID unstack_nid,
+    std::pair<SubStream, SubStream> add_lstm_cell(NodeID       unstack_nid,
                                                   unsigned int unstack_idx,
                                                   SubStream    previous_state_c,
                                                   SubStream    previous_state_h,
@@ -250,41 +250,41 @@ private:
                                                   SubStream    lstm_fc_bias)
     {
         const std::string         cell_name("rnn/lstm_cell_" + std::to_string(unstack_idx));
-        const DataLayoutDimension concat_dim = (common_params.data_layout == DataLayout::NHWC) ? DataLayoutDimension::CHANNEL : DataLayoutDimension::WIDTH;
+        const DataLayoutDimension concat_dim =
+            (common_params.data_layout == DataLayout::NHWC) ? DataLayoutDimension::CHANNEL : DataLayoutDimension::WIDTH;
 
         // Concatenate result of Unstack with previous_state_h
-        NodeParams concat_params = { cell_name + "/concat", graph.hints().target_hint };
+        NodeParams concat_params = {cell_name + "/concat", graph.hints().target_hint};
         NodeID     concat_nid    = graph.graph().add_node<ConcatenateLayerNode>(2, concat_dim);
         graph.graph().add_connection(unstack_nid, unstack_idx, concat_nid, 0);
         graph.graph().add_connection(previous_state_h.tail_node(), 0, concat_nid, 1);
         set_node_params(graph.graph(), concat_nid, concat_params);
         graph.forward_tail(concat_nid);
 
-        graph << FullyConnectedLayer(
-                  8192U,
-                  lstm_fc_weights,
-                  lstm_fc_bias)
-              .set_name(cell_name + "/BiasAdd");
+        graph << FullyConnectedLayer(8192U, lstm_fc_weights, lstm_fc_bias).set_name(cell_name + "/BiasAdd");
 
         // Split Layer
         const unsigned int num_splits = 4;
         const unsigned int split_axis = 0;
 
-        NodeParams split_params = { cell_name + "/split", graph.hints().target_hint };
-        NodeID     split_nid    = GraphBuilder::add_split_node(graph.graph(), split_params, { graph.tail_node(), 0 }, num_splits, split_axis);
+        NodeParams split_params = {cell_name + "/split", graph.hints().target_hint};
+        NodeID     split_nid =
+            GraphBuilder::add_split_node(graph.graph(), split_params, {graph.tail_node(), 0}, num_splits, split_axis);
 
-        NodeParams sigmoid_1_params = { cell_name + "/Sigmoid_1", graph.hints().target_hint };
-        NodeParams add_params       = { cell_name + "/add", graph.hints().target_hint };
-        NodeParams sigmoid_2_params = { cell_name + "/Sigmoid_2", graph.hints().target_hint };
-        NodeParams tanh_params      = { cell_name + "/Tanh", graph.hints().target_hint };
+        NodeParams sigmoid_1_params = {cell_name + "/Sigmoid_1", graph.hints().target_hint};
+        NodeParams add_params       = {cell_name + "/add", graph.hints().target_hint};
+        NodeParams sigmoid_2_params = {cell_name + "/Sigmoid_2", graph.hints().target_hint};
+        NodeParams tanh_params      = {cell_name + "/Tanh", graph.hints().target_hint};
 
         // Sigmoid 1 (first split)
-        NodeID sigmoid_1_nid = graph.graph().add_node<ActivationLayerNode>(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        NodeID sigmoid_1_nid = graph.graph().add_node<ActivationLayerNode>(
+            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
         graph.graph().add_connection(split_nid, 0, sigmoid_1_nid, 0);
         set_node_params(graph.graph(), sigmoid_1_nid, sigmoid_1_params);
 
         // Tanh (second split)
-        NodeID tanh_nid = graph.graph().add_node<ActivationLayerNode>(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+        NodeID tanh_nid = graph.graph().add_node<ActivationLayerNode>(
+            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
         graph.graph().add_connection(split_nid, 1, tanh_nid, 0);
         set_node_params(graph.graph(), tanh_nid, tanh_params);
 
@@ -292,13 +292,15 @@ private:
         tanh_ss.forward_tail(tanh_nid);
 
         // Add (third split)
-        NodeID add_nid = graph.graph().add_node<EltwiseLayerNode>(descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add });
+        NodeID add_nid =
+            graph.graph().add_node<EltwiseLayerNode>(descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add});
         graph.graph().add_connection(split_nid, 2, add_nid, 0);
         graph.graph().add_connection(add_y.tail_node(), 0, add_nid, 1);
         set_node_params(graph.graph(), add_nid, add_params);
 
         // Sigmoid 2 (fourth split)
-        NodeID sigmoid_2_nid = graph.graph().add_node<ActivationLayerNode>(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        NodeID sigmoid_2_nid = graph.graph().add_node<ActivationLayerNode>(
+            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
         graph.graph().add_connection(split_nid, 3, sigmoid_2_nid, 0);
         set_node_params(graph.graph(), sigmoid_2_nid, sigmoid_2_params);
 
@@ -306,28 +308,28 @@ private:
         sigmoid_1_ss.forward_tail(sigmoid_1_nid);
         SubStream mul_1_ss(sigmoid_1_ss);
         mul_1_ss << EltwiseLayer(std::move(sigmoid_1_ss), std::move(tanh_ss), EltwiseOperation::Mul)
-                 .set_name(cell_name + "/mul_1");
+                        .set_name(cell_name + "/mul_1");
 
         SubStream tanh_1_ss_tmp(graph);
         tanh_1_ss_tmp.forward_tail(add_nid);
 
         tanh_1_ss_tmp << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))
-                      .set_name(cell_name + "/Sigmoid");
+                             .set_name(cell_name + "/Sigmoid");
         SubStream tanh_1_ss_tmp2(tanh_1_ss_tmp);
         tanh_1_ss_tmp2 << EltwiseLayer(std::move(tanh_1_ss_tmp), std::move(previous_state_c), EltwiseOperation::Mul)
-                       .set_name(cell_name + "/mul");
+                              .set_name(cell_name + "/mul");
         SubStream tanh_1_ss(tanh_1_ss_tmp2);
         tanh_1_ss << EltwiseLayer(std::move(tanh_1_ss_tmp2), std::move(mul_1_ss), EltwiseOperation::Add)
-                  .set_name(cell_name + "/new_state_c");
+                         .set_name(cell_name + "/new_state_c");
         SubStream new_state_c(tanh_1_ss);
 
         tanh_1_ss << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))
-                  .set_name(cell_name + "/Tanh_1");
+                         .set_name(cell_name + "/Tanh_1");
 
         SubStream sigmoid_2_ss(graph);
         sigmoid_2_ss.forward_tail(sigmoid_2_nid);
         graph << EltwiseLayer(std::move(sigmoid_2_ss), std::move(tanh_1_ss), EltwiseOperation::Mul)
-              .set_name(cell_name + "/new_state_h");
+                     .set_name(cell_name + "/new_state_h");
 
         SubStream new_state_h(graph);
         return std::pair<SubStream, SubStream>(new_state_c, new_state_h);
diff --git a/examples/graph_edsr.cpp b/examples/graph_edsr.cpp
index 0e41f12155..b4f2fadf4a 100644
--- a/examples/graph_edsr.cpp
+++ b/examples/graph_edsr.cpp
@@ -22,28 +22,28 @@
  * SOFTWARE.
  */
 
+#include "graph_edsr.h"
+
 #include "arm_compute/graph/Utils.h"
 
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/Utils.h"
 
-#include "graph_edsr.h"
-
 using namespace arm_compute::graph;
 using namespace arm_compute::utils;
 
 class GraphEdsrExample : public Example
 {
 public:
-    GraphEdsrExample()
-        : cmd_parser(), common_opts(cmd_parser), common_params()
+    GraphEdsrExample() : cmd_parser(), common_opts(cmd_parser), common_params()
     {
         expected_output_filename = cmd_parser.add_option<SimpleOption<std::string>>("expected-output-filename", "");
-        expected_output_filename->set_help("Name of npy file containing the expected output to validate the graph output.");
+        expected_output_filename->set_help(
+            "Name of npy file containing the expected output to validate the graph output.");
     }
 
-    GraphEdsrExample(const GraphEdsrExample &) = delete;
+    GraphEdsrExample(const GraphEdsrExample &)            = delete;
     GraphEdsrExample &operator=(const GraphEdsrExample &) = delete;
     ~GraphEdsrExample() override                          = default;
 
@@ -57,13 +57,14 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
         }
 
-        ARM_COMPUTE_EXIT_ON_MSG(common_params.data_type != DataType::QASYMM8, "Only QASYMM8 is supported for this graph example");
+        ARM_COMPUTE_EXIT_ON_MSG(common_params.data_type != DataType::QASYMM8,
+                                "Only QASYMM8 is supported for this graph example");
 
         // Print parameter values
         std::cout << common_params << std::endl;
@@ -98,7 +99,7 @@ private:
     GraphContext context{};
     GraphManager manager{};
 
-    SimpleOption<std::string> *expected_output_filename{ nullptr };
+    SimpleOption<std::string> *expected_output_filename{nullptr};
 
     GraphEdsr model{};
 };
diff --git a/examples/graph_edsr.h b/examples/graph_edsr.h
index 72012afdcb..1161e4ba38 100644
--- a/examples/graph_edsr.h
+++ b/examples/graph_edsr.h
@@ -32,12 +32,12 @@
 class GraphEdsr
 {
 public:
-    GraphEdsr()
-        : _graph(0, "EDSR")
+    GraphEdsr() : _graph(0, "EDSR")
     {
     }
 
-    bool setup(const arm_compute::utils::CommonGraphParams &common_params, const arm_compute::utils::SimpleOption<std::string> &expected_output_filename)
+    bool setup(const arm_compute::utils::CommonGraphParams         &common_params,
+               const arm_compute::utils::SimpleOption<std::string> &expected_output_filename)
     {
         using namespace arm_compute;
         using namespace arm_compute::graph;
@@ -47,1221 +47,879 @@ public:
         const auto &data_path = common_params.data_path;
         const auto &target    = common_params.target;
 
-        NodeID id_upscale_net_FakeQuantWithMinMaxVars_transposed = _graph.add_node<ConstNode>(
-                                                                       TensorDescriptor
-        {
-            TensorShape{ 12, 2, 2, 3 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.00393533194437623, 1),
-            DataLayout::NHWC });
-        INode *node_upscale_net_FakeQuantWithMinMaxVars_transposed = _graph.node(id_upscale_net_FakeQuantWithMinMaxVars_transposed);
-        node_upscale_net_FakeQuantWithMinMaxVars_transposed->set_common_node_parameters(NodeParams{ "upscale_net_FakeQuantWithMinMaxVars_transposed", target });
-        node_upscale_net_FakeQuantWithMinMaxVars_transposed->output(0)->set_accessor(get_weights_accessor(data_path,
-                                                                                                          "/cnn_data/edsr_model/upscale_net_FakeQuantWithMinMaxVars_transposed.npy", DataLayout::NHWC));
-
-        NodeID id_pre_upscale_Conv2D_bias = _graph.add_node<ConstNode>(
-                                                TensorDescriptor
-        {
-            TensorShape{ 12 },
-            DataType::S32,
-            QuantizationInfo(2.9644968435604824e-06),
-            DataLayout::NHWC });
+        NodeID id_upscale_net_FakeQuantWithMinMaxVars_transposed = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{12, 2, 2, 3}, DataType::QASYMM8, QuantizationInfo(0.00393533194437623, 1), DataLayout::NHWC});
+        INode *node_upscale_net_FakeQuantWithMinMaxVars_transposed =
+            _graph.node(id_upscale_net_FakeQuantWithMinMaxVars_transposed);
+        node_upscale_net_FakeQuantWithMinMaxVars_transposed->set_common_node_parameters(
+            NodeParams{"upscale_net_FakeQuantWithMinMaxVars_transposed", target});
+        node_upscale_net_FakeQuantWithMinMaxVars_transposed->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/upscale_net_FakeQuantWithMinMaxVars_transposed.npy", DataLayout::NHWC));
+
+        NodeID id_pre_upscale_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{12}, DataType::S32, QuantizationInfo(2.9644968435604824e-06), DataLayout::NHWC});
         INode *node_pre_upscale_Conv2D_bias = _graph.node(id_pre_upscale_Conv2D_bias);
-        node_pre_upscale_Conv2D_bias->set_common_node_parameters(NodeParams{ "pre_upscale_Conv2D_bias", target });
-        node_pre_upscale_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/pre_upscale_Conv2D_bias.npy", DataLayout::NHWC));
-
-        NodeID id_pre_upscale_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                            TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 12 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.000455576169770211, 128),
-            DataLayout::NHWC });
+        node_pre_upscale_Conv2D_bias->set_common_node_parameters(NodeParams{"pre_upscale_Conv2D_bias", target});
+        node_pre_upscale_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/pre_upscale_Conv2D_bias.npy", DataLayout::NHWC));
+
+        NodeID id_pre_upscale_FakeQuantWithMinMaxVars =
+            _graph.add_node<ConstNode>(TensorDescriptor{TensorShape{256, 3, 3, 12}, DataType::QASYMM8,
+                                                        QuantizationInfo(0.000455576169770211, 128), DataLayout::NHWC});
         INode *node_pre_upscale_FakeQuantWithMinMaxVars = _graph.node(id_pre_upscale_FakeQuantWithMinMaxVars);
-        node_pre_upscale_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "pre_upscale_FakeQuantWithMinMaxVars", target });
-        node_pre_upscale_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/pre_upscale_FakeQuantWithMinMaxVars.npy",
-                                                                                               DataLayout::NHWC));
-
-        NodeID id_post_residual_Conv2D_bias = _graph.add_node<ConstNode>(
-                                                  TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.2760000345224398e-06),
-            DataLayout::NHWC });
+        node_pre_upscale_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"pre_upscale_FakeQuantWithMinMaxVars", target});
+        node_pre_upscale_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/pre_upscale_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_post_residual_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.2760000345224398e-06), DataLayout::NHWC});
         INode *node_post_residual_Conv2D_bias = _graph.node(id_post_residual_Conv2D_bias);
-        node_post_residual_Conv2D_bias->set_common_node_parameters(NodeParams{ "post_residual_Conv2D_bias", target });
-        node_post_residual_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/post_residual_Conv2D_bias.npy", DataLayout::NHWC));
+        node_post_residual_Conv2D_bias->set_common_node_parameters(NodeParams{"post_residual_Conv2D_bias", target});
+        node_post_residual_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/post_residual_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_post_residual_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                              TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.00036424631252884865, 129),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.00036424631252884865, 129), DataLayout::NHWC});
         INode *node_post_residual_FakeQuantWithMinMaxVars = _graph.node(id_post_residual_FakeQuantWithMinMaxVars);
-        node_post_residual_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "post_residual_FakeQuantWithMinMaxVars", target });
-        node_post_residual_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/post_residual_FakeQuantWithMinMaxVars.npy",
-                                                                                                 DataLayout::NHWC));
-
-        NodeID id_mul_15_y = _graph.add_node<ConstNode>(
-                                 TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_post_residual_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"post_residual_FakeQuantWithMinMaxVars", target});
+        node_post_residual_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/post_residual_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_15_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_15_y = _graph.node(id_mul_15_y);
-        node_mul_15_y->set_common_node_parameters(NodeParams{ "mul_15_y", target });
-        node_mul_15_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_15_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_15_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                               TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.2441644230420934e-06),
-            DataLayout::NHWC });
+        node_mul_15_y->set_common_node_parameters(NodeParams{"mul_15_y", target});
+        node_mul_15_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_15_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_15_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.2441644230420934e-06), DataLayout::NHWC});
         INode *node_block_15_1_Conv2D_bias = _graph.node(id_block_15_1_Conv2D_bias);
-        node_block_15_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_15_1_Conv2D_bias", target });
-        node_block_15_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_15_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_15_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_15_1_Conv2D_bias", target});
+        node_block_15_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_15_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_15_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                           TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.00037038681330159307, 125),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.00037038681330159307, 125), DataLayout::NHWC});
         INode *node_block_15_1_FakeQuantWithMinMaxVars = _graph.node(id_block_15_1_FakeQuantWithMinMaxVars);
-        node_block_15_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_15_1_FakeQuantWithMinMaxVars", target });
-        node_block_15_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_15_1_FakeQuantWithMinMaxVars.npy",
-                                                                                              DataLayout::NHWC));
-
-        NodeID id_mul_14_y = _graph.add_node<ConstNode>(
-                                 TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_15_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_15_1_FakeQuantWithMinMaxVars", target});
+        node_block_15_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_15_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_14_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_14_y = _graph.node(id_mul_14_y);
-        node_mul_14_y->set_common_node_parameters(NodeParams{ "mul_14_y", target });
-        node_mul_14_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_14_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_14_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                               TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.3417260333881131e-06),
-            DataLayout::NHWC });
+        node_mul_14_y->set_common_node_parameters(NodeParams{"mul_14_y", target});
+        node_mul_14_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_14_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_14_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.3417260333881131e-06), DataLayout::NHWC});
         INode *node_block_14_1_Conv2D_bias = _graph.node(id_block_14_1_Conv2D_bias);
-        node_block_14_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_14_1_Conv2D_bias", target });
-        node_block_14_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_14_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_14_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_14_1_Conv2D_bias", target});
+        node_block_14_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_14_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_14_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                           TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.00040307495510205626, 127),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.00040307495510205626, 127), DataLayout::NHWC});
         INode *node_block_14_1_FakeQuantWithMinMaxVars = _graph.node(id_block_14_1_FakeQuantWithMinMaxVars);
-        node_block_14_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_14_1_FakeQuantWithMinMaxVars", target });
-        node_block_14_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_14_1_FakeQuantWithMinMaxVars.npy",
-                                                                                              DataLayout::NHWC));
-
-        NodeID id_mul_13_y = _graph.add_node<ConstNode>(
-                                 TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_14_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_14_1_FakeQuantWithMinMaxVars", target});
+        node_block_14_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_14_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_13_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_13_y = _graph.node(id_mul_13_y);
-        node_mul_13_y->set_common_node_parameters(NodeParams{ "mul_13_y", target });
-        node_mul_13_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_13_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_13_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                               TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.2636977544389083e-06),
-            DataLayout::NHWC });
+        node_mul_13_y->set_common_node_parameters(NodeParams{"mul_13_y", target});
+        node_mul_13_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_13_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_13_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.2636977544389083e-06), DataLayout::NHWC});
         INode *node_block_13_1_Conv2D_bias = _graph.node(id_block_13_1_Conv2D_bias);
-        node_block_13_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_13_1_Conv2D_bias", target });
-        node_block_13_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_13_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_13_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_13_1_Conv2D_bias", target});
+        node_block_13_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_13_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_13_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                           TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003858553245663643, 131),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.0003858553245663643, 131), DataLayout::NHWC});
         INode *node_block_13_1_FakeQuantWithMinMaxVars = _graph.node(id_block_13_1_FakeQuantWithMinMaxVars);
-        node_block_13_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_13_1_FakeQuantWithMinMaxVars", target });
-        node_block_13_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_13_1_FakeQuantWithMinMaxVars.npy",
-                                                                                              DataLayout::NHWC));
-
-        NodeID id_mul_12_y = _graph.add_node<ConstNode>(
-                                 TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_13_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_13_1_FakeQuantWithMinMaxVars", target});
+        node_block_13_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_13_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_12_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_12_y = _graph.node(id_mul_12_y);
-        node_mul_12_y->set_common_node_parameters(NodeParams{ "mul_12_y", target });
-        node_mul_12_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_12_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_12_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                               TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.3479783547154511e-06),
-            DataLayout::NHWC });
+        node_mul_12_y->set_common_node_parameters(NodeParams{"mul_12_y", target});
+        node_mul_12_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_12_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_12_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.3479783547154511e-06), DataLayout::NHWC});
         INode *node_block_12_1_Conv2D_bias = _graph.node(id_block_12_1_Conv2D_bias);
-        node_block_12_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_12_1_Conv2D_bias", target });
-        node_block_12_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_12_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_12_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_12_1_Conv2D_bias", target});
+        node_block_12_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_12_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_12_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                           TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.00041212860378436744, 130),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.00041212860378436744, 130), DataLayout::NHWC});
         INode *node_block_12_1_FakeQuantWithMinMaxVars = _graph.node(id_block_12_1_FakeQuantWithMinMaxVars);
-        node_block_12_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_12_1_FakeQuantWithMinMaxVars", target });
-        node_block_12_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_12_1_FakeQuantWithMinMaxVars.npy",
-                                                                                              DataLayout::NHWC));
-
-        NodeID id_mul_11_y = _graph.add_node<ConstNode>(
-                                 TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_12_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_12_1_FakeQuantWithMinMaxVars", target});
+        node_block_12_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_12_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_11_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_11_y = _graph.node(id_mul_11_y);
-        node_mul_11_y->set_common_node_parameters(NodeParams{ "mul_11_y", target });
-        node_mul_11_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_11_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_11_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                               TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.2847248171965475e-06),
-            DataLayout::NHWC });
+        node_mul_11_y->set_common_node_parameters(NodeParams{"mul_11_y", target});
+        node_mul_11_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_11_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_11_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.2847248171965475e-06), DataLayout::NHWC});
         INode *node_block_11_1_Conv2D_bias = _graph.node(id_block_11_1_Conv2D_bias);
-        node_block_11_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_11_1_Conv2D_bias", target });
-        node_block_11_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_11_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_11_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_11_1_Conv2D_bias", target});
+        node_block_11_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_11_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_11_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                           TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.00040296532097272575, 131),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.00040296532097272575, 131), DataLayout::NHWC});
         INode *node_block_11_1_FakeQuantWithMinMaxVars = _graph.node(id_block_11_1_FakeQuantWithMinMaxVars);
-        node_block_11_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_11_1_FakeQuantWithMinMaxVars", target });
-        node_block_11_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_11_1_FakeQuantWithMinMaxVars.npy",
-                                                                                              DataLayout::NHWC));
-
-        NodeID id_mul_10_y = _graph.add_node<ConstNode>(
-                                 TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_11_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_11_1_FakeQuantWithMinMaxVars", target});
+        node_block_11_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_11_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_10_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_10_y = _graph.node(id_mul_10_y);
-        node_mul_10_y->set_common_node_parameters(NodeParams{ "mul_10_y", target });
-        node_mul_10_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_10_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_10_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                               TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.1997129831797793e-06),
-            DataLayout::NHWC });
+        node_mul_10_y->set_common_node_parameters(NodeParams{"mul_10_y", target});
+        node_mul_10_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_10_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_10_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.1997129831797793e-06), DataLayout::NHWC});
         INode *node_block_10_1_Conv2D_bias = _graph.node(id_block_10_1_Conv2D_bias);
-        node_block_10_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_10_1_Conv2D_bias", target });
-        node_block_10_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_10_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_10_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_10_1_Conv2D_bias", target});
+        node_block_10_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_10_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_10_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                           TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.00036640543839894235, 129),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.00036640543839894235, 129), DataLayout::NHWC});
         INode *node_block_10_1_FakeQuantWithMinMaxVars = _graph.node(id_block_10_1_FakeQuantWithMinMaxVars);
-        node_block_10_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_10_1_FakeQuantWithMinMaxVars", target });
-        node_block_10_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_10_1_FakeQuantWithMinMaxVars.npy",
-                                                                                              DataLayout::NHWC));
-
-        NodeID id_mul_9_y = _graph.add_node<ConstNode>(
-                                TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_10_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_10_1_FakeQuantWithMinMaxVars", target});
+        node_block_10_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_10_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_9_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_9_y = _graph.node(id_mul_9_y);
-        node_mul_9_y->set_common_node_parameters(NodeParams{ "mul_9_y", target });
-        node_mul_9_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_9_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_9_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                              TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.1920226370421005e-06),
-            DataLayout::NHWC });
+        node_mul_9_y->set_common_node_parameters(NodeParams{"mul_9_y", target});
+        node_mul_9_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_9_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_9_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.1920226370421005e-06), DataLayout::NHWC});
         INode *node_block_9_1_Conv2D_bias = _graph.node(id_block_9_1_Conv2D_bias);
-        node_block_9_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_9_1_Conv2D_bias", target });
-        node_block_9_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_9_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_9_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_9_1_Conv2D_bias", target});
+        node_block_9_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_9_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_9_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                          TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003706997958943248, 129),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.0003706997958943248, 129), DataLayout::NHWC});
         INode *node_block_9_1_FakeQuantWithMinMaxVars = _graph.node(id_block_9_1_FakeQuantWithMinMaxVars);
-        node_block_9_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_9_1_FakeQuantWithMinMaxVars", target });
-        node_block_9_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_9_1_FakeQuantWithMinMaxVars.npy",
-                                                                                             DataLayout::NHWC));
-
-        NodeID id_mul_8_y = _graph.add_node<ConstNode>(
-                                TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_9_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_9_1_FakeQuantWithMinMaxVars", target});
+        node_block_9_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_9_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_8_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_8_y = _graph.node(id_mul_8_y);
-        node_mul_8_y->set_common_node_parameters(NodeParams{ "mul_8_y", target });
-        node_mul_8_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_8_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_8_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                              TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.218903321387188e-06),
-            DataLayout::NHWC });
+        node_mul_8_y->set_common_node_parameters(NodeParams{"mul_8_y", target});
+        node_mul_8_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_8_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_8_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.218903321387188e-06), DataLayout::NHWC});
         INode *node_block_8_1_Conv2D_bias = _graph.node(id_block_8_1_Conv2D_bias);
-        node_block_8_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_8_1_Conv2D_bias", target });
-        node_block_8_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_8_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_8_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_8_1_Conv2D_bias", target});
+        node_block_8_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_8_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_8_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                          TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.00038377835880964994, 127),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.00038377835880964994, 127), DataLayout::NHWC});
         INode *node_block_8_1_FakeQuantWithMinMaxVars = _graph.node(id_block_8_1_FakeQuantWithMinMaxVars);
-        node_block_8_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_8_1_FakeQuantWithMinMaxVars", target });
-        node_block_8_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_8_1_FakeQuantWithMinMaxVars.npy",
-                                                                                             DataLayout::NHWC));
-
-        NodeID id_mul_7_y = _graph.add_node<ConstNode>(
-                                TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_8_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_8_1_FakeQuantWithMinMaxVars", target});
+        node_block_8_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_8_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_7_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_7_y = _graph.node(id_mul_7_y);
-        node_mul_7_y->set_common_node_parameters(NodeParams{ "mul_7_y", target });
-        node_mul_7_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_7_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_7_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                              TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.257252392861119e-06),
-            DataLayout::NHWC });
+        node_mul_7_y->set_common_node_parameters(NodeParams{"mul_7_y", target});
+        node_mul_7_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_7_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_7_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.257252392861119e-06), DataLayout::NHWC});
         INode *node_block_7_1_Conv2D_bias = _graph.node(id_block_7_1_Conv2D_bias);
-        node_block_7_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_7_1_Conv2D_bias", target });
-        node_block_7_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_7_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_7_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_7_1_Conv2D_bias", target});
+        node_block_7_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_7_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_7_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                          TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.00039844686398282647, 129),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.00039844686398282647, 129), DataLayout::NHWC});
         INode *node_block_7_1_FakeQuantWithMinMaxVars = _graph.node(id_block_7_1_FakeQuantWithMinMaxVars);
-        node_block_7_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_7_1_FakeQuantWithMinMaxVars", target });
-        node_block_7_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_7_1_FakeQuantWithMinMaxVars.npy",
-                                                                                             DataLayout::NHWC));
-
-        NodeID id_mul_6_y = _graph.add_node<ConstNode>(
-                                TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_7_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_7_1_FakeQuantWithMinMaxVars", target});
+        node_block_7_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_7_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_6_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_6_y = _graph.node(id_mul_6_y);
-        node_mul_6_y->set_common_node_parameters(NodeParams{ "mul_6_y", target });
-        node_mul_6_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_6_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_6_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                              TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.244850636794581e-06),
-            DataLayout::NHWC });
+        node_mul_6_y->set_common_node_parameters(NodeParams{"mul_6_y", target});
+        node_mul_6_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_6_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_6_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.244850636794581e-06), DataLayout::NHWC});
         INode *node_block_6_1_Conv2D_bias = _graph.node(id_block_6_1_Conv2D_bias);
-        node_block_6_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_6_1_Conv2D_bias", target });
-        node_block_6_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_6_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_6_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_6_1_Conv2D_bias", target});
+        node_block_6_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_6_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_6_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                          TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.00040187727427110076, 132),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.00040187727427110076, 132), DataLayout::NHWC});
         INode *node_block_6_1_FakeQuantWithMinMaxVars = _graph.node(id_block_6_1_FakeQuantWithMinMaxVars);
-        node_block_6_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_6_1_FakeQuantWithMinMaxVars", target });
-        node_block_6_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_6_1_FakeQuantWithMinMaxVars.npy",
-                                                                                             DataLayout::NHWC));
-
-        NodeID id_mul_5_y = _graph.add_node<ConstNode>(
-                                TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_6_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_6_1_FakeQuantWithMinMaxVars", target});
+        node_block_6_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_6_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_5_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_5_y = _graph.node(id_mul_5_y);
-        node_mul_5_y->set_common_node_parameters(NodeParams{ "mul_5_y", target });
-        node_mul_5_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_5_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_5_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                              TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.241092718373693e-06),
-            DataLayout::NHWC });
+        node_mul_5_y->set_common_node_parameters(NodeParams{"mul_5_y", target});
+        node_mul_5_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_5_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_5_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.241092718373693e-06), DataLayout::NHWC});
         INode *node_block_5_1_Conv2D_bias = _graph.node(id_block_5_1_Conv2D_bias);
-        node_block_5_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_5_1_Conv2D_bias", target });
-        node_block_5_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_5_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_5_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_5_1_Conv2D_bias", target});
+        node_block_5_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_5_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_5_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                          TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003938926674891263, 129),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.0003938926674891263, 129), DataLayout::NHWC});
         INode *node_block_5_1_FakeQuantWithMinMaxVars = _graph.node(id_block_5_1_FakeQuantWithMinMaxVars);
-        node_block_5_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_5_1_FakeQuantWithMinMaxVars", target });
-        node_block_5_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_5_1_FakeQuantWithMinMaxVars.npy",
-                                                                                             DataLayout::NHWC));
-
-        NodeID id_mul_4_y = _graph.add_node<ConstNode>(
-                                TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_5_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_5_1_FakeQuantWithMinMaxVars", target});
+        node_block_5_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_5_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_4_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_4_y = _graph.node(id_mul_4_y);
-        node_mul_4_y->set_common_node_parameters(NodeParams{ "mul_4_y", target });
-        node_mul_4_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_4_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_4_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                              TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.1748390988941537e-06),
-            DataLayout::NHWC });
+        node_mul_4_y->set_common_node_parameters(NodeParams{"mul_4_y", target});
+        node_mul_4_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_4_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_4_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.1748390988941537e-06), DataLayout::NHWC});
         INode *node_block_4_1_Conv2D_bias = _graph.node(id_block_4_1_Conv2D_bias);
-        node_block_4_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_4_1_Conv2D_bias", target });
-        node_block_4_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_4_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_4_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_4_1_Conv2D_bias", target});
+        node_block_4_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_4_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_4_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                          TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003788181929849088, 129),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.0003788181929849088, 129), DataLayout::NHWC});
         INode *node_block_4_1_FakeQuantWithMinMaxVars = _graph.node(id_block_4_1_FakeQuantWithMinMaxVars);
-        node_block_4_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_4_1_FakeQuantWithMinMaxVars", target });
-        node_block_4_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_4_1_FakeQuantWithMinMaxVars.npy",
-                                                                                             DataLayout::NHWC));
-
-        NodeID id_mul_3_y = _graph.add_node<ConstNode>(
-                                TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_4_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_4_1_FakeQuantWithMinMaxVars", target});
+        node_block_4_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_4_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_3_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_3_y = _graph.node(id_mul_3_y);
-        node_mul_3_y->set_common_node_parameters(NodeParams{ "mul_3_y", target });
-        node_mul_3_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_3_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_3_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                              TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.1937011095142225e-06),
-            DataLayout::NHWC });
+        node_mul_3_y->set_common_node_parameters(NodeParams{"mul_3_y", target});
+        node_mul_3_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_3_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_3_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.1937011095142225e-06), DataLayout::NHWC});
         INode *node_block_3_1_Conv2D_bias = _graph.node(id_block_3_1_Conv2D_bias);
-        node_block_3_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_3_1_Conv2D_bias", target });
-        node_block_3_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_3_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_3_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_3_1_Conv2D_bias", target});
+        node_block_3_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_3_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_3_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                          TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003944312920793891, 129),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.0003944312920793891, 129), DataLayout::NHWC});
         INode *node_block_3_1_FakeQuantWithMinMaxVars = _graph.node(id_block_3_1_FakeQuantWithMinMaxVars);
-        node_block_3_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_3_1_FakeQuantWithMinMaxVars", target });
-        node_block_3_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_3_1_FakeQuantWithMinMaxVars.npy",
-                                                                                             DataLayout::NHWC));
-
-        NodeID id_mul_2_y = _graph.add_node<ConstNode>(
-                                TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_3_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_3_1_FakeQuantWithMinMaxVars", target});
+        node_block_3_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_3_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_2_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_2_y = _graph.node(id_mul_2_y);
-        node_mul_2_y->set_common_node_parameters(NodeParams{ "mul_2_y", target });
-        node_mul_2_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_2_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_2_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                              TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.1634580232566805e-06),
-            DataLayout::NHWC });
+        node_mul_2_y->set_common_node_parameters(NodeParams{"mul_2_y", target});
+        node_mul_2_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_2_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_2_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.1634580232566805e-06), DataLayout::NHWC});
         INode *node_block_2_1_Conv2D_bias = _graph.node(id_block_2_1_Conv2D_bias);
-        node_block_2_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_2_1_Conv2D_bias", target });
-        node_block_2_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_2_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_2_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_2_1_Conv2D_bias", target});
+        node_block_2_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_2_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_2_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                          TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003789655165746808, 132),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.0003789655165746808, 132), DataLayout::NHWC});
         INode *node_block_2_1_FakeQuantWithMinMaxVars = _graph.node(id_block_2_1_FakeQuantWithMinMaxVars);
-        node_block_2_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_2_1_FakeQuantWithMinMaxVars", target });
-        node_block_2_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_2_1_FakeQuantWithMinMaxVars.npy",
-                                                                                             DataLayout::NHWC));
-
-        NodeID id_mul_1_y = _graph.add_node<ConstNode>(
-                                TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_2_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_2_1_FakeQuantWithMinMaxVars", target});
+        node_block_2_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_2_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_1_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_1_y = _graph.node(id_mul_1_y);
-        node_mul_1_y->set_common_node_parameters(NodeParams{ "mul_1_y", target });
-        node_mul_1_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_1_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_1_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                              TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.197920255435747e-06),
-            DataLayout::NHWC });
+        node_mul_1_y->set_common_node_parameters(NodeParams{"mul_1_y", target});
+        node_mul_1_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_1_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_1_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.197920255435747e-06), DataLayout::NHWC});
         INode *node_block_1_1_Conv2D_bias = _graph.node(id_block_1_1_Conv2D_bias);
-        node_block_1_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_1_1_Conv2D_bias", target });
-        node_block_1_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_1_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_1_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_1_1_Conv2D_bias", target});
+        node_block_1_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_1_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_1_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                          TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.00038527738070115447, 132),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.00038527738070115447, 132), DataLayout::NHWC});
         INode *node_block_1_1_FakeQuantWithMinMaxVars = _graph.node(id_block_1_1_FakeQuantWithMinMaxVars);
-        node_block_1_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_1_1_FakeQuantWithMinMaxVars", target });
-        node_block_1_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_1_1_FakeQuantWithMinMaxVars.npy",
-                                                                                             DataLayout::NHWC));
-
-        NodeID id_mul_y = _graph.add_node<ConstNode>(
-                              TensorDescriptor
-        {
-            TensorShape{ 1 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0003921568568330258),
-            DataLayout::NHWC });
+        node_block_1_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_1_1_FakeQuantWithMinMaxVars", target});
+        node_block_1_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_1_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_mul_y   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{1}, DataType::QASYMM8, QuantizationInfo(0.0003921568568330258), DataLayout::NHWC});
         INode *node_mul_y = _graph.node(id_mul_y);
-        node_mul_y->set_common_node_parameters(NodeParams{ "mul_y", target });
-        node_mul_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_y.npy", DataLayout::NHWC));
-
-        NodeID id_block_0_1_Conv2D_bias = _graph.add_node<ConstNode>(
-                                              TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.315485519626236e-06),
-            DataLayout::NHWC });
+        node_mul_y->set_common_node_parameters(NodeParams{"mul_y", target});
+        node_mul_y->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/mul_y.npy", DataLayout::NHWC));
+
+        NodeID id_block_0_1_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.315485519626236e-06), DataLayout::NHWC});
         INode *node_block_0_1_Conv2D_bias = _graph.node(id_block_0_1_Conv2D_bias);
-        node_block_0_1_Conv2D_bias->set_common_node_parameters(NodeParams{ "block_0_1_Conv2D_bias", target });
-        node_block_0_1_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_0_1_Conv2D_bias.npy", DataLayout::NHWC));
+        node_block_0_1_Conv2D_bias->set_common_node_parameters(NodeParams{"block_0_1_Conv2D_bias", target});
+        node_block_0_1_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/block_0_1_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_block_0_1_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                          TensorDescriptor
-        {
-            TensorShape{ 256, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.00039420535904355347, 129),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{256, 3, 3, 256}, DataType::QASYMM8,
+                             QuantizationInfo(0.00039420535904355347, 129), DataLayout::NHWC});
         INode *node_block_0_1_FakeQuantWithMinMaxVars = _graph.node(id_block_0_1_FakeQuantWithMinMaxVars);
-        node_block_0_1_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "block_0_1_FakeQuantWithMinMaxVars", target });
-        node_block_0_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/block_0_1_FakeQuantWithMinMaxVars.npy",
-                                                                                             DataLayout::NHWC));
-
-        NodeID id_pre_residual_Conv2D_bias = _graph.add_node<ConstNode>(
-                                                 TensorDescriptor
-        {
-            TensorShape{ 256 },
-            DataType::S32,
-            QuantizationInfo(1.7214160834555514e-06),
-            DataLayout::NHWC });
+        node_block_0_1_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"block_0_1_FakeQuantWithMinMaxVars", target});
+        node_block_0_1_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/block_0_1_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
+
+        NodeID id_pre_residual_Conv2D_bias   = _graph.add_node<ConstNode>(TensorDescriptor{
+            TensorShape{256}, DataType::S32, QuantizationInfo(1.7214160834555514e-06), DataLayout::NHWC});
         INode *node_pre_residual_Conv2D_bias = _graph.node(id_pre_residual_Conv2D_bias);
-        node_pre_residual_Conv2D_bias->set_common_node_parameters(NodeParams{ "pre_residual_Conv2D_bias", target });
-        node_pre_residual_Conv2D_bias->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/pre_residual_Conv2D_bias.npy", DataLayout::NHWC));
+        node_pre_residual_Conv2D_bias->set_common_node_parameters(NodeParams{"pre_residual_Conv2D_bias", target});
+        node_pre_residual_Conv2D_bias->output(0)->set_accessor(
+            get_weights_accessor(data_path, "/cnn_data/edsr_model/pre_residual_Conv2D_bias.npy", DataLayout::NHWC));
 
         NodeID id_pre_residual_FakeQuantWithMinMaxVars = _graph.add_node<ConstNode>(
-                                                             TensorDescriptor
-        {
-            TensorShape{ 3, 3, 3, 256 },
-            DataType::QASYMM8,
-            QuantizationInfo(0.0004389610840007663, 127),
-            DataLayout::NHWC });
+            TensorDescriptor{TensorShape{3, 3, 3, 256}, DataType::QASYMM8, QuantizationInfo(0.0004389610840007663, 127),
+                             DataLayout::NHWC});
         INode *node_pre_residual_FakeQuantWithMinMaxVars = _graph.node(id_pre_residual_FakeQuantWithMinMaxVars);
-        node_pre_residual_FakeQuantWithMinMaxVars->set_common_node_parameters(NodeParams{ "pre_residual_FakeQuantWithMinMaxVars", target });
-        node_pre_residual_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/edsr_model/pre_residual_FakeQuantWithMinMaxVars.npy",
-                                                                                                DataLayout::NHWC));
+        node_pre_residual_FakeQuantWithMinMaxVars->set_common_node_parameters(
+            NodeParams{"pre_residual_FakeQuantWithMinMaxVars", target});
+        node_pre_residual_FakeQuantWithMinMaxVars->output(0)->set_accessor(get_weights_accessor(
+            data_path, "/cnn_data/edsr_model/pre_residual_FakeQuantWithMinMaxVars.npy", DataLayout::NHWC));
 
         TensorShape input_shape{};
         input_shape.set(0, 3, false).set(1, 360, false).set(2, 640, false).set(3, 1, false);
 
         NodeID id_input = _graph.add_node<InputNode>(
-                              TensorDescriptor
-        {
-            input_shape,
-            DataType::QASYMM8,
-            QuantizationInfo(0.003921568859368563),
-            DataLayout::NHWC });
+            TensorDescriptor{input_shape, DataType::QASYMM8, QuantizationInfo(0.003921568859368563), DataLayout::NHWC});
         INode *node_input = _graph.node(id_input);
-        node_input->set_common_node_parameters(NodeParams{ "input", target });
+        node_input->set_common_node_parameters(NodeParams{"input", target});
         node_input->output(0)->set_accessor(get_input_accessor(common_params));
 
-        NodeID id_pre_residual_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                             PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.0033370566088706255, 96));
+        NodeID id_pre_residual_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.0033370566088706255, 96));
         INode *node_pre_residual_BiasAdd = _graph.node(id_pre_residual_BiasAdd);
-        node_pre_residual_BiasAdd->set_common_node_parameters(NodeParams{ "pre_residual_BiasAdd", target });
+        node_pre_residual_BiasAdd->set_common_node_parameters(NodeParams{"pre_residual_BiasAdd", target});
         _graph.add_connection(id_input, 0, id_pre_residual_BiasAdd, 0);
         _graph.add_connection(id_pre_residual_FakeQuantWithMinMaxVars, 0, id_pre_residual_BiasAdd, 1);
         _graph.add_connection(id_pre_residual_Conv2D_bias, 0, id_pre_residual_BiasAdd, 2);
 
-        NodeID id_block_0_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                          PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.007344874087721109, 185));
+        NodeID id_block_0_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.007344874087721109, 185));
         INode *node_block_0_1_BiasAdd = _graph.node(id_block_0_1_BiasAdd);
-        node_block_0_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_0_1_BiasAdd", target });
+        node_block_0_1_BiasAdd->set_common_node_parameters(NodeParams{"block_0_1_BiasAdd", target});
         _graph.add_connection(id_pre_residual_BiasAdd, 0, id_block_0_1_BiasAdd, 0);
         _graph.add_connection(id_block_0_1_FakeQuantWithMinMaxVars, 0, id_block_0_1_BiasAdd, 1);
         _graph.add_connection(id_block_0_1_Conv2D_bias, 0, id_block_0_1_BiasAdd, 2);
 
         NodeID id_mul = _graph.add_node<EltwiseLayerNode>(
-                            descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.0006341293919831514, 174 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.0006341293919831514, 174}});
         INode *node_mul = _graph.node(id_mul);
-        node_mul->set_common_node_parameters(NodeParams{ "mul", target });
+        node_mul->set_common_node_parameters(NodeParams{"mul", target});
         _graph.add_connection(id_block_0_1_BiasAdd, 0, id_mul, 0);
         _graph.add_connection(id_mul_y, 0, id_mul, 1);
 
         NodeID id_add = _graph.add_node<EltwiseLayerNode>(
-                            descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.0031092411372810602, 95 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.0031092411372810602, 95}});
         INode *node_add = _graph.node(id_add);
-        node_add->set_common_node_parameters(NodeParams{ "add", target });
+        node_add->set_common_node_parameters(NodeParams{"add", target});
         _graph.add_connection(id_pre_residual_BiasAdd, 0, id_add, 0);
         _graph.add_connection(id_mul, 0, id_add, 1);
 
-        NodeID id_block_1_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                          PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.005333727691322565, 117));
+        NodeID id_block_1_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.005333727691322565, 117));
         INode *node_block_1_1_BiasAdd = _graph.node(id_block_1_1_BiasAdd);
-        node_block_1_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_1_1_BiasAdd", target });
+        node_block_1_1_BiasAdd->set_common_node_parameters(NodeParams{"block_1_1_BiasAdd", target});
         _graph.add_connection(id_add, 0, id_block_1_1_BiasAdd, 0);
         _graph.add_connection(id_block_1_1_FakeQuantWithMinMaxVars, 0, id_block_1_1_BiasAdd, 1);
         _graph.add_connection(id_block_1_1_Conv2D_bias, 0, id_block_1_1_BiasAdd, 2);
 
         NodeID id_mul_1 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.0004965941770933568, 122 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.0004965941770933568, 122}});
         INode *node_mul_1 = _graph.node(id_mul_1);
-        node_mul_1->set_common_node_parameters(NodeParams{ "mul_1", target });
+        node_mul_1->set_common_node_parameters(NodeParams{"mul_1", target});
         _graph.add_connection(id_block_1_1_BiasAdd, 0, id_mul_1, 0);
         _graph.add_connection(id_mul_1_y, 0, id_mul_1, 1);
 
         NodeID id_add_1 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.0030700892675668, 96 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.0030700892675668, 96}});
         INode *node_add_1 = _graph.node(id_add_1);
-        node_add_1->set_common_node_parameters(NodeParams{ "add_1", target });
+        node_add_1->set_common_node_parameters(NodeParams{"add_1", target});
         _graph.add_connection(id_add, 0, id_add_1, 0);
         _graph.add_connection(id_mul_1, 0, id_add_1, 1);
 
-        NodeID id_block_2_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                          PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.004199742339551449, 132));
+        NodeID id_block_2_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.004199742339551449, 132));
         INode *node_block_2_1_BiasAdd = _graph.node(id_block_2_1_BiasAdd);
-        node_block_2_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_2_1_BiasAdd", target });
+        node_block_2_1_BiasAdd->set_common_node_parameters(NodeParams{"block_2_1_BiasAdd", target});
         _graph.add_connection(id_add_1, 0, id_block_2_1_BiasAdd, 0);
         _graph.add_connection(id_block_2_1_FakeQuantWithMinMaxVars, 0, id_block_2_1_BiasAdd, 1);
         _graph.add_connection(id_block_2_1_Conv2D_bias, 0, id_block_2_1_BiasAdd, 2);
 
         NodeID id_mul_2 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.0004133903712499887, 130 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.0004133903712499887, 130}});
         INode *node_mul_2 = _graph.node(id_mul_2);
-        node_mul_2->set_common_node_parameters(NodeParams{ "mul_2", target });
+        node_mul_2->set_common_node_parameters(NodeParams{"mul_2", target});
         _graph.add_connection(id_block_2_1_BiasAdd, 0, id_mul_2, 0);
         _graph.add_connection(id_mul_2_y, 0, id_mul_2, 1);
 
         NodeID id_add_2 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.003026385325938463, 94 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.003026385325938463, 94}});
         INode *node_add_2 = _graph.node(id_add_2);
-        node_add_2->set_common_node_parameters(NodeParams{ "add_2", target });
+        node_add_2->set_common_node_parameters(NodeParams{"add_2", target});
         _graph.add_connection(id_add_1, 0, id_add_2, 0);
         _graph.add_connection(id_mul_2, 0, id_add_2, 1);
 
-        NodeID id_block_3_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                          PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.003977528307586908, 142));
+        NodeID id_block_3_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.003977528307586908, 142));
         INode *node_block_3_1_BiasAdd = _graph.node(id_block_3_1_BiasAdd);
-        node_block_3_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_3_1_BiasAdd", target });
+        node_block_3_1_BiasAdd->set_common_node_parameters(NodeParams{"block_3_1_BiasAdd", target});
         _graph.add_connection(id_add_2, 0, id_block_3_1_BiasAdd, 0);
         _graph.add_connection(id_block_3_1_FakeQuantWithMinMaxVars, 0, id_block_3_1_BiasAdd, 1);
         _graph.add_connection(id_block_3_1_Conv2D_bias, 0, id_block_3_1_BiasAdd, 2);
 
         NodeID id_mul_3 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.0003943995980080217, 141 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.0003943995980080217, 141}});
         INode *node_mul_3 = _graph.node(id_mul_3);
-        node_mul_3->set_common_node_parameters(NodeParams{ "mul_3", target });
+        node_mul_3->set_common_node_parameters(NodeParams{"mul_3", target});
         _graph.add_connection(id_block_3_1_BiasAdd, 0, id_mul_3, 0);
         _graph.add_connection(id_mul_3_y, 0, id_mul_3, 1);
 
         NodeID id_add_3 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.003101327223703265, 98 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.003101327223703265, 98}});
         INode *node_add_3 = _graph.node(id_add_3);
-        node_add_3->set_common_node_parameters(NodeParams{ "add_3", target });
+        node_add_3->set_common_node_parameters(NodeParams{"add_3", target});
         _graph.add_connection(id_add_2, 0, id_add_3, 0);
         _graph.add_connection(id_mul_3, 0, id_add_3, 1);
 
-        NodeID id_block_4_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                          PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.0045388080179691315, 146));
+        NodeID id_block_4_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.0045388080179691315, 146));
         INode *node_block_4_1_BiasAdd = _graph.node(id_block_4_1_BiasAdd);
-        node_block_4_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_4_1_BiasAdd", target });
+        node_block_4_1_BiasAdd->set_common_node_parameters(NodeParams{"block_4_1_BiasAdd", target});
         _graph.add_connection(id_add_3, 0, id_block_4_1_BiasAdd, 0);
         _graph.add_connection(id_block_4_1_FakeQuantWithMinMaxVars, 0, id_block_4_1_BiasAdd, 1);
         _graph.add_connection(id_block_4_1_Conv2D_bias, 0, id_block_4_1_BiasAdd, 2);
 
         NodeID id_mul_4 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.00044342130422592163, 143 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.00044342130422592163, 143}});
         INode *node_mul_4 = _graph.node(id_mul_4);
-        node_mul_4->set_common_node_parameters(NodeParams{ "mul_4", target });
+        node_mul_4->set_common_node_parameters(NodeParams{"mul_4", target});
         _graph.add_connection(id_block_4_1_BiasAdd, 0, id_mul_4, 0);
         _graph.add_connection(id_mul_4_y, 0, id_mul_4, 1);
 
         NodeID id_add_4 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.003150839824229479, 98 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.003150839824229479, 98}});
         INode *node_add_4 = _graph.node(id_add_4);
-        node_add_4->set_common_node_parameters(NodeParams{ "add_4", target });
+        node_add_4->set_common_node_parameters(NodeParams{"add_4", target});
         _graph.add_connection(id_add_3, 0, id_add_4, 0);
         _graph.add_connection(id_mul_4, 0, id_add_4, 1);
 
-        NodeID id_block_5_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                          PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.00402890844270587, 132));
+        NodeID id_block_5_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.00402890844270587, 132));
         INode *node_block_5_1_BiasAdd = _graph.node(id_block_5_1_BiasAdd);
-        node_block_5_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_5_1_BiasAdd", target });
+        node_block_5_1_BiasAdd->set_common_node_parameters(NodeParams{"block_5_1_BiasAdd", target});
         _graph.add_connection(id_add_4, 0, id_block_5_1_BiasAdd, 0);
         _graph.add_connection(id_block_5_1_FakeQuantWithMinMaxVars, 0, id_block_5_1_BiasAdd, 1);
         _graph.add_connection(id_block_5_1_Conv2D_bias, 0, id_block_5_1_BiasAdd, 2);
 
         NodeID id_mul_5 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.0004023382789455354, 132 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.0004023382789455354, 132}});
         INode *node_mul_5 = _graph.node(id_mul_5);
-        node_mul_5->set_common_node_parameters(NodeParams{ "mul_5", target });
+        node_mul_5->set_common_node_parameters(NodeParams{"mul_5", target});
         _graph.add_connection(id_block_5_1_BiasAdd, 0, id_mul_5, 0);
         _graph.add_connection(id_mul_5_y, 0, id_mul_5, 1);
 
         NodeID id_add_5 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.0030975888948887587, 94 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.0030975888948887587, 94}});
         INode *node_add_5 = _graph.node(id_add_5);
-        node_add_5->set_common_node_parameters(NodeParams{ "add_5", target });
+        node_add_5->set_common_node_parameters(NodeParams{"add_5", target});
         _graph.add_connection(id_add_4, 0, id_add_5, 0);
         _graph.add_connection(id_mul_5, 0, id_add_5, 1);
 
-        NodeID id_block_6_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                          PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.00421866774559021, 125));
+        NodeID id_block_6_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.00421866774559021, 125));
         INode *node_block_6_1_BiasAdd = _graph.node(id_block_6_1_BiasAdd);
-        node_block_6_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_6_1_BiasAdd", target });
+        node_block_6_1_BiasAdd->set_common_node_parameters(NodeParams{"block_6_1_BiasAdd", target});
         _graph.add_connection(id_add_5, 0, id_block_6_1_BiasAdd, 0);
         _graph.add_connection(id_block_6_1_FakeQuantWithMinMaxVars, 0, id_block_6_1_BiasAdd, 1);
         _graph.add_connection(id_block_6_1_Conv2D_bias, 0, id_block_6_1_BiasAdd, 2);
 
         NodeID id_mul_6 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.00041950203012675047, 125 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.00041950203012675047, 125}});
         INode *node_mul_6 = _graph.node(id_mul_6);
-        node_mul_6->set_common_node_parameters(NodeParams{ "mul_6", target });
+        node_mul_6->set_common_node_parameters(NodeParams{"mul_6", target});
         _graph.add_connection(id_block_6_1_BiasAdd, 0, id_mul_6, 0);
         _graph.add_connection(id_mul_6_y, 0, id_mul_6, 1);
 
         NodeID id_add_6 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.003155382815748453, 92 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.003155382815748453, 92}});
         INode *node_add_6 = _graph.node(id_add_6);
-        node_add_6->set_common_node_parameters(NodeParams{ "add_6", target });
+        node_add_6->set_common_node_parameters(NodeParams{"add_6", target});
         _graph.add_connection(id_add_5, 0, id_add_6, 0);
         _graph.add_connection(id_mul_6, 0, id_add_6, 1);
 
-        NodeID id_block_7_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                          PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.004250136204063892, 143));
+        NodeID id_block_7_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.004250136204063892, 143));
         INode *node_block_7_1_BiasAdd = _graph.node(id_block_7_1_BiasAdd);
-        node_block_7_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_7_1_BiasAdd", target });
+        node_block_7_1_BiasAdd->set_common_node_parameters(NodeParams{"block_7_1_BiasAdd", target});
         _graph.add_connection(id_add_6, 0, id_block_7_1_BiasAdd, 0);
         _graph.add_connection(id_block_7_1_FakeQuantWithMinMaxVars, 0, id_block_7_1_BiasAdd, 1);
         _graph.add_connection(id_block_7_1_Conv2D_bias, 0, id_block_7_1_BiasAdd, 2);
 
         NodeID id_mul_7 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.00042401350219734013, 142 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.00042401350219734013, 142}});
         INode *node_mul_7 = _graph.node(id_mul_7);
-        node_mul_7->set_common_node_parameters(NodeParams{ "mul_7", target });
+        node_mul_7->set_common_node_parameters(NodeParams{"mul_7", target});
         _graph.add_connection(id_block_7_1_BiasAdd, 0, id_mul_7, 0);
         _graph.add_connection(id_mul_7_y, 0, id_mul_7, 1);
 
         NodeID id_add_7 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.0031760605052113533, 86 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.0031760605052113533, 86}});
         INode *node_add_7 = _graph.node(id_add_7);
-        node_add_7->set_common_node_parameters(NodeParams{ "add_7", target });
+        node_add_7->set_common_node_parameters(NodeParams{"add_7", target});
         _graph.add_connection(id_add_6, 0, id_add_7, 0);
         _graph.add_connection(id_mul_7, 0, id_add_7, 1);
 
-        NodeID id_block_8_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                          PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.004277155734598637, 123));
+        NodeID id_block_8_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.004277155734598637, 123));
         INode *node_block_8_1_BiasAdd = _graph.node(id_block_8_1_BiasAdd);
-        node_block_8_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_8_1_BiasAdd", target });
+        node_block_8_1_BiasAdd->set_common_node_parameters(NodeParams{"block_8_1_BiasAdd", target});
         _graph.add_connection(id_add_7, 0, id_block_8_1_BiasAdd, 0);
         _graph.add_connection(id_block_8_1_FakeQuantWithMinMaxVars, 0, id_block_8_1_BiasAdd, 1);
         _graph.add_connection(id_block_8_1_Conv2D_bias, 0, id_block_8_1_BiasAdd, 2);
 
         NodeID id_mul_8 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.00042673019925132394, 123 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.00042673019925132394, 123}});
         INode *node_mul_8 = _graph.node(id_mul_8);
-        node_mul_8->set_common_node_parameters(NodeParams{ "mul_8", target });
+        node_mul_8->set_common_node_parameters(NodeParams{"mul_8", target});
         _graph.add_connection(id_block_8_1_BiasAdd, 0, id_mul_8, 0);
         _graph.add_connection(id_mul_8_y, 0, id_mul_8, 1);
 
         NodeID id_add_8 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.0032156009692698717, 86 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.0032156009692698717, 86}});
         INode *node_add_8 = _graph.node(id_add_8);
-        node_add_8->set_common_node_parameters(NodeParams{ "add_8", target });
+        node_add_8->set_common_node_parameters(NodeParams{"add_8", target});
         _graph.add_connection(id_add_7, 0, id_add_8, 0);
         _graph.add_connection(id_mul_8, 0, id_add_8, 1);
 
-        NodeID id_block_9_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                          PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.00445037754252553, 129));
+        NodeID id_block_9_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.00445037754252553, 129));
         INode *node_block_9_1_BiasAdd = _graph.node(id_block_9_1_BiasAdd);
-        node_block_9_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_9_1_BiasAdd", target });
+        node_block_9_1_BiasAdd->set_common_node_parameters(NodeParams{"block_9_1_BiasAdd", target});
         _graph.add_connection(id_add_8, 0, id_block_9_1_BiasAdd, 0);
         _graph.add_connection(id_block_9_1_FakeQuantWithMinMaxVars, 0, id_block_9_1_BiasAdd, 1);
         _graph.add_connection(id_block_9_1_Conv2D_bias, 0, id_block_9_1_BiasAdd, 2);
 
         NodeID id_mul_9 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.0004448975087143481, 129 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.0004448975087143481, 129}});
         INode *node_mul_9 = _graph.node(id_mul_9);
-        node_mul_9->set_common_node_parameters(NodeParams{ "mul_9", target });
+        node_mul_9->set_common_node_parameters(NodeParams{"mul_9", target});
         _graph.add_connection(id_block_9_1_BiasAdd, 0, id_mul_9, 0);
         _graph.add_connection(id_mul_9_y, 0, id_mul_9, 1);
 
         NodeID id_add_9 = _graph.add_node<EltwiseLayerNode>(
-                              descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.0032742770854383707, 80 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.0032742770854383707, 80}});
         INode *node_add_9 = _graph.node(id_add_9);
-        node_add_9->set_common_node_parameters(NodeParams{ "add_9", target });
+        node_add_9->set_common_node_parameters(NodeParams{"add_9", target});
         _graph.add_connection(id_add_8, 0, id_add_9, 0);
         _graph.add_connection(id_mul_9, 0, id_add_9, 1);
 
-        NodeID id_block_10_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                           PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.003614710411056876, 131));
+        NodeID id_block_10_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.003614710411056876, 131));
         INode *node_block_10_1_BiasAdd = _graph.node(id_block_10_1_BiasAdd);
-        node_block_10_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_10_1_BiasAdd", target });
+        node_block_10_1_BiasAdd->set_common_node_parameters(NodeParams{"block_10_1_BiasAdd", target});
         _graph.add_connection(id_add_9, 0, id_block_10_1_BiasAdd, 0);
         _graph.add_connection(id_block_10_1_FakeQuantWithMinMaxVars, 0, id_block_10_1_BiasAdd, 1);
         _graph.add_connection(id_block_10_1_Conv2D_bias, 0, id_block_10_1_BiasAdd, 2);
 
         NodeID id_mul_10 = _graph.add_node<EltwiseLayerNode>(
-                               descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.00036083892337046564, 130 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.00036083892337046564, 130}});
         INode *node_mul_10 = _graph.node(id_mul_10);
-        node_mul_10->set_common_node_parameters(NodeParams{ "mul_10", target });
+        node_mul_10->set_common_node_parameters(NodeParams{"mul_10", target});
         _graph.add_connection(id_block_10_1_BiasAdd, 0, id_mul_10, 0);
         _graph.add_connection(id_mul_10_y, 0, id_mul_10, 1);
 
         NodeID id_add_10 = _graph.add_node<EltwiseLayerNode>(
-                               descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.0031881770119071007, 81 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.0031881770119071007, 81}});
         INode *node_add_10 = _graph.node(id_add_10);
-        node_add_10->set_common_node_parameters(NodeParams{ "add_10", target });
+        node_add_10->set_common_node_parameters(NodeParams{"add_10", target});
         _graph.add_connection(id_add_9, 0, id_add_10, 0);
         _graph.add_connection(id_mul_10, 0, id_add_10, 1);
 
-        NodeID id_block_11_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                           PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.003969002980738878, 133));
+        NodeID id_block_11_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.003969002980738878, 133));
         INode *node_block_11_1_BiasAdd = _graph.node(id_block_11_1_BiasAdd);
-        node_block_11_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_11_1_BiasAdd", target });
+        node_block_11_1_BiasAdd->set_common_node_parameters(NodeParams{"block_11_1_BiasAdd", target});
         _graph.add_connection(id_add_10, 0, id_block_11_1_BiasAdd, 0);
         _graph.add_connection(id_block_11_1_FakeQuantWithMinMaxVars, 0, id_block_11_1_BiasAdd, 1);
         _graph.add_connection(id_block_11_1_Conv2D_bias, 0, id_block_11_1_BiasAdd, 2);
 
         NodeID id_mul_11 = _graph.add_node<EltwiseLayerNode>(
-                               descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.0003968806122429669, 133 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.0003968806122429669, 133}});
         INode *node_mul_11 = _graph.node(id_mul_11);
-        node_mul_11->set_common_node_parameters(NodeParams{ "mul_11", target });
+        node_mul_11->set_common_node_parameters(NodeParams{"mul_11", target});
         _graph.add_connection(id_block_11_1_BiasAdd, 0, id_mul_11, 0);
         _graph.add_connection(id_mul_11_y, 0, id_mul_11, 1);
 
         NodeID id_add_11 = _graph.add_node<EltwiseLayerNode>(
-                               descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.0032707711216062307, 80 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.0032707711216062307, 80}});
         INode *node_add_11 = _graph.node(id_add_11);
-        node_add_11->set_common_node_parameters(NodeParams{ "add_11", target });
+        node_add_11->set_common_node_parameters(NodeParams{"add_11", target});
         _graph.add_connection(id_add_10, 0, id_add_11, 0);
         _graph.add_connection(id_mul_11, 0, id_add_11, 1);
 
-        NodeID id_block_12_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                           PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.004366801120340824, 110));
+        NodeID id_block_12_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.004366801120340824, 110));
         INode *node_block_12_1_BiasAdd = _graph.node(id_block_12_1_BiasAdd);
-        node_block_12_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_12_1_BiasAdd", target });
+        node_block_12_1_BiasAdd->set_common_node_parameters(NodeParams{"block_12_1_BiasAdd", target});
         _graph.add_connection(id_add_11, 0, id_block_12_1_BiasAdd, 0);
         _graph.add_connection(id_block_12_1_FakeQuantWithMinMaxVars, 0, id_block_12_1_BiasAdd, 1);
         _graph.add_connection(id_block_12_1_Conv2D_bias, 0, id_block_12_1_BiasAdd, 2);
 
         NodeID id_mul_12 = _graph.add_node<EltwiseLayerNode>(
-                               descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.0004365936329122633, 110 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.0004365936329122633, 110}});
         INode *node_mul_12 = _graph.node(id_mul_12);
-        node_mul_12->set_common_node_parameters(NodeParams{ "mul_12", target });
+        node_mul_12->set_common_node_parameters(NodeParams{"mul_12", target});
         _graph.add_connection(id_block_12_1_BiasAdd, 0, id_mul_12, 0);
         _graph.add_connection(id_mul_12_y, 0, id_mul_12, 1);
 
         NodeID id_add_12 = _graph.add_node<EltwiseLayerNode>(
-                               descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.003275055903941393, 79 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.003275055903941393, 79}});
         INode *node_add_12 = _graph.node(id_add_12);
-        node_add_12->set_common_node_parameters(NodeParams{ "add_12", target });
+        node_add_12->set_common_node_parameters(NodeParams{"add_12", target});
         _graph.add_connection(id_add_11, 0, id_add_12, 0);
         _graph.add_connection(id_mul_12, 0, id_add_12, 1);
 
-        NodeID id_block_13_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                           PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.004386766813695431, 139));
+        NodeID id_block_13_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.004386766813695431, 139));
         INode *node_block_13_1_BiasAdd = _graph.node(id_block_13_1_BiasAdd);
-        node_block_13_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_13_1_BiasAdd", target });
+        node_block_13_1_BiasAdd->set_common_node_parameters(NodeParams{"block_13_1_BiasAdd", target});
         _graph.add_connection(id_add_12, 0, id_block_13_1_BiasAdd, 0);
         _graph.add_connection(id_block_13_1_FakeQuantWithMinMaxVars, 0, id_block_13_1_BiasAdd, 1);
         _graph.add_connection(id_block_13_1_Conv2D_bias, 0, id_block_13_1_BiasAdd, 2);
 
         NodeID id_mul_13 = _graph.add_node<EltwiseLayerNode>(
-                               descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.0004385628562886268, 139 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.0004385628562886268, 139}});
         INode *node_mul_13 = _graph.node(id_mul_13);
-        node_mul_13->set_common_node_parameters(NodeParams{ "mul_13", target });
+        node_mul_13->set_common_node_parameters(NodeParams{"mul_13", target});
         _graph.add_connection(id_block_13_1_BiasAdd, 0, id_mul_13, 0);
         _graph.add_connection(id_mul_13_y, 0, id_mul_13, 1);
 
         NodeID id_add_13 = _graph.add_node<EltwiseLayerNode>(
-                               descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.0033287261612713337, 78 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.0033287261612713337, 78}});
         INode *node_add_13 = _graph.node(id_add_13);
-        node_add_13->set_common_node_parameters(NodeParams{ "add_13", target });
+        node_add_13->set_common_node_parameters(NodeParams{"add_13", target});
         _graph.add_connection(id_add_12, 0, id_add_13, 0);
         _graph.add_connection(id_mul_13, 0, id_add_13, 1);
 
-        NodeID id_block_14_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                           PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.0038069337606430054, 130));
+        NodeID id_block_14_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.0038069337606430054, 130));
         INode *node_block_14_1_BiasAdd = _graph.node(id_block_14_1_BiasAdd);
-        node_block_14_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_14_1_BiasAdd", target });
+        node_block_14_1_BiasAdd->set_common_node_parameters(NodeParams{"block_14_1_BiasAdd", target});
         _graph.add_connection(id_add_13, 0, id_block_14_1_BiasAdd, 0);
         _graph.add_connection(id_block_14_1_FakeQuantWithMinMaxVars, 0, id_block_14_1_BiasAdd, 1);
         _graph.add_connection(id_block_14_1_Conv2D_bias, 0, id_block_14_1_BiasAdd, 2);
 
         NodeID id_mul_14 = _graph.add_node<EltwiseLayerNode>(
-                               descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.00037829321809113026, 130 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.00037829321809113026, 130}});
         INode *node_mul_14 = _graph.node(id_mul_14);
-        node_mul_14->set_common_node_parameters(NodeParams{ "mul_14", target });
+        node_mul_14->set_common_node_parameters(NodeParams{"mul_14", target});
         _graph.add_connection(id_block_14_1_BiasAdd, 0, id_mul_14, 0);
         _graph.add_connection(id_mul_14_y, 0, id_mul_14, 1);
 
         NodeID id_add_14 = _graph.add_node<EltwiseLayerNode>(
-                               descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.0033590947277843952, 77 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.0033590947277843952, 77}});
         INode *node_add_14 = _graph.node(id_add_14);
-        node_add_14->set_common_node_parameters(NodeParams{ "add_14", target });
+        node_add_14->set_common_node_parameters(NodeParams{"add_14", target});
         _graph.add_connection(id_add_13, 0, id_add_14, 0);
         _graph.add_connection(id_mul_14, 0, id_add_14, 1);
 
-        NodeID id_block_15_1_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                           PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.004009159281849861, 130));
+        NodeID id_block_15_1_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.004009159281849861, 130));
         INode *node_block_15_1_BiasAdd = _graph.node(id_block_15_1_BiasAdd);
-        node_block_15_1_BiasAdd->set_common_node_parameters(NodeParams{ "block_15_1_BiasAdd", target });
+        node_block_15_1_BiasAdd->set_common_node_parameters(NodeParams{"block_15_1_BiasAdd", target});
         _graph.add_connection(id_add_14, 0, id_block_15_1_BiasAdd, 0);
         _graph.add_connection(id_block_15_1_FakeQuantWithMinMaxVars, 0, id_block_15_1_BiasAdd, 1);
         _graph.add_connection(id_block_15_1_Conv2D_bias, 0, id_block_15_1_BiasAdd, 2);
 
         NodeID id_mul_15 = _graph.add_node<EltwiseLayerNode>(
-                               descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Mul, QuantizationInfo{ 0.0004008286341559142, 130 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Mul, QuantizationInfo{0.0004008286341559142, 130}});
         INode *node_mul_15 = _graph.node(id_mul_15);
-        node_mul_15->set_common_node_parameters(NodeParams{ "mul_15", target });
+        node_mul_15->set_common_node_parameters(NodeParams{"mul_15", target});
         _graph.add_connection(id_block_15_1_BiasAdd, 0, id_mul_15, 0);
         _graph.add_connection(id_mul_15_y, 0, id_mul_15, 1);
 
         NodeID id_add_15 = _graph.add_node<EltwiseLayerNode>(
-                               descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.0035031239967793226, 78 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.0035031239967793226, 78}});
         INode *node_add_15 = _graph.node(id_add_15);
-        node_add_15->set_common_node_parameters(NodeParams{ "add_15", target });
+        node_add_15->set_common_node_parameters(NodeParams{"add_15", target});
         _graph.add_connection(id_add_14, 0, id_add_15, 0);
         _graph.add_connection(id_mul_15, 0, id_add_15, 1);
 
-        NodeID id_post_residual_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                              PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.005167999770492315, 112));
+        NodeID id_post_residual_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.005167999770492315, 112));
         INode *node_post_residual_BiasAdd = _graph.node(id_post_residual_BiasAdd);
-        node_post_residual_BiasAdd->set_common_node_parameters(NodeParams{ "post_residual_BiasAdd", target });
+        node_post_residual_BiasAdd->set_common_node_parameters(NodeParams{"post_residual_BiasAdd", target});
         _graph.add_connection(id_add_15, 0, id_post_residual_BiasAdd, 0);
         _graph.add_connection(id_post_residual_FakeQuantWithMinMaxVars, 0, id_post_residual_BiasAdd, 1);
         _graph.add_connection(id_post_residual_Conv2D_bias, 0, id_post_residual_BiasAdd, 2);
 
         NodeID id_add_16 = _graph.add_node<EltwiseLayerNode>(
-                               descriptors::EltwiseLayerDescriptor{ EltwiseOperation::Add, QuantizationInfo{ 0.0065071373246610165, 89 } });
+            descriptors::EltwiseLayerDescriptor{EltwiseOperation::Add, QuantizationInfo{0.0065071373246610165, 89}});
         INode *node_add_16 = _graph.node(id_add_16);
-        node_add_16->set_common_node_parameters(NodeParams{ "add_16", target });
+        node_add_16->set_common_node_parameters(NodeParams{"add_16", target});
         _graph.add_connection(id_post_residual_BiasAdd, 0, id_add_16, 0);
         _graph.add_connection(id_pre_residual_BiasAdd, 0, id_add_16, 1);
 
-        NodeID id_pre_upscale_BiasAdd = _graph.add_node<ConvolutionLayerNode>(
-                                            PadStrideInfo
-        {
-            1, 1,
-            1, 1,
-            1, 1,
-            DimensionRoundingType::FLOOR },
-        1,
-        arm_compute::graph::ConvolutionMethod::Default,
-        FastMathHint::Disabled,
-        QuantizationInfo(0.005013593938201666, 26));
+        NodeID id_pre_upscale_BiasAdd =
+            _graph.add_node<ConvolutionLayerNode>(PadStrideInfo{1, 1, 1, 1, 1, 1, DimensionRoundingType::FLOOR}, 1,
+                                                  arm_compute::graph::ConvolutionMethod::Default,
+                                                  FastMathHint::Disabled, QuantizationInfo(0.005013593938201666, 26));
         INode *node_pre_upscale_BiasAdd = _graph.node(id_pre_upscale_BiasAdd);
-        node_pre_upscale_BiasAdd->set_common_node_parameters(NodeParams{ "pre_upscale_BiasAdd", target });
+        node_pre_upscale_BiasAdd->set_common_node_parameters(NodeParams{"pre_upscale_BiasAdd", target});
         _graph.add_connection(id_add_16, 0, id_pre_upscale_BiasAdd, 0);
         _graph.add_connection(id_pre_upscale_FakeQuantWithMinMaxVars, 0, id_pre_upscale_BiasAdd, 1);
         _graph.add_connection(id_pre_upscale_Conv2D_bias, 0, id_pre_upscale_BiasAdd, 2);
 
         NodeID id_upscale_net_FakeQuantWithMinMaxVars_1 = _graph.add_node<DeconvolutionLayerNode>(
-                                                              descriptors::DeconvolutionLayerDescriptor
-        {
-            PadStrideInfo{
-                2, 2,
-                0, 0,
-                0, 0,
-                DimensionRoundingType::FLOOR },
-            QuantizationInfo{ 0.004990961868315935, 26 } });
+            descriptors::DeconvolutionLayerDescriptor{PadStrideInfo{2, 2, 0, 0, 0, 0, DimensionRoundingType::FLOOR},
+                                                      QuantizationInfo{0.004990961868315935, 26}});
         INode *node_upscale_net_FakeQuantWithMinMaxVars_1 = _graph.node(id_upscale_net_FakeQuantWithMinMaxVars_1);
-        node_upscale_net_FakeQuantWithMinMaxVars_1->set_common_node_parameters(NodeParams{ "upscale_net_FakeQuantWithMinMaxVars_1", target });
+        node_upscale_net_FakeQuantWithMinMaxVars_1->set_common_node_parameters(
+            NodeParams{"upscale_net_FakeQuantWithMinMaxVars_1", target});
         _graph.add_connection(id_pre_upscale_BiasAdd, 0, id_upscale_net_FakeQuantWithMinMaxVars_1, 0);
-        _graph.add_connection(id_upscale_net_FakeQuantWithMinMaxVars_transposed, 0, id_upscale_net_FakeQuantWithMinMaxVars_1, 1);
+        _graph.add_connection(id_upscale_net_FakeQuantWithMinMaxVars_transposed, 0,
+                              id_upscale_net_FakeQuantWithMinMaxVars_1, 1);
         TensorShape output_shape;
         output_shape.set(0, 3, false).set(1, 720, false).set(2, 1280, false).set(3, 1, false);
 
         NodeID id_output_140211982446376   = _graph.add_node<OutputNode>();
         INode *node_output_140211982446376 = _graph.node(id_output_140211982446376);
-        node_output_140211982446376->set_common_node_parameters(NodeParams{ "output_140211982446376", target });
+        node_output_140211982446376->set_common_node_parameters(NodeParams{"output_140211982446376", target});
         _graph.add_connection(id_upscale_net_FakeQuantWithMinMaxVars_1, 0, id_output_140211982446376, 0);
-        node_output_140211982446376->input(0)->set_accessor(get_npy_output_accessor(expected_output_filename.value(), output_shape, common_params.data_type,
-                                                                                    common_params.data_layout));
+        node_output_140211982446376->input(0)->set_accessor(get_npy_output_accessor(
+            expected_output_filename.value(), output_shape, common_params.data_type, common_params.data_layout));
 
         return true;
     }
diff --git a/examples/graph_googlenet.cpp b/examples/graph_googlenet.cpp
index 683205b3b5..f431fc412b 100644
--- a/examples/graph_googlenet.cpp
+++ b/examples/graph_googlenet.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -35,8 +36,7 @@ using namespace arm_compute::graph_utils;
 class GraphGooglenetExample : public Example
 {
 public:
-    GraphGooglenetExample()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "GoogleNet")
+    GraphGooglenetExample() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "GoogleNet")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -49,14 +49,15 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
         }
 
         // Checks
-        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type), "QASYMM8 not supported for this graph");
+        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type),
+                                "QASYMM8 not supported for this graph");
 
         // Print parameter values
         std::cout << common_params << std::endl;
@@ -65,64 +66,99 @@ public:
         std::string data_path = common_params.data_path;
 
         // Create a preprocessor object
-        const std::array<float, 3> mean_rgb{ { 122.68f, 116.67f, 104.01f } };
+        const std::array<float, 3>     mean_rgb{{122.68f, 116.67f, 104.01f}};
         std::unique_ptr<IPreprocessor> preprocessor = std::make_unique<CaffePreproccessor>(mean_rgb);
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
+        graph << common_params.target << common_params.fast_math_hint
               << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor)))
+              << ConvolutionLayer(7U, 7U, 64U,
+                                  get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv1/conv1_7x7_s2_w.npy",
+                                                       weights_layout),
+                                  get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv1/conv1_7x7_s2_b.npy"),
+                                  PadStrideInfo(2, 2, 3, 3))
+                     .set_name("conv1/7x7_s2")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("conv1/relu_7x7")
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+                     .set_name("pool1/3x3_s2")
+              << NormalizationLayer(NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f))
+                     .set_name("pool1/norm1")
               << ConvolutionLayer(
-                  7U, 7U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv1/conv1_7x7_s2_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv1/conv1_7x7_s2_b.npy"),
-                  PadStrideInfo(2, 2, 3, 3))
-              .set_name("conv1/7x7_s2")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv1/relu_7x7")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name("pool1/3x3_s2")
-              << NormalizationLayer(NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f)).set_name("pool1/norm1")
-              << ConvolutionLayer(
-                  1U, 1U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv2/conv2_3x3_reduce_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv2/conv2_3x3_reduce_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("conv2/3x3_reduce")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv2/relu_3x3_reduce")
+                     1U, 1U, 64U,
+                     get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv2/conv2_3x3_reduce_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv2/conv2_3x3_reduce_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("conv2/3x3_reduce")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("conv2/relu_3x3_reduce")
               << ConvolutionLayer(
-                  3U, 3U, 192U,
-                  get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv2/conv2_3x3_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv2/conv2_3x3_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv2/3x3")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv2/relu_3x3")
-              << NormalizationLayer(NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f)).set_name("conv2/norm2")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name("pool2/3x3_s2");
-        graph << get_inception_node(data_path, "inception_3a", weights_layout, 64, std::make_tuple(96U, 128U), std::make_tuple(16U, 32U), 32U).set_name("inception_3a/concat");
-        graph << get_inception_node(data_path, "inception_3b", weights_layout, 128, std::make_tuple(128U, 192U), std::make_tuple(32U, 96U), 64U).set_name("inception_3b/concat");
-        graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name("pool3/3x3_s2");
-        graph << get_inception_node(data_path, "inception_4a", weights_layout, 192, std::make_tuple(96U, 208U), std::make_tuple(16U, 48U), 64U).set_name("inception_4a/concat");
-        graph << get_inception_node(data_path, "inception_4b", weights_layout, 160, std::make_tuple(112U, 224U), std::make_tuple(24U, 64U), 64U).set_name("inception_4b/concat");
-        graph << get_inception_node(data_path, "inception_4c", weights_layout, 128, std::make_tuple(128U, 256U), std::make_tuple(24U, 64U), 64U).set_name("inception_4c/concat");
-        graph << get_inception_node(data_path, "inception_4d", weights_layout, 112, std::make_tuple(144U, 288U), std::make_tuple(32U, 64U), 64U).set_name("inception_4d/concat");
-        graph << get_inception_node(data_path, "inception_4e", weights_layout, 256, std::make_tuple(160U, 320U), std::make_tuple(32U, 128U), 128U).set_name("inception_4e/concat");
-        graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name("pool4/3x3_s2");
-        graph << get_inception_node(data_path, "inception_5a", weights_layout, 256, std::make_tuple(160U, 320U), std::make_tuple(32U, 128U), 128U).set_name("inception_5a/concat");
-        graph << get_inception_node(data_path, "inception_5b", weights_layout, 384, std::make_tuple(192U, 384U), std::make_tuple(48U, 128U), 128U).set_name("inception_5b/concat");
-        graph << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 7, operation_layout, PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL))).set_name("pool5/7x7_s1")
+                     3U, 3U, 192U,
+                     get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv2/conv2_3x3_w.npy", weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/googlenet_model/conv2/conv2_3x3_b.npy"),
+                     PadStrideInfo(1, 1, 1, 1))
+                     .set_name("conv2/3x3")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("conv2/relu_3x3")
+              << NormalizationLayer(NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f))
+                     .set_name("conv2/norm2")
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+                     .set_name("pool2/3x3_s2");
+        graph << get_inception_node(data_path, "inception_3a", weights_layout, 64, std::make_tuple(96U, 128U),
+                                    std::make_tuple(16U, 32U), 32U)
+                     .set_name("inception_3a/concat");
+        graph << get_inception_node(data_path, "inception_3b", weights_layout, 128, std::make_tuple(128U, 192U),
+                                    std::make_tuple(32U, 96U), 64U)
+                     .set_name("inception_3b/concat");
+        graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+                     .set_name("pool3/3x3_s2");
+        graph << get_inception_node(data_path, "inception_4a", weights_layout, 192, std::make_tuple(96U, 208U),
+                                    std::make_tuple(16U, 48U), 64U)
+                     .set_name("inception_4a/concat");
+        graph << get_inception_node(data_path, "inception_4b", weights_layout, 160, std::make_tuple(112U, 224U),
+                                    std::make_tuple(24U, 64U), 64U)
+                     .set_name("inception_4b/concat");
+        graph << get_inception_node(data_path, "inception_4c", weights_layout, 128, std::make_tuple(128U, 256U),
+                                    std::make_tuple(24U, 64U), 64U)
+                     .set_name("inception_4c/concat");
+        graph << get_inception_node(data_path, "inception_4d", weights_layout, 112, std::make_tuple(144U, 288U),
+                                    std::make_tuple(32U, 64U), 64U)
+                     .set_name("inception_4d/concat");
+        graph << get_inception_node(data_path, "inception_4e", weights_layout, 256, std::make_tuple(160U, 320U),
+                                    std::make_tuple(32U, 128U), 128U)
+                     .set_name("inception_4e/concat");
+        graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+                     .set_name("pool4/3x3_s2");
+        graph << get_inception_node(data_path, "inception_5a", weights_layout, 256, std::make_tuple(160U, 320U),
+                                    std::make_tuple(32U, 128U), 128U)
+                     .set_name("inception_5a/concat");
+        graph << get_inception_node(data_path, "inception_5b", weights_layout, 384, std::make_tuple(192U, 384U),
+                                    std::make_tuple(48U, 128U), 128U)
+                     .set_name("inception_5b/concat");
+        graph << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 7, operation_layout,
+                                               PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL)))
+                     .set_name("pool5/7x7_s1")
               << FullyConnectedLayer(
-                  1000U,
-                  get_weights_accessor(data_path, "/cnn_data/googlenet_model/loss3/loss3_classifier_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/googlenet_model/loss3/loss3_classifier_b.npy"))
-              .set_name("loss3/classifier")
-              << SoftmaxLayer().set_name("prob")
-              << OutputLayer(get_output_accessor(common_params, 5));
+                     1000U,
+                     get_weights_accessor(data_path, "/cnn_data/googlenet_model/loss3/loss3_classifier_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/googlenet_model/loss3/loss3_classifier_b.npy"))
+                     .set_name("loss3/classifier")
+              << SoftmaxLayer().set_name("prob") << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
         GraphConfig config;
@@ -148,63 +184,63 @@ private:
     CommonGraphParams  common_params;
     Stream             graph;
 
-    ConcatLayer get_inception_node(const std::string &data_path, std::string &&param_path, DataLayout weights_layout,
-                                   unsigned int a_filt,
+    ConcatLayer get_inception_node(const std::string                     &data_path,
+                                   std::string                          &&param_path,
+                                   DataLayout                             weights_layout,
+                                   unsigned int                           a_filt,
                                    std::tuple<unsigned int, unsigned int> b_filters,
                                    std::tuple<unsigned int, unsigned int> c_filters,
-                                   unsigned int d_filt)
+                                   unsigned int                           d_filt)
     {
         std::string total_path = "/cnn_data/googlenet_model/" + param_path + "/" + param_path + "_";
         SubStream   i_a(graph);
-        i_a << ConvolutionLayer(
-                1U, 1U, a_filt,
-                get_weights_accessor(data_path, total_path + "1x1_w.npy", weights_layout),
-                get_weights_accessor(data_path, total_path + "1x1_b.npy"),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/1x1")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/relu_1x1");
+        i_a << ConvolutionLayer(1U, 1U, a_filt,
+                                get_weights_accessor(data_path, total_path + "1x1_w.npy", weights_layout),
+                                get_weights_accessor(data_path, total_path + "1x1_b.npy"), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/1x1")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/relu_1x1");
 
         SubStream i_b(graph);
-        i_b << ConvolutionLayer(
-                1U, 1U, std::get<0>(b_filters),
-                get_weights_accessor(data_path, total_path + "3x3_reduce_w.npy", weights_layout),
-                get_weights_accessor(data_path, total_path + "3x3_reduce_b.npy"),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/3x3_reduce")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/relu_3x3_reduce")
-            << ConvolutionLayer(
-                3U, 3U, std::get<1>(b_filters),
-                get_weights_accessor(data_path, total_path + "3x3_w.npy", weights_layout),
-                get_weights_accessor(data_path, total_path + "3x3_b.npy"),
-                PadStrideInfo(1, 1, 1, 1))
-            .set_name(param_path + "/3x3")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/relu_3x3");
+        i_b << ConvolutionLayer(1U, 1U, std::get<0>(b_filters),
+                                get_weights_accessor(data_path, total_path + "3x3_reduce_w.npy", weights_layout),
+                                get_weights_accessor(data_path, total_path + "3x3_reduce_b.npy"),
+                                PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/3x3_reduce")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/relu_3x3_reduce")
+            << ConvolutionLayer(3U, 3U, std::get<1>(b_filters),
+                                get_weights_accessor(data_path, total_path + "3x3_w.npy", weights_layout),
+                                get_weights_accessor(data_path, total_path + "3x3_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name(param_path + "/3x3")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/relu_3x3");
 
         SubStream i_c(graph);
-        i_c << ConvolutionLayer(
-                1U, 1U, std::get<0>(c_filters),
-                get_weights_accessor(data_path, total_path + "5x5_reduce_w.npy", weights_layout),
-                get_weights_accessor(data_path, total_path + "5x5_reduce_b.npy"),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/5x5_reduce")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/relu_5x5_reduce")
-            << ConvolutionLayer(
-                5U, 5U, std::get<1>(c_filters),
-                get_weights_accessor(data_path, total_path + "5x5_w.npy", weights_layout),
-                get_weights_accessor(data_path, total_path + "5x5_b.npy"),
-                PadStrideInfo(1, 1, 2, 2))
-            .set_name(param_path + "/5x5")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/relu_5x5");
+        i_c << ConvolutionLayer(1U, 1U, std::get<0>(c_filters),
+                                get_weights_accessor(data_path, total_path + "5x5_reduce_w.npy", weights_layout),
+                                get_weights_accessor(data_path, total_path + "5x5_reduce_b.npy"),
+                                PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/5x5_reduce")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/relu_5x5_reduce")
+            << ConvolutionLayer(5U, 5U, std::get<1>(c_filters),
+                                get_weights_accessor(data_path, total_path + "5x5_w.npy", weights_layout),
+                                get_weights_accessor(data_path, total_path + "5x5_b.npy"), PadStrideInfo(1, 1, 2, 2))
+                   .set_name(param_path + "/5x5")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/relu_5x5");
 
         SubStream i_d(graph);
-        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL))).set_name(param_path + "/pool")
+        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout,
+                                             PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL)))
+                   .set_name(param_path + "/pool")
             << ConvolutionLayer(
-                1U, 1U, d_filt,
-                get_weights_accessor(data_path, total_path + "pool_proj_w.npy", weights_layout),
-                get_weights_accessor(data_path, total_path + "pool_proj_b.npy"),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/pool_proj")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/relu_pool_proj");
+                   1U, 1U, d_filt, get_weights_accessor(data_path, total_path + "pool_proj_w.npy", weights_layout),
+                   get_weights_accessor(data_path, total_path + "pool_proj_b.npy"), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/pool_proj")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/relu_pool_proj");
 
         return ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d));
     }
diff --git a/examples/graph_inception_resnet_v1.cpp b/examples/graph_inception_resnet_v1.cpp
index d789d7f6e7..a54a0f7806 100644
--- a/examples/graph_inception_resnet_v1.cpp
+++ b/examples/graph_inception_resnet_v1.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -38,7 +39,12 @@ class InceptionResNetV1Example final : public Example
 {
 public:
     InceptionResNetV1Example()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), model_input_width(nullptr), model_input_height(nullptr), graph(0, "InceptionResNetV1")
+        : cmd_parser(),
+          common_opts(cmd_parser),
+          common_params(),
+          model_input_width(nullptr),
+          model_input_height(nullptr),
+          graph(0, "InceptionResNetV1")
     {
         model_input_width  = cmd_parser.add_option<SimpleOption<unsigned int>>("image-width", 512);
         model_input_height = cmd_parser.add_option<SimpleOption<unsigned int>>("image-height", 512);
@@ -47,7 +53,7 @@ public:
         model_input_width->set_help("Input image width.");
         model_input_height->set_help("Input image height.");
     }
-    InceptionResNetV1Example(const InceptionResNetV1Example &) = delete;
+    InceptionResNetV1Example(const InceptionResNetV1Example &)            = delete;
     InceptionResNetV1Example &operator=(const InceptionResNetV1Example &) = delete;
     ~InceptionResNetV1Example() override                                  = default;
     bool do_setup(int argc, char **argv) override
@@ -60,7 +66,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -70,13 +76,14 @@ public:
         const unsigned int image_height = model_input_height->value();
 
         // Set default layout if needed
-        if(!common_opts.data_layout->is_set() && common_params.target == Target::NEON)
+        if (!common_opts.data_layout->is_set() && common_params.target == Target::NEON)
         {
             common_params.data_layout = DataLayout::NCHW;
         }
 
         // Checks
-        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type), "QASYMM8 not supported for this graph");
+        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type),
+                                "QASYMM8 not supported for this graph");
 
         // Print parameter values
         std::cout << common_params << std::endl;
@@ -86,7 +93,7 @@ public:
         // Create model path
         std::string data_path  = common_params.data_path;
         std::string model_path = "/cnn_data/inception_resnet_v1_model/";
-        if(!data_path.empty())
+        if (!data_path.empty())
         {
             data_path += model_path;
         }
@@ -96,95 +103,98 @@ public:
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(image_width, image_height, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape     = permute_shape(
+                TensorShape(image_width, image_height, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
+        graph << common_params.target << common_params.fast_math_hint
               << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor), false))
               // Conv2d_1a_3x3
-              << ConvolutionLayer(3U, 3U, 32U,
-                                  get_weights_accessor(data_path, "Conv2d_1a_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                  PadStrideInfo(2, 2, 0, 0))
-              .set_name("Conv2d_1a_3x3/convolution")
+              << ConvolutionLayer(
+                     3U, 3U, 32U, get_weights_accessor(data_path, "Conv2d_1a_3x3_weights.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                     .set_name("Conv2d_1a_3x3/convolution")
               << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
                                          get_random_accessor(1.f, 1.f),
                                          get_weights_accessor(data_path, "Conv2d_1a_3x3_BatchNorm_beta.npy"),
                                          batch_norm_epsilon)
-              .set_name("Conv2d_1a_3x3/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_1a_3x3/Relu")
+                     .set_name("Conv2d_1a_3x3/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_1a_3x3/Relu")
               // Conv2d_2a_3x3
-              << ConvolutionLayer(3U, 3U, 32U,
-                                  get_weights_accessor(data_path, "Conv2d_2a_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("Conv2d_2a_3x3/convolution")
+              << ConvolutionLayer(
+                     3U, 3U, 32U, get_weights_accessor(data_path, "Conv2d_2a_3x3_weights.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                     .set_name("Conv2d_2a_3x3/convolution")
               << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv2d_2a_3x3_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "Conv2d_2a_3x3_BatchNorm_moving_variance.npy"),
                                          get_random_accessor(1.f, 1.f),
                                          get_weights_accessor(data_path, "Conv2d_2a_3x3_BatchNorm_beta.npy"),
                                          batch_norm_epsilon)
-              .set_name("Conv2d_2a_3x3/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_2a_3x3/Relu")
+                     .set_name("Conv2d_2a_3x3/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_2a_3x3/Relu")
               // Conv2d_2b_3x3
-              << ConvolutionLayer(3U, 3U, 64U,
-                                  get_weights_accessor(data_path, "Conv2d_2b_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("Conv2d_2b_3x3/convolution")
+              << ConvolutionLayer(
+                     3U, 3U, 64U, get_weights_accessor(data_path, "Conv2d_2b_3x3_weights.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                     .set_name("Conv2d_2b_3x3/convolution")
               << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv2d_2b_3x3_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "Conv2d_2b_3x3_BatchNorm_moving_variance.npy"),
                                          get_random_accessor(1.f, 1.f),
                                          get_weights_accessor(data_path, "Conv2d_2b_3x3_BatchNorm_beta.npy"),
                                          batch_norm_epsilon)
-              .set_name("Conv2d_2b_3x3/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_2b_3x3/Relu")
+                     .set_name("Conv2d_2b_3x3/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_2b_3x3/Relu")
               // MaxPool_3a_3x3
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), true)).set_name("MaxPool_3a_3x3/MaxPool")
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), true))
+                     .set_name("MaxPool_3a_3x3/MaxPool")
               // Conv2d_3b_1x1
-              << ConvolutionLayer(1U, 1U, 80U,
-                                  get_weights_accessor(data_path, "Conv2d_3b_1x1_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("Conv2d_3b_1x1/convolution")
+              << ConvolutionLayer(
+                     1U, 1U, 80U, get_weights_accessor(data_path, "Conv2d_3b_1x1_weights.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                     .set_name("Conv2d_3b_1x1/convolution")
               << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv2d_3b_1x1_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "Conv2d_3b_1x1_BatchNorm_moving_variance.npy"),
                                          get_random_accessor(1.f, 1.f),
                                          get_weights_accessor(data_path, "Conv2d_3b_1x1_BatchNorm_beta.npy"),
                                          batch_norm_epsilon)
-              .set_name("Conv2d_3b_1x1/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_3b_1x1/Relu")
+                     .set_name("Conv2d_3b_1x1/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_3b_1x1/Relu")
               // Conv2d_4a_3x3
-              << ConvolutionLayer(3U, 3U, 192U,
-                                  get_weights_accessor(data_path, "Conv2d_4a_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("Conv2d_4a_3x3/convolution")
+              << ConvolutionLayer(
+                     3U, 3U, 192U, get_weights_accessor(data_path, "Conv2d_4a_3x3_weights.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                     .set_name("Conv2d_4a_3x3/convolution")
               << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv2d_4a_3x3_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "Conv2d_4a_3x3_BatchNorm_moving_variance.npy"),
                                          get_random_accessor(1.f, 1.f),
                                          get_weights_accessor(data_path, "Conv2d_4a_3x3_BatchNorm_beta.npy"),
                                          batch_norm_epsilon)
-              .set_name("Conv2d_4a_3x3/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_4a_3x3/Relu")
+                     .set_name("Conv2d_4a_3x3/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_4a_3x3/Relu")
               // Conv2d_4b_3x3
-              << ConvolutionLayer(3U, 3U, 256U,
-                                  get_weights_accessor(data_path, "Conv2d_4b_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                  PadStrideInfo(2, 2, 0, 0))
-              .set_name("Conv2d_4a_3x3/convolution")
+              << ConvolutionLayer(
+                     3U, 3U, 256U, get_weights_accessor(data_path, "Conv2d_4b_3x3_weights.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                     .set_name("Conv2d_4a_3x3/convolution")
               << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv2d_4b_3x3_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "Conv2d_4b_3x3_BatchNorm_moving_variance.npy"),
                                          get_random_accessor(1.f, 1.f),
                                          get_weights_accessor(data_path, "Conv2d_4b_3x3_BatchNorm_beta.npy"),
                                          batch_norm_epsilon)
-              .set_name("Conv2d_4b_3x3/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_4b_3x3/Relu");
+                     .set_name("Conv2d_4b_3x3/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_4b_3x3/Relu");
 
         // 5 x Inception-resnet-A
         block35_repeat(data_path, weights_layout, 5);
@@ -202,11 +212,9 @@ public:
         // Logits tail
         graph << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, operation_layout)).set_name("Logits/AvgPool_1a_8x8")
               << FlattenLayer().set_name("Logits/Flatten")
-              << FullyConnectedLayer(
-                  128U,
-                  get_weights_accessor(data_path, "Logits_Logits_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, "Logits_Logits_biases.npy"))
-              .set_name("Logits/Logits")
+              << FullyConnectedLayer(128U, get_weights_accessor(data_path, "Logits_Logits_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, "Logits_Logits_biases.npy"))
+                     .set_name("Logits/Logits")
               << OutputLayer(std::make_unique<DummyAccessor>(0));
 
         // Finalize graph
@@ -231,14 +239,14 @@ private:
     CommandLineParser           cmd_parser;
     CommonGraphOptions          common_opts;
     CommonGraphParams           common_params;
-    SimpleOption<unsigned int> *model_input_width{ nullptr };
-    SimpleOption<unsigned int> *model_input_height{ nullptr };
+    SimpleOption<unsigned int> *model_input_width{nullptr};
+    SimpleOption<unsigned int> *model_input_height{nullptr};
     Stream                      graph;
 
 private:
     void block35_repeat(const std::string &data_path, DataLayout weights_layout, unsigned int num_blocks)
     {
-        for(unsigned int i = 0; i < num_blocks; ++i)
+        for (unsigned int i = 0; i < num_blocks; ++i)
         {
             std::stringstream unit_path_ss;
             unit_path_ss << "Repeat_block35_" << (i + 1) << "_";
@@ -254,102 +262,128 @@ private:
 
             // Branch 0
             SubStream i_la(i_l);
-            i_la << ConvolutionLayer(1U, 1U, 32U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_weights.npy", weights_layout),
-                                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                     PadStrideInfo(1, 1, 0, 0))
-                 .set_name(unit_name + "Branch_0/Conv2d_1x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_beta.npy"),
-                                            batch_norm_epsilon)
-                 .set_name(unit_name + "Branch_0/Conv2d_1x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_0/Conv2d_1x1/Relu");
+            i_la << ConvolutionLayer(
+                        1U, 1U, 32U,
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_weights.npy", weights_layout),
+                        std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_beta.npy"),
+                        batch_norm_epsilon)
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/Relu");
 
             // Branch 1
             SubStream i_lb(i_l);
             i_lb << ConvolutionLayer(1U, 1U, 32U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 0, 0))
-                 .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                            batch_norm_epsilon)
-                 .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0a_1x1/Relu")
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                        batch_norm_epsilon)
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/Relu")
                  << ConvolutionLayer(3U, 3U, 32U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_3x3_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_3x3_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 1, 1))
-                 .set_name(unit_name + "Branch_1/Conv2d_0b_3x3/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                                            batch_norm_epsilon)
-                 .set_name(unit_name + "Branch_1/Conv2d_0b_3x3/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0b_3x3/Relu");
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_3x3/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_beta.npy"),
+                        batch_norm_epsilon)
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_3x3/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_3x3/Relu");
 
             // Branch 2
             SubStream i_lc(i_l);
             i_lc << ConvolutionLayer(1U, 1U, 32U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0a_1x1_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 0, 0))
-                 .set_name(unit_name + "Branch_2/Conv2d_0a_1x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                            batch_norm_epsilon)
-                 .set_name(unit_name + "Branch_2/Conv2d_0a_1x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_2/Conv2d_0a_1x1/Relu")
+                        .set_name(unit_name + "Branch_2/Conv2d_0a_1x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                        batch_norm_epsilon)
+                        .set_name(unit_name + "Branch_2/Conv2d_0a_1x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_2/Conv2d_0a_1x1/Relu")
                  << ConvolutionLayer(3U, 3U, 32U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0b_3x3_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 1, 1))
-                 .set_name(unit_name + "Branch_2/Conv2d_0b_3x3/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                                            batch_norm_epsilon)
-                 .set_name(unit_name + "Branch_2/Conv2d_0b_3x3/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_2/Conv2d_0b_3x3/Relu")
+                        .set_name(unit_name + "Branch_2/Conv2d_0b_3x3/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
+                        batch_norm_epsilon)
+                        .set_name(unit_name + "Branch_2/Conv2d_0b_3x3/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_2/Conv2d_0b_3x3/Relu")
                  << ConvolutionLayer(3U, 3U, 32U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0c_3x3_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0c_3x3_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 1, 1))
-                 .set_name(unit_name + "Branch_2/Conv2d_0c_3x3/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_beta.npy"),
-                                            batch_norm_epsilon)
-                 .set_name(unit_name + "Branch_2/Conv2d_0c_3x3/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_2/Conv2d_0c_3x3/Relu");
+                        .set_name(unit_name + "Branch_2/Conv2d_0c_3x3/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_beta.npy"),
+                        batch_norm_epsilon)
+                        .set_name(unit_name + "Branch_2/Conv2d_0c_3x3/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_2/Conv2d_0c_3x3/Relu");
 
             // Concatenate
             i_l << ConcatLayer(std::move(i_la), std::move(i_lb), std::move(i_lc)).set_name(unit_name + "concat")
-                << ConvolutionLayer(1U, 1U, 256U,
-                                    get_weights_accessor(data_path, unit_path + "Conv2d_1x1_weights.npy", weights_layout),
-                                    get_weights_accessor(data_path, unit_path + "Conv2d_1x1_biases.npy", weights_layout),
-                                    PadStrideInfo(1, 1, 0, 0))
-                .set_name(unit_name + "Conv2d_1x1/convolution")
-                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 0.17f, 0.f)).set_name(unit_name + "mul");
+                << ConvolutionLayer(
+                       1U, 1U, 256U,
+                       get_weights_accessor(data_path, unit_path + "Conv2d_1x1_weights.npy", weights_layout),
+                       get_weights_accessor(data_path, unit_path + "Conv2d_1x1_biases.npy", weights_layout),
+                       PadStrideInfo(1, 1, 0, 0))
+                       .set_name(unit_name + "Conv2d_1x1/convolution")
+                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 0.17f, 0.f))
+                       .set_name(unit_name + "mul");
 
             graph << EltwiseLayer(std::move(i_l), std::move(i_r), EltwiseOperation::Add).set_name(unit_name + "add")
-                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Relu");
+                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                         .set_name(unit_name + "Relu");
         }
     }
 
     void block17_repeat(const std::string &data_path, DataLayout weights_layout, unsigned int num_blocks)
     {
-        for(unsigned int i = 0; i < num_blocks; ++i)
+        for (unsigned int i = 0; i < num_blocks; ++i)
         {
             std::stringstream unit_path_ss;
             unit_path_ss << "Repeat_1_block17_" << (i + 1) << "_";
@@ -365,79 +399,101 @@ private:
 
             // Branch 0
             SubStream i_la(i_l);
-            i_la << ConvolutionLayer(1U, 1U, 128U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_weights.npy", weights_layout),
-                                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                     PadStrideInfo(1, 1, 0, 0))
-                 .set_name(unit_name + "Branch_0/Conv2d_1x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_beta.npy"),
-                                            batch_norm_epsilon)
-                 .set_name(unit_name + "Branch_0/Conv2d_1x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_0/Conv2d_1x1/Relu");
+            i_la << ConvolutionLayer(
+                        1U, 1U, 128U,
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_weights.npy", weights_layout),
+                        std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_beta.npy"),
+                        batch_norm_epsilon)
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/Relu");
 
             // Branch 1
             SubStream i_lb(i_l);
             i_lb << ConvolutionLayer(1U, 1U, 128U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 0, 0))
-                 .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                            batch_norm_epsilon)
-                 .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0a_1x1/Relu")
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                        batch_norm_epsilon)
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/Relu")
                  << ConvolutionLayer(7U, 1U, 128U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x7_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x7_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 3, 0))
-                 .set_name(unit_name + "Branch_1/Conv2d_0b_1x7/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"),
-                                            batch_norm_epsilon)
-                 .set_name(unit_name + "Branch_1/Conv2d_0b_1x7/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0b_1x7/Relu")
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_1x7/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"),
+                        batch_norm_epsilon)
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_1x7/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_1x7/Relu")
                  << ConvolutionLayer(1U, 7U, 128U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_7x1_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_7x1_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 0, 3))
-                 .set_name(unit_name + "Branch_1/Conv2d_0c_7x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"),
-                                            batch_norm_epsilon)
-                 .set_name(unit_name + "Branch_1/Conv2d_0c_7x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0c_7x1/Relu");
+                        .set_name(unit_name + "Branch_1/Conv2d_0c_7x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"),
+                        batch_norm_epsilon)
+                        .set_name(unit_name + "Branch_1/Conv2d_0c_7x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0c_7x1/Relu");
 
             // Concatenate
             i_l << ConcatLayer(std::move(i_la), std::move(i_lb)).set_name(unit_name + "concat")
-                << ConvolutionLayer(1U, 1U, 896U,
-                                    get_weights_accessor(data_path, unit_path + "Conv2d_1x1_weights.npy", weights_layout),
-                                    get_weights_accessor(data_path, unit_path + "Conv2d_1x1_biases.npy", weights_layout),
-                                    PadStrideInfo(1, 1, 0, 0))
-                .set_name(unit_name + "Conv2d_1x1/convolution")
-                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 0.10f, 0.f)).set_name(unit_name + "mul");
+                << ConvolutionLayer(
+                       1U, 1U, 896U,
+                       get_weights_accessor(data_path, unit_path + "Conv2d_1x1_weights.npy", weights_layout),
+                       get_weights_accessor(data_path, unit_path + "Conv2d_1x1_biases.npy", weights_layout),
+                       PadStrideInfo(1, 1, 0, 0))
+                       .set_name(unit_name + "Conv2d_1x1/convolution")
+                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 0.10f, 0.f))
+                       .set_name(unit_name + "mul");
 
             graph << EltwiseLayer(std::move(i_l), std::move(i_r), EltwiseOperation::Add).set_name(unit_name + "add")
-                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Relu");
+                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                         .set_name(unit_name + "Relu");
         }
     }
 
-    void block8_repeat(const std::string &data_path, DataLayout weights_layout, unsigned int num_blocks, float scale, bool has_activation)
+    void block8_repeat(const std::string &data_path,
+                       DataLayout         weights_layout,
+                       unsigned int       num_blocks,
+                       float              scale,
+                       bool               has_activation)
     {
-        for(unsigned int i = 0; i < num_blocks; ++i)
+        for (unsigned int i = 0; i < num_blocks; ++i)
         {
             std::stringstream unit_path_ss;
             std::stringstream unit_name_ss;
-            if(num_blocks != 1)
+            if (num_blocks != 1)
             {
                 unit_path_ss << "Repeat_2_block8_" << (i + 1) << "_";
                 unit_name_ss << "Repeat_2/block8_" << (i + 1) << "/";
@@ -457,79 +513,97 @@ private:
 
             // Branch 0
             SubStream i_la(i_l);
-            i_la << ConvolutionLayer(1U, 1U, 192U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_weights.npy", weights_layout),
-                                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                     PadStrideInfo(1, 1, 0, 0))
-                 .set_name(unit_name + "Branch_0/Conv2d_1x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_beta.npy"),
-                                            batch_norm_epsilon)
-                 .set_name(unit_name + "Branch_0/Conv2d_1x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_0/Conv2d_1x1/Relu");
+            i_la << ConvolutionLayer(
+                        1U, 1U, 192U,
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_weights.npy", weights_layout),
+                        std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_beta.npy"),
+                        batch_norm_epsilon)
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/Relu");
 
             // Branch 1
             SubStream i_lb(i_l);
             i_lb << ConvolutionLayer(1U, 1U, 192U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 0, 0))
-                 .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                            batch_norm_epsilon)
-                 .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0a_1x1/Relu")
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                        batch_norm_epsilon)
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/Relu")
                  << ConvolutionLayer(3U, 1U, 192U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x3_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x3_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 1, 0))
-                 .set_name(unit_name + "Branch_1/Conv2d_0b_1x3/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_beta.npy"),
-                                            batch_norm_epsilon)
-                 .set_name(unit_name + "Branch_1/Conv2d_0b_1x3/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0b_1x3/Relu")
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_1x3/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_beta.npy"),
+                        batch_norm_epsilon)
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_1x3/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_1x3/Relu")
                  << ConvolutionLayer(1U, 3U, 192U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_3x1_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_3x1_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 0, 1))
-                 .set_name(unit_name + "Branch_1/Conv2d_0c_3x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_beta.npy"),
-                                            batch_norm_epsilon)
-                 .set_name(unit_name + "Branch_1/Conv2d_0c_3x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0c_3x1/Relu");
+                        .set_name(unit_name + "Branch_1/Conv2d_0c_3x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_beta.npy"),
+                        batch_norm_epsilon)
+                        .set_name(unit_name + "Branch_1/Conv2d_0c_3x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0c_3x1/Relu");
 
             // Concatenate
             i_l << ConcatLayer(std::move(i_la), std::move(i_lb)).set_name(unit_name + "concat")
-                << ConvolutionLayer(1U, 1U, 1792U,
-                                    get_weights_accessor(data_path, unit_path + "Conv2d_1x1_weights.npy", weights_layout),
-                                    get_weights_accessor(data_path, unit_path + "Conv2d_1x1_biases.npy", weights_layout),
-                                    PadStrideInfo(1, 1, 0, 0))
-                .set_name(unit_name + "Conv2d_1x1/convolution");
+                << ConvolutionLayer(
+                       1U, 1U, 1792U,
+                       get_weights_accessor(data_path, unit_path + "Conv2d_1x1_weights.npy", weights_layout),
+                       get_weights_accessor(data_path, unit_path + "Conv2d_1x1_biases.npy", weights_layout),
+                       PadStrideInfo(1, 1, 0, 0))
+                       .set_name(unit_name + "Conv2d_1x1/convolution");
 
             // Scale result
-            if(scale != 1.f)
+            if (scale != 1.f)
             {
-                i_l << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, scale, 0.f)).set_name(unit_name + "mul");
+                i_l << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, scale, 0.f))
+                           .set_name(unit_name + "mul");
             }
 
             // Residual add
             graph << EltwiseLayer(std::move(i_l), std::move(i_r), EltwiseOperation::Add).set_name(unit_name + "add");
 
             // Apply activation if needed
-            if(has_activation)
+            if (has_activation)
             {
-                graph << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Relu");
+                graph << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                             .set_name(unit_name + "Relu");
             }
         }
     }
@@ -538,61 +612,71 @@ private:
     {
         // Branch 0
         SubStream i_a(graph);
-        i_a << ConvolutionLayer(3U, 3U, 384U,
-                                get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       batch_norm_epsilon)
-            .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/Relu");
+        i_a << ConvolutionLayer(
+                   3U, 3U, 384U,
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                   batch_norm_epsilon)
+                   .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/Relu");
 
         // Branch 1
         SubStream i_b(graph);
-        i_b << ConvolutionLayer(1U, 1U, 192U,
-                                get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       batch_norm_epsilon)
-            .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(3U, 3U, 192U,
-                                get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 1, 1))
-            .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                                       batch_norm_epsilon)
-            .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/Relu")
-            << ConvolutionLayer(3U, 3U, 256U,
-                                get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       batch_norm_epsilon)
-            .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/Relu");
+        i_b << ConvolutionLayer(
+                   1U, 1U, 192U,
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   batch_norm_epsilon)
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 192U,
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_BatchNorm_beta.npy"),
+                   batch_norm_epsilon)
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 256U,
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                   batch_norm_epsilon)
+                   .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/Relu");
 
         // Branch 2
         SubStream i_c(graph);
-        i_c << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout, PadStrideInfo(2, 2, 0, 0), true)).set_name("Mixed_6a/Branch_2/MaxPool_1a_3x3");
+        i_c << PoolingLayer(
+                   PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout, PadStrideInfo(2, 2, 0, 0), true))
+                   .set_name("Mixed_6a/Branch_2/MaxPool_1a_3x3");
 
         // Concatenate
         graph << ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c)).set_name("Mixed_6a/concat");
@@ -602,103 +686,120 @@ private:
     {
         // Branch 0
         SubStream i_a(graph);
-        i_a << ConvolutionLayer(1U, 1U, 256U,
-                                get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_7a/Branch_0/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       batch_norm_epsilon)
-            .set_name("Mixed_7a/Branch_0/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_0/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(3U, 3U, 384U,
-                                get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       batch_norm_epsilon)
-            .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/Relu");
+        i_a << ConvolutionLayer(
+                   1U, 1U, 256U,
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_7a/Branch_0/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   batch_norm_epsilon)
+                   .set_name("Mixed_7a/Branch_0/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_0/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 384U,
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                   batch_norm_epsilon)
+                   .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/Relu");
 
         // Branch 1
         SubStream i_b(graph);
-        i_b << ConvolutionLayer(1U, 1U, 256U,
-                                get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       batch_norm_epsilon)
-            .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(3U, 3U, 256U,
-                                get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       batch_norm_epsilon)
-            .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/Relu");
+        i_b << ConvolutionLayer(
+                   1U, 1U, 256U,
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   batch_norm_epsilon)
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 256U,
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                   batch_norm_epsilon)
+                   .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/Relu");
 
         // Branch 2
         SubStream i_c(graph);
-        i_c << ConvolutionLayer(1U, 1U, 256U,
-                                get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_7a/Branch_2/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       batch_norm_epsilon)
-            .set_name("Mixed_7a/Branch_2/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_2/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(3U, 3U, 256U,
-                                get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 1, 1))
-            .set_name("Mixed_7a/Branch_2/Conv2d_0b_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                                       batch_norm_epsilon)
-            .set_name("Mixed_7a/Branch_2/Conv2d_0b_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_2/Conv2d_0b_3x3/Relu")
-            << ConvolutionLayer(3U, 3U, 256U,
-                                get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_7a/Branch_2/Conv2d_1a_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       batch_norm_epsilon)
-            .set_name("Mixed_7a/Branch_2/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_2/Conv2d_1a_3x3/Relu");
+        i_c << ConvolutionLayer(
+                   1U, 1U, 256U,
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_7a/Branch_2/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   batch_norm_epsilon)
+                   .set_name("Mixed_7a/Branch_2/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_2/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 256U,
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("Mixed_7a/Branch_2/Conv2d_0b_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
+                   batch_norm_epsilon)
+                   .set_name("Mixed_7a/Branch_2/Conv2d_0b_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_2/Conv2d_0b_3x3/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 256U,
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_7a/Branch_2/Conv2d_1a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                   batch_norm_epsilon)
+                   .set_name("Mixed_7a/Branch_2/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_2/Conv2d_1a_3x3/Relu");
 
         // Branch 3
         SubStream i_d(graph);
-        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout, PadStrideInfo(2, 2, 0, 0), true)).set_name("Mixed_7a/Branch_3/MaxPool_1a_3x3");
+        i_d << PoolingLayer(
+                   PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout, PadStrideInfo(2, 2, 0, 0), true))
+                   .set_name("Mixed_7a/Branch_3/MaxPool_1a_3x3");
 
         // Concatenate
-        graph << ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d)).set_name("Mixed_7a/concat");
+        graph
+            << ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d)).set_name("Mixed_7a/concat");
     }
 };
 
diff --git a/examples/graph_inception_resnet_v2.cpp b/examples/graph_inception_resnet_v2.cpp
index 1d0c51e9ad..43e31ee14b 100644
--- a/examples/graph_inception_resnet_v2.cpp
+++ b/examples/graph_inception_resnet_v2.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -35,8 +36,7 @@ using namespace arm_compute::graph_utils;
 class InceptionResNetV2Example final : public Example
 {
 public:
-    InceptionResNetV2Example()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "InceptionResNetV2")
+    InceptionResNetV2Example() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "InceptionResNetV2")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -49,20 +49,21 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
         }
 
         // Set default layout if needed
-        if(!common_opts.data_layout->is_set() && common_params.target == Target::NEON)
+        if (!common_opts.data_layout->is_set() && common_params.target == Target::NEON)
         {
             common_params.data_layout = DataLayout::NCHW;
         }
 
         // Checks
-        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type), "QASYMM8 not supported for this graph");
+        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type),
+                                "QASYMM8 not supported for this graph");
 
         // Print parameter values
         std::cout << common_params << std::endl;
@@ -70,7 +71,7 @@ public:
         // Create model path
         std::string data_path  = common_params.data_path;
         std::string model_path = "/cnn_data/inception_resnet_v2_model/";
-        if(!data_path.empty())
+        if (!data_path.empty())
         {
             data_path += model_path;
         }
@@ -80,84 +81,88 @@ public:
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(299U, 299U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(299U, 299U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
+        graph << common_params.target << common_params.fast_math_hint
               << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor), false))
               // Conv2d_1a_3x3
-              << ConvolutionLayer(3U, 3U, 32U,
-                                  get_weights_accessor(data_path, "Conv2d_1a_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                  PadStrideInfo(2, 2, 0, 0))
-              .set_name("Conv2d_1a_3x3/convolution")
+              << ConvolutionLayer(
+                     3U, 3U, 32U, get_weights_accessor(data_path, "Conv2d_1a_3x3_weights.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                     .set_name("Conv2d_1a_3x3/convolution")
               << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
                                          get_random_accessor(1.f, 1.f),
                                          get_weights_accessor(data_path, "Conv2d_1a_3x3_BatchNorm_beta.npy"),
                                          0.0010000000474974513f)
-              .set_name("Conv2d_1a_3x3/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_1a_3x3/Relu")
+                     .set_name("Conv2d_1a_3x3/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_1a_3x3/Relu")
               // Conv2d_2a_3x3
-              << ConvolutionLayer(3U, 3U, 32U,
-                                  get_weights_accessor(data_path, "Conv2d_2a_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("Conv2d_2a_3x3/convolution")
+              << ConvolutionLayer(
+                     3U, 3U, 32U, get_weights_accessor(data_path, "Conv2d_2a_3x3_weights.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                     .set_name("Conv2d_2a_3x3/convolution")
               << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv2d_2a_3x3_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "Conv2d_2a_3x3_BatchNorm_moving_variance.npy"),
                                          get_random_accessor(1.f, 1.f),
                                          get_weights_accessor(data_path, "Conv2d_2a_3x3_BatchNorm_beta.npy"),
                                          0.0010000000474974513f)
-              .set_name("Conv2d_2a_3x3/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_2a_3x3/Relu")
+                     .set_name("Conv2d_2a_3x3/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_2a_3x3/Relu")
               // Conv2d_2b_3x3
-              << ConvolutionLayer(3U, 3U, 64U,
-                                  get_weights_accessor(data_path, "Conv2d_2b_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("Conv2d_2b_3x3/convolution")
+              << ConvolutionLayer(
+                     3U, 3U, 64U, get_weights_accessor(data_path, "Conv2d_2b_3x3_weights.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                     .set_name("Conv2d_2b_3x3/convolution")
               << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv2d_2b_3x3_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "Conv2d_2b_3x3_BatchNorm_moving_variance.npy"),
                                          get_random_accessor(1.f, 1.f),
                                          get_weights_accessor(data_path, "Conv2d_2b_3x3_BatchNorm_beta.npy"),
                                          0.0010000000474974513f)
-              .set_name("Conv2d_2b_3x3/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_2b_3x3/Relu")
+                     .set_name("Conv2d_2b_3x3/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_2b_3x3/Relu")
               // MaxPool_3a_3x3
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), true)).set_name("MaxPool_3a_3x3/MaxPool")
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), true))
+                     .set_name("MaxPool_3a_3x3/MaxPool")
               // Conv2d_3b_1x1
-              << ConvolutionLayer(1U, 1U, 80U,
-                                  get_weights_accessor(data_path, "Conv2d_3b_1x1_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("Conv2d_3b_1x1/convolution")
+              << ConvolutionLayer(
+                     1U, 1U, 80U, get_weights_accessor(data_path, "Conv2d_3b_1x1_weights.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                     .set_name("Conv2d_3b_1x1/convolution")
               << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv2d_3b_1x1_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "Conv2d_3b_1x1_BatchNorm_moving_variance.npy"),
                                          get_random_accessor(1.f, 1.f),
                                          get_weights_accessor(data_path, "Conv2d_3b_1x1_BatchNorm_beta.npy"),
                                          0.0010000000474974513f)
-              .set_name("Conv2d_3b_1x1/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_3b_1x1/Relu")
+                     .set_name("Conv2d_3b_1x1/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_3b_1x1/Relu")
               // Conv2d_4a_3x3
-              << ConvolutionLayer(3U, 3U, 192U,
-                                  get_weights_accessor(data_path, "Conv2d_4a_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("Conv2d_4a_3x3/convolution")
+              << ConvolutionLayer(
+                     3U, 3U, 192U, get_weights_accessor(data_path, "Conv2d_4a_3x3_weights.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                     .set_name("Conv2d_4a_3x3/convolution")
               << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv2d_4a_3x3_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "Conv2d_4a_3x3_BatchNorm_moving_variance.npy"),
                                          get_random_accessor(1.f, 1.f),
                                          get_weights_accessor(data_path, "Conv2d_4a_3x3_BatchNorm_beta.npy"),
                                          0.0010000000474974513f)
-              .set_name("Conv2d_4a_3x3/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_4a_3x3/Relu")
+                     .set_name("Conv2d_4a_3x3/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_4a_3x3/Relu")
               // MaxPool_5a_3x3
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0), true)).set_name("MaxPool_5a_3x3/MaxPool");
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0), true))
+                     .set_name("MaxPool_5a_3x3/MaxPool");
 
         block_mixed_5b(data_path, weights_layout);
         block35_repeat(data_path, weights_layout, 10);
@@ -168,27 +173,25 @@ public:
         block8_repeat(data_path, weights_layout, 1, 1.f, false);
 
         // Conv2d_7b_1x1
-        graph << ConvolutionLayer(1U, 1U, 1536U,
-                                  get_weights_accessor(data_path, "Conv2d_7b_1x1_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("Conv2d_7b_1x1/convolution")
+        graph << ConvolutionLayer(
+                     1U, 1U, 1536U, get_weights_accessor(data_path, "Conv2d_7b_1x1_weights.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                     .set_name("Conv2d_7b_1x1/convolution")
               << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv2d_7b_1x1_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "Conv2d_7b_1x1_BatchNorm_moving_variance.npy"),
                                          get_random_accessor(1.f, 1.f),
                                          get_weights_accessor(data_path, "Conv2d_7b_1x1_BatchNorm_beta.npy"),
                                          0.0010000000474974513f)
-              .set_name("Conv2d_7b_1x1/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_7b_1x1/Relu")
+                     .set_name("Conv2d_7b_1x1/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_7b_1x1/Relu")
               << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, operation_layout)).set_name("Logits/AvgPool_1a_8x8")
               << FlattenLayer().set_name("Logits/Flatten")
-              << FullyConnectedLayer(
-                  1001U,
-                  get_weights_accessor(data_path, "Logits_Logits_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, "Logits_Logits_biases.npy"))
-              .set_name("Logits/Logits")
-              << SoftmaxLayer().set_name("Logits/Predictions")
-              << OutputLayer(get_output_accessor(common_params, 5));
+              << FullyConnectedLayer(1001U,
+                                     get_weights_accessor(data_path, "Logits_Logits_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, "Logits_Logits_biases.npy"))
+                     .set_name("Logits/Logits")
+              << SoftmaxLayer().set_name("Logits/Predictions") << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
         GraphConfig config;
@@ -219,164 +222,191 @@ private:
     {
         // Branch 0
         SubStream i_a(graph);
-        i_a << ConvolutionLayer(1U, 1U, 96U,
-                                get_weights_accessor(data_path, "Mixed_5b_Branch_0_Conv2d_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_5b/Branch_0/Conv2d_1x1/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_5b_Branch_0_Conv2d_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_5b_Branch_0_Conv2d_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_5b_Branch_0_Conv2d_1x1_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_5b/Branch_0/Conv2d_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_5b/Branch_0/Conv2d_1x1/Relu");
+        i_a << ConvolutionLayer(
+                   1U, 1U, 96U,
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_0_Conv2d_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_5b/Branch_0/Conv2d_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_0_Conv2d_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_0_Conv2d_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_0_Conv2d_1x1_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_5b/Branch_0/Conv2d_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_5b/Branch_0/Conv2d_1x1/Relu");
 
         // Branch 1
         SubStream i_b(graph);
-        i_b << ConvolutionLayer(1U, 1U, 48U,
-                                get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_5b/Branch_1/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_5b/Branch_1/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_5b/Branch_1/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(5U, 5U, 64U,
-                                get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0b_5x5_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 2, 2))
-            .set_name("Mixed_5b/Branch_1/Conv2d_0b_5x5/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0b_5x5_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0b_5x5_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0b_5x5_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_5b/Branch_1/Conv2d_0b_5x5/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_5b/Branch_1/Conv2d_0b_5x5/Relu");
+        i_b << ConvolutionLayer(
+                   1U, 1U, 48U,
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_5b/Branch_1/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_5b/Branch_1/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_5b/Branch_1/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   5U, 5U, 64U,
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0b_5x5_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 2, 2))
+                   .set_name("Mixed_5b/Branch_1/Conv2d_0b_5x5/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0b_5x5_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0b_5x5_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_1_Conv2d_0b_5x5_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_5b/Branch_1/Conv2d_0b_5x5/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_5b/Branch_1/Conv2d_0b_5x5/Relu");
 
         // Branch 2
         SubStream i_c(graph);
-        i_c << ConvolutionLayer(1U, 1U, 64U,
-                                get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_5b/Branch_2/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_5b/Branch_2/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_5b/Branch_2/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(3U, 3U, 96U,
-                                get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 1, 1))
-            .set_name("Mixed_5b/Branch_2/Conv2d_0b_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_5b/Branch_2/Conv2d_0b_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_5b/Branch_2/Conv2d_0b_3x3/Relu")
-            << ConvolutionLayer(3U, 3U, 96U,
-                                get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0c_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 1, 1))
-            .set_name("Mixed_5b/Branch_2/Conv2d_0c_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0c_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0c_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0c_3x3_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_5b/Branch_2/Conv2d_0c_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_5b/Branch_2/Conv2d_0c_3x3/Relu");
+        i_c << ConvolutionLayer(
+                   1U, 1U, 64U,
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_5b/Branch_2/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_5b/Branch_2/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_5b/Branch_2/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 96U,
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("Mixed_5b/Branch_2/Conv2d_0b_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_5b/Branch_2/Conv2d_0b_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_5b/Branch_2/Conv2d_0b_3x3/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 96U,
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0c_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("Mixed_5b/Branch_2/Conv2d_0c_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0c_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0c_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_2_Conv2d_0c_3x3_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_5b/Branch_2/Conv2d_0c_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_5b/Branch_2/Conv2d_0c_3x3/Relu");
 
         // Branch 3
         SubStream i_d(graph);
-        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL), true)).set_name("Mixed_5b/Branch_3/AvgPool_0a_3x3")
-            << ConvolutionLayer(1U, 1U, 64U,
-                                get_weights_accessor(data_path, "Mixed_5b_Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_5b/Branch_3/Conv2d_0b_1x1/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_5b_Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_5b_Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_5b_Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_5b/Branch_3/Conv2d_0b_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_5b/Branch_3/Conv2d_0b_1x1/Relu");
+        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout,
+                                             PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL), true))
+                   .set_name("Mixed_5b/Branch_3/AvgPool_0a_3x3")
+            << ConvolutionLayer(
+                   1U, 1U, 64U,
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_5b/Branch_3/Conv2d_0b_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_5b_Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_5b/Branch_3/Conv2d_0b_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_5b/Branch_3/Conv2d_0b_1x1/Relu");
 
         // Concatenate
-        graph << ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d)).set_name("Mixed_5a/concat");
+        graph
+            << ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d)).set_name("Mixed_5a/concat");
     }
 
     void block_mixed_6a(const std::string &data_path, DataLayout weights_layout)
     {
         // Branch 0
         SubStream i_a(graph);
-        i_a << ConvolutionLayer(3U, 3U, 384U,
-                                get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/Relu");
+        i_a << ConvolutionLayer(
+                   3U, 3U, 384U,
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/Relu");
 
         // Branch 1
         SubStream i_b(graph);
-        i_b << ConvolutionLayer(1U, 1U, 256U,
-                                get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(3U, 3U, 256U,
-                                get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 1, 1))
-            .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/Relu")
-            << ConvolutionLayer(3U, 3U, 384U,
-                                get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/Relu");
+        i_b << ConvolutionLayer(
+                   1U, 1U, 256U,
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 256U,
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_0b_3x3_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 384U,
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_6a_Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/Relu");
 
         // Branch 2
         SubStream i_c(graph);
-        i_c << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout, PadStrideInfo(2, 2, 0, 0), true)).set_name("Mixed_6a/Branch_2/MaxPool_1a_3x3");
+        i_c << PoolingLayer(
+                   PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout, PadStrideInfo(2, 2, 0, 0), true))
+                   .set_name("Mixed_6a/Branch_2/MaxPool_1a_3x3");
 
         // Concatenate
         graph << ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c)).set_name("Mixed_6a/concat");
@@ -386,108 +416,125 @@ private:
     {
         // Branch 0
         SubStream i_a(graph);
-        i_a << ConvolutionLayer(1U, 1U, 256U,
-                                get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_7a/Branch_0/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_7a/Branch_0/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_0/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(3U, 3U, 384U,
-                                get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/Relu");
+        i_a << ConvolutionLayer(
+                   1U, 1U, 256U,
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_7a/Branch_0/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_7a/Branch_0/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_0/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 384U,
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/Relu");
 
         // Branch 1
         SubStream i_b(graph);
-        i_b << ConvolutionLayer(1U, 1U, 256U,
-                                get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(3U, 3U, 288U,
-                                get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/Relu");
+        i_b << ConvolutionLayer(
+                   1U, 1U, 256U,
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 288U,
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/Relu");
 
         // Branch 2
         SubStream i_c(graph);
-        i_c << ConvolutionLayer(1U, 1U, 256U,
-                                get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_7a/Branch_2/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_7a/Branch_2/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_2/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(3U, 3U, 288U,
-                                get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(1, 1, 1, 1))
-            .set_name("Mixed_7a/Branch_2/Conv2d_0b_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_7a/Branch_2/Conv2d_0b_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_2/Conv2d_0b_3x3/Relu")
-            << ConvolutionLayer(3U, 3U, 320U,
-                                get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_7a/Branch_2/Conv2d_1a_3x3/convolution")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       0.0010000000474974513f)
-            .set_name("Mixed_7a/Branch_2/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_2/Conv2d_1a_3x3/Relu");
+        i_c << ConvolutionLayer(
+                   1U, 1U, 256U,
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_7a/Branch_2/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_7a/Branch_2/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_2/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 288U,
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("Mixed_7a/Branch_2/Conv2d_0b_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_7a/Branch_2/Conv2d_0b_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_2/Conv2d_0b_3x3/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 320U,
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_7a/Branch_2/Conv2d_1a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, "Mixed_7a_Branch_2_Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                   0.0010000000474974513f)
+                   .set_name("Mixed_7a/Branch_2/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_2/Conv2d_1a_3x3/Relu");
 
         // Branch 3
         SubStream i_d(graph);
-        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), true)).set_name("Mixed_7a/Branch_3/MaxPool_1a_3x3");
+        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout,
+                                             PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), true))
+                   .set_name("Mixed_7a/Branch_3/MaxPool_1a_3x3");
 
         // Concatenate
-        graph << ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d)).set_name("Mixed_7a/concat");
+        graph
+            << ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d)).set_name("Mixed_7a/concat");
     }
 
     void block35_repeat(const std::string &data_path, DataLayout weights_layout, unsigned int num_blocks)
     {
-        for(unsigned int i = 0; i < num_blocks; ++i)
+        for (unsigned int i = 0; i < num_blocks; ++i)
         {
             std::stringstream unit_path_ss;
             unit_path_ss << "Repeat_block35_" << (i + 1) << "_";
@@ -503,102 +550,128 @@ private:
 
             // Branch 0
             SubStream i_la(i_l);
-            i_la << ConvolutionLayer(1U, 1U, 32U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_weights.npy", weights_layout),
-                                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                     PadStrideInfo(1, 1, 0, 0))
-                 .set_name(unit_name + "Branch_0/Conv2d_1x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(unit_name + "Branch_0/Conv2d_1x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_0/Conv2d_1x1/Relu");
+            i_la << ConvolutionLayer(
+                        1U, 1U, 32U,
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_weights.npy", weights_layout),
+                        std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/Relu");
 
             // Branch 1
             SubStream i_lb(i_l);
             i_lb << ConvolutionLayer(1U, 1U, 32U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 0, 0))
-                 .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0a_1x1/Relu")
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/Relu")
                  << ConvolutionLayer(3U, 3U, 32U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_3x3_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_3x3_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 1, 1))
-                 .set_name(unit_name + "Branch_1/Conv2d_0b_3x3/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(unit_name + "Branch_1/Conv2d_0b_3x3/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0b_3x3/Relu");
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_3x3/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_3x3/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_3x3/Relu");
 
             // Branch 2
             SubStream i_lc(i_l);
             i_lc << ConvolutionLayer(1U, 1U, 32U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0a_1x1_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 0, 0))
-                 .set_name(unit_name + "Branch_2/Conv2d_0a_1x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(unit_name + "Branch_2/Conv2d_0a_1x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_2/Conv2d_0a_1x1/Relu")
+                        .set_name(unit_name + "Branch_2/Conv2d_0a_1x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(unit_name + "Branch_2/Conv2d_0a_1x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_2/Conv2d_0a_1x1/Relu")
                  << ConvolutionLayer(3U, 3U, 48U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0b_3x3_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 1, 1))
-                 .set_name(unit_name + "Branch_2/Conv2d_0b_3x3/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(unit_name + "Branch_2/Conv2d_0b_3x3/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_2/Conv2d_0b_3x3/Relu")
+                        .set_name(unit_name + "Branch_2/Conv2d_0b_3x3/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(unit_name + "Branch_2/Conv2d_0b_3x3/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_2/Conv2d_0b_3x3/Relu")
                  << ConvolutionLayer(3U, 3U, 64U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0c_3x3_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0c_3x3_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 1, 1))
-                 .set_name(unit_name + "Branch_2/Conv2d_0c_3x3/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(unit_name + "Branch_2/Conv2d_0c_3x3/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_2/Conv2d_0c_3x3/Relu");
+                        .set_name(unit_name + "Branch_2/Conv2d_0c_3x3/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(unit_name + "Branch_2/Conv2d_0c_3x3/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_2/Conv2d_0c_3x3/Relu");
 
             // Concatenate
             i_l << ConcatLayer(std::move(i_la), std::move(i_lb), std::move(i_lc)).set_name(unit_name + "concat")
-                << ConvolutionLayer(1U, 1U, 320U,
-                                    get_weights_accessor(data_path, unit_path + "Conv2d_1x1_weights.npy", weights_layout),
-                                    get_weights_accessor(data_path, unit_path + "Conv2d_1x1_biases.npy", weights_layout),
-                                    PadStrideInfo(1, 1, 0, 0))
-                .set_name(unit_name + "Conv2d_1x1/convolution")
-                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 0.17f, 0.f)).set_name(unit_name + "mul");
+                << ConvolutionLayer(
+                       1U, 1U, 320U,
+                       get_weights_accessor(data_path, unit_path + "Conv2d_1x1_weights.npy", weights_layout),
+                       get_weights_accessor(data_path, unit_path + "Conv2d_1x1_biases.npy", weights_layout),
+                       PadStrideInfo(1, 1, 0, 0))
+                       .set_name(unit_name + "Conv2d_1x1/convolution")
+                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 0.17f, 0.f))
+                       .set_name(unit_name + "mul");
 
             graph << EltwiseLayer(std::move(i_l), std::move(i_r), EltwiseOperation::Add).set_name(unit_name + "add")
-                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Relu");
+                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                         .set_name(unit_name + "Relu");
         }
     }
 
     void block17_repeat(const std::string &data_path, DataLayout weights_layout, unsigned int num_blocks)
     {
-        for(unsigned int i = 0; i < num_blocks; ++i)
+        for (unsigned int i = 0; i < num_blocks; ++i)
         {
             std::stringstream unit_path_ss;
             unit_path_ss << "Repeat_1_block17_" << (i + 1) << "_";
@@ -614,79 +687,101 @@ private:
 
             // Branch 0
             SubStream i_la(i_l);
-            i_la << ConvolutionLayer(1U, 1U, 192U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_weights.npy", weights_layout),
-                                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                     PadStrideInfo(1, 1, 0, 0))
-                 .set_name(unit_name + "Branch_0/Conv2d_1x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(unit_name + "Branch_0/Conv2d_1x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_0/Conv2d_1x1/Relu");
+            i_la << ConvolutionLayer(
+                        1U, 1U, 192U,
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_weights.npy", weights_layout),
+                        std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/Relu");
 
             // Branch 1
             SubStream i_lb(i_l);
             i_lb << ConvolutionLayer(1U, 1U, 128U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 0, 0))
-                 .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0a_1x1/Relu")
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/Relu")
                  << ConvolutionLayer(7U, 1U, 160U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x7_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x7_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 3, 0))
-                 .set_name(unit_name + "Branch_1/Conv2d_0b_1x7/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(unit_name + "Branch_1/Conv2d_0b_1x7/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0b_1x7/Relu")
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_1x7/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_1x7/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_1x7/Relu")
                  << ConvolutionLayer(1U, 7U, 192U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_7x1_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_7x1_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 0, 3))
-                 .set_name(unit_name + "Branch_1/Conv2d_0c_7x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(unit_name + "Branch_1/Conv2d_0c_7x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0c_7x1/Relu");
+                        .set_name(unit_name + "Branch_1/Conv2d_0c_7x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(unit_name + "Branch_1/Conv2d_0c_7x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0c_7x1/Relu");
 
             // Concatenate
             i_l << ConcatLayer(std::move(i_la), std::move(i_lb)).set_name(unit_name + "concat")
-                << ConvolutionLayer(1U, 1U, 1088U,
-                                    get_weights_accessor(data_path, unit_path + "Conv2d_1x1_weights.npy", weights_layout),
-                                    get_weights_accessor(data_path, unit_path + "Conv2d_1x1_biases.npy", weights_layout),
-                                    PadStrideInfo(1, 1, 0, 0))
-                .set_name(unit_name + "Conv2d_1x1/convolution")
-                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 0.10f, 0.f)).set_name(unit_name + "mul");
+                << ConvolutionLayer(
+                       1U, 1U, 1088U,
+                       get_weights_accessor(data_path, unit_path + "Conv2d_1x1_weights.npy", weights_layout),
+                       get_weights_accessor(data_path, unit_path + "Conv2d_1x1_biases.npy", weights_layout),
+                       PadStrideInfo(1, 1, 0, 0))
+                       .set_name(unit_name + "Conv2d_1x1/convolution")
+                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 0.10f, 0.f))
+                       .set_name(unit_name + "mul");
 
             graph << EltwiseLayer(std::move(i_l), std::move(i_r), EltwiseOperation::Add).set_name(unit_name + "add")
-                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Relu");
+                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                         .set_name(unit_name + "Relu");
         }
     }
 
-    void block8_repeat(const std::string &data_path, DataLayout weights_layout, unsigned int num_blocks, float scale, bool has_activation)
+    void block8_repeat(const std::string &data_path,
+                       DataLayout         weights_layout,
+                       unsigned int       num_blocks,
+                       float              scale,
+                       bool               has_activation)
     {
-        for(unsigned int i = 0; i < num_blocks; ++i)
+        for (unsigned int i = 0; i < num_blocks; ++i)
         {
             std::stringstream unit_path_ss;
             std::stringstream unit_name_ss;
-            if(num_blocks != 1)
+            if (num_blocks != 1)
             {
                 unit_path_ss << "Repeat_2_block8_" << (i + 1) << "_";
                 unit_name_ss << "Repeat_2/block8_" << (i + 1) << "/";
@@ -706,79 +801,97 @@ private:
 
             // Branch 0
             SubStream i_la(i_l);
-            i_la << ConvolutionLayer(1U, 1U, 192U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_weights.npy", weights_layout),
-                                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                     PadStrideInfo(1, 1, 0, 0))
-                 .set_name(unit_name + "Branch_0/Conv2d_1x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(unit_name + "Branch_0/Conv2d_1x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_0/Conv2d_1x1/Relu");
+            i_la << ConvolutionLayer(
+                        1U, 1U, 192U,
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_weights.npy", weights_layout),
+                        std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_0_Conv2d_1x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_0_Conv2d_1x1_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_0/Conv2d_1x1/Relu");
 
             // Branch 1
             SubStream i_lb(i_l);
             i_lb << ConvolutionLayer(1U, 1U, 192U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 0, 0))
-                 .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0a_1x1/Relu")
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0a_1x1/Relu")
                  << ConvolutionLayer(3U, 1U, 224U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x3_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x3_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 1, 0))
-                 .set_name(unit_name + "Branch_1/Conv2d_0b_1x3/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(unit_name + "Branch_1/Conv2d_0b_1x3/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0b_1x3/Relu")
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_1x3/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_1x3/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0b_1x3/Relu")
                  << ConvolutionLayer(1U, 3U, 256U,
-                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_3x1_weights.npy", weights_layout),
+                                     get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_3x1_weights.npy",
+                                                          weights_layout),
                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                      PadStrideInfo(1, 1, 0, 1))
-                 .set_name(unit_name + "Branch_1/Conv2d_0c_3x1/convolution")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_moving_variance.npy"),
-                                            get_random_accessor(1.f, 1.f),
-                                            get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(unit_name + "Branch_1/Conv2d_0c_3x1/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Branch_1/Conv2d_0c_3x1/Relu");
+                        .set_name(unit_name + "Branch_1/Conv2d_0c_3x1/convolution")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path,
+                                             unit_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_moving_variance.npy"),
+                        get_random_accessor(1.f, 1.f),
+                        get_weights_accessor(data_path, unit_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(unit_name + "Branch_1/Conv2d_0c_3x1/BatchNorm")
+                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                        .set_name(unit_name + "Branch_1/Conv2d_0c_3x1/Relu");
 
             // Concatenate
             i_l << ConcatLayer(std::move(i_la), std::move(i_lb)).set_name(unit_name + "concat")
-                << ConvolutionLayer(1U, 1U, 2080U,
-                                    get_weights_accessor(data_path, unit_path + "Conv2d_1x1_weights.npy", weights_layout),
-                                    get_weights_accessor(data_path, unit_path + "Conv2d_1x1_biases.npy", weights_layout),
-                                    PadStrideInfo(1, 1, 0, 0))
-                .set_name(unit_name + "Conv2d_1x1/convolution");
+                << ConvolutionLayer(
+                       1U, 1U, 2080U,
+                       get_weights_accessor(data_path, unit_path + "Conv2d_1x1_weights.npy", weights_layout),
+                       get_weights_accessor(data_path, unit_path + "Conv2d_1x1_biases.npy", weights_layout),
+                       PadStrideInfo(1, 1, 0, 0))
+                       .set_name(unit_name + "Conv2d_1x1/convolution");
 
             // Scale result
-            if(scale != 1.f)
+            if (scale != 1.f)
             {
-                i_l << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, scale, 0.f)).set_name(unit_name + "mul");
+                i_l << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, scale, 0.f))
+                           .set_name(unit_name + "mul");
             }
 
             // Residual add
             graph << EltwiseLayer(std::move(i_l), std::move(i_r), EltwiseOperation::Add).set_name(unit_name + "add");
 
             // Apply activation if needed
-            if(has_activation)
+            if (has_activation)
             {
-                graph << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Relu");
+                graph << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                             .set_name(unit_name + "Relu");
             }
         }
     }
diff --git a/examples/graph_inception_v3.cpp b/examples/graph_inception_v3.cpp
index 160e7f04f4..75e03fb6b3 100644
--- a/examples/graph_inception_v3.cpp
+++ b/examples/graph_inception_v3.cpp
@@ -21,9 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/graph.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -37,8 +38,7 @@ using namespace arm_compute::graph_utils;
 class InceptionV3Example : public Example
 {
 public:
-    InceptionV3Example()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "InceptionV3")
+    InceptionV3Example() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "InceptionV3")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -51,7 +51,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -68,134 +68,163 @@ public:
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(299U, 299U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(299U, 299U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
-              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor), false))
-              << ConvolutionLayer(3U, 3U, 32U,
-                                  get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_1a_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
-              .set_name("Conv2d_1a_3x3/convolution")
-              << BatchNormalizationLayer(get_weights_accessor(data_path,
-                                                              "/cnn_data/inceptionv3_model/Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                         get_weights_accessor(data_path,
-                                                              "/cnn_data/inceptionv3_model/Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                         nullptr, get_weights_accessor(data_path,
-                                                                       "/cnn_data/inceptionv3_model/Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                         0.001f)
-              .set_name("Conv2d_1a_3x3/BatchNorm/batchnorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_1a_3x3/Relu")
-              << ConvolutionLayer(3U, 3U, 32U,
-                                  get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_2a_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-              .set_name("Conv2d_2a_3x3/convolution")
-              << BatchNormalizationLayer(get_weights_accessor(data_path,
-                                                              "/cnn_data/inceptionv3_model/Conv2d_2a_3x3_BatchNorm_moving_mean.npy"),
-                                         get_weights_accessor(data_path,
-                                                              "/cnn_data/inceptionv3_model/Conv2d_2a_3x3_BatchNorm_moving_variance.npy"),
-                                         nullptr, get_weights_accessor(data_path,
-                                                                       "/cnn_data/inceptionv3_model/Conv2d_2a_3x3_BatchNorm_beta.npy"),
-                                         0.001f)
-              .set_name("Conv2d_2a_3x3/BatchNorm/batchnorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_2a_3x3/Relu")
-
-              << ConvolutionLayer(3U, 3U, 64U,
-                                  get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_2b_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
-              .set_name("Conv2d_2b_3x3/convolution")
-              << BatchNormalizationLayer(get_weights_accessor(data_path,
-                                                              "/cnn_data/inceptionv3_model/Conv2d_2b_3x3_BatchNorm_moving_mean.npy"),
-                                         get_weights_accessor(data_path,
-                                                              "/cnn_data/inceptionv3_model/Conv2d_2b_3x3_BatchNorm_moving_variance.npy"),
-                                         nullptr, get_weights_accessor(data_path,
-                                                                       "/cnn_data/inceptionv3_model/Conv2d_2b_3x3_BatchNorm_beta.npy"),
-                                         0.001f)
-              .set_name("Conv2d_2b_3x3/BatchNorm/batchnorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_2b_3x3/Relu")
-
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name("MaxPool_3a_3x3/MaxPool")
-
-              << ConvolutionLayer(1U, 1U, 80U,
-                                  get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_3b_1x1_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-              .set_name("Conv2d_3b_1x1/convolution")
-              << BatchNormalizationLayer(get_weights_accessor(data_path,
-                                                              "/cnn_data/inceptionv3_model/Conv2d_3b_1x1_BatchNorm_moving_mean.npy"),
-                                         get_weights_accessor(data_path,
-                                                              "/cnn_data/inceptionv3_model/Conv2d_3b_1x1_BatchNorm_moving_variance.npy"),
-                                         nullptr, get_weights_accessor(data_path,
-                                                                       "/cnn_data/inceptionv3_model/Conv2d_3b_1x1_BatchNorm_beta.npy"),
-                                         0.001f)
-              .set_name("Conv2d_3b_1x1/BatchNorm/batchnorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_3b_1x1/Relu")
-
-              << ConvolutionLayer(3U, 3U, 192U,
-                                  get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_4a_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-              .set_name("Conv2d_4a_3x3/convolution")
-              << BatchNormalizationLayer(get_weights_accessor(data_path,
-                                                              "/cnn_data/inceptionv3_model/Conv2d_4a_3x3_BatchNorm_moving_mean.npy"),
-                                         get_weights_accessor(data_path,
-                                                              "/cnn_data/inceptionv3_model/Conv2d_4a_3x3_BatchNorm_moving_variance.npy"),
-                                         nullptr, get_weights_accessor(data_path,
-                                                                       "/cnn_data/inceptionv3_model/Conv2d_4a_3x3_BatchNorm_beta.npy"),
-                                         0.001f)
-              .set_name("Conv2d_4a_3x3/BatchNorm/batchnorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_4a_3x3/Relu")
-
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name("MaxPool_5a_3x3/MaxPool");
-
-        graph << get_inception_node_A(data_path, "Mixed_5b", weights_layout, 64U, std::make_tuple(48U, 64U), std::make_tuple(64U, 96U, 96U),
-                                      32U)
-              .set_name("Mixed_5b/concat");
-        graph << get_inception_node_A(data_path, "Mixed_5c", weights_layout, 64U, std::make_tuple(48U, 64U), std::make_tuple(64U, 96U, 96U),
-                                      64U, true)
-              .set_name("Mixed_5c/concat");
-        graph << get_inception_node_A(data_path, "Mixed_5d", weights_layout, 64U, std::make_tuple(48U, 64U), std::make_tuple(64U, 96U, 96U),
-                                      64U)
-              .set_name("Mixed_5d/concat");
-
-        graph << get_inception_node_B(data_path, "Mixed_6a", weights_layout, 384U, std::make_tuple(64U, 96U, 96U)).set_name("Mixed_6a/concat");
+        graph
+            << common_params.target << common_params.fast_math_hint
+            << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor), false))
+            << ConvolutionLayer(3U, 3U, 32U,
+                                get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_1a_3x3_weights.npy",
+                                                     weights_layout),
+                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Conv2d_1a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path,
+                                        "/cnn_data/inceptionv3_model/Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path,
+                                        "/cnn_data/inceptionv3_model/Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   nullptr,
+                   get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name("Conv2d_1a_3x3/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Conv2d_1a_3x3/Relu")
+            << ConvolutionLayer(3U, 3U, 32U,
+                                get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_2a_3x3_weights.npy",
+                                                     weights_layout),
+                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Conv2d_2a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path,
+                                        "/cnn_data/inceptionv3_model/Conv2d_2a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path,
+                                        "/cnn_data/inceptionv3_model/Conv2d_2a_3x3_BatchNorm_moving_variance.npy"),
+                   nullptr,
+                   get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_2a_3x3_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name("Conv2d_2a_3x3/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Conv2d_2a_3x3/Relu")
+
+            << ConvolutionLayer(3U, 3U, 64U,
+                                get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_2b_3x3_weights.npy",
+                                                     weights_layout),
+                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                PadStrideInfo(1, 1, 1, 1))
+                   .set_name("Conv2d_2b_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path,
+                                        "/cnn_data/inceptionv3_model/Conv2d_2b_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path,
+                                        "/cnn_data/inceptionv3_model/Conv2d_2b_3x3_BatchNorm_moving_variance.npy"),
+                   nullptr,
+                   get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_2b_3x3_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name("Conv2d_2b_3x3/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Conv2d_2b_3x3/Relu")
+
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                             PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+                   .set_name("MaxPool_3a_3x3/MaxPool")
+
+            << ConvolutionLayer(1U, 1U, 80U,
+                                get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_3b_1x1_weights.npy",
+                                                     weights_layout),
+                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Conv2d_3b_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path,
+                                        "/cnn_data/inceptionv3_model/Conv2d_3b_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path,
+                                        "/cnn_data/inceptionv3_model/Conv2d_3b_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr,
+                   get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_3b_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name("Conv2d_3b_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Conv2d_3b_1x1/Relu")
+
+            << ConvolutionLayer(3U, 3U, 192U,
+                                get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_4a_3x3_weights.npy",
+                                                     weights_layout),
+                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Conv2d_4a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path,
+                                        "/cnn_data/inceptionv3_model/Conv2d_4a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path,
+                                        "/cnn_data/inceptionv3_model/Conv2d_4a_3x3_BatchNorm_moving_variance.npy"),
+                   nullptr,
+                   get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Conv2d_4a_3x3_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name("Conv2d_4a_3x3/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Conv2d_4a_3x3/Relu")
+
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                             PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+                   .set_name("MaxPool_5a_3x3/MaxPool");
+
+        graph << get_inception_node_A(data_path, "Mixed_5b", weights_layout, 64U, std::make_tuple(48U, 64U),
+                                      std::make_tuple(64U, 96U, 96U), 32U)
+                     .set_name("Mixed_5b/concat");
+        graph << get_inception_node_A(data_path, "Mixed_5c", weights_layout, 64U, std::make_tuple(48U, 64U),
+                                      std::make_tuple(64U, 96U, 96U), 64U, true)
+                     .set_name("Mixed_5c/concat");
+        graph << get_inception_node_A(data_path, "Mixed_5d", weights_layout, 64U, std::make_tuple(48U, 64U),
+                                      std::make_tuple(64U, 96U, 96U), 64U)
+                     .set_name("Mixed_5d/concat");
+
+        graph << get_inception_node_B(data_path, "Mixed_6a", weights_layout, 384U, std::make_tuple(64U, 96U, 96U))
+                     .set_name("Mixed_6a/concat");
 
         graph << get_inception_node_C(data_path, "Mixed_6b", weights_layout, 192U, std::make_tuple(128U, 128U, 192U),
                                       std::make_tuple(128U, 128U, 128U, 128U, 192U), 192U)
-              .set_name("Mixed_6b/concat");
+                     .set_name("Mixed_6b/concat");
         graph << get_inception_node_C(data_path, "Mixed_6c", weights_layout, 192U, std::make_tuple(160U, 160U, 192U),
                                       std::make_tuple(160U, 160U, 160U, 160U, 192U), 192U)
-              .set_name("Mixed_6c/concat");
+                     .set_name("Mixed_6c/concat");
         graph << get_inception_node_C(data_path, "Mixed_6d", weights_layout, 192U, std::make_tuple(160U, 160U, 192U),
                                       std::make_tuple(160U, 160U, 160U, 160U, 192U), 192U)
-              .set_name("Mixed_6d/concat");
+                     .set_name("Mixed_6d/concat");
         graph << get_inception_node_C(data_path, "Mixed_6e", weights_layout, 192U, std::make_tuple(192U, 192U, 192U),
                                       std::make_tuple(192U, 192U, 192U, 192U, 192U), 192U)
-              .set_name("Mixed_6e/concat");
+                     .set_name("Mixed_6e/concat");
 
         graph << get_inception_node_D(data_path, "Mixed_7a", weights_layout, std::make_tuple(192U, 320U),
                                       std::make_tuple(192U, 192U, 192U, 192U))
-              .set_name("Mixed_7a/concat");
+                     .set_name("Mixed_7a/concat");
 
         graph << get_inception_node_E(data_path, "Mixed_7b", weights_layout, 320U, std::make_tuple(384U, 384U, 384U),
                                       std::make_tuple(448U, 384U, 384U, 384U), 192U)
-              .set_name("Mixed_7b/concat");
+                     .set_name("Mixed_7b/concat");
         graph << get_inception_node_E(data_path, "Mixed_7c", weights_layout, 320U, std::make_tuple(384U, 384U, 384U),
                                       std::make_tuple(448U, 384U, 384U, 384U), 192U, true)
-              .set_name("Mixed_7c/concat");
-
-        graph << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 8, operation_layout, PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL))).set_name("Logits/AvgPool_1a_8x8/AvgPool")
-              << ConvolutionLayer(1U, 1U, 1001U, get_weights_accessor(data_path,
-                                                                      "/cnn_data/inceptionv3_model/Logits_Conv2d_1c_1x1_weights.npy", weights_layout),
-                                  get_weights_accessor(data_path,
-                                                       "/cnn_data/inceptionv3_model/Logits_Conv2d_1c_1x1_biases.npy"),
-                                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("Logits/Conv2d_1c_1x1/convolution")
+                     .set_name("Mixed_7c/concat");
+
+        graph << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 8, operation_layout,
+                                               PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL)))
+                     .set_name("Logits/AvgPool_1a_8x8/AvgPool")
+              << ConvolutionLayer(
+                     1U, 1U, 1001U,
+                     get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Logits_Conv2d_1c_1x1_weights.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/inceptionv3_model/Logits_Conv2d_1c_1x1_biases.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("Logits/Conv2d_1c_1x1/convolution")
               << ReshapeLayer(TensorShape(1001U)).set_name("Predictions/Reshape")
-              << SoftmaxLayer().set_name("Predictions/Softmax")
-              << OutputLayer(get_output_accessor(common_params, 5));
+              << SoftmaxLayer().set_name("Predictions/Softmax") << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
         GraphConfig config;
@@ -223,19 +252,21 @@ private:
     Stream             graph;
 
 private:
-    ConcatLayer get_inception_node_A(const std::string &data_path, std::string &&param_path, DataLayout weights_layout,
-                                     unsigned int a_filt,
-                                     std::tuple<unsigned int, unsigned int> b_filters,
+    ConcatLayer get_inception_node_A(const std::string                                   &data_path,
+                                     std::string                                        &&param_path,
+                                     DataLayout                                           weights_layout,
+                                     unsigned int                                         a_filt,
+                                     std::tuple<unsigned int, unsigned int>               b_filters,
                                      std::tuple<unsigned int, unsigned int, unsigned int> c_filters,
-                                     unsigned int d_filt,
-                                     bool         is_name_different = false)
+                                     unsigned int                                         d_filt,
+                                     bool                                                 is_name_different = false)
     {
         std::string total_path = "/cnn_data/inceptionv3_model/" + param_path + "_";
 
         // This is due to a naming issue in the tf model
         std::string conv_id0 = "_0a_";
         std::string conv_id1 = "2d_0b_";
-        if(is_name_different)
+        if (is_name_different)
         {
             conv_id0 = "_0b_";
             conv_id1 = "_1_0c_";
@@ -243,457 +274,451 @@ private:
 
         SubStream i_a(graph);
         i_a << ConvolutionLayer(
-                1U, 1U, a_filt,
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Relu");
+                   1U, 1U, a_filt,
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Relu");
 
         SubStream i_b(graph);
         i_b << ConvolutionLayer(
-                1U, 1U, std::get<0>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id0 + "1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_1/Conv2d" + conv_id0 + "1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id0 + "1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id0 + "1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id0 + "1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d" + conv_id0 + "1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d" + conv_id0 + "1x1/Relu")
+                   1U, 1U, std::get<0>(b_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id0 + "1x1_weights.npy",
+                                        weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_1/Conv2d" + conv_id0 + "1x1/convolution")
+            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id0 +
+                                                                           "1x1_BatchNorm_moving_mean.npy"),
+                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id0 +
+                                                                           "1x1_BatchNorm_moving_variance.npy"),
+                                       nullptr,
+                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id0 +
+                                                                           "1x1_BatchNorm_beta.npy"),
+                                       0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d" + conv_id0 + "1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d" + conv_id0 + "1x1/Relu")
             << ConvolutionLayer(
-                5U, 5U, std::get<1>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv" + conv_id1 + "5x5_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 2, 2))
-            .set_name(param_path + "/Branch_1/Conv2d" + conv_id1 + "5x5/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv" + conv_id1 + "5x5_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv" + conv_id1 + "5x5_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv" + conv_id1 + "5x5_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d" + conv_id1 + "5x5/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d" + conv_id1 + "5x5/Relu");
+                   5U, 5U, std::get<1>(b_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv" + conv_id1 + "5x5_weights.npy",
+                                        weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 2, 2))
+                   .set_name(param_path + "/Branch_1/Conv2d" + conv_id1 + "5x5/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path,
+                                        total_path + "Branch_1_Conv" + conv_id1 + "5x5_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path,
+                                        total_path + "Branch_1_Conv" + conv_id1 + "5x5_BatchNorm_moving_variance.npy"),
+                   nullptr,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv" + conv_id1 + "5x5_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d" + conv_id1 + "5x5/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d" + conv_id1 + "5x5/Relu");
 
         SubStream i_c(graph);
         i_c << ConvolutionLayer(
-                1U, 1U, std::get<0>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Relu")
+                   1U, 1U, std::get<0>(c_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Relu")
             << ConvolutionLayer(
-                3U, 3U, std::get<1>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 1, 1))
-            .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0b_3x3/Relu")
+                   3U, 3U, std::get<1>(c_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/Relu")
             << ConvolutionLayer(
-                3U, 3U, std::get<2>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 1, 1))
-            .set_name(param_path + "/Branch_2/Conv2d_0c_3x3/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0c_3x3/BatchNorm/batcnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0c_3x3/Relu");
+                   3U, 3U, std::get<2>(c_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_3x3/BatchNorm/batcnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_3x3/Relu");
 
         SubStream i_d(graph);
-        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL),
-                                             true))
-            .set_name(param_path + "/Branch_3/AvgPool_0a_3x3/AvgPool")
+        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout,
+                                             PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL), true))
+                   .set_name(param_path + "/Branch_3/AvgPool_0a_3x3/AvgPool")
             << ConvolutionLayer(
-                1U, 1U, d_filt,
-                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Relu");
+                   1U, 1U, d_filt,
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Relu");
 
         return ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d));
     }
 
-    ConcatLayer get_inception_node_B(const std::string &data_path, std::string &&param_path, DataLayout weights_layout,
-                                     unsigned int a_filt,
+    ConcatLayer get_inception_node_B(const std::string                                   &data_path,
+                                     std::string                                        &&param_path,
+                                     DataLayout                                           weights_layout,
+                                     unsigned int                                         a_filt,
                                      std::tuple<unsigned int, unsigned int, unsigned int> b_filters)
     {
         std::string total_path = "/cnn_data/inceptionv3_model/" + param_path + "_";
         SubStream   i_a(graph);
         i_a << ConvolutionLayer(
-                3U, 3U, a_filt,
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(2, 2, 0, 0))
-            .set_name(param_path + "/Branch_0/Conv2d_1a_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_0/Conv2d_1a_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_0/Conv2d_1a_1x1/Relu");
+                   3U, 3U, a_filt,
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name(param_path + "/Branch_0/Conv2d_1a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_0/Conv2d_1a_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_0/Conv2d_1a_1x1/Relu");
 
         SubStream i_b(graph);
         i_b << ConvolutionLayer(
-                1U, 1U, std::get<0>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu")
+                   1U, 1U, std::get<0>(b_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu")
             << ConvolutionLayer(
-                3U, 3U, std::get<1>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 1, 1))
-            .set_name(param_path + "/Branch_1/Conv2d_0b_3x3/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0b_3x3/Relu")
+                   3U, 3U, std::get<1>(b_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_3x3/Relu")
             << ConvolutionLayer(
-                3U, 3U, std::get<2>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(2, 2, 0, 0))
-            .set_name(param_path + "/Branch_1/Conv2d_1a_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_1a_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_1a_1x1/Relu");
+                   3U, 3U, std::get<2>(b_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name(param_path + "/Branch_1/Conv2d_1a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_1a_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_1a_1x1/Relu");
 
         SubStream i_c(graph);
-        i_c << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name(param_path + "/Branch_2/MaxPool_1a_3x3/MaxPool");
+        i_c << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout,
+                                             PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+                   .set_name(param_path + "/Branch_2/MaxPool_1a_3x3/MaxPool");
 
         return ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c));
     }
 
-    ConcatLayer get_inception_node_C(const std::string &data_path, std::string &&param_path, DataLayout weights_layout,
-                                     unsigned int a_filt,
-                                     std::tuple<unsigned int, unsigned int, unsigned int> b_filters,
-                                     std::tuple<unsigned int, unsigned int, unsigned int, unsigned int, unsigned int> c_filters,
-                                     unsigned int d_filt)
+    ConcatLayer
+    get_inception_node_C(const std::string                                   &data_path,
+                         std::string                                        &&param_path,
+                         DataLayout                                           weights_layout,
+                         unsigned int                                         a_filt,
+                         std::tuple<unsigned int, unsigned int, unsigned int> b_filters,
+                         std::tuple<unsigned int, unsigned int, unsigned int, unsigned int, unsigned int> c_filters,
+                         unsigned int                                                                     d_filt)
     {
         std::string total_path = "/cnn_data/inceptionv3_model/" + param_path + "_";
         SubStream   i_a(graph);
         i_a << ConvolutionLayer(
-                1U, 1U, a_filt,
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Relu");
+                   1U, 1U, a_filt,
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Relu");
 
         SubStream i_b(graph);
         i_b << ConvolutionLayer(
-                1U, 1U, std::get<0>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu")
+                   1U, 1U, std::get<0>(b_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu")
             << ConvolutionLayer(
-                7U, 1U, std::get<1>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 3, 0))
-            .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0b_1x7/Relu")
+                   7U, 1U, std::get<1>(b_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/Relu")
             << ConvolutionLayer(
-                1U, 7U, std::get<2>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 3))
-            .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_0/Conv2d_0c_7x1/Relu");
+                   1U, 7U, std::get<2>(b_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
+                   .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_0/Conv2d_0c_7x1/Relu");
 
         SubStream i_c(graph);
         i_c << ConvolutionLayer(
-                1U, 1U, std::get<0>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Relu")
+                   1U, 1U, std::get<0>(c_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Relu")
             << ConvolutionLayer(
-                1U, 7U, std::get<1>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 3))
-            .set_name(param_path + "/Branch_2/Conv2d_0b_7x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0b_7x1/Relu")
+                   1U, 7U, std::get<1>(c_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_7x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_7x1/Relu")
             << ConvolutionLayer(
-                7U, 1U, std::get<2>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 3, 0))
-            .set_name(param_path + "/Branch_2/Conv2d_0c_1x7/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0c_1x7/Relu")
+                   7U, 1U, std::get<2>(c_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_1x7/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_1x7/Relu")
             << ConvolutionLayer(
-                1U, 7U, std::get<3>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 3))
-            .set_name(param_path + "/Branch_2/Conv2d_0d_7x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0d_7x1/Relu")
+                   1U, 7U, std::get<3>(c_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
+                   .set_name(param_path + "/Branch_2/Conv2d_0d_7x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0d_7x1/Relu")
             << ConvolutionLayer(
-                7U, 1U, std::get<4>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 3, 0))
-            .set_name(param_path + "/Branch_2/Conv2d_0e_1x7/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0e_1x7/Relu");
+                   7U, 1U, std::get<4>(c_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
+                   .set_name(param_path + "/Branch_2/Conv2d_0e_1x7/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0e_1x7/Relu");
 
         SubStream i_d(graph);
-        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL),
-                                             true))
-            .set_name(param_path + "/Branch_3/AvgPool_0a_3x3/AvgPool")
+        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout,
+                                             PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL), true))
+                   .set_name(param_path + "/Branch_3/AvgPool_0a_3x3/AvgPool")
             << ConvolutionLayer(
-                1U, 1U, d_filt,
-                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Relu");
+                   1U, 1U, d_filt,
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Relu");
 
         return ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d));
     }
 
-    ConcatLayer get_inception_node_D(const std::string &data_path, std::string &&param_path, DataLayout weights_layout,
-                                     std::tuple<unsigned int, unsigned int> a_filters,
+    ConcatLayer get_inception_node_D(const std::string                                                 &data_path,
+                                     std::string                                                      &&param_path,
+                                     DataLayout                                                         weights_layout,
+                                     std::tuple<unsigned int, unsigned int>                             a_filters,
                                      std::tuple<unsigned int, unsigned int, unsigned int, unsigned int> b_filters)
     {
         std::string total_path = "/cnn_data/inceptionv3_model/" + param_path + "_";
         SubStream   i_a(graph);
         i_a << ConvolutionLayer(
-                1U, 1U, std::get<0>(a_filters),
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Relu")
+                   1U, 1U, std::get<0>(a_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Relu")
             << ConvolutionLayer(
-                3U, 3U, std::get<1>(a_filters),
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(2, 2, 0, 0))
-            .set_name(param_path + "/Branch_0/Conv2d_1a_3x3/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_0/Conv2d_1a_3x3/Relu");
+                   3U, 3U, std::get<1>(a_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name(param_path + "/Branch_0/Conv2d_1a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_0/Conv2d_1a_3x3/Relu");
 
         SubStream i_b(graph);
         i_b << ConvolutionLayer(
-                1U, 1U, std::get<0>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu")
+                   1U, 1U, std::get<0>(b_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu")
             << ConvolutionLayer(
-                7U, 1U, std::get<1>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 3, 0))
-            .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0b_1x7/Relu")
+                   7U, 1U, std::get<1>(b_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/Relu")
             << ConvolutionLayer(
-                1U, 7U, std::get<2>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 3))
-            .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0c_7x1/Relu")
+                   1U, 7U, std::get<2>(b_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
+                   .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/Relu")
             << ConvolutionLayer(
-                3U, 3U, std::get<3>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(2, 2, 0, 0))
-            .set_name(param_path + "/Branch_1/Conv2d_1a_3x3/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_1a_3x3/Relu");
+                   3U, 3U, std::get<3>(b_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name(param_path + "/Branch_1/Conv2d_1a_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_1a_3x3/Relu");
 
         SubStream i_c(graph);
-        i_c << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name(param_path + "/Branch_2/MaxPool_1a_3x3/MaxPool");
+        i_c << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout,
+                                             PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+                   .set_name(param_path + "/Branch_2/MaxPool_1a_3x3/MaxPool");
 
         return ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c));
     }
 
-    ConcatLayer get_inception_node_E(const std::string &data_path, std::string &&param_path, DataLayout weights_layout,
-                                     unsigned int a_filt,
-                                     std::tuple<unsigned int, unsigned int, unsigned int> b_filters,
+    ConcatLayer get_inception_node_E(const std::string                                                 &data_path,
+                                     std::string                                                      &&param_path,
+                                     DataLayout                                                         weights_layout,
+                                     unsigned int                                                       a_filt,
+                                     std::tuple<unsigned int, unsigned int, unsigned int>               b_filters,
                                      std::tuple<unsigned int, unsigned int, unsigned int, unsigned int> c_filters,
-                                     unsigned int d_filt,
-                                     bool         is_name_different = false)
+                                     unsigned int                                                       d_filt,
+                                     bool is_name_different = false)
     {
         // This is due to a naming issue in the tf model
         std::string conv_id = "_0b_";
-        if(is_name_different)
+        if (is_name_different)
         {
             conv_id = "_0c_";
         }
@@ -701,154 +726,152 @@ private:
         std::string total_path = "/cnn_data/inceptionv3_model/" + param_path + "_";
         SubStream   i_a(graph);
         i_a << ConvolutionLayer(
-                1U, 1U, a_filt,
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Relu");
+                   1U, 1U, a_filt,
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Relu");
 
         SubStream i_b(graph);
         i_b << ConvolutionLayer(
-                1U, 1U, std::get<0>(b_filters),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu");
+                   1U, 1U, std::get<0>(b_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu");
 
         SubStream i_b1(i_b);
         i_b1 << ConvolutionLayer(
-                 3U, 1U, std::get<1>(b_filters),
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_weights.npy", weights_layout),
-                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                 PadStrideInfo(1, 1, 1, 0))
-             .set_name(param_path + "/Branch_1/Conv2d_0b_1x3/convolution")
+                    3U, 1U, std::get<1>(b_filters),
+                    get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_weights.npy", weights_layout),
+                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 0))
+                    .set_name(param_path + "/Branch_1/Conv2d_0b_1x3/convolution")
              << BatchNormalizationLayer(
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_mean.npy"),
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_variance.npy"),
-                 nullptr,
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_beta.npy"),
-                 0.001f)
-             .set_name(param_path + "/Branch_1/Conv2d_0b_1x3/BatchNorm/batchnorm")
-             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0b_1x3/Relu");
+                    get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_mean.npy"),
+                    get_weights_accessor(data_path,
+                                         total_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_variance.npy"),
+                    nullptr, get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_beta.npy"),
+                    0.001f)
+                    .set_name(param_path + "/Branch_1/Conv2d_0b_1x3/BatchNorm/batchnorm")
+             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                    .set_name(param_path + "/Branch_1/Conv2d_0b_1x3/Relu");
 
         SubStream i_b2(i_b);
         i_b2 << ConvolutionLayer(
-                 1U, 3U, std::get<2>(b_filters),
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id + "3x1_weights.npy", weights_layout),
-                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                 PadStrideInfo(1, 1, 0, 1))
-             .set_name(param_path + "/Branch_1/Conv2d" + conv_id + "3x1/convolution")
-             << BatchNormalizationLayer(
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id + "3x1_BatchNorm_moving_mean.npy"),
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id + "3x1_BatchNorm_moving_variance.npy"),
-                 nullptr,
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id + "3x1_BatchNorm_beta.npy"),
-                 0.001f)
-             .set_name(param_path + "/Branch_1/Conv2d" + conv_id + "3x1/BatchNorm/batchnorm")
-             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d" + conv_id + "3x1/Relu");
+                    1U, 3U, std::get<2>(b_filters),
+                    get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id + "3x1_weights.npy",
+                                         weights_layout),
+                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 1))
+                    .set_name(param_path + "/Branch_1/Conv2d" + conv_id + "3x1/convolution")
+             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id +
+                                                                            "3x1_BatchNorm_moving_mean.npy"),
+                                        get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id +
+                                                                            "3x1_BatchNorm_moving_variance.npy"),
+                                        nullptr,
+                                        get_weights_accessor(data_path, total_path + "Branch_1_Conv2d" + conv_id +
+                                                                            "3x1_BatchNorm_beta.npy"),
+                                        0.001f)
+                    .set_name(param_path + "/Branch_1/Conv2d" + conv_id + "3x1/BatchNorm/batchnorm")
+             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                    .set_name(param_path + "/Branch_1/Conv2d" + conv_id + "3x1/Relu");
 
         // Merge b1 and b2
         i_b << ConcatLayer(std::move(i_b1), std::move(i_b2)).set_name(param_path + "/Branch_1/concat");
 
         SubStream i_c(graph);
         i_c << ConvolutionLayer(
-                1U, 1U, std::get<0>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Relu")
+                   1U, 1U, std::get<0>(c_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Relu")
             << ConvolutionLayer(
-                3U, 3U, std::get<1>(c_filters),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 1, 1))
-            .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0b_3x3/Relu");
+                   3U, 3U, std::get<1>(c_filters),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/Relu");
 
         SubStream i_c1(i_c);
         i_c1 << ConvolutionLayer(
-                 3U, 1U, std::get<2>(c_filters),
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_weights.npy", weights_layout),
-                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                 PadStrideInfo(1, 1, 1, 0))
-             .set_name(param_path + "/Branch_2/Conv2d_0c_1x3/convolution")
+                    3U, 1U, std::get<2>(c_filters),
+                    get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_weights.npy", weights_layout),
+                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 0))
+                    .set_name(param_path + "/Branch_2/Conv2d_0c_1x3/convolution")
              << BatchNormalizationLayer(
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_BatchNorm_moving_mean.npy"),
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_BatchNorm_moving_variance.npy"),
-                 nullptr,
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_BatchNorm_beta.npy"),
-                 0.001f)
-             .set_name(param_path + "/Branch_2/Conv2d_0c_1x3/BatchNorm/batchnorm")
-             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0c_1x3/Relu");
+                    get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_BatchNorm_moving_mean.npy"),
+                    get_weights_accessor(data_path,
+                                         total_path + "Branch_2_Conv2d_0c_1x3_BatchNorm_moving_variance.npy"),
+                    nullptr, get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_BatchNorm_beta.npy"),
+                    0.001f)
+                    .set_name(param_path + "/Branch_2/Conv2d_0c_1x3/BatchNorm/batchnorm")
+             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                    .set_name(param_path + "/Branch_2/Conv2d_0c_1x3/Relu");
 
         SubStream i_c2(i_c);
         i_c2 << ConvolutionLayer(
-                 1U, 3U, std::get<3>(c_filters),
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_3x1_weights.npy", weights_layout),
-                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                 PadStrideInfo(1, 1, 0, 1))
-             .set_name(param_path + "/Branch_2/Conv2d_0d_3x1/convolution")
+                    1U, 3U, std::get<3>(c_filters),
+                    get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_3x1_weights.npy", weights_layout),
+                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 1))
+                    .set_name(param_path + "/Branch_2/Conv2d_0d_3x1/convolution")
              << BatchNormalizationLayer(
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_3x1_BatchNorm_moving_mean.npy"),
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_3x1_BatchNorm_moving_variance.npy"),
-                 nullptr,
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_3x1_BatchNorm_beta.npy"),
-                 0.001f)
-             .set_name(param_path + "/Branch_2/Conv2d_0d_3x1/BatchNorm/batchnorm")
-             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0d_3x1/Relu");
+                    get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_3x1_BatchNorm_moving_mean.npy"),
+                    get_weights_accessor(data_path,
+                                         total_path + "Branch_2_Conv2d_0d_3x1_BatchNorm_moving_variance.npy"),
+                    nullptr, get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_3x1_BatchNorm_beta.npy"),
+                    0.001f)
+                    .set_name(param_path + "/Branch_2/Conv2d_0d_3x1/BatchNorm/batchnorm")
+             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                    .set_name(param_path + "/Branch_2/Conv2d_0d_3x1/Relu");
 
         // Merge i_c1 and i_c2
         i_c << ConcatLayer(std::move(i_c1), std::move(i_c2)).set_name(param_path + "/Branch_2/concat");
 
         SubStream i_d(graph);
-        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL),
-                                             true))
-            .set_name(param_path + "/Branch_3/AvgPool_0a_3x3/AvgPool")
+        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout,
+                                             PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL), true))
+                   .set_name(param_path + "/Branch_3/AvgPool_0a_3x3/AvgPool")
             << ConvolutionLayer(
-                1U, 1U, d_filt,
-                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/convolution")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
-                nullptr,
-                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Relu");
+                   1U, 1U, d_filt,
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/convolution")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
+                   nullptr, get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"),
+                   0.001f)
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Relu");
 
         return ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d));
     }
diff --git a/examples/graph_inception_v4.cpp b/examples/graph_inception_v4.cpp
index 6d8fab4141..052498ad38 100644
--- a/examples/graph_inception_v4.cpp
+++ b/examples/graph_inception_v4.cpp
@@ -39,8 +39,7 @@ using namespace arm_compute::graph_utils;
 class InceptionV4Example final : public Example
 {
 public:
-    InceptionV4Example()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "InceptionV4")
+    InceptionV4Example() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "InceptionV4")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -53,7 +52,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -70,51 +69,70 @@ public:
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(299U, 299U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(299U, 299U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
+        graph << common_params.target << common_params.fast_math_hint
               << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor), false))
               // Conv2d_1a_3x3
-              << ConvolutionLayer(3U, 3U, 32U,
-                                  get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_1a_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
-              .set_name("Conv2d_1a_3x3/Conv2D")
-              << BatchNormalizationLayer(get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                         get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                         get_random_accessor(1.f, 1.f),
-                                         get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                         0.001f)
-              .set_name("Conv2d_1a_3x3/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_1a_3x3/Relu")
+              << ConvolutionLayer(
+                     3U, 3U, 32U,
+                     get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_1a_3x3_weights.npy",
+                                          weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                     .set_name("Conv2d_1a_3x3/Conv2D")
+              << BatchNormalizationLayer(
+                     get_weights_accessor(data_path,
+                                          "/cnn_data/inceptionv4_model/Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                     get_weights_accessor(data_path,
+                                          "/cnn_data/inceptionv4_model/Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                     get_random_accessor(1.f, 1.f),
+                     get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_1a_3x3_BatchNorm_beta.npy"),
+                     0.001f)
+                     .set_name("Conv2d_1a_3x3/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_1a_3x3/Relu")
               // Conv2d_2a_3x3
-              << ConvolutionLayer(3U, 3U, 32U,
-                                  get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2a_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-              .set_name("Conv2d_2a_3x3/Conv2D")
-              << BatchNormalizationLayer(get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2a_3x3_BatchNorm_moving_mean.npy"),
-                                         get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2a_3x3_BatchNorm_moving_variance.npy"),
-                                         get_random_accessor(1.f, 1.f),
-                                         get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2a_3x3_BatchNorm_beta.npy"),
-                                         0.001f)
-              .set_name("Conv2d_2a_3x3/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_2a_3x3/Relu")
+              << ConvolutionLayer(
+                     3U, 3U, 32U,
+                     get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2a_3x3_weights.npy",
+                                          weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                     .set_name("Conv2d_2a_3x3/Conv2D")
+              << BatchNormalizationLayer(
+                     get_weights_accessor(data_path,
+                                          "/cnn_data/inceptionv4_model/Conv2d_2a_3x3_BatchNorm_moving_mean.npy"),
+                     get_weights_accessor(data_path,
+                                          "/cnn_data/inceptionv4_model/Conv2d_2a_3x3_BatchNorm_moving_variance.npy"),
+                     get_random_accessor(1.f, 1.f),
+                     get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2a_3x3_BatchNorm_beta.npy"),
+                     0.001f)
+                     .set_name("Conv2d_2a_3x3/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_2a_3x3/Relu")
               // Conv2d_2b_3x3
-              << ConvolutionLayer(3U, 3U, 64U,
-                                  get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2b_3x3_weights.npy", weights_layout),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
-              .set_name("Conv2d_2b_3x3/Conv2D")
-              << BatchNormalizationLayer(get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2b_3x3_BatchNorm_moving_mean.npy"),
-                                         get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2b_3x3_BatchNorm_moving_variance.npy"),
-                                         get_random_accessor(1.f, 1.f),
-                                         get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2b_3x3_BatchNorm_beta.npy"),
-                                         0.001f)
-              .set_name("Conv2d_2b_3x3/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv2d_2b_3x3/Relu");
+              << ConvolutionLayer(
+                     3U, 3U, 64U,
+                     get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2b_3x3_weights.npy",
+                                          weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                     .set_name("Conv2d_2b_3x3/Conv2D")
+              << BatchNormalizationLayer(
+                     get_weights_accessor(data_path,
+                                          "/cnn_data/inceptionv4_model/Conv2d_2b_3x3_BatchNorm_moving_mean.npy"),
+                     get_weights_accessor(data_path,
+                                          "/cnn_data/inceptionv4_model/Conv2d_2b_3x3_BatchNorm_moving_variance.npy"),
+                     get_random_accessor(1.f, 1.f),
+                     get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_2b_3x3_BatchNorm_beta.npy"),
+                     0.001f)
+                     .set_name("Conv2d_2b_3x3/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv2d_2b_3x3/Relu");
 
         graph << get_mixed_3a(data_path, weights_layout).set_name("Mixed_3a/concat");
         graph << get_mixed_4a(data_path, weights_layout).set_name("Mixed_4a/concat");
@@ -140,15 +158,16 @@ public:
         graph << get_inceptionC_block(data_path, weights_layout, "Mixed_7b").set_name("Mixed_7b/concat");
         graph << get_inceptionC_block(data_path, weights_layout, "Mixed_7c").set_name("Mixed_7c/concat");
         graph << get_inceptionC_block(data_path, weights_layout, "Mixed_7d").set_name("Mixed_7d/concat");
-        graph << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, operation_layout)).set_name("Logits/AvgPool_1a/AvgPool")
-              << FlattenLayer().set_name("Logits/Flatten")
-              << FullyConnectedLayer(
-                  1001U,
-                  get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Logits_Logits_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Logits_Logits_biases.npy"))
-              .set_name("Logits/MatMul")
-              << SoftmaxLayer().set_name("Logits/Predictions")
-              << OutputLayer(get_output_accessor(common_params, 5));
+        graph
+            << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, operation_layout)).set_name("Logits/AvgPool_1a/AvgPool")
+            << FlattenLayer().set_name("Logits/Flatten")
+            << FullyConnectedLayer(
+                   1001U,
+                   get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Logits_Logits_weights.npy",
+                                        weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Logits_Logits_biases.npy"))
+                   .set_name("Logits/MatMul")
+            << SoftmaxLayer().set_name("Logits/Predictions") << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
         GraphConfig config;
@@ -162,7 +181,7 @@ public:
 
         // Load the precompiled kernels from a file into the kernel library, in this way the next time they are needed
         // compilation won't be required.
-        if(common_params.enable_cl_cache)
+        if (common_params.enable_cl_cache)
         {
 #ifdef ARM_COMPUTE_CL
             restore_program_cache_from_file();
@@ -172,7 +191,7 @@ public:
         graph.finalize(common_params.target, config);
 
         // Save the opencl kernels to a file
-        if(common_opts.enable_cl_cache)
+        if (common_opts.enable_cl_cache)
         {
 #ifdef ARM_COMPUTE_CL
             save_program_cache_to_file();
@@ -199,22 +218,24 @@ private:
         std::string total_path = "/cnn_data/inceptionv4_model/Mixed_3a_";
 
         SubStream i_a(graph);
-        i_a << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL),
-                                             true))
-            .set_name("Mixed_3a/Branch_0/MaxPool_0a_3x3/MaxPool");
+        i_a << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout,
+                                             PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), true))
+                   .set_name("Mixed_3a/Branch_0/MaxPool_0a_3x3/MaxPool");
 
         SubStream i_b(graph);
-        i_b << ConvolutionLayer(3U, 3U, 96U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_3a/Branch_1/Conv2d_0a_3x3/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_3x3_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_3a/Branch_1/Conv2d_0a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_3a/Branch_1/Conv2d_0a_3x3/Relu");
+        i_b << ConvolutionLayer(
+                   3U, 3U, 96U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_3a/Branch_1/Conv2d_0a_3x3/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_3x3_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_3a/Branch_1/Conv2d_0a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_3a/Branch_1/Conv2d_0a_3x3/Relu");
 
         return ConcatLayer(std::move(i_a), std::move(i_b));
     }
@@ -224,74 +245,86 @@ private:
         std::string total_path = "/cnn_data/inceptionv4_model/Mixed_4a_";
 
         SubStream i_a(graph);
-        i_a << ConvolutionLayer(1U, 1U, 64U,
-                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_4a/Branch_0/Conv2d_0a_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_4a/Branch_0/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_4a/Branch_0/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(3U, 3U, 96U,
-                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_4a/Branch_0/Conv2d_1a_3x3/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_4a/Branch_0/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_4a/Branch_0/Conv2d_1a_3x3/Relu");
+        i_a << ConvolutionLayer(
+                   1U, 1U, 64U,
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_4a/Branch_0/Conv2d_0a_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_4a/Branch_0/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_4a/Branch_0/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 96U,
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_4a/Branch_0/Conv2d_1a_3x3/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_4a/Branch_0/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_4a/Branch_0/Conv2d_1a_3x3/Relu");
 
         SubStream i_b(graph);
-        i_b << ConvolutionLayer(1U, 1U, 64U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_4a/Branch_1/Conv2d_0a_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_4a/Branch_1/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_4a/Branch_1/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(7U, 1U, 64U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
-            .set_name("Mixed_4a/Branch_1/Conv2d_0b_1x7/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_4a/Branch_1/Conv2d_0b_1x7/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_4a/Branch_1/Conv2d_0b_1x7/Relu")
-            << ConvolutionLayer(1U, 7U, 64U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
-            .set_name("Mixed_4a/Branch_1/Conv2d_0c_7x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_4a/Branch_1/Conv2d_0c_7x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_4a/Branch_1/Conv2d_0c_7x1/Relu")
-            << ConvolutionLayer(3U, 3U, 96U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_4a/Branch_1/Conv2d_1a_3x3/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_4a/Branch_1/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_4a/Branch_1/Conv2d_1a_3x3/Relu");
+        i_b << ConvolutionLayer(
+                   1U, 1U, 64U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_4a/Branch_1/Conv2d_0a_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_4a/Branch_1/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_4a/Branch_1/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   7U, 1U, 64U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
+                   .set_name("Mixed_4a/Branch_1/Conv2d_0b_1x7/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_4a/Branch_1/Conv2d_0b_1x7/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_4a/Branch_1/Conv2d_0b_1x7/Relu")
+            << ConvolutionLayer(
+                   1U, 7U, 64U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
+                   .set_name("Mixed_4a/Branch_1/Conv2d_0c_7x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_4a/Branch_1/Conv2d_0c_7x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_4a/Branch_1/Conv2d_0c_7x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 96U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_4a/Branch_1/Conv2d_1a_3x3/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_4a/Branch_1/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_4a/Branch_1/Conv2d_1a_3x3/Relu");
 
         return ConcatLayer(std::move(i_a), std::move(i_b));
     }
@@ -301,22 +334,24 @@ private:
         std::string total_path = "/cnn_data/inceptionv4_model/Mixed_5a_";
 
         SubStream i_a(graph);
-        i_a << ConvolutionLayer(3U, 3U, 192U,
-                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_5a/Branch_0/Conv2d_1a_3x3/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_5a/Branch_0/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_5a/Branch_0/Conv2d_1a_3x3/Relu");
+        i_a << ConvolutionLayer(
+                   3U, 3U, 192U,
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_5a/Branch_0/Conv2d_1a_3x3/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_5a/Branch_0/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_5a/Branch_0/Conv2d_1a_3x3/Relu");
 
         SubStream i_b(graph);
-        i_b << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL),
-                                             true))
-            .set_name("Mixed_5a/Branch_1/MaxPool_1a_3x3/MaxPool");
+        i_b << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout,
+                                             PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), true))
+                   .set_name("Mixed_5a/Branch_1/MaxPool_1a_3x3/MaxPool");
 
         return ConcatLayer(std::move(i_a), std::move(i_b));
     }
@@ -326,92 +361,106 @@ private:
         std::string total_path = "/cnn_data/inceptionv4_model/" + param_path + "_";
 
         SubStream i_a(graph);
-        i_a << ConvolutionLayer(1U, 1U, 96U,
-                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Relu");
+        i_a << ConvolutionLayer(
+                   1U, 1U, 96U,
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Relu");
 
         SubStream i_b(graph);
-        i_b << ConvolutionLayer(1U, 1U, 64U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(3U, 3U, 96U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
-            .set_name(param_path + "/Branch_1/Conv2d_0b_3x3/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0b_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0b_3x3/Relu");
+        i_b << ConvolutionLayer(
+                   1U, 1U, 64U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 96U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_3x3/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_3x3/Relu");
 
         SubStream i_c(graph);
-        i_c << ConvolutionLayer(1U, 1U, 64U,
-                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(3U, 3U, 96U,
-                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
-            .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0b_3x3/Relu")
-            << ConvolutionLayer(3U, 3U, 96U,
-                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
-            .set_name(param_path + "/Branch_2/Conv2d_0c_3x3/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0c_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0c_3x3/Relu");
+        i_c << ConvolutionLayer(
+                   1U, 1U, 64U,
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 96U,
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x3_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_3x3/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 96U,
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_3x3/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_3x3_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_3x3/Relu");
 
         SubStream i_d(graph);
-        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL),
-                                             true))
-            .set_name(param_path + "/Branch_3/AvgPool_0a_3x3/AvgPool")
-            << ConvolutionLayer(1U, 1U, 96U,
-                                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Relu");
+        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout,
+                                             PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL), true))
+                   .set_name(param_path + "/Branch_3/AvgPool_0a_3x3/AvgPool")
+            << ConvolutionLayer(
+                   1U, 1U, 96U,
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Relu");
 
         return ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d));
     }
@@ -421,57 +470,65 @@ private:
         std::string total_path = "/cnn_data/inceptionv4_model/Mixed_6a_";
 
         SubStream i_a(graph);
-        i_a << ConvolutionLayer(3U, 3U, 384U,
-                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/Relu");
+        i_a << ConvolutionLayer(
+                   3U, 3U, 384U,
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_6a/Branch_0/Conv2d_1a_3x3/Relu");
 
         SubStream i_b(graph);
-        i_b << ConvolutionLayer(1U, 1U, 192U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(3U, 3U, 224U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
-            .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/Relu")
-            << ConvolutionLayer(3U, 3U, 256U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/Relu");
+        i_b << ConvolutionLayer(
+                   1U, 1U, 192U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 224U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_3x3_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_0b_3x3/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 256U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_6a/Branch_1/Conv2d_1a_3x3/Relu");
 
         SubStream i_c(graph);
-        i_c << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL),
-                                             true))
-            .set_name("Mixed_6a/Branch_2/MaxPool_1a_3x3/MaxPool");
+        i_c << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout,
+                                             PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), true))
+                   .set_name("Mixed_6a/Branch_2/MaxPool_1a_3x3/MaxPool");
 
         return ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c));
     }
@@ -481,125 +538,145 @@ private:
         std::string total_path = "/cnn_data/inceptionv4_model/" + param_path + "_";
 
         SubStream i_a(graph);
-        i_a << ConvolutionLayer(1U, 1U, 384U,
-                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Relu");
+        i_a << ConvolutionLayer(
+                   1U, 1U, 384U,
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Relu");
 
         SubStream i_b(graph);
-        i_b << ConvolutionLayer(1U, 1U, 192U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(7U, 1U, 224U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
-            .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0b_1x7/Relu")
-            << ConvolutionLayer(1U, 7U, 256U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
-            .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0c_7x1/Relu");
+        i_b << ConvolutionLayer(
+                   1U, 1U, 192U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   7U, 1U, 224U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_0b_1x7/Relu")
+            << ConvolutionLayer(
+                   1U, 7U, 256U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
+                   .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_0c_7x1/Relu");
 
         SubStream i_c(graph);
-        i_c << ConvolutionLayer(1U, 1U, 192U,
-                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(1U, 7U, 192U,
-                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
-            .set_name(param_path + "/Branch_2/Conv2d_0b_7x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0b_7x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0b_7x1/Relu")
-            << ConvolutionLayer(7U, 1U, 224U,
-                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
-            .set_name(param_path + "/Branch_2/Conv2d_0c_1x7/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0c_1x7/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0c_1x7/Relu")
-            << ConvolutionLayer(1U, 7U, 224U,
-                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
-            .set_name(param_path + "/Branch_2/Conv2d_0d_7x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0d_7x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0d_7x1/Relu")
-            << ConvolutionLayer(7U, 1U, 256U,
-                                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
-            .set_name(param_path + "/Branch_2/Conv2d_0e_1x7/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0e_1x7/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0e_1x7/Relu");
+        i_c << ConvolutionLayer(
+                   1U, 1U, 192U,
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   1U, 7U, 192U,
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_7x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_7x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_7x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_7x1/Relu")
+            << ConvolutionLayer(
+                   7U, 1U, 224U,
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_1x7/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x7_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_1x7/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_1x7/Relu")
+            << ConvolutionLayer(
+                   1U, 7U, 224U,
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
+                   .set_name(param_path + "/Branch_2/Conv2d_0d_7x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_7x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0d_7x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0d_7x1/Relu")
+            << ConvolutionLayer(
+                   7U, 1U, 256U,
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
+                   .set_name(param_path + "/Branch_2/Conv2d_0e_1x7/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_1x7_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0e_1x7/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0e_1x7/Relu");
 
         SubStream i_d(graph);
-        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL),
-                                             true))
-            .set_name(param_path + "/Branch_3/AvgPool_0a_3x3/AvgPool")
-            << ConvolutionLayer(1U, 1U, 128U,
-                                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Relu");
+        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout,
+                                             PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL), true))
+                   .set_name(param_path + "/Branch_3/AvgPool_0a_3x3/AvgPool")
+            << ConvolutionLayer(
+                   1U, 1U, 128U,
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Relu");
 
         return ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d));
     }
@@ -609,79 +686,91 @@ private:
         std::string total_path = "/cnn_data/inceptionv4_model/Mixed_7a_";
 
         SubStream i_a(graph);
-        i_a << ConvolutionLayer(1U, 1U, 192U,
-                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(3U, 3U, 192U,
-                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/Relu");
+        i_a << ConvolutionLayer(
+                   1U, 1U, 192U,
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 192U,
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_1a_3x3_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_0/Conv2d_1a_3x3/Relu");
 
         SubStream i_b(graph);
-        i_b << ConvolutionLayer(1U, 1U, 256U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/Relu")
-            << ConvolutionLayer(7U, 1U, 256U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
-            .set_name("Mixed_7a/Branch_1/Conv2d_0b_1x7/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_7a/Branch_1/Conv2d_0b_1x7/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_1/Conv2d_0b_1x7/Relu")
-            << ConvolutionLayer(1U, 7U, 320U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
-            .set_name("Mixed_7a/Branch_1/Conv2d_0c_7x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_7a/Branch_1/Conv2d_0c_7x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_1/Conv2d_0c_7x1/Relu")
-            << ConvolutionLayer(3U, 3U, 320U,
-                                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
-            .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/Relu");
+        i_b << ConvolutionLayer(
+                   1U, 1U, 256U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0a_1x1/Relu")
+            << ConvolutionLayer(
+                   7U, 1U, 256U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 3, 0))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0b_1x7/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x7_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0b_1x7/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0b_1x7/Relu")
+            << ConvolutionLayer(
+                   1U, 7U, 320U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 3))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0c_7x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_7x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0c_7x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_0c_7x1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 320U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 0, 0))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_1a_3x3_BatchNorm_beta.npy"), 0.001f)
+                   .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("Mixed_7a/Branch_1/Conv2d_1a_3x3/Relu");
 
         SubStream i_c(graph);
-        i_c << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL),
-                                             true))
-            .set_name("Mixed_7a/Branch_2/MaxPool_1a_3x3/MaxPool");
+        i_c << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, common_params.data_layout,
+                                             PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL), true))
+                   .set_name("Mixed_7a/Branch_2/MaxPool_1a_3x3/MaxPool");
 
         return ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c));
     }
@@ -691,163 +780,163 @@ private:
         std::string total_path = "/cnn_data/inceptionv4_model/" + param_path + "_";
 
         SubStream i_a(graph);
-        i_a << ConvolutionLayer(1U, 1U, 256U,
-                                get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Relu");
+        i_a << ConvolutionLayer(
+                   1U, 1U, 256U,
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_0_Conv2d_0a_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_0/Conv2d_0a_1x1/Relu");
 
         SubStream i_b(graph);
         i_b << ConvolutionLayer(
-                1U, 1U, 384U,
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Conv2D")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                get_random_accessor(1.f, 1.f),
-                get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu");
+                   1U, 1U, 384U,
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0a_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_1/Conv2d_0a_1x1/Relu");
 
         SubStream i_b1(i_b);
         i_b1 << ConvolutionLayer(
-                 3U, 1U, 256U,
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_weights.npy", weights_layout),
-                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                 PadStrideInfo(1, 1, 1, 0))
-             .set_name(param_path + "/Branch_1/Conv2d_0b_1x3/Conv2D")
+                    3U, 1U, 256U,
+                    get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_weights.npy", weights_layout),
+                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 0))
+                    .set_name(param_path + "/Branch_1/Conv2d_0b_1x3/Conv2D")
              << BatchNormalizationLayer(
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_mean.npy"),
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_variance.npy"),
-                 get_random_accessor(1.f, 1.f),
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_beta.npy"),
-                 0.001f)
-             .set_name(param_path + "/Branch_1/Conv2d_0b_1x3/BatchNorm")
-             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0b_1x3/Relu");
+                    get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_mean.npy"),
+                    get_weights_accessor(data_path,
+                                         total_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_moving_variance.npy"),
+                    get_random_accessor(1.f, 1.f),
+                    get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0b_1x3_BatchNorm_beta.npy"), 0.001f)
+                    .set_name(param_path + "/Branch_1/Conv2d_0b_1x3/BatchNorm")
+             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                    .set_name(param_path + "/Branch_1/Conv2d_0b_1x3/Relu");
 
         SubStream i_b2(i_b);
         i_b2 << ConvolutionLayer(
-                 1U, 3U, 256U,
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_3x1_weights.npy", weights_layout),
-                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                 PadStrideInfo(1, 1, 0, 1))
-             .set_name(param_path + "/Branch_1/Conv2d_0c_3x1/Conv2D")
+                    1U, 3U, 256U,
+                    get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_3x1_weights.npy", weights_layout),
+                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 1))
+                    .set_name(param_path + "/Branch_1/Conv2d_0c_3x1/Conv2D")
              << BatchNormalizationLayer(
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_moving_mean.npy"),
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_moving_variance.npy"),
-                 get_random_accessor(1.f, 1.f),
-                 get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_beta.npy"),
-                 0.001f)
-             .set_name(param_path + "/Branch_1/Conv2d_0c_3x1/BatchNorm")
-             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_1/Conv2d_0c_3x1/Relu");
+                    get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_moving_mean.npy"),
+                    get_weights_accessor(data_path,
+                                         total_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_moving_variance.npy"),
+                    get_random_accessor(1.f, 1.f),
+                    get_weights_accessor(data_path, total_path + "Branch_1_Conv2d_0c_3x1_BatchNorm_beta.npy"), 0.001f)
+                    .set_name(param_path + "/Branch_1/Conv2d_0c_3x1/BatchNorm")
+             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                    .set_name(param_path + "/Branch_1/Conv2d_0c_3x1/Relu");
 
         // Merge b1 and b2
         i_b << ConcatLayer(std::move(i_b1), std::move(i_b2)).set_name(param_path + "/Branch_1/concat");
 
         SubStream i_c(graph);
         i_c << ConvolutionLayer(
-                1U, 1U, 384U,
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Conv2D")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
-                get_random_accessor(1.f, 1.f),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Relu")
+                   1U, 1U, 384U,
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0a_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0a_1x1/Relu")
             << ConvolutionLayer(
-                1U, 3U, 448U,
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x1_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 1))
-            .set_name(param_path + "/Branch_2/Conv2d_0b_3x1/Conv2D")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x1_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x1_BatchNorm_moving_variance.npy"),
-                get_random_accessor(1.f, 1.f),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x1_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0b_3x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0b_3x1/Relu")
+                   1U, 3U, 448U,
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 1))
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_3x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0b_3x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_3x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0b_3x1/Relu")
             << ConvolutionLayer(
-                3U, 1U, 512U,
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_weights.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 1, 0))
-            .set_name(param_path + "/Branch_2/Conv2d_0c_1x3/Conv2D")
-            << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_BatchNorm_moving_mean.npy"),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_BatchNorm_moving_variance.npy"),
-                get_random_accessor(1.f, 1.f),
-                get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_BatchNorm_beta.npy"),
-                0.001f)
-            .set_name(param_path + "/Branch_2/Conv2d_0c_1x3/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0c_1x3/Relu");
+                   3U, 1U, 512U,
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 0))
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_1x3/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0c_1x3_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_1x3/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_2/Conv2d_0c_1x3/Relu");
 
         SubStream i_c1(i_c);
         i_c1 << ConvolutionLayer(
-                 3U, 1U, 256U,
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_1x3_weights.npy", weights_layout),
-                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                 PadStrideInfo(1, 1, 1, 0))
-             .set_name(param_path + "/Branch_2/Conv2d_0d_1x3/Conv2D")
+                    3U, 1U, 256U,
+                    get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_1x3_weights.npy", weights_layout),
+                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 0))
+                    .set_name(param_path + "/Branch_2/Conv2d_0d_1x3/Conv2D")
              << BatchNormalizationLayer(
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_1x3_BatchNorm_moving_mean.npy"),
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_1x3_BatchNorm_moving_variance.npy"),
-                 get_random_accessor(1.f, 1.f),
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_1x3_BatchNorm_beta.npy"),
-                 0.001f)
-             .set_name(param_path + "/Branch_2/Conv2d_0d_1x3/BatchNorm")
-             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0d_1x3/Relu");
+                    get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_1x3_BatchNorm_moving_mean.npy"),
+                    get_weights_accessor(data_path,
+                                         total_path + "Branch_2_Conv2d_0d_1x3_BatchNorm_moving_variance.npy"),
+                    get_random_accessor(1.f, 1.f),
+                    get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0d_1x3_BatchNorm_beta.npy"), 0.001f)
+                    .set_name(param_path + "/Branch_2/Conv2d_0d_1x3/BatchNorm")
+             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                    .set_name(param_path + "/Branch_2/Conv2d_0d_1x3/Relu");
 
         SubStream i_c2(i_c);
         i_c2 << ConvolutionLayer(
-                 1U, 3U, 256U,
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_3x1_weights.npy", weights_layout),
-                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                 PadStrideInfo(1, 1, 0, 1))
-             .set_name(param_path + "/Branch_2/Conv2d_0e_3x1/Conv2D")
+                    1U, 3U, 256U,
+                    get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_3x1_weights.npy", weights_layout),
+                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 1))
+                    .set_name(param_path + "/Branch_2/Conv2d_0e_3x1/Conv2D")
              << BatchNormalizationLayer(
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_3x1_BatchNorm_moving_mean.npy"),
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_3x1_BatchNorm_moving_variance.npy"),
-                 get_random_accessor(1.f, 1.f),
-                 get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_3x1_BatchNorm_beta.npy"),
-                 0.001f)
-             .set_name(param_path + "/Branch_2/Conv2d_0e_3x1/BatchNorm")
-             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_2/Conv2d_0e_3x1/Relu");
+                    get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_3x1_BatchNorm_moving_mean.npy"),
+                    get_weights_accessor(data_path,
+                                         total_path + "Branch_2_Conv2d_0e_3x1_BatchNorm_moving_variance.npy"),
+                    get_random_accessor(1.f, 1.f),
+                    get_weights_accessor(data_path, total_path + "Branch_2_Conv2d_0e_3x1_BatchNorm_beta.npy"), 0.001f)
+                    .set_name(param_path + "/Branch_2/Conv2d_0e_3x1/BatchNorm")
+             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                    .set_name(param_path + "/Branch_2/Conv2d_0e_3x1/Relu");
 
         // Merge i_c1 and i_c2
         i_c << ConcatLayer(std::move(i_c1), std::move(i_c2)).set_name(param_path + "/Branch_2/concat");
 
         SubStream i_d(graph);
-        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL),
-                                             true))
-            .set_name(param_path + "/Branch_3/AvgPool_0a_3x3/AvgPool")
-            << ConvolutionLayer(1U, 1U, 256U,
-                                get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
-                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Conv2D")
-            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
-                                       get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
-                                       get_random_accessor(1.f, 1.f),
-                                       get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"),
-                                       0.001f)
-            .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Relu");
+        i_d << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout,
+                                             PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL), true))
+                   .set_name(param_path + "/Branch_3/AvgPool_0a_3x3/AvgPool")
+            << ConvolutionLayer(
+                   1U, 1U, 256U,
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_weights.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Conv2D")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_moving_variance.npy"),
+                   get_random_accessor(1.f, 1.f),
+                   get_weights_accessor(data_path, total_path + "Branch_3_Conv2d_0b_1x1_BatchNorm_beta.npy"), 0.001f)
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/Branch_3/Conv2d_0b_1x1/Relu");
 
         return ConcatLayer(std::move(i_a), std::move(i_b), std::move(i_c), std::move(i_d));
     }
diff --git a/examples/graph_lenet.cpp b/examples/graph_lenet.cpp
index 1bcd95fb58..7d6dce7b17 100644
--- a/examples/graph_lenet.cpp
+++ b/examples/graph_lenet.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -35,8 +36,7 @@ using namespace arm_compute::graph_utils;
 class GraphLenetExample : public Example
 {
 public:
-    GraphLenetExample()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "LeNet")
+    GraphLenetExample() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "LeNet")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -49,14 +49,15 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
         }
 
         // Checks
-        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type), "QASYMM8 not supported for this graph");
+        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type),
+                                "QASYMM8 not supported for this graph");
 
         // Print parameter values
         std::cout << common_params << std::endl;
@@ -67,43 +68,39 @@ public:
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(28U, 28U, 1U, batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(28U, 28U, 1U, batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
         //conv1 << pool1 << conv2 << pool2 << fc1 << act1 << fc2 << smx
-        graph << common_params.target
-              << common_params.fast_math_hint
+        graph << common_params.target << common_params.fast_math_hint
               << InputLayer(input_descriptor, get_input_accessor(common_params))
               << ConvolutionLayer(
-                  5U, 5U, 20U,
-                  get_weights_accessor(data_path, "/cnn_data/lenet_model/conv1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/lenet_model/conv1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("conv1")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool1")
+                     5U, 5U, 20U, get_weights_accessor(data_path, "/cnn_data/lenet_model/conv1_w.npy", weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/lenet_model/conv1_b.npy"), PadStrideInfo(1, 1, 0, 0))
+                     .set_name("conv1")
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                     .set_name("pool1")
               << ConvolutionLayer(
-                  5U, 5U, 50U,
-                  get_weights_accessor(data_path, "/cnn_data/lenet_model/conv2_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/lenet_model/conv2_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("conv2")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool2")
-              << FullyConnectedLayer(
-                  500U,
-                  get_weights_accessor(data_path, "/cnn_data/lenet_model/ip1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/lenet_model/ip1_b.npy"))
-              .set_name("ip1")
+                     5U, 5U, 50U, get_weights_accessor(data_path, "/cnn_data/lenet_model/conv2_w.npy", weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/lenet_model/conv2_b.npy"), PadStrideInfo(1, 1, 0, 0))
+                     .set_name("conv2")
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                     .set_name("pool2")
+              << FullyConnectedLayer(500U,
+                                     get_weights_accessor(data_path, "/cnn_data/lenet_model/ip1_w.npy", weights_layout),
+                                     get_weights_accessor(data_path, "/cnn_data/lenet_model/ip1_b.npy"))
+                     .set_name("ip1")
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu")
-              << FullyConnectedLayer(
-                  10U,
-                  get_weights_accessor(data_path, "/cnn_data/lenet_model/ip2_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/lenet_model/ip2_b.npy"))
-              .set_name("ip2")
-              << SoftmaxLayer().set_name("prob")
-              << OutputLayer(get_output_accessor(common_params));
+              << FullyConnectedLayer(10U,
+                                     get_weights_accessor(data_path, "/cnn_data/lenet_model/ip2_w.npy", weights_layout),
+                                     get_weights_accessor(data_path, "/cnn_data/lenet_model/ip2_b.npy"))
+                     .set_name("ip2")
+              << SoftmaxLayer().set_name("prob") << OutputLayer(get_output_accessor(common_params));
 
         // Finalize graph
         GraphConfig config;
diff --git a/examples/graph_mobilenet.cpp b/examples/graph_mobilenet.cpp
index 4630dc958a..e3a6ef116d 100644
--- a/examples/graph_mobilenet.cpp
+++ b/examples/graph_mobilenet.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -36,14 +37,13 @@ using namespace arm_compute::graph_utils;
 class GraphMobilenetExample : public Example
 {
 public:
-    GraphMobilenetExample()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "MobileNetV1")
+    GraphMobilenetExample() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "MobileNetV1")
     {
         // Add model id option
         model_id_opt = cmd_parser.add_option<SimpleOption<int>>("model-id", 0);
         model_id_opt->set_help("Mobilenet model id (0: 1.0_224, else: 0.75_160");
     }
-    GraphMobilenetExample(const GraphMobilenetExample &) = delete;
+    GraphMobilenetExample(const GraphMobilenetExample &)            = delete;
     GraphMobilenetExample &operator=(const GraphMobilenetExample &) = delete;
     ~GraphMobilenetExample() override                               = default;
     bool do_setup(int argc, char **argv) override
@@ -56,7 +56,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -72,15 +72,17 @@ public:
         unsigned int spatial_size = (model_id == 0 || common_params.data_type == DataType::QASYMM8) ? 224 : 160;
 
         // Create input descriptor
-        const TensorShape tensor_shape     = permute_shape(TensorShape(spatial_size, spatial_size, 3U, common_params.batches), DataLayout::NCHW, common_params.data_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(spatial_size, spatial_size, 3U, common_params.batches), DataLayout::NCHW,
+                          common_params.data_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
 
         // Set graph hints
-        graph << common_params.target
-              << common_params.fast_math_hint;
+        graph << common_params.target << common_params.fast_math_hint;
 
         // Create core graph
-        if(arm_compute::is_data_type_float(common_params.data_type))
+        if (arm_compute::is_data_type_float(common_params.data_type))
         {
             create_graph_float(input_descriptor, model_id);
         }
@@ -90,8 +92,7 @@ public:
         }
 
         // Create common tail
-        graph << ReshapeLayer(TensorShape(1001U)).set_name("Reshape")
-              << SoftmaxLayer().set_name("Softmax")
+        graph << ReshapeLayer(TensorShape(1001U)).set_name("Reshape") << SoftmaxLayer().set_name("Softmax")
               << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
@@ -115,14 +116,15 @@ public:
 private:
     CommandLineParser  cmd_parser;
     CommonGraphOptions common_opts;
-    SimpleOption<int> *model_id_opt{ nullptr };
+    SimpleOption<int> *model_id_opt{nullptr};
     CommonGraphParams  common_params;
     Stream             graph;
 
     void create_graph_float(TensorDescriptor &input_descriptor, int model_id)
     {
         float       depth_scale = (model_id == 0) ? 1.f : 0.75;
-        std::string model_path  = (model_id == 0) ? "/cnn_data/mobilenet_v1_1_224_model/" : "/cnn_data/mobilenet_v1_075_160_model/";
+        std::string model_path =
+            (model_id == 0) ? "/cnn_data/mobilenet_v1_1_224_model/" : "/cnn_data/mobilenet_v1_075_160_model/";
 
         // Create a preprocessor object
         std::unique_ptr<IPreprocessor> preprocessor = std::make_unique<TFPreproccessor>();
@@ -131,47 +133,68 @@ private:
         std::string data_path = common_params.data_path;
 
         // Add model path to data path
-        if(!data_path.empty())
+        if (!data_path.empty())
         {
             data_path += model_path;
         }
 
-        graph << InputLayer(input_descriptor,
-                            get_input_accessor(common_params, std::move(preprocessor), false))
-              << ConvolutionLayer(
-                  3U, 3U, 32U * depth_scale,
-                  get_weights_accessor(data_path, "Conv2d_0_weights.npy", DataLayout::NCHW),
-                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                  PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR))
-              .set_name("Conv2d_0")
-              << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "Conv2d_0_BatchNorm_moving_mean.npy"),
-                  get_weights_accessor(data_path, "Conv2d_0_BatchNorm_moving_variance.npy"),
-                  get_weights_accessor(data_path, "Conv2d_0_BatchNorm_gamma.npy"),
-                  get_weights_accessor(data_path, "Conv2d_0_BatchNorm_beta.npy"),
-                  0.001f)
-              .set_name("Conv2d_0/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f)).set_name("Conv2d_0/Relu6");
-        graph << get_dwsc_node_float(data_path, "Conv2d_1", 64 * depth_scale, PadStrideInfo(1, 1, 1, 1), PadStrideInfo(1, 1, 0, 0));
-        graph << get_dwsc_node_float(data_path, "Conv2d_2", 128 * depth_scale, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), PadStrideInfo(1, 1, 0, 0));
-        graph << get_dwsc_node_float(data_path, "Conv2d_3", 128 * depth_scale, PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL), PadStrideInfo(1, 1, 0, 0));
-        graph << get_dwsc_node_float(data_path, "Conv2d_4", 256 * depth_scale, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), PadStrideInfo(1, 1, 0, 0));
-        graph << get_dwsc_node_float(data_path, "Conv2d_5", 256 * depth_scale, PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL), PadStrideInfo(1, 1, 0, 0));
-        graph << get_dwsc_node_float(data_path, "Conv2d_6", 512 * depth_scale, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), PadStrideInfo(1, 1, 0, 0));
-        graph << get_dwsc_node_float(data_path, "Conv2d_7", 512 * depth_scale, PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL), PadStrideInfo(1, 1, 0, 0));
-        graph << get_dwsc_node_float(data_path, "Conv2d_8", 512 * depth_scale, PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL), PadStrideInfo(1, 1, 0, 0));
-        graph << get_dwsc_node_float(data_path, "Conv2d_9", 512 * depth_scale, PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL), PadStrideInfo(1, 1, 0, 0));
-        graph << get_dwsc_node_float(data_path, "Conv2d_10", 512 * depth_scale, PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL), PadStrideInfo(1, 1, 0, 0));
-        graph << get_dwsc_node_float(data_path, "Conv2d_11", 512 * depth_scale, PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL), PadStrideInfo(1, 1, 0, 0));
-        graph << get_dwsc_node_float(data_path, "Conv2d_12", 1024 * depth_scale, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), PadStrideInfo(1, 1, 0, 0));
-        graph << get_dwsc_node_float(data_path, "Conv2d_13", 1024 * depth_scale, PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL), PadStrideInfo(1, 1, 0, 0));
-        graph << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, common_params.data_layout)).set_name("Logits/AvgPool_1a")
-              << ConvolutionLayer(
-                  1U, 1U, 1001U,
-                  get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_weights.npy", DataLayout::NCHW),
-                  get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_biases.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("Logits/Conv2d_1c_1x1");
+        graph << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor), false))
+              << ConvolutionLayer(3U, 3U, 32U * depth_scale,
+                                  get_weights_accessor(data_path, "Conv2d_0_weights.npy", DataLayout::NCHW),
+                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                  PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR))
+                     .set_name("Conv2d_0")
+              << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv2d_0_BatchNorm_moving_mean.npy"),
+                                         get_weights_accessor(data_path, "Conv2d_0_BatchNorm_moving_variance.npy"),
+                                         get_weights_accessor(data_path, "Conv2d_0_BatchNorm_gamma.npy"),
+                                         get_weights_accessor(data_path, "Conv2d_0_BatchNorm_beta.npy"), 0.001f)
+                     .set_name("Conv2d_0/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f))
+                     .set_name("Conv2d_0/Relu6");
+        graph << get_dwsc_node_float(data_path, "Conv2d_1", 64 * depth_scale, PadStrideInfo(1, 1, 1, 1),
+                                     PadStrideInfo(1, 1, 0, 0));
+        graph << get_dwsc_node_float(data_path, "Conv2d_2", 128 * depth_scale,
+                                     PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL),
+                                     PadStrideInfo(1, 1, 0, 0));
+        graph << get_dwsc_node_float(data_path, "Conv2d_3", 128 * depth_scale,
+                                     PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL),
+                                     PadStrideInfo(1, 1, 0, 0));
+        graph << get_dwsc_node_float(data_path, "Conv2d_4", 256 * depth_scale,
+                                     PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL),
+                                     PadStrideInfo(1, 1, 0, 0));
+        graph << get_dwsc_node_float(data_path, "Conv2d_5", 256 * depth_scale,
+                                     PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL),
+                                     PadStrideInfo(1, 1, 0, 0));
+        graph << get_dwsc_node_float(data_path, "Conv2d_6", 512 * depth_scale,
+                                     PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL),
+                                     PadStrideInfo(1, 1, 0, 0));
+        graph << get_dwsc_node_float(data_path, "Conv2d_7", 512 * depth_scale,
+                                     PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL),
+                                     PadStrideInfo(1, 1, 0, 0));
+        graph << get_dwsc_node_float(data_path, "Conv2d_8", 512 * depth_scale,
+                                     PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL),
+                                     PadStrideInfo(1, 1, 0, 0));
+        graph << get_dwsc_node_float(data_path, "Conv2d_9", 512 * depth_scale,
+                                     PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL),
+                                     PadStrideInfo(1, 1, 0, 0));
+        graph << get_dwsc_node_float(data_path, "Conv2d_10", 512 * depth_scale,
+                                     PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL),
+                                     PadStrideInfo(1, 1, 0, 0));
+        graph << get_dwsc_node_float(data_path, "Conv2d_11", 512 * depth_scale,
+                                     PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL),
+                                     PadStrideInfo(1, 1, 0, 0));
+        graph << get_dwsc_node_float(data_path, "Conv2d_12", 1024 * depth_scale,
+                                     PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL),
+                                     PadStrideInfo(1, 1, 0, 0));
+        graph << get_dwsc_node_float(data_path, "Conv2d_13", 1024 * depth_scale,
+                                     PadStrideInfo(1, 1, 1, 1, 1, 1, DimensionRoundingType::CEIL),
+                                     PadStrideInfo(1, 1, 0, 0));
+        graph
+            << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, common_params.data_layout)).set_name("Logits/AvgPool_1a")
+            << ConvolutionLayer(
+                   1U, 1U, 1001U, get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_weights.npy", DataLayout::NCHW),
+                   get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_biases.npy"), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("Logits/Conv2d_1c_1x1");
     }
 
     void create_graph_qasymm(TensorDescriptor &input_descriptor)
@@ -180,7 +203,7 @@ private:
         std::string data_path = common_params.data_path;
 
         // Add model path to data path
-        if(!data_path.empty())
+        if (!data_path.empty())
         {
             data_path += "/cnn_data/mobilenet_qasymm8_model/";
         }
@@ -188,19 +211,16 @@ private:
         // Quantization info taken from the AndroidNN QASYMM8 MobileNet example
         const QuantizationInfo in_quant_info = QuantizationInfo(0.0078125f, 128);
 
-        const std::vector<QuantizationInfo> conv_weights_quant_info =
-        {
+        const std::vector<QuantizationInfo> conv_weights_quant_info = {
             QuantizationInfo(0.02182667888700962f, 151), // conv0
             QuantizationInfo(0.004986600950360298f, 74)  // conv14
         };
-        const std::vector<QuantizationInfo> conv_out_quant_info =
-        {
+        const std::vector<QuantizationInfo> conv_out_quant_info = {
             QuantizationInfo(0.023528477177023888f, 0), // conv0
             QuantizationInfo(0.16609922051429749f, 66)  // conv14
         };
 
-        const std::vector<QuantizationInfo> depth_weights_quant_info =
-        {
+        const std::vector<QuantizationInfo> depth_weights_quant_info = {
             QuantizationInfo(0.29219913482666016f, 110),  // dwsc1
             QuantizationInfo(0.40277284383773804f, 130),  // dwsc2
             QuantizationInfo(0.06053730100393295f, 160),  // dwsc3
@@ -216,8 +236,7 @@ private:
             QuantizationInfo(0.12616927921772003f, 211)   // dwsc13
         };
 
-        const std::vector<QuantizationInfo> point_weights_quant_info =
-        {
+        const std::vector<QuantizationInfo> point_weights_quant_info = {
             QuantizationInfo(0.030420949682593346f, 121), // dwsc1
             QuantizationInfo(0.015148180536925793f, 104), // dwsc2
             QuantizationInfo(0.013755458407104015f, 94),  // dwsc3
@@ -235,108 +254,121 @@ private:
 
         graph << InputLayer(input_descriptor.set_quantization_info(in_quant_info),
                             get_input_accessor(common_params, nullptr, false))
-              << ConvolutionLayer(
-                  3U, 3U, 32U,
-                  get_weights_accessor(data_path, "Conv2d_0_weights.npy"),
-                  get_weights_accessor(data_path, "Conv2d_0_bias.npy"),
-                  PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR),
-                  1, conv_weights_quant_info.at(0), conv_out_quant_info.at(0))
-              .set_name("Conv2d_0")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)).set_name("Conv2d_0/Relu6");
-        graph << get_dwsc_node_qasymm(data_path, "Conv2d_1", 64U, PadStrideInfo(1U, 1U, 1U, 1U), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(0), point_weights_quant_info.at(0));
-        graph << get_dwsc_node_qasymm(data_path, "Conv2d_2", 128U, PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(1),
-                                      point_weights_quant_info.at(1));
-        graph << get_dwsc_node_qasymm(data_path, "Conv2d_3", 128U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(2),
-                                      point_weights_quant_info.at(2));
-        graph << get_dwsc_node_qasymm(data_path, "Conv2d_4", 256U, PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(3),
-                                      point_weights_quant_info.at(3));
-        graph << get_dwsc_node_qasymm(data_path, "Conv2d_5", 256U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(4),
-                                      point_weights_quant_info.at(4));
-        graph << get_dwsc_node_qasymm(data_path, "Conv2d_6", 512U, PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(5),
-                                      point_weights_quant_info.at(5));
-        graph << get_dwsc_node_qasymm(data_path, "Conv2d_7", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(6),
-                                      point_weights_quant_info.at(6));
-        graph << get_dwsc_node_qasymm(data_path, "Conv2d_8", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(7),
-                                      point_weights_quant_info.at(7));
-        graph << get_dwsc_node_qasymm(data_path, "Conv2d_9", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(8),
-                                      point_weights_quant_info.at(8));
-        graph << get_dwsc_node_qasymm(data_path, "Conv2d_10", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(9),
-                                      point_weights_quant_info.at(9));
-        graph << get_dwsc_node_qasymm(data_path, "Conv2d_11", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(10),
-                                      point_weights_quant_info.at(10));
-        graph << get_dwsc_node_qasymm(data_path, "Conv2d_12", 1024U, PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(11),
-                                      point_weights_quant_info.at(11));
-        graph << get_dwsc_node_qasymm(data_path, "Conv2d_13", 1024U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(12),
-                                      point_weights_quant_info.at(12))
-              << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, common_params.data_layout)).set_name("Logits/AvgPool_1a")
-              << ConvolutionLayer(
-                  1U, 1U, 1001U,
-                  get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_weights.npy"),
-                  get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_bias.npy"),
-                  PadStrideInfo(1U, 1U, 0U, 0U), 1, conv_weights_quant_info.at(1), conv_out_quant_info.at(1))
-              .set_name("Logits/Conv2d_1c_1x1");
+              << ConvolutionLayer(3U, 3U, 32U, get_weights_accessor(data_path, "Conv2d_0_weights.npy"),
+                                  get_weights_accessor(data_path, "Conv2d_0_bias.npy"),
+                                  PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), 1,
+                                  conv_weights_quant_info.at(0), conv_out_quant_info.at(0))
+                     .set_name("Conv2d_0")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f))
+                     .set_name("Conv2d_0/Relu6");
+        graph << get_dwsc_node_qasymm(data_path, "Conv2d_1", 64U, PadStrideInfo(1U, 1U, 1U, 1U),
+                                      PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(0),
+                                      point_weights_quant_info.at(0));
+        graph << get_dwsc_node_qasymm(
+            data_path, "Conv2d_2", 128U, PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR),
+            PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(1), point_weights_quant_info.at(1));
+        graph << get_dwsc_node_qasymm(
+            data_path, "Conv2d_3", 128U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR),
+            PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(2), point_weights_quant_info.at(2));
+        graph << get_dwsc_node_qasymm(
+            data_path, "Conv2d_4", 256U, PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR),
+            PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(3), point_weights_quant_info.at(3));
+        graph << get_dwsc_node_qasymm(
+            data_path, "Conv2d_5", 256U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR),
+            PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(4), point_weights_quant_info.at(4));
+        graph << get_dwsc_node_qasymm(
+            data_path, "Conv2d_6", 512U, PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR),
+            PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(5), point_weights_quant_info.at(5));
+        graph << get_dwsc_node_qasymm(
+            data_path, "Conv2d_7", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR),
+            PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(6), point_weights_quant_info.at(6));
+        graph << get_dwsc_node_qasymm(
+            data_path, "Conv2d_8", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR),
+            PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(7), point_weights_quant_info.at(7));
+        graph << get_dwsc_node_qasymm(
+            data_path, "Conv2d_9", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR),
+            PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(8), point_weights_quant_info.at(8));
+        graph << get_dwsc_node_qasymm(
+            data_path, "Conv2d_10", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR),
+            PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(9), point_weights_quant_info.at(9));
+        graph << get_dwsc_node_qasymm(
+            data_path, "Conv2d_11", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR),
+            PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(10), point_weights_quant_info.at(10));
+        graph << get_dwsc_node_qasymm(
+            data_path, "Conv2d_12", 1024U, PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR),
+            PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(11), point_weights_quant_info.at(11));
+        graph
+            << get_dwsc_node_qasymm(
+                   data_path, "Conv2d_13", 1024U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR),
+                   PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(12), point_weights_quant_info.at(12))
+            << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, common_params.data_layout)).set_name("Logits/AvgPool_1a")
+            << ConvolutionLayer(1U, 1U, 1001U, get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_weights.npy"),
+                                get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_bias.npy"),
+                                PadStrideInfo(1U, 1U, 0U, 0U), 1, conv_weights_quant_info.at(1),
+                                conv_out_quant_info.at(1))
+                   .set_name("Logits/Conv2d_1c_1x1");
     }
 
-    ConcatLayer get_dwsc_node_float(const std::string &data_path, std::string &&param_path,
-                                    unsigned int  conv_filt,
-                                    PadStrideInfo dwc_pad_stride_info, PadStrideInfo conv_pad_stride_info)
+    ConcatLayer get_dwsc_node_float(const std::string &data_path,
+                                    std::string      &&param_path,
+                                    unsigned int       conv_filt,
+                                    PadStrideInfo      dwc_pad_stride_info,
+                                    PadStrideInfo      conv_pad_stride_info)
     {
         std::string total_path = param_path + "_";
         SubStream   sg(graph);
         sg << DepthwiseConvolutionLayer(
-               3U, 3U,
-               get_weights_accessor(data_path, total_path + "depthwise_depthwise_weights.npy", DataLayout::NCHW),
-               std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-               dwc_pad_stride_info)
-           .set_name(total_path + "depthwise/depthwise")
+                  3U, 3U,
+                  get_weights_accessor(data_path, total_path + "depthwise_depthwise_weights.npy", DataLayout::NCHW),
+                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), dwc_pad_stride_info)
+                  .set_name(total_path + "depthwise/depthwise")
            << BatchNormalizationLayer(
-               get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_moving_mean.npy"),
-               get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_moving_variance.npy"),
-               get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_gamma.npy"),
-               get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_beta.npy"),
-               0.001f)
-           .set_name(total_path + "depthwise/BatchNorm")
-           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f)).set_name(total_path + "depthwise/Relu6")
-           << ConvolutionLayer(
-               1U, 1U, conv_filt,
-               get_weights_accessor(data_path, total_path + "pointwise_weights.npy", DataLayout::NCHW),
-               std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-               conv_pad_stride_info)
-           .set_name(total_path + "pointwise/Conv2D")
+                  get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_moving_mean.npy"),
+                  get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_moving_variance.npy"),
+                  get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_gamma.npy"),
+                  get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_beta.npy"), 0.001f)
+                  .set_name(total_path + "depthwise/BatchNorm")
+           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f))
+                  .set_name(total_path + "depthwise/Relu6")
+           << ConvolutionLayer(1U, 1U, conv_filt,
+                               get_weights_accessor(data_path, total_path + "pointwise_weights.npy", DataLayout::NCHW),
+                               std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), conv_pad_stride_info)
+                  .set_name(total_path + "pointwise/Conv2D")
            << BatchNormalizationLayer(
-               get_weights_accessor(data_path, total_path + "pointwise_BatchNorm_moving_mean.npy"),
-               get_weights_accessor(data_path, total_path + "pointwise_BatchNorm_moving_variance.npy"),
-               get_weights_accessor(data_path, total_path + "pointwise_BatchNorm_gamma.npy"),
-               get_weights_accessor(data_path, total_path + "pointwise_BatchNorm_beta.npy"),
-               0.001f)
-           .set_name(total_path + "pointwise/BatchNorm")
-           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f)).set_name(total_path + "pointwise/Relu6");
+                  get_weights_accessor(data_path, total_path + "pointwise_BatchNorm_moving_mean.npy"),
+                  get_weights_accessor(data_path, total_path + "pointwise_BatchNorm_moving_variance.npy"),
+                  get_weights_accessor(data_path, total_path + "pointwise_BatchNorm_gamma.npy"),
+                  get_weights_accessor(data_path, total_path + "pointwise_BatchNorm_beta.npy"), 0.001f)
+                  .set_name(total_path + "pointwise/BatchNorm")
+           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f))
+                  .set_name(total_path + "pointwise/Relu6");
 
         return ConcatLayer(std::move(sg));
     }
 
-    ConcatLayer get_dwsc_node_qasymm(const std::string &data_path, std::string &&param_path,
+    ConcatLayer get_dwsc_node_qasymm(const std::string &data_path,
+                                     std::string      &&param_path,
                                      const unsigned int conv_filt,
-                                     PadStrideInfo dwc_pad_stride_info, PadStrideInfo conv_pad_stride_info,
-                                     QuantizationInfo depth_weights_quant_info, QuantizationInfo point_weights_quant_info)
+                                     PadStrideInfo      dwc_pad_stride_info,
+                                     PadStrideInfo      conv_pad_stride_info,
+                                     QuantizationInfo   depth_weights_quant_info,
+                                     QuantizationInfo   point_weights_quant_info)
     {
         std::string total_path = param_path + "_";
         SubStream   sg(graph);
 
-        sg << DepthwiseConvolutionLayer(
-               3U, 3U,
-               get_weights_accessor(data_path, total_path + "depthwise_weights.npy"),
-               get_weights_accessor(data_path, total_path + "depthwise_bias.npy"),
-               dwc_pad_stride_info, 1, std::move(depth_weights_quant_info))
-           .set_name(total_path + "depthwise/depthwise")
-           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)).set_name(total_path + "depthwise/Relu6")
-           << ConvolutionLayer(
-               1U, 1U, conv_filt,
-               get_weights_accessor(data_path, total_path + "pointwise_weights.npy"),
-               get_weights_accessor(data_path, total_path + "pointwise_bias.npy"),
-               conv_pad_stride_info, 1, std::move(point_weights_quant_info))
-           .set_name(total_path + "pointwise/Conv2D")
-           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)).set_name(total_path + "pointwise/Relu6");
+        sg << DepthwiseConvolutionLayer(3U, 3U, get_weights_accessor(data_path, total_path + "depthwise_weights.npy"),
+                                        get_weights_accessor(data_path, total_path + "depthwise_bias.npy"),
+                                        dwc_pad_stride_info, 1, std::move(depth_weights_quant_info))
+                  .set_name(total_path + "depthwise/depthwise")
+           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f))
+                  .set_name(total_path + "depthwise/Relu6")
+           << ConvolutionLayer(1U, 1U, conv_filt, get_weights_accessor(data_path, total_path + "pointwise_weights.npy"),
+                               get_weights_accessor(data_path, total_path + "pointwise_bias.npy"), conv_pad_stride_info,
+                               1, std::move(point_weights_quant_info))
+                  .set_name(total_path + "pointwise/Conv2D")
+           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f))
+                  .set_name(total_path + "pointwise/Relu6");
 
         return ConcatLayer(std::move(sg));
     }
diff --git a/examples/graph_mobilenet_v2.cpp b/examples/graph_mobilenet_v2.cpp
index c027e6f13e..9bc21c42c5 100644
--- a/examples/graph_mobilenet_v2.cpp
+++ b/examples/graph_mobilenet_v2.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -36,11 +37,10 @@ using namespace arm_compute::graph_utils;
 class GraphMobilenetV2Example : public Example
 {
 public:
-    GraphMobilenetV2Example()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "MobileNetV2")
+    GraphMobilenetV2Example() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "MobileNetV2")
     {
     }
-    GraphMobilenetV2Example(const GraphMobilenetV2Example &) = delete;
+    GraphMobilenetV2Example(const GraphMobilenetV2Example &)            = delete;
     GraphMobilenetV2Example &operator=(const GraphMobilenetV2Example &) = delete;
     ~GraphMobilenetV2Example() override                                 = default;
 
@@ -54,7 +54,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -64,15 +64,16 @@ public:
         std::cout << common_params << std::endl;
 
         // Create input descriptor
-        const TensorShape tensor_shape     = permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, common_params.data_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
+        const TensorShape tensor_shape = permute_shape(TensorShape(224U, 224U, 3U, common_params.batches),
+                                                       DataLayout::NCHW, common_params.data_layout);
+        TensorDescriptor  input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
 
         // Set graph hints
-        graph << common_params.target
-              << common_params.fast_math_hint;
+        graph << common_params.target << common_params.fast_math_hint;
 
         // Create core graph
-        if(arm_compute::is_data_type_float(common_params.data_type))
+        if (arm_compute::is_data_type_float(common_params.data_type))
         {
             create_graph_float(input_descriptor);
         }
@@ -82,8 +83,7 @@ public:
         }
         // Create common tail
         graph << ReshapeLayer(TensorShape(1001U)).set_name("Predictions/Reshape")
-              << SoftmaxLayer().set_name("Predictions/Softmax")
-              << OutputLayer(get_output_accessor(common_params, 5));
+              << SoftmaxLayer().set_name("Predictions/Softmax") << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
         GraphConfig config;
@@ -136,123 +136,143 @@ private:
         std::string data_path = common_params.data_path;
 
         // Add model path to data path
-        if(!data_path.empty())
+        if (!data_path.empty())
         {
             data_path += model_path;
         }
 
         graph << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor), false))
-              << ConvolutionLayer(3U, 3U, 32U,
-                                  get_weights_accessor(data_path, "Conv_weights.npy", DataLayout::NCHW),
+              << ConvolutionLayer(3U, 3U, 32U, get_weights_accessor(data_path, "Conv_weights.npy", DataLayout::NCHW),
                                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
                                   PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL))
-              .set_name("Conv")
+                     .set_name("Conv")
               << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "Conv_BatchNorm_moving_variance.npy"),
                                          get_weights_accessor(data_path, "Conv_BatchNorm_gamma.npy"),
                                          get_weights_accessor(data_path, "Conv_BatchNorm_beta.npy"),
                                          0.0010000000474974513f)
-              .set_name("Conv/BatchNorm")
+                     .set_name("Conv/BatchNorm")
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f))
-              .set_name("Conv/Relu6");
+                     .set_name("Conv/Relu6");
 
         get_expanded_conv_float(data_path, "expanded_conv", 32U, 16U, PadStrideInfo(1, 1, 1, 1));
-        get_expanded_conv_float(data_path, "expanded_conv_1", 16U, 24U, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), HasExpand::Yes);
-        get_expanded_conv_float(data_path, "expanded_conv_2", 24U, 24U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes, IsResidual::Yes);
-        get_expanded_conv_float(data_path, "expanded_conv_3", 24U, 32U, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), HasExpand::Yes);
-        get_expanded_conv_float(data_path, "expanded_conv_4", 32U, 32U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes, IsResidual::Yes);
-        get_expanded_conv_float(data_path, "expanded_conv_5", 32U, 32U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes, IsResidual::Yes);
-        get_expanded_conv_float(data_path, "expanded_conv_6", 32U, 64U, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), HasExpand::Yes);
-        get_expanded_conv_float(data_path, "expanded_conv_7", 64U, 64U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes, IsResidual::Yes);
-        get_expanded_conv_float(data_path, "expanded_conv_8", 64U, 64U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes, IsResidual::Yes);
-        get_expanded_conv_float(data_path, "expanded_conv_9", 64U, 64U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes, IsResidual::Yes);
+        get_expanded_conv_float(data_path, "expanded_conv_1", 16U, 24U,
+                                PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), HasExpand::Yes);
+        get_expanded_conv_float(data_path, "expanded_conv_2", 24U, 24U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes,
+                                IsResidual::Yes);
+        get_expanded_conv_float(data_path, "expanded_conv_3", 24U, 32U,
+                                PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), HasExpand::Yes);
+        get_expanded_conv_float(data_path, "expanded_conv_4", 32U, 32U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes,
+                                IsResidual::Yes);
+        get_expanded_conv_float(data_path, "expanded_conv_5", 32U, 32U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes,
+                                IsResidual::Yes);
+        get_expanded_conv_float(data_path, "expanded_conv_6", 32U, 64U,
+                                PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), HasExpand::Yes);
+        get_expanded_conv_float(data_path, "expanded_conv_7", 64U, 64U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes,
+                                IsResidual::Yes);
+        get_expanded_conv_float(data_path, "expanded_conv_8", 64U, 64U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes,
+                                IsResidual::Yes);
+        get_expanded_conv_float(data_path, "expanded_conv_9", 64U, 64U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes,
+                                IsResidual::Yes);
         get_expanded_conv_float(data_path, "expanded_conv_10", 64U, 96U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes);
-        get_expanded_conv_float(data_path, "expanded_conv_11", 96U, 96U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes, IsResidual::Yes);
-        get_expanded_conv_float(data_path, "expanded_conv_12", 96U, 96U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes, IsResidual::Yes);
-        get_expanded_conv_float(data_path, "expanded_conv_13", 96U, 160U, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), HasExpand::Yes);
-        get_expanded_conv_float(data_path, "expanded_conv_14", 160U, 160U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes, IsResidual::Yes);
-        get_expanded_conv_float(data_path, "expanded_conv_15", 160U, 160U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes, IsResidual::Yes);
+        get_expanded_conv_float(data_path, "expanded_conv_11", 96U, 96U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes,
+                                IsResidual::Yes);
+        get_expanded_conv_float(data_path, "expanded_conv_12", 96U, 96U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes,
+                                IsResidual::Yes);
+        get_expanded_conv_float(data_path, "expanded_conv_13", 96U, 160U,
+                                PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), HasExpand::Yes);
+        get_expanded_conv_float(data_path, "expanded_conv_14", 160U, 160U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes,
+                                IsResidual::Yes);
+        get_expanded_conv_float(data_path, "expanded_conv_15", 160U, 160U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes,
+                                IsResidual::Yes);
         get_expanded_conv_float(data_path, "expanded_conv_16", 160U, 320U, PadStrideInfo(1, 1, 1, 1), HasExpand::Yes);
 
-        graph << ConvolutionLayer(1U, 1U, 1280U,
-                                  get_weights_accessor(data_path, "Conv_1_weights.npy", DataLayout::NCHW),
-                                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("Conv_1")
+        graph << ConvolutionLayer(
+                     1U, 1U, 1280U, get_weights_accessor(data_path, "Conv_1_weights.npy", DataLayout::NCHW),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                     .set_name("Conv_1")
               << BatchNormalizationLayer(get_weights_accessor(data_path, "Conv_1_BatchNorm_moving_mean.npy"),
                                          get_weights_accessor(data_path, "Conv_1_BatchNorm_moving_variance.npy"),
                                          get_weights_accessor(data_path, "Conv_1_BatchNorm_gamma.npy"),
                                          get_weights_accessor(data_path, "Conv_1_BatchNorm_beta.npy"),
                                          0.0010000000474974513f)
-              .set_name("Conv_1/BatchNorm")
+                     .set_name("Conv_1/BatchNorm")
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f))
-              .set_name("Conv_1/Relu6")
+                     .set_name("Conv_1/Relu6")
               << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, common_params.data_layout)).set_name("Logits/AvgPool")
               << ConvolutionLayer(1U, 1U, 1001U,
                                   get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_weights.npy", DataLayout::NCHW),
                                   get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_biases.npy"),
                                   PadStrideInfo(1, 1, 0, 0))
-              .set_name("Logits/Conv2d_1c_1x1");
+                     .set_name("Logits/Conv2d_1c_1x1");
     }
 
-    void get_expanded_conv_float(const std::string &data_path, std::string &&param_path,
-                                 unsigned int input_channels, unsigned int output_channels,
-                                 PadStrideInfo dwc_pad_stride_info,
-                                 HasExpand has_expand = HasExpand::No, IsResidual is_residual = IsResidual::No,
-                                 unsigned int expansion_size = 6)
+    void get_expanded_conv_float(const std::string &data_path,
+                                 std::string      &&param_path,
+                                 unsigned int       input_channels,
+                                 unsigned int       output_channels,
+                                 PadStrideInfo      dwc_pad_stride_info,
+                                 HasExpand          has_expand     = HasExpand::No,
+                                 IsResidual         is_residual    = IsResidual::No,
+                                 unsigned int       expansion_size = 6)
     {
         std::string total_path = param_path + "_";
         SubStream   left(graph);
 
         // Add expand node
-        if(has_expand == HasExpand::Yes)
+        if (has_expand == HasExpand::Yes)
         {
-            left << ConvolutionLayer(1U, 1U, input_channels * expansion_size,
-                                     get_weights_accessor(data_path, total_path + "expand_weights.npy", DataLayout::NCHW),
-                                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-                 .set_name(param_path + "/expand/Conv2D")
-                 << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "expand_BatchNorm_moving_mean.npy"),
-                                            get_weights_accessor(data_path, total_path + "expand_BatchNorm_moving_variance.npy"),
-                                            get_weights_accessor(data_path, total_path + "expand_BatchNorm_gamma.npy"),
-                                            get_weights_accessor(data_path, total_path + "expand_BatchNorm_beta.npy"),
-                                            0.0010000000474974513f)
-                 .set_name(param_path + "/expand/BatchNorm")
+            left << ConvolutionLayer(
+                        1U, 1U, input_channels * expansion_size,
+                        get_weights_accessor(data_path, total_path + "expand_weights.npy", DataLayout::NCHW),
+                        std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                        .set_name(param_path + "/expand/Conv2D")
+                 << BatchNormalizationLayer(
+                        get_weights_accessor(data_path, total_path + "expand_BatchNorm_moving_mean.npy"),
+                        get_weights_accessor(data_path, total_path + "expand_BatchNorm_moving_variance.npy"),
+                        get_weights_accessor(data_path, total_path + "expand_BatchNorm_gamma.npy"),
+                        get_weights_accessor(data_path, total_path + "expand_BatchNorm_beta.npy"),
+                        0.0010000000474974513f)
+                        .set_name(param_path + "/expand/BatchNorm")
                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f))
-                 .set_name(param_path + "/expand/Relu6");
+                        .set_name(param_path + "/expand/Relu6");
         }
 
         // Add depthwise node
-        left << DepthwiseConvolutionLayer(3U, 3U,
-                                          get_weights_accessor(data_path, total_path + "depthwise_depthwise_weights.npy", DataLayout::NCHW),
-                                          std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                                          dwc_pad_stride_info)
-             .set_name(param_path + "/depthwise/depthwise")
-             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_moving_mean.npy"),
-                                        get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_moving_variance.npy"),
-                                        get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_gamma.npy"),
-                                        get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_beta.npy"),
-                                        0.0010000000474974513f)
-             .set_name(param_path + "/depthwise/BatchNorm")
+        left << DepthwiseConvolutionLayer(
+                    3U, 3U,
+                    get_weights_accessor(data_path, total_path + "depthwise_depthwise_weights.npy", DataLayout::NCHW),
+                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), dwc_pad_stride_info)
+                    .set_name(param_path + "/depthwise/depthwise")
+             << BatchNormalizationLayer(
+                    get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_moving_mean.npy"),
+                    get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_moving_variance.npy"),
+                    get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_gamma.npy"),
+                    get_weights_accessor(data_path, total_path + "depthwise_BatchNorm_beta.npy"),
+                    0.0010000000474974513f)
+                    .set_name(param_path + "/depthwise/BatchNorm")
              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f))
-             .set_name(param_path + "/depthwise/Relu6");
+                    .set_name(param_path + "/depthwise/Relu6");
 
         // Add project node
         left << ConvolutionLayer(1U, 1U, output_channels,
                                  get_weights_accessor(data_path, total_path + "project_weights.npy", DataLayout::NCHW),
-                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
-             .set_name(param_path + "/project/Conv2D")
-             << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "project_BatchNorm_moving_mean.npy"),
-                                        get_weights_accessor(data_path, total_path + "project_BatchNorm_moving_variance.npy"),
-                                        get_weights_accessor(data_path, total_path + "project_BatchNorm_gamma.npy"),
-                                        get_weights_accessor(data_path, total_path + "project_BatchNorm_beta.npy"),
-                                        0.0010000000474974513)
-             .set_name(param_path + "/project/BatchNorm");
-
-        if(is_residual == IsResidual::Yes)
+                                 std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                 PadStrideInfo(1, 1, 0, 0))
+                    .set_name(param_path + "/project/Conv2D")
+             << BatchNormalizationLayer(
+                    get_weights_accessor(data_path, total_path + "project_BatchNorm_moving_mean.npy"),
+                    get_weights_accessor(data_path, total_path + "project_BatchNorm_moving_variance.npy"),
+                    get_weights_accessor(data_path, total_path + "project_BatchNorm_gamma.npy"),
+                    get_weights_accessor(data_path, total_path + "project_BatchNorm_beta.npy"), 0.0010000000474974513)
+                    .set_name(param_path + "/project/BatchNorm");
+
+        if (is_residual == IsResidual::Yes)
         {
             // Add residual node
             SubStream right(graph);
-            graph << EltwiseLayer(std::move(left), std::move(right), EltwiseOperation::Add).set_name(param_path + "/add");
+            graph
+                << EltwiseLayer(std::move(left), std::move(right), EltwiseOperation::Add).set_name(param_path + "/add");
         }
         else
         {
@@ -269,7 +289,7 @@ private:
         std::string data_path = common_params.data_path;
 
         // Add model path to data path
-        if(!data_path.empty())
+        if (!data_path.empty())
         {
             data_path += model_path;
         }
@@ -277,16 +297,14 @@ private:
         const QuantizationInfo in_quant_info  = QuantizationInfo(0.0078125f, 128);
         const QuantizationInfo mid_quant_info = QuantizationInfo(0.023528477177023888f, 128);
 
-        const std::vector<QuantizationInfo> conv_weights_quant_info =
-        {
+        const std::vector<QuantizationInfo> conv_weights_quant_info = {
             QuantizationInfo(0.03396892547607422f, 122),  // Conv
             QuantizationInfo(0.005167067516595125f, 125), // Conv1
             QuantizationInfo(0.0016910821432247758f, 113) // Conv2d_1c_1x1
         };
 
         // Pointwise expand convolution quantization info
-        const std::vector<QuantizationInfo> pwc_q =
-        {
+        const std::vector<QuantizationInfo> pwc_q = {
             QuantizationInfo(0.254282623529f, 129),        // expand_0 (Dummy)
             QuantizationInfo(0.009758507832884789f, 127),  // expand_1
             QuantizationInfo(0.0036556976847350597f, 144), // expand_2
@@ -306,8 +324,7 @@ private:
             QuantizationInfo(0.002046825597062707f, 135)   // expand_16
         };
         // Depthwise expand convolution quantization info
-        const std::vector<QuantizationInfo> dwc_q =
-        {
+        const std::vector<QuantizationInfo> dwc_q = {
             QuantizationInfo(0.3436955213546753f, 165),   // expand_0
             QuantizationInfo(0.020969120785593987f, 109), // expand_1
             QuantizationInfo(0.16981913149356842f, 52),   // expand_2
@@ -327,8 +344,7 @@ private:
             QuantizationInfo(0.16456253826618195, 201)    // expand_16
         };
         // Project convolution quantization info
-        const std::vector<QuantizationInfo> prwc_q =
-        {
+        const std::vector<QuantizationInfo> prwc_q = {
             QuantizationInfo(0.03737175464630127f, 140),  // expand_0
             QuantizationInfo(0.0225360207259655f, 156),   // expand_1
             QuantizationInfo(0.02740888111293316f, 122),  // expand_2
@@ -350,65 +366,84 @@ private:
 
         graph << InputLayer(input_descriptor.set_quantization_info(in_quant_info),
                             get_weights_accessor(data_path, common_params.image))
-              << ConvolutionLayer(
-                  3U, 3U, 32U,
-                  get_weights_accessor(data_path, "Conv_weights.npy"),
-                  get_weights_accessor(data_path, "Conv_bias.npy"),
-                  PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR),
-                  1, conv_weights_quant_info.at(0), mid_quant_info)
-              .set_name("Conv")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)).set_name("Conv/Relu6")
-              << DepthwiseConvolutionLayer(3U, 3U,
-                                           get_weights_accessor(data_path, "expanded_conv_depthwise_depthwise_weights.npy"),
-                                           get_weights_accessor(data_path, "expanded_conv_depthwise_depthwise_biases.npy"),
-                                           PadStrideInfo(1, 1, 1, 1), 1, dwc_q.at(0))
-              .set_name("expanded_conv/depthwise/depthwise")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)).set_name("expanded_conv/depthwise/Relu6")
-              << ConvolutionLayer(1U, 1U, 16U,
-                                  get_weights_accessor(data_path, "expanded_conv_project_weights.npy"),
+              << ConvolutionLayer(3U, 3U, 32U, get_weights_accessor(data_path, "Conv_weights.npy"),
+                                  get_weights_accessor(data_path, "Conv_bias.npy"),
+                                  PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), 1,
+                                  conv_weights_quant_info.at(0), mid_quant_info)
+                     .set_name("Conv")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f))
+                     .set_name("Conv/Relu6")
+              << DepthwiseConvolutionLayer(
+                     3U, 3U, get_weights_accessor(data_path, "expanded_conv_depthwise_depthwise_weights.npy"),
+                     get_weights_accessor(data_path, "expanded_conv_depthwise_depthwise_biases.npy"),
+                     PadStrideInfo(1, 1, 1, 1), 1, dwc_q.at(0))
+                     .set_name("expanded_conv/depthwise/depthwise")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f))
+                     .set_name("expanded_conv/depthwise/Relu6")
+              << ConvolutionLayer(1U, 1U, 16U, get_weights_accessor(data_path, "expanded_conv_project_weights.npy"),
                                   get_weights_accessor(data_path, "expanded_conv_project_biases.npy"),
                                   PadStrideInfo(1, 1, 0, 0), 1, prwc_q.at(0))
-              .set_name("expanded_conv/project/Conv2D");
-
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_1", IsResidual::No, 96U, 24U, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL),
-                                  pwc_q.at(1), dwc_q.at(1), prwc_q.at(1));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_2", IsResidual::Yes, 144U, 24U, PadStrideInfo(1, 1, 1, 1), pwc_q.at(2), dwc_q.at(2), prwc_q.at(2));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_3", IsResidual::No, 144U, 32U, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL),
-                                  pwc_q.at(3), dwc_q.at(3), prwc_q.at(3));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_4", IsResidual::Yes, 192U, 32U, PadStrideInfo(1, 1, 1, 1), pwc_q.at(4), dwc_q.at(4), prwc_q.at(4));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_5", IsResidual::Yes, 192U, 32U, PadStrideInfo(1, 1, 1, 1), pwc_q.at(5), dwc_q.at(5), prwc_q.at(5));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_6", IsResidual::No, 192U, 64U, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL),
-                                  pwc_q.at(6), dwc_q.at(6), prwc_q.at(6));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_7", IsResidual::Yes, 384U, 64U, PadStrideInfo(1, 1, 1, 1), pwc_q.at(7), dwc_q.at(7), prwc_q.at(7));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_8", IsResidual::Yes, 384U, 64U, PadStrideInfo(1, 1, 1, 1), pwc_q.at(8), dwc_q.at(8), prwc_q.at(8));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_9", IsResidual::Yes, 384U, 64U, PadStrideInfo(1, 1, 1, 1), pwc_q.at(9), dwc_q.at(9), prwc_q.at(9));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_10", IsResidual::No, 384U, 96U, PadStrideInfo(1, 1, 1, 1), pwc_q.at(10), dwc_q.at(10), prwc_q.at(10));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_11", IsResidual::Yes, 576U, 96U, PadStrideInfo(1, 1, 1, 1), pwc_q.at(11), dwc_q.at(11), prwc_q.at(11));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_12", IsResidual::Yes, 576U, 96U, PadStrideInfo(1, 1, 1, 1), pwc_q.at(12), dwc_q.at(12), prwc_q.at(12));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_13", IsResidual::No, 576U, 160U, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL),
-                                  pwc_q.at(13), dwc_q.at(13), prwc_q.at(13));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_14", IsResidual::Yes, 960U, 160U, PadStrideInfo(1, 1, 1, 1), pwc_q.at(14), dwc_q.at(14), prwc_q.at(14));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_15", IsResidual::Yes, 960U, 160U, PadStrideInfo(1, 1, 1, 1), pwc_q.at(15), dwc_q.at(15), prwc_q.at(15));
-        get_expanded_conv_qasymm8(data_path, "expanded_conv_16", IsResidual::No, 960U, 320U, PadStrideInfo(1, 1, 1, 1), pwc_q.at(16), dwc_q.at(16), prwc_q.at(16));
-
-        graph << ConvolutionLayer(1U, 1U, 1280U,
-                                  get_weights_accessor(data_path, "Conv_1_weights.npy"),
-                                  get_weights_accessor(data_path, "Conv_1_biases.npy"),
-                                  PadStrideInfo(1, 1, 0, 0), 1, conv_weights_quant_info.at(1))
-              .set_name("Conv_1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)).set_name("Conv_1/Relu6")
+                     .set_name("expanded_conv/project/Conv2D");
+
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_1", IsResidual::No, 96U, 24U,
+                                  PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), pwc_q.at(1),
+                                  dwc_q.at(1), prwc_q.at(1));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_2", IsResidual::Yes, 144U, 24U, PadStrideInfo(1, 1, 1, 1),
+                                  pwc_q.at(2), dwc_q.at(2), prwc_q.at(2));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_3", IsResidual::No, 144U, 32U,
+                                  PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), pwc_q.at(3),
+                                  dwc_q.at(3), prwc_q.at(3));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_4", IsResidual::Yes, 192U, 32U, PadStrideInfo(1, 1, 1, 1),
+                                  pwc_q.at(4), dwc_q.at(4), prwc_q.at(4));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_5", IsResidual::Yes, 192U, 32U, PadStrideInfo(1, 1, 1, 1),
+                                  pwc_q.at(5), dwc_q.at(5), prwc_q.at(5));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_6", IsResidual::No, 192U, 64U,
+                                  PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), pwc_q.at(6),
+                                  dwc_q.at(6), prwc_q.at(6));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_7", IsResidual::Yes, 384U, 64U, PadStrideInfo(1, 1, 1, 1),
+                                  pwc_q.at(7), dwc_q.at(7), prwc_q.at(7));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_8", IsResidual::Yes, 384U, 64U, PadStrideInfo(1, 1, 1, 1),
+                                  pwc_q.at(8), dwc_q.at(8), prwc_q.at(8));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_9", IsResidual::Yes, 384U, 64U, PadStrideInfo(1, 1, 1, 1),
+                                  pwc_q.at(9), dwc_q.at(9), prwc_q.at(9));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_10", IsResidual::No, 384U, 96U, PadStrideInfo(1, 1, 1, 1),
+                                  pwc_q.at(10), dwc_q.at(10), prwc_q.at(10));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_11", IsResidual::Yes, 576U, 96U, PadStrideInfo(1, 1, 1, 1),
+                                  pwc_q.at(11), dwc_q.at(11), prwc_q.at(11));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_12", IsResidual::Yes, 576U, 96U, PadStrideInfo(1, 1, 1, 1),
+                                  pwc_q.at(12), dwc_q.at(12), prwc_q.at(12));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_13", IsResidual::No, 576U, 160U,
+                                  PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL), pwc_q.at(13),
+                                  dwc_q.at(13), prwc_q.at(13));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_14", IsResidual::Yes, 960U, 160U, PadStrideInfo(1, 1, 1, 1),
+                                  pwc_q.at(14), dwc_q.at(14), prwc_q.at(14));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_15", IsResidual::Yes, 960U, 160U, PadStrideInfo(1, 1, 1, 1),
+                                  pwc_q.at(15), dwc_q.at(15), prwc_q.at(15));
+        get_expanded_conv_qasymm8(data_path, "expanded_conv_16", IsResidual::No, 960U, 320U, PadStrideInfo(1, 1, 1, 1),
+                                  pwc_q.at(16), dwc_q.at(16), prwc_q.at(16));
+
+        graph << ConvolutionLayer(1U, 1U, 1280U, get_weights_accessor(data_path, "Conv_1_weights.npy"),
+                                  get_weights_accessor(data_path, "Conv_1_biases.npy"), PadStrideInfo(1, 1, 0, 0), 1,
+                                  conv_weights_quant_info.at(1))
+                     .set_name("Conv_1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f))
+                     .set_name("Conv_1/Relu6")
               << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, common_params.data_layout)).set_name("Logits/AvgPool")
-              << ConvolutionLayer(1U, 1U, 1001U,
-                                  get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_weights.npy"),
+              << ConvolutionLayer(1U, 1U, 1001U, get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_weights.npy"),
                                   get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_biases.npy"),
                                   PadStrideInfo(1, 1, 0, 0), 1, conv_weights_quant_info.at(2))
-              .set_name("Logits/Conv2d_1c_1x1");
+                     .set_name("Logits/Conv2d_1c_1x1");
     }
 
-    void get_expanded_conv_qasymm8(const std::string &data_path, std::string &&param_path, IsResidual is_residual,
-                                   unsigned int input_channels, unsigned int output_channels,
+    void get_expanded_conv_qasymm8(const std::string      &data_path,
+                                   std::string           &&param_path,
+                                   IsResidual              is_residual,
+                                   unsigned int            input_channels,
+                                   unsigned int            output_channels,
                                    PadStrideInfo           dwc_pad_stride_info,
-                                   const QuantizationInfo &pwi, const QuantizationInfo &dwi, const QuantizationInfo &pji)
+                                   const QuantizationInfo &pwi,
+                                   const QuantizationInfo &dwi,
+                                   const QuantizationInfo &pji)
     {
         std::string total_path = param_path + "_";
 
@@ -417,25 +452,28 @@ private:
                                  get_weights_accessor(data_path, total_path + "project_weights.npy"),
                                  get_weights_accessor(data_path, total_path + "project_biases.npy"),
                                  PadStrideInfo(1, 1, 0, 0), 1, pwi)
-             .set_name(param_path + "/Conv2D")
-             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)).set_name(param_path + "/Conv2D/Relu6")
-             << DepthwiseConvolutionLayer(3U, 3U,
-                                          get_weights_accessor(data_path, total_path + "depthwise_depthwise_weights.npy"),
-                                          get_weights_accessor(data_path, total_path + "depthwise_depthwise_biases.npy"),
-                                          dwc_pad_stride_info, 1, dwi)
-             .set_name(param_path + "/depthwise/depthwise")
-             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)).set_name(param_path + "/depthwise/Relu6")
+                    .set_name(param_path + "/Conv2D")
+             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f))
+                    .set_name(param_path + "/Conv2D/Relu6")
+             << DepthwiseConvolutionLayer(
+                    3U, 3U, get_weights_accessor(data_path, total_path + "depthwise_depthwise_weights.npy"),
+                    get_weights_accessor(data_path, total_path + "depthwise_depthwise_biases.npy"), dwc_pad_stride_info,
+                    1, dwi)
+                    .set_name(param_path + "/depthwise/depthwise")
+             << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f))
+                    .set_name(param_path + "/depthwise/Relu6")
              << ConvolutionLayer(1U, 1U, output_channels,
                                  get_weights_accessor(data_path, total_path + "project_weights.npy"),
                                  get_weights_accessor(data_path, total_path + "project_biases.npy"),
                                  PadStrideInfo(1, 1, 0, 0), 1, pji)
-             .set_name(param_path + "/project/Conv2D");
+                    .set_name(param_path + "/project/Conv2D");
 
-        if(is_residual == IsResidual::Yes)
+        if (is_residual == IsResidual::Yes)
         {
             // Add residual node
             SubStream right(graph);
-            graph << EltwiseLayer(std::move(left), std::move(right), EltwiseOperation::Add).set_name(param_path + "/add");
+            graph
+                << EltwiseLayer(std::move(left), std::move(right), EltwiseOperation::Add).set_name(param_path + "/add");
         }
         else
         {
diff --git a/examples/graph_resnet12.cpp b/examples/graph_resnet12.cpp
index 48708ce29a..80db826be5 100644
--- a/examples/graph_resnet12.cpp
+++ b/examples/graph_resnet12.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -36,7 +37,12 @@ class GraphResNet12Example : public Example
 {
 public:
     GraphResNet12Example()
-        : cmd_parser(), common_opts(cmd_parser), model_input_width(nullptr), model_input_height(nullptr), common_params(), graph(0, "ResNet12")
+        : cmd_parser(),
+          common_opts(cmd_parser),
+          model_input_width(nullptr),
+          model_input_height(nullptr),
+          common_params(),
+          graph(0, "ResNet12")
     {
         model_input_width  = cmd_parser.add_option<SimpleOption<unsigned int>>("image-width", 192);
         model_input_height = cmd_parser.add_option<SimpleOption<unsigned int>>("image-height", 128);
@@ -45,7 +51,7 @@ public:
         model_input_width->set_help("Input image width.");
         model_input_height->set_help("Input image height.");
     }
-    GraphResNet12Example(const GraphResNet12Example &) = delete;
+    GraphResNet12Example(const GraphResNet12Example &)            = delete;
     GraphResNet12Example &operator=(const GraphResNet12Example &) = delete;
     ~GraphResNet12Example() override                              = default;
     bool do_setup(int argc, char **argv) override
@@ -58,7 +64,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -69,7 +75,8 @@ public:
         const unsigned int image_height = model_input_height->value();
 
         // Checks
-        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type), "QASYMM8 not supported for this graph");
+        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type),
+                                "QASYMM8 not supported for this graph");
 
         // Print parameter values
         std::cout << common_params << std::endl;
@@ -84,50 +91,47 @@ public:
         std::unique_ptr<IPreprocessor> preprocessor = std::make_unique<TFPreproccessor>();
 
         // Create input descriptor
-        const TensorShape tensor_shape     = permute_shape(TensorShape(image_width, image_height, 3U, common_params.batches), DataLayout::NCHW, common_params.data_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(image_width, image_height, 3U, common_params.batches), DataLayout::NCHW,
+                          common_params.data_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
-              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor), false /* Do not convert to BGR */))
-              << ConvolutionLayer(
-                  9U, 9U, 64U,
-                  get_weights_accessor(data_path, "conv1_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, "conv1_biases.npy", weights_layout),
-                  PadStrideInfo(1, 1, 4, 4))
-              .set_name("conv1/convolution")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv1/Relu");
+        graph << common_params.target << common_params.fast_math_hint
+              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor),
+                                                                 false /* Do not convert to BGR */))
+              << ConvolutionLayer(9U, 9U, 64U, get_weights_accessor(data_path, "conv1_weights.npy", weights_layout),
+                                  get_weights_accessor(data_path, "conv1_biases.npy", weights_layout),
+                                  PadStrideInfo(1, 1, 4, 4))
+                     .set_name("conv1/convolution")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("conv1/Relu");
 
         add_residual_block(data_path, "block1", weights_layout);
         add_residual_block(data_path, "block2", weights_layout);
         add_residual_block(data_path, "block3", weights_layout);
         add_residual_block(data_path, "block4", weights_layout);
 
-        graph << ConvolutionLayer(
-                  3U, 3U, 64U,
-                  get_weights_accessor(data_path, "conv10_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, "conv10_biases.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv10/convolution")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv10/Relu")
-              << ConvolutionLayer(
-                  3U, 3U, 64U,
-                  get_weights_accessor(data_path, "conv11_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, "conv11_biases.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv11/convolution")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv11/Relu")
-              << ConvolutionLayer(
-                  9U, 9U, 3U,
-                  get_weights_accessor(data_path, "conv12_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, "conv12_biases.npy"),
-                  PadStrideInfo(1, 1, 4, 4))
-              .set_name("conv12/convolution")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH)).set_name("conv12/Tanh")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 0.58f, 0.5f)).set_name("conv12/Linear")
+        graph << ConvolutionLayer(3U, 3U, 64U, get_weights_accessor(data_path, "conv10_weights.npy", weights_layout),
+                                  get_weights_accessor(data_path, "conv10_biases.npy"), PadStrideInfo(1, 1, 1, 1))
+                     .set_name("conv10/convolution")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("conv10/Relu")
+              << ConvolutionLayer(3U, 3U, 64U, get_weights_accessor(data_path, "conv11_weights.npy", weights_layout),
+                                  get_weights_accessor(data_path, "conv11_biases.npy"), PadStrideInfo(1, 1, 1, 1))
+                     .set_name("conv11/convolution")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("conv11/Relu")
+              << ConvolutionLayer(9U, 9U, 3U, get_weights_accessor(data_path, "conv12_weights.npy", weights_layout),
+                                  get_weights_accessor(data_path, "conv12_biases.npy"), PadStrideInfo(1, 1, 4, 4))
+                     .set_name("conv12/convolution")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH))
+                     .set_name("conv12/Tanh")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 0.58f, 0.5f))
+                     .set_name("conv12/Linear")
               << OutputLayer(std::make_unique<DummyAccessor>(0));
 
         // Finalize graph
@@ -152,8 +156,8 @@ public:
 private:
     CommandLineParser           cmd_parser;
     CommonGraphOptions          common_opts;
-    SimpleOption<unsigned int> *model_input_width{ nullptr };
-    SimpleOption<unsigned int> *model_input_height{ nullptr };
+    SimpleOption<unsigned int> *model_input_width{nullptr};
+    SimpleOption<unsigned int> *model_input_height{nullptr};
     CommonGraphParams           common_params;
     Stream                      graph;
 
@@ -170,35 +174,33 @@ private:
         SubStream left(graph);
         SubStream right(graph);
 
-        right << ConvolutionLayer(
-                  3U, 3U, 64U,
-                  get_weights_accessor(data_path, unit_path + "conv1_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, unit_path + "conv1_biases.npy", weights_layout),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name(unit_name + "conv1/convolution")
+        right << ConvolutionLayer(3U, 3U, 64U,
+                                  get_weights_accessor(data_path, unit_path + "conv1_weights.npy", weights_layout),
+                                  get_weights_accessor(data_path, unit_path + "conv1_biases.npy", weights_layout),
+                                  PadStrideInfo(1, 1, 1, 1))
+                     .set_name(unit_name + "conv1/convolution")
               << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_moving_mean.npy"),
-                  get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_moving_variance.npy"),
-                  get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_gamma.npy"),
-                  get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_beta.npy"),
-                  0.0000100099996416f)
-              .set_name(unit_name + "conv1/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "conv1/Relu")
-
-              << ConvolutionLayer(
-                  3U, 3U, 64U,
-                  get_weights_accessor(data_path, unit_path + "conv2_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, unit_path + "conv2_biases.npy", weights_layout),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name(unit_name + "conv2/convolution")
+                     get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_moving_mean.npy"),
+                     get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_moving_variance.npy"),
+                     get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_gamma.npy"),
+                     get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_beta.npy"), 0.0000100099996416f)
+                     .set_name(unit_name + "conv1/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name(unit_name + "conv1/Relu")
+
+              << ConvolutionLayer(3U, 3U, 64U,
+                                  get_weights_accessor(data_path, unit_path + "conv2_weights.npy", weights_layout),
+                                  get_weights_accessor(data_path, unit_path + "conv2_biases.npy", weights_layout),
+                                  PadStrideInfo(1, 1, 1, 1))
+                     .set_name(unit_name + "conv2/convolution")
               << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_moving_mean.npy"),
-                  get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_moving_variance.npy"),
-                  get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_gamma.npy"),
-                  get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_beta.npy"),
-                  0.0000100099996416f)
-              .set_name(unit_name + "conv2/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "conv2/Relu");
+                     get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_moving_mean.npy"),
+                     get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_moving_variance.npy"),
+                     get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_gamma.npy"),
+                     get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_beta.npy"), 0.0000100099996416f)
+                     .set_name(unit_name + "conv2/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name(unit_name + "conv2/Relu");
 
         graph << EltwiseLayer(std::move(left), std::move(right), EltwiseOperation::Add).set_name(unit_name + "add");
     }
diff --git a/examples/graph_resnet50.cpp b/examples/graph_resnet50.cpp
index 0d3322c886..ba0f0d5fb6 100644
--- a/examples/graph_resnet50.cpp
+++ b/examples/graph_resnet50.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -35,8 +36,7 @@ using namespace arm_compute::graph_utils;
 class GraphResNetV1_50Example : public Example
 {
 public:
-    GraphResNetV1_50Example()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "ResNetV1_50")
+    GraphResNetV1_50Example() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "ResNetV1_50")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -49,7 +49,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -62,36 +62,40 @@ public:
         std::string data_path = common_params.data_path;
 
         // Create a preprocessor object
-        const std::array<float, 3> mean_rgb{ { 122.68f, 116.67f, 104.01f } };
-        std::unique_ptr<IPreprocessor> preprocessor = std::make_unique<CaffePreproccessor>(mean_rgb,
-                                                                                           false /* Do not convert to BGR */);
+        const std::array<float, 3>     mean_rgb{{122.68f, 116.67f, 104.01f}};
+        std::unique_ptr<IPreprocessor> preprocessor =
+            std::make_unique<CaffePreproccessor>(mean_rgb, false /* Do not convert to BGR */);
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
-              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor), false /* Do not convert to BGR */))
+        graph << common_params.target << common_params.fast_math_hint
+              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor),
+                                                                 false /* Do not convert to BGR */))
               << ConvolutionLayer(
-                  7U, 7U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/resnet50_model/conv1_weights.npy", weights_layout),
-                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                  PadStrideInfo(2, 2, 3, 3))
-              .set_name("conv1/convolution")
+                     7U, 7U, 64U,
+                     get_weights_accessor(data_path, "/cnn_data/resnet50_model/conv1_weights.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 3, 3))
+                     .set_name("conv1/convolution")
               << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "/cnn_data/resnet50_model/conv1_BatchNorm_moving_mean.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/resnet50_model/conv1_BatchNorm_moving_variance.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/resnet50_model/conv1_BatchNorm_gamma.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/resnet50_model/conv1_BatchNorm_beta.npy"),
-                  0.0000100099996416f)
-              .set_name("conv1/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv1/Relu")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR))).set_name("pool1/MaxPool");
+                     get_weights_accessor(data_path, "/cnn_data/resnet50_model/conv1_BatchNorm_moving_mean.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/resnet50_model/conv1_BatchNorm_moving_variance.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/resnet50_model/conv1_BatchNorm_gamma.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/resnet50_model/conv1_BatchNorm_beta.npy"),
+                     0.0000100099996416f)
+                     .set_name("conv1/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("conv1/Relu")
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR)))
+                     .set_name("pool1/MaxPool");
 
         add_residual_block(data_path, "block1", weights_layout, 64, 3, 2);
         add_residual_block(data_path, "block2", weights_layout, 128, 4, 2);
@@ -100,13 +104,12 @@ public:
 
         graph << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, operation_layout)).set_name("pool5")
               << ConvolutionLayer(
-                  1U, 1U, 1000U,
-                  get_weights_accessor(data_path, "/cnn_data/resnet50_model/logits_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/resnet50_model/logits_biases.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("logits/convolution")
-              << FlattenLayer().set_name("predictions/Reshape")
-              << SoftmaxLayer().set_name("predictions/Softmax")
+                     1U, 1U, 1000U,
+                     get_weights_accessor(data_path, "/cnn_data/resnet50_model/logits_weights.npy", weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/resnet50_model/logits_biases.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("logits/convolution")
+              << FlattenLayer().set_name("predictions/Reshape") << SoftmaxLayer().set_name("predictions/Softmax")
               << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
@@ -136,10 +139,14 @@ private:
     CommonGraphParams  common_params;
     Stream             graph;
 
-    void add_residual_block(const std::string &data_path, const std::string &name, DataLayout weights_layout,
-                            unsigned int base_depth, unsigned int num_units, unsigned int stride)
+    void add_residual_block(const std::string &data_path,
+                            const std::string &name,
+                            DataLayout         weights_layout,
+                            unsigned int       base_depth,
+                            unsigned int       num_units,
+                            unsigned int       stride)
     {
-        for(unsigned int i = 0; i < num_units; ++i)
+        for (unsigned int i = 0; i < num_units; ++i)
         {
             std::stringstream unit_path_ss;
             unit_path_ss << "/cnn_data/resnet50_model/" << name << "_unit_" << (i + 1) << "_bottleneck_v1_";
@@ -151,89 +158,90 @@ private:
 
             unsigned int middle_stride = 1;
 
-            if(i == (num_units - 1))
+            if (i == (num_units - 1))
             {
                 middle_stride = stride;
             }
 
             SubStream right(graph);
-            right << ConvolutionLayer(
-                      1U, 1U, base_depth,
-                      get_weights_accessor(data_path, unit_path + "conv1_weights.npy", weights_layout),
-                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                      PadStrideInfo(1, 1, 0, 0))
-                  .set_name(unit_name + "conv1/convolution")
+            right << ConvolutionLayer(1U, 1U, base_depth,
+                                      get_weights_accessor(data_path, unit_path + "conv1_weights.npy", weights_layout),
+                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                      PadStrideInfo(1, 1, 0, 0))
+                         .set_name(unit_name + "conv1/convolution")
                   << BatchNormalizationLayer(
-                      get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_moving_mean.npy"),
-                      get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_moving_variance.npy"),
-                      get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_gamma.npy"),
-                      get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_beta.npy"),
-                      0.0000100099996416f)
-                  .set_name(unit_name + "conv1/BatchNorm")
-                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "conv1/Relu")
+                         get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_moving_mean.npy"),
+                         get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_moving_variance.npy"),
+                         get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_gamma.npy"),
+                         get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_beta.npy"), 0.0000100099996416f)
+                         .set_name(unit_name + "conv1/BatchNorm")
+                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                         .set_name(unit_name + "conv1/Relu")
 
-                  << ConvolutionLayer(
-                      3U, 3U, base_depth,
-                      get_weights_accessor(data_path, unit_path + "conv2_weights.npy", weights_layout),
-                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                      PadStrideInfo(middle_stride, middle_stride, 1, 1))
-                  .set_name(unit_name + "conv2/convolution")
+                  << ConvolutionLayer(3U, 3U, base_depth,
+                                      get_weights_accessor(data_path, unit_path + "conv2_weights.npy", weights_layout),
+                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                      PadStrideInfo(middle_stride, middle_stride, 1, 1))
+                         .set_name(unit_name + "conv2/convolution")
                   << BatchNormalizationLayer(
-                      get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_moving_mean.npy"),
-                      get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_moving_variance.npy"),
-                      get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_gamma.npy"),
-                      get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_beta.npy"),
-                      0.0000100099996416f)
-                  .set_name(unit_name + "conv2/BatchNorm")
-                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "conv1/Relu")
+                         get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_moving_mean.npy"),
+                         get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_moving_variance.npy"),
+                         get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_gamma.npy"),
+                         get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_beta.npy"), 0.0000100099996416f)
+                         .set_name(unit_name + "conv2/BatchNorm")
+                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                         .set_name(unit_name + "conv1/Relu")
 
-                  << ConvolutionLayer(
-                      1U, 1U, base_depth * 4,
-                      get_weights_accessor(data_path, unit_path + "conv3_weights.npy", weights_layout),
-                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                      PadStrideInfo(1, 1, 0, 0))
-                  .set_name(unit_name + "conv3/convolution")
+                  << ConvolutionLayer(1U, 1U, base_depth * 4,
+                                      get_weights_accessor(data_path, unit_path + "conv3_weights.npy", weights_layout),
+                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                      PadStrideInfo(1, 1, 0, 0))
+                         .set_name(unit_name + "conv3/convolution")
                   << BatchNormalizationLayer(
-                      get_weights_accessor(data_path, unit_path + "conv3_BatchNorm_moving_mean.npy"),
-                      get_weights_accessor(data_path, unit_path + "conv3_BatchNorm_moving_variance.npy"),
-                      get_weights_accessor(data_path, unit_path + "conv3_BatchNorm_gamma.npy"),
-                      get_weights_accessor(data_path, unit_path + "conv3_BatchNorm_beta.npy"),
-                      0.0000100099996416f)
-                  .set_name(unit_name + "conv2/BatchNorm");
+                         get_weights_accessor(data_path, unit_path + "conv3_BatchNorm_moving_mean.npy"),
+                         get_weights_accessor(data_path, unit_path + "conv3_BatchNorm_moving_variance.npy"),
+                         get_weights_accessor(data_path, unit_path + "conv3_BatchNorm_gamma.npy"),
+                         get_weights_accessor(data_path, unit_path + "conv3_BatchNorm_beta.npy"), 0.0000100099996416f)
+                         .set_name(unit_name + "conv2/BatchNorm");
 
-            if(i == 0)
+            if (i == 0)
             {
                 SubStream left(graph);
                 left << ConvolutionLayer(
-                         1U, 1U, base_depth * 4,
-                         get_weights_accessor(data_path, unit_path + "shortcut_weights.npy", weights_layout),
-                         std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                         PadStrideInfo(1, 1, 0, 0))
-                     .set_name(unit_name + "shortcut/convolution")
+                            1U, 1U, base_depth * 4,
+                            get_weights_accessor(data_path, unit_path + "shortcut_weights.npy", weights_layout),
+                            std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                            .set_name(unit_name + "shortcut/convolution")
                      << BatchNormalizationLayer(
-                         get_weights_accessor(data_path, unit_path + "shortcut_BatchNorm_moving_mean.npy"),
-                         get_weights_accessor(data_path, unit_path + "shortcut_BatchNorm_moving_variance.npy"),
-                         get_weights_accessor(data_path, unit_path + "shortcut_BatchNorm_gamma.npy"),
-                         get_weights_accessor(data_path, unit_path + "shortcut_BatchNorm_beta.npy"),
-                         0.0000100099996416f)
-                     .set_name(unit_name + "shortcut/BatchNorm");
+                            get_weights_accessor(data_path, unit_path + "shortcut_BatchNorm_moving_mean.npy"),
+                            get_weights_accessor(data_path, unit_path + "shortcut_BatchNorm_moving_variance.npy"),
+                            get_weights_accessor(data_path, unit_path + "shortcut_BatchNorm_gamma.npy"),
+                            get_weights_accessor(data_path, unit_path + "shortcut_BatchNorm_beta.npy"),
+                            0.0000100099996416f)
+                            .set_name(unit_name + "shortcut/BatchNorm");
 
-                graph << EltwiseLayer(std::move(left), std::move(right), EltwiseOperation::Add).set_name(unit_name + "add");
+                graph << EltwiseLayer(std::move(left), std::move(right), EltwiseOperation::Add)
+                             .set_name(unit_name + "add");
             }
-            else if(middle_stride > 1)
+            else if (middle_stride > 1)
             {
                 SubStream left(graph);
-                left << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 1, common_params.data_layout, PadStrideInfo(middle_stride, middle_stride, 0, 0), true)).set_name(unit_name + "shortcut/MaxPool");
+                left << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 1, common_params.data_layout,
+                                                      PadStrideInfo(middle_stride, middle_stride, 0, 0), true))
+                            .set_name(unit_name + "shortcut/MaxPool");
 
-                graph << EltwiseLayer(std::move(left), std::move(right), EltwiseOperation::Add).set_name(unit_name + "add");
+                graph << EltwiseLayer(std::move(left), std::move(right), EltwiseOperation::Add)
+                             .set_name(unit_name + "add");
             }
             else
             {
                 SubStream left(graph);
-                graph << EltwiseLayer(std::move(left), std::move(right), EltwiseOperation::Add).set_name(unit_name + "add");
+                graph << EltwiseLayer(std::move(left), std::move(right), EltwiseOperation::Add)
+                             .set_name(unit_name + "add");
             }
 
-            graph << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Relu");
+            graph << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                         .set_name(unit_name + "Relu");
         }
     }
 };
diff --git a/examples/graph_resnet_v2_50.cpp b/examples/graph_resnet_v2_50.cpp
index 6d5abb4f4b..48cf9b0b3c 100644
--- a/examples/graph_resnet_v2_50.cpp
+++ b/examples/graph_resnet_v2_50.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -35,8 +36,7 @@ using namespace arm_compute::graph_utils;
 class GraphResNetV2_50Example : public Example
 {
 public:
-    GraphResNetV2_50Example()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "ResNetV2_50")
+    GraphResNetV2_50Example() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "ResNetV2_50")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -49,7 +49,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -61,7 +61,7 @@ public:
         // Get trainable parameters data path
         std::string data_path  = common_params.data_path;
         std::string model_path = "/cnn_data/resnet_v2_50_model/";
-        if(!data_path.empty())
+        if (!data_path.empty())
         {
             data_path += model_path;
         }
@@ -71,45 +71,42 @@ public:
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
-              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor), false /* Do not convert to BGR */))
-              << ConvolutionLayer(
-                  7U, 7U, 64U,
-                  get_weights_accessor(data_path, "conv1_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, "conv1_biases.npy", weights_layout),
-                  PadStrideInfo(2, 2, 3, 3))
-              .set_name("conv1/convolution")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR))).set_name("pool1/MaxPool");
+        graph << common_params.target << common_params.fast_math_hint
+              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor),
+                                                                 false /* Do not convert to BGR */))
+              << ConvolutionLayer(7U, 7U, 64U, get_weights_accessor(data_path, "conv1_weights.npy", weights_layout),
+                                  get_weights_accessor(data_path, "conv1_biases.npy", weights_layout),
+                                  PadStrideInfo(2, 2, 3, 3))
+                     .set_name("conv1/convolution")
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR)))
+                     .set_name("pool1/MaxPool");
 
         add_residual_block(data_path, "block1", weights_layout, 64, 3, 2);
         add_residual_block(data_path, "block2", weights_layout, 128, 4, 2);
         add_residual_block(data_path, "block3", weights_layout, 256, 6, 2);
         add_residual_block(data_path, "block4", weights_layout, 512, 3, 1);
 
-        graph << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "postnorm_moving_mean.npy"),
-                  get_weights_accessor(data_path, "postnorm_moving_variance.npy"),
-                  get_weights_accessor(data_path, "postnorm_gamma.npy"),
-                  get_weights_accessor(data_path, "postnorm_beta.npy"),
-                  0.000009999999747378752f)
-              .set_name("postnorm/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("postnorm/Relu")
+        graph << BatchNormalizationLayer(get_weights_accessor(data_path, "postnorm_moving_mean.npy"),
+                                         get_weights_accessor(data_path, "postnorm_moving_variance.npy"),
+                                         get_weights_accessor(data_path, "postnorm_gamma.npy"),
+                                         get_weights_accessor(data_path, "postnorm_beta.npy"), 0.000009999999747378752f)
+                     .set_name("postnorm/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("postnorm/Relu")
               << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, operation_layout)).set_name("pool5")
-              << ConvolutionLayer(
-                  1U, 1U, 1001U,
-                  get_weights_accessor(data_path, "logits_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, "logits_biases.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("logits/convolution")
-              << FlattenLayer().set_name("predictions/Reshape")
-              << SoftmaxLayer().set_name("predictions/Softmax")
+              << ConvolutionLayer(1U, 1U, 1001U, get_weights_accessor(data_path, "logits_weights.npy", weights_layout),
+                                  get_weights_accessor(data_path, "logits_biases.npy"), PadStrideInfo(1, 1, 0, 0))
+                     .set_name("logits/convolution")
+              << FlattenLayer().set_name("predictions/Reshape") << SoftmaxLayer().set_name("predictions/Softmax")
               << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
@@ -139,10 +136,14 @@ private:
     CommonGraphParams  common_params;
     Stream             graph;
 
-    void add_residual_block(const std::string &data_path, const std::string &name, DataLayout weights_layout,
-                            unsigned int base_depth, unsigned int num_units, unsigned int stride)
+    void add_residual_block(const std::string &data_path,
+                            const std::string &name,
+                            DataLayout         weights_layout,
+                            unsigned int       base_depth,
+                            unsigned int       num_units,
+                            unsigned int       stride)
     {
-        for(unsigned int i = 0; i < num_units; ++i)
+        for (unsigned int i = 0; i < num_units; ++i)
         {
             // Generate unit names
             std::stringstream unit_path_ss;
@@ -154,7 +155,8 @@ private:
             std::string unit_name = unit_name_ss.str();
 
             const TensorShape last_shape = graph.graph().node(graph.tail_node())->output(0)->desc().shape;
-            unsigned int      depth_in   = last_shape[arm_compute::get_data_layout_dimension_index(common_params.data_layout, DataLayoutDimension::CHANNEL)];
+            unsigned int      depth_in   = last_shape[arm_compute::get_data_layout_dimension_index(
+                       common_params.data_layout, DataLayoutDimension::CHANNEL)];
             unsigned int      depth_out  = base_depth * 4;
 
             // All units have stride 1 apart from last one
@@ -162,73 +164,76 @@ private:
 
             // Preact
             SubStream preact(graph);
-            preact << BatchNormalizationLayer(
-                       get_weights_accessor(data_path, unit_path + "preact_moving_mean.npy"),
-                       get_weights_accessor(data_path, unit_path + "preact_moving_variance.npy"),
-                       get_weights_accessor(data_path, unit_path + "preact_gamma.npy"),
-                       get_weights_accessor(data_path, unit_path + "preact_beta.npy"),
-                       0.000009999999747378752f)
-                   .set_name(unit_name + "preact/BatchNorm")
-                   << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "preact/Relu");
+            preact << BatchNormalizationLayer(get_weights_accessor(data_path, unit_path + "preact_moving_mean.npy"),
+                                              get_weights_accessor(data_path, unit_path + "preact_moving_variance.npy"),
+                                              get_weights_accessor(data_path, unit_path + "preact_gamma.npy"),
+                                              get_weights_accessor(data_path, unit_path + "preact_beta.npy"),
+                                              0.000009999999747378752f)
+                          .set_name(unit_name + "preact/BatchNorm")
+                   << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                          .set_name(unit_name + "preact/Relu");
 
             // Create bottleneck path
             SubStream shortcut(graph);
-            if(depth_in == depth_out)
+            if (depth_in == depth_out)
             {
-                if(middle_stride != 1)
+                if (middle_stride != 1)
                 {
-                    shortcut << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 1, common_params.data_layout, PadStrideInfo(middle_stride, middle_stride, 0, 0), true)).set_name(unit_name + "shortcut/MaxPool");
+                    shortcut << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 1, common_params.data_layout,
+                                                              PadStrideInfo(middle_stride, middle_stride, 0, 0), true))
+                                    .set_name(unit_name + "shortcut/MaxPool");
                 }
             }
             else
             {
                 shortcut.forward_tail(preact.tail_node());
                 shortcut << ConvolutionLayer(
-                             1U, 1U, depth_out,
-                             get_weights_accessor(data_path, unit_path + "shortcut_weights.npy", weights_layout),
-                             get_weights_accessor(data_path, unit_path + "shortcut_biases.npy", weights_layout),
-                             PadStrideInfo(1, 1, 0, 0))
-                         .set_name(unit_name + "shortcut/convolution");
+                                1U, 1U, depth_out,
+                                get_weights_accessor(data_path, unit_path + "shortcut_weights.npy", weights_layout),
+                                get_weights_accessor(data_path, unit_path + "shortcut_biases.npy", weights_layout),
+                                PadStrideInfo(1, 1, 0, 0))
+                                .set_name(unit_name + "shortcut/convolution");
             }
 
             // Create residual path
             SubStream residual(preact);
-            residual << ConvolutionLayer(
-                         1U, 1U, base_depth,
-                         get_weights_accessor(data_path, unit_path + "conv1_weights.npy", weights_layout),
-                         std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                         PadStrideInfo(1, 1, 0, 0))
-                     .set_name(unit_name + "conv1/convolution")
-                     << BatchNormalizationLayer(
-                         get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_moving_mean.npy"),
-                         get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_moving_variance.npy"),
-                         get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_gamma.npy"),
-                         get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_beta.npy"),
-                         0.000009999999747378752f)
-                     .set_name(unit_name + "conv1/BatchNorm")
-                     << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "conv1/Relu")
-                     << ConvolutionLayer(
-                         3U, 3U, base_depth,
-                         get_weights_accessor(data_path, unit_path + "conv2_weights.npy", weights_layout),
-                         std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                         PadStrideInfo(middle_stride, middle_stride, 1, 1))
-                     .set_name(unit_name + "conv2/convolution")
-                     << BatchNormalizationLayer(
-                         get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_moving_mean.npy"),
-                         get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_moving_variance.npy"),
-                         get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_gamma.npy"),
-                         get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_beta.npy"),
-                         0.000009999999747378752f)
-                     .set_name(unit_name + "conv2/BatchNorm")
-                     << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "conv1/Relu")
-                     << ConvolutionLayer(
-                         1U, 1U, depth_out,
-                         get_weights_accessor(data_path, unit_path + "conv3_weights.npy", weights_layout),
-                         get_weights_accessor(data_path, unit_path + "conv3_biases.npy", weights_layout),
-                         PadStrideInfo(1, 1, 0, 0))
-                     .set_name(unit_name + "conv3/convolution");
-
-            graph << EltwiseLayer(std::move(shortcut), std::move(residual), EltwiseOperation::Add).set_name(unit_name + "add");
+            residual
+                << ConvolutionLayer(1U, 1U, base_depth,
+                                    get_weights_accessor(data_path, unit_path + "conv1_weights.npy", weights_layout),
+                                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                    PadStrideInfo(1, 1, 0, 0))
+                       .set_name(unit_name + "conv1/convolution")
+                << BatchNormalizationLayer(
+                       get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_moving_mean.npy"),
+                       get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_moving_variance.npy"),
+                       get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_gamma.npy"),
+                       get_weights_accessor(data_path, unit_path + "conv1_BatchNorm_beta.npy"),
+                       0.000009999999747378752f)
+                       .set_name(unit_name + "conv1/BatchNorm")
+                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                       .set_name(unit_name + "conv1/Relu")
+                << ConvolutionLayer(3U, 3U, base_depth,
+                                    get_weights_accessor(data_path, unit_path + "conv2_weights.npy", weights_layout),
+                                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                    PadStrideInfo(middle_stride, middle_stride, 1, 1))
+                       .set_name(unit_name + "conv2/convolution")
+                << BatchNormalizationLayer(
+                       get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_moving_mean.npy"),
+                       get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_moving_variance.npy"),
+                       get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_gamma.npy"),
+                       get_weights_accessor(data_path, unit_path + "conv2_BatchNorm_beta.npy"),
+                       0.000009999999747378752f)
+                       .set_name(unit_name + "conv2/BatchNorm")
+                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                       .set_name(unit_name + "conv1/Relu")
+                << ConvolutionLayer(1U, 1U, depth_out,
+                                    get_weights_accessor(data_path, unit_path + "conv3_weights.npy", weights_layout),
+                                    get_weights_accessor(data_path, unit_path + "conv3_biases.npy", weights_layout),
+                                    PadStrideInfo(1, 1, 0, 0))
+                       .set_name(unit_name + "conv3/convolution");
+
+            graph << EltwiseLayer(std::move(shortcut), std::move(residual), EltwiseOperation::Add)
+                         .set_name(unit_name + "add");
         }
     }
 };
diff --git a/examples/graph_resnext50.cpp b/examples/graph_resnext50.cpp
index 6378f6c741..12a1507c4c 100644
--- a/examples/graph_resnext50.cpp
+++ b/examples/graph_resnext50.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -35,8 +36,7 @@ using namespace arm_compute::graph_utils;
 class GraphResNeXt50Example : public Example
 {
 public:
-    GraphResNeXt50Example()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "ResNeXt50")
+    GraphResNeXt50Example() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "ResNeXt50")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -49,14 +49,15 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
         }
 
         // Checks
-        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type), "QASYMM8 not supported for this graph");
+        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type),
+                                "QASYMM8 not supported for this graph");
 
         // Print parameter values
         std::cout << common_params << std::endl;
@@ -66,28 +67,33 @@ public:
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
+        graph << common_params.target << common_params.fast_math_hint
               << InputLayer(input_descriptor, get_input_accessor(common_params))
               << ScaleLayer(get_weights_accessor(data_path, "/cnn_data/resnext50_model/bn_data_mul.npy"),
                             get_weights_accessor(data_path, "/cnn_data/resnext50_model/bn_data_add.npy"))
-              .set_name("bn_data/Scale")
+                     .set_name("bn_data/Scale")
               << ConvolutionLayer(
-                  7U, 7U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/resnext50_model/conv0_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/resnext50_model/conv0_biases.npy"),
-                  PadStrideInfo(2, 2, 2, 3, 2, 3, DimensionRoundingType::FLOOR))
-              .set_name("conv0/Convolution")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv0/Relu")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR))).set_name("pool0");
-
-        add_residual_block(data_path, weights_layout, /*ofm*/ 256, /*stage*/ 1, /*num_unit*/ 3, /*stride_conv_unit1*/ 1);
+                     7U, 7U, 64U,
+                     get_weights_accessor(data_path, "/cnn_data/resnext50_model/conv0_weights.npy", weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/resnext50_model/conv0_biases.npy"),
+                     PadStrideInfo(2, 2, 2, 3, 2, 3, DimensionRoundingType::FLOOR))
+                     .set_name("conv0/Convolution")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("conv0/Relu")
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR)))
+                     .set_name("pool0");
+
+        add_residual_block(data_path, weights_layout, /*ofm*/ 256, /*stage*/ 1, /*num_unit*/ 3,
+                           /*stride_conv_unit1*/ 1);
         add_residual_block(data_path, weights_layout, 512, 2, 4, 2);
         add_residual_block(data_path, weights_layout, 1024, 3, 6, 2);
         add_residual_block(data_path, weights_layout, 2048, 4, 3, 2);
@@ -121,10 +127,14 @@ private:
     CommonGraphParams  common_params;
     Stream             graph;
 
-    void add_residual_block(const std::string &data_path, DataLayout weights_layout,
-                            unsigned int base_depth, unsigned int stage, unsigned int num_units, unsigned int stride_conv_unit1)
+    void add_residual_block(const std::string &data_path,
+                            DataLayout         weights_layout,
+                            unsigned int       base_depth,
+                            unsigned int       stage,
+                            unsigned int       num_units,
+                            unsigned int       stride_conv_unit1)
     {
-        for(unsigned int i = 0; i < num_units; ++i)
+        for (unsigned int i = 0; i < num_units; ++i)
         {
             std::stringstream unit_path_ss;
             unit_path_ss << "/cnn_data/resnext50_model/stage" << stage << "_unit" << (i + 1) << "_";
@@ -135,54 +145,55 @@ private:
             std::string unit_name = unit_name_ss.str();
 
             PadStrideInfo pad_grouped_conv(1, 1, 1, 1);
-            if(i == 0)
+            if (i == 0)
             {
-                pad_grouped_conv = (stage == 1) ? PadStrideInfo(stride_conv_unit1, stride_conv_unit1, 1, 1) : PadStrideInfo(stride_conv_unit1, stride_conv_unit1, 0, 1, 0, 1, DimensionRoundingType::FLOOR);
+                pad_grouped_conv = (stage == 1) ? PadStrideInfo(stride_conv_unit1, stride_conv_unit1, 1, 1)
+                                                : PadStrideInfo(stride_conv_unit1, stride_conv_unit1, 0, 1, 0, 1,
+                                                                DimensionRoundingType::FLOOR);
             }
 
             SubStream right(graph);
-            right << ConvolutionLayer(
-                      1U, 1U, base_depth / 2,
-                      get_weights_accessor(data_path, unit_path + "conv1_weights.npy", weights_layout),
-                      get_weights_accessor(data_path, unit_path + "conv1_biases.npy"),
-                      PadStrideInfo(1, 1, 0, 0))
-                  .set_name(unit_name + "conv1/convolution")
-                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "conv1/Relu")
-
-                  << ConvolutionLayer(
-                      3U, 3U, base_depth / 2,
-                      get_weights_accessor(data_path, unit_path + "conv2_weights.npy", weights_layout),
-                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                      pad_grouped_conv, 32)
-                  .set_name(unit_name + "conv2/convolution")
+            right << ConvolutionLayer(1U, 1U, base_depth / 2,
+                                      get_weights_accessor(data_path, unit_path + "conv1_weights.npy", weights_layout),
+                                      get_weights_accessor(data_path, unit_path + "conv1_biases.npy"),
+                                      PadStrideInfo(1, 1, 0, 0))
+                         .set_name(unit_name + "conv1/convolution")
+                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                         .set_name(unit_name + "conv1/Relu")
+
+                  << ConvolutionLayer(3U, 3U, base_depth / 2,
+                                      get_weights_accessor(data_path, unit_path + "conv2_weights.npy", weights_layout),
+                                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), pad_grouped_conv,
+                                      32)
+                         .set_name(unit_name + "conv2/convolution")
                   << ScaleLayer(get_weights_accessor(data_path, unit_path + "bn2_mul.npy"),
                                 get_weights_accessor(data_path, unit_path + "bn2_add.npy"))
-                  .set_name(unit_name + "conv1/Scale")
-                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "conv2/Relu")
+                         .set_name(unit_name + "conv1/Scale")
+                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                         .set_name(unit_name + "conv2/Relu")
 
-                  << ConvolutionLayer(
-                      1U, 1U, base_depth,
-                      get_weights_accessor(data_path, unit_path + "conv3_weights.npy", weights_layout),
-                      get_weights_accessor(data_path, unit_path + "conv3_biases.npy"),
-                      PadStrideInfo(1, 1, 0, 0))
-                  .set_name(unit_name + "conv3/convolution");
+                  << ConvolutionLayer(1U, 1U, base_depth,
+                                      get_weights_accessor(data_path, unit_path + "conv3_weights.npy", weights_layout),
+                                      get_weights_accessor(data_path, unit_path + "conv3_biases.npy"),
+                                      PadStrideInfo(1, 1, 0, 0))
+                         .set_name(unit_name + "conv3/convolution");
 
             SubStream left(graph);
-            if(i == 0)
+            if (i == 0)
             {
-                left << ConvolutionLayer(
-                         1U, 1U, base_depth,
-                         get_weights_accessor(data_path, unit_path + "sc_weights.npy", weights_layout),
-                         std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                         PadStrideInfo(stride_conv_unit1, stride_conv_unit1, 0, 0))
-                     .set_name(unit_name + "sc/convolution")
+                left << ConvolutionLayer(1U, 1U, base_depth,
+                                         get_weights_accessor(data_path, unit_path + "sc_weights.npy", weights_layout),
+                                         std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                         PadStrideInfo(stride_conv_unit1, stride_conv_unit1, 0, 0))
+                            .set_name(unit_name + "sc/convolution")
                      << ScaleLayer(get_weights_accessor(data_path, unit_path + "sc_bn_mul.npy"),
                                    get_weights_accessor(data_path, unit_path + "sc_bn_add.npy"))
-                     .set_name(unit_name + "sc/scale");
+                            .set_name(unit_name + "sc/scale");
             }
 
             graph << EltwiseLayer(std::move(left), std::move(right), EltwiseOperation::Add).set_name(unit_name + "add");
-            graph << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "Relu");
+            graph << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                         .set_name(unit_name + "Relu");
         }
     }
 };
diff --git a/examples/graph_shufflenet.cpp b/examples/graph_shufflenet.cpp
index 6e13c5eeb4..513d95884e 100644
--- a/examples/graph_shufflenet.cpp
+++ b/examples/graph_shufflenet.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -35,8 +36,7 @@ using namespace arm_compute::graph_utils;
 class ShuffleNetExample : public Example
 {
 public:
-    ShuffleNetExample()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "ShuffleNet")
+    ShuffleNetExample() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "ShuffleNet")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -49,20 +49,21 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
         }
 
         // Set default layout if needed (Single kernel grouped convolution not yet supported int NHWC)
-        if(!common_opts.data_layout->is_set())
+        if (!common_opts.data_layout->is_set())
         {
             common_params.data_layout = DataLayout::NHWC;
         }
 
         // Checks
-        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type), "QASYMM8 not supported for this graph");
+        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type),
+                                "QASYMM8 not supported for this graph");
 
         // Print parameter values
         std::cout << common_params << std::endl;
@@ -75,15 +76,17 @@ public:
         std::string data_path = common_params.data_path;
 
         // Add model path to data path
-        if(!data_path.empty())
+        if (!data_path.empty())
         {
             data_path += model_path;
         }
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
@@ -91,24 +94,22 @@ public:
         // Create preprocessor
         std::unique_ptr<IPreprocessor> preprocessor = std::make_unique<TFPreproccessor>(0);
 
-        graph << common_params.target
-              << common_params.fast_math_hint
-              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor), false /* Do not convert to BGR */))
-              << ConvolutionLayer(
-                  3U, 3U, 24U,
-                  get_weights_accessor(data_path, "conv3_0_w_0.npy", weights_layout),
-                  get_weights_accessor(data_path, "conv3_0_b_0.npy", weights_layout),
-                  PadStrideInfo(2, 2, 1, 1))
-              .set_name("Conv1/convolution")
-              << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "conv3_0_bn_rm_0.npy"),
-                  get_weights_accessor(data_path, "conv3_0_bn_riv_0.npy"),
-                  get_weights_accessor(data_path, "conv3_0_bn_s_0.npy"),
-                  get_weights_accessor(data_path, "conv3_0_bn_b_0.npy"),
-                  1e-5f)
-              .set_name("Conv1/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Conv1/Relu")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 1, 1))).set_name("pool1/MaxPool");
+        graph << common_params.target << common_params.fast_math_hint
+              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor),
+                                                                 false /* Do not convert to BGR */))
+              << ConvolutionLayer(3U, 3U, 24U, get_weights_accessor(data_path, "conv3_0_w_0.npy", weights_layout),
+                                  get_weights_accessor(data_path, "conv3_0_b_0.npy", weights_layout),
+                                  PadStrideInfo(2, 2, 1, 1))
+                     .set_name("Conv1/convolution")
+              << BatchNormalizationLayer(get_weights_accessor(data_path, "conv3_0_bn_rm_0.npy"),
+                                         get_weights_accessor(data_path, "conv3_0_bn_riv_0.npy"),
+                                         get_weights_accessor(data_path, "conv3_0_bn_s_0.npy"),
+                                         get_weights_accessor(data_path, "conv3_0_bn_b_0.npy"), 1e-5f)
+                     .set_name("Conv1/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("Conv1/Relu")
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 1, 1)))
+                     .set_name("pool1/MaxPool");
 
         // Stage 2
         add_residual_block(data_path, DataLayout::NCHW, 0U /* unit */, 112U /* depth */, 2U /* stride */);
@@ -134,13 +135,10 @@ public:
 
         graph << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, operation_layout)).set_name("predictions/AvgPool")
               << FlattenLayer().set_name("predictions/Reshape")
-              << FullyConnectedLayer(
-                  1000U,
-                  get_weights_accessor(data_path, "pred_w_0.npy", weights_layout),
-                  get_weights_accessor(data_path, "pred_b_0.npy"))
-              .set_name("predictions/FC")
-              << SoftmaxLayer().set_name("predictions/Softmax")
-              << OutputLayer(get_output_accessor(common_params, 5));
+              << FullyConnectedLayer(1000U, get_weights_accessor(data_path, "pred_w_0.npy", weights_layout),
+                                     get_weights_accessor(data_path, "pred_b_0.npy"))
+                     .set_name("predictions/FC")
+              << SoftmaxLayer().set_name("predictions/Softmax") << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
         GraphConfig config;
@@ -167,8 +165,11 @@ private:
     CommonGraphParams  common_params;
     Stream             graph;
 
-    void add_residual_block(const std::string &data_path, DataLayout weights_layout,
-                            unsigned int unit, unsigned int depth, unsigned int stride)
+    void add_residual_block(const std::string &data_path,
+                            DataLayout         weights_layout,
+                            unsigned int       unit,
+                            unsigned int       depth,
+                            unsigned int       stride)
     {
         PadStrideInfo      dwc_info        = PadStrideInfo(1, 1, 1, 1);
         const unsigned int gconv_id        = unit * 2;
@@ -181,63 +182,61 @@ private:
         SubStream left_ss(graph);
         SubStream right_ss(graph);
 
-        if(stride == 2)
+        if (stride == 2)
         {
-            right_ss << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout, PadStrideInfo(2, 2, 1, 1))).set_name(unit_name + "/pool_1/AveragePool");
+            right_ss << PoolingLayer(
+                            PoolingLayerInfo(PoolingType::AVG, 3, common_params.data_layout, PadStrideInfo(2, 2, 1, 1)))
+                            .set_name(unit_name + "/pool_1/AveragePool");
             dwc_info = PadStrideInfo(2, 2, 1, 1);
         }
 
-        left_ss << ConvolutionLayer(
-                    1U, 1U, depth,
-                    get_weights_accessor(data_path, "gconv1_" + gconv_id_name + "_w_0.npy", weights_layout),
-                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                    PadStrideInfo(1, 1, 0, 0), num_groups)
-                .set_name(unit_name + "/gconv1_" + gconv_id_name + "/convolution")
-                << BatchNormalizationLayer(
-                    get_weights_accessor(data_path, "gconv1_" + gconv_id_name + "_bn_rm_0.npy"),
-                    get_weights_accessor(data_path, "gconv1_" + gconv_id_name + "_bn_riv_0.npy"),
-                    get_weights_accessor(data_path, "gconv1_" + gconv_id_name + "_bn_s_0.npy"),
-                    get_weights_accessor(data_path, "gconv1_" + gconv_id_name + "_bn_b_0.npy"),
-                    1e-5f)
-                .set_name(unit_name + "/gconv1_" + gconv_id_name + "/BatchNorm")
-                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "/gconv1_" + gconv_id_name + "/Relu")
-                << ChannelShuffleLayer(num_groups).set_name(unit_name + "/shuffle_0/ChannelShufle")
-                << DepthwiseConvolutionLayer(
-                    3U, 3U,
-                    get_weights_accessor(data_path, "gconv3_" + unit_id_name + "_w_0.npy", weights_layout),
-                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                    dwc_info)
-                .set_name(unit_name + "/gconv3_" + unit_id_name + "/depthwise")
-                << BatchNormalizationLayer(
-                    get_weights_accessor(data_path, "gconv3_" + unit_id_name + "_bn_rm_0.npy"),
-                    get_weights_accessor(data_path, "gconv3_" + unit_id_name + "_bn_riv_0.npy"),
-                    get_weights_accessor(data_path, "gconv3_" + unit_id_name + "_bn_s_0.npy"),
-                    get_weights_accessor(data_path, "gconv3_" + unit_id_name + "_bn_b_0.npy"),
-                    1e-5f)
-                .set_name(unit_name + "/gconv3_" + unit_id_name + "/BatchNorm")
-                << ConvolutionLayer(
-                    1U, 1U, depth,
-                    get_weights_accessor(data_path, "gconv1_" + gconv_id_1_name + "_w_0.npy", weights_layout),
-                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                    PadStrideInfo(1, 1, 0, 0), num_groups)
-                .set_name(unit_name + "/gconv1_" + gconv_id_1_name + "/convolution")
-                << BatchNormalizationLayer(
-                    get_weights_accessor(data_path, "gconv1_" + gconv_id_1_name + "_bn_rm_0.npy"),
-                    get_weights_accessor(data_path, "gconv1_" + gconv_id_1_name + "_bn_riv_0.npy"),
-                    get_weights_accessor(data_path, "gconv1_" + gconv_id_1_name + "_bn_s_0.npy"),
-                    get_weights_accessor(data_path, "gconv1_" + gconv_id_1_name + "_bn_b_0.npy"),
-                    1e-5f)
-                .set_name(unit_name + "/gconv1_" + gconv_id_1_name + "/BatchNorm");
+        left_ss
+            << ConvolutionLayer(1U, 1U, depth,
+                                get_weights_accessor(data_path, "gconv1_" + gconv_id_name + "_w_0.npy", weights_layout),
+                                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                PadStrideInfo(1, 1, 0, 0), num_groups)
+                   .set_name(unit_name + "/gconv1_" + gconv_id_name + "/convolution")
+            << BatchNormalizationLayer(get_weights_accessor(data_path, "gconv1_" + gconv_id_name + "_bn_rm_0.npy"),
+                                       get_weights_accessor(data_path, "gconv1_" + gconv_id_name + "_bn_riv_0.npy"),
+                                       get_weights_accessor(data_path, "gconv1_" + gconv_id_name + "_bn_s_0.npy"),
+                                       get_weights_accessor(data_path, "gconv1_" + gconv_id_name + "_bn_b_0.npy"),
+                                       1e-5f)
+                   .set_name(unit_name + "/gconv1_" + gconv_id_name + "/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(unit_name + "/gconv1_" + gconv_id_name + "/Relu")
+            << ChannelShuffleLayer(num_groups).set_name(unit_name + "/shuffle_0/ChannelShufle")
+            << DepthwiseConvolutionLayer(
+                   3U, 3U, get_weights_accessor(data_path, "gconv3_" + unit_id_name + "_w_0.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), dwc_info)
+                   .set_name(unit_name + "/gconv3_" + unit_id_name + "/depthwise")
+            << BatchNormalizationLayer(get_weights_accessor(data_path, "gconv3_" + unit_id_name + "_bn_rm_0.npy"),
+                                       get_weights_accessor(data_path, "gconv3_" + unit_id_name + "_bn_riv_0.npy"),
+                                       get_weights_accessor(data_path, "gconv3_" + unit_id_name + "_bn_s_0.npy"),
+                                       get_weights_accessor(data_path, "gconv3_" + unit_id_name + "_bn_b_0.npy"), 1e-5f)
+                   .set_name(unit_name + "/gconv3_" + unit_id_name + "/BatchNorm")
+            << ConvolutionLayer(
+                   1U, 1U, depth,
+                   get_weights_accessor(data_path, "gconv1_" + gconv_id_1_name + "_w_0.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0), num_groups)
+                   .set_name(unit_name + "/gconv1_" + gconv_id_1_name + "/convolution")
+            << BatchNormalizationLayer(get_weights_accessor(data_path, "gconv1_" + gconv_id_1_name + "_bn_rm_0.npy"),
+                                       get_weights_accessor(data_path, "gconv1_" + gconv_id_1_name + "_bn_riv_0.npy"),
+                                       get_weights_accessor(data_path, "gconv1_" + gconv_id_1_name + "_bn_s_0.npy"),
+                                       get_weights_accessor(data_path, "gconv1_" + gconv_id_1_name + "_bn_b_0.npy"),
+                                       1e-5f)
+                   .set_name(unit_name + "/gconv1_" + gconv_id_1_name + "/BatchNorm");
 
-        if(stride == 2)
+        if (stride == 2)
         {
             graph << ConcatLayer(std::move(left_ss), std::move(right_ss)).set_name(unit_name + "/Concat");
         }
         else
         {
-            graph << EltwiseLayer(std::move(left_ss), std::move(right_ss), EltwiseOperation::Add).set_name(unit_name + "/Add");
+            graph << EltwiseLayer(std::move(left_ss), std::move(right_ss), EltwiseOperation::Add)
+                         .set_name(unit_name + "/Add");
         }
-        graph << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(unit_name + "/Relu");
+        graph << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name(unit_name + "/Relu");
     }
 };
 
diff --git a/examples/graph_squeezenet.cpp b/examples/graph_squeezenet.cpp
index 3ea2fea38f..7d0528f805 100644
--- a/examples/graph_squeezenet.cpp
+++ b/examples/graph_squeezenet.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -35,8 +36,7 @@ using namespace arm_compute::graph_utils;
 class GraphSqueezenetExample : public Example
 {
 public:
-    GraphSqueezenetExample()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "SqueezeNetV1")
+    GraphSqueezenetExample() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "SqueezeNetV1")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -49,7 +49,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -62,104 +62,128 @@ public:
         std::string data_path = common_params.data_path;
 
         // Create a preprocessor object
-        const std::array<float, 3> mean_rgb{ { 122.68f, 116.67f, 104.01f } };
+        const std::array<float, 3>     mean_rgb{{122.68f, 116.67f, 104.01f}};
         std::unique_ptr<IPreprocessor> preprocessor = std::make_unique<CaffePreproccessor>(mean_rgb);
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
+        graph << common_params.target << common_params.fast_math_hint
               << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor)))
               << ConvolutionLayer(
-                  7U, 7U, 96U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/conv1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/conv1_b.npy"),
-                  PadStrideInfo(2, 2, 0, 0))
-              .set_name("conv1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu_conv1")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name("pool1")
+                     7U, 7U, 96U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/conv1_w.npy", weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/conv1_b.npy"),
+                     PadStrideInfo(2, 2, 0, 0))
+                     .set_name("conv1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("relu_conv1")
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+                     .set_name("pool1")
               << ConvolutionLayer(
-                  1U, 1U, 16U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire2_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire2_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire2/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire2/relu_squeeze1x1");
+                     1U, 1U, 16U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire2_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire2_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire2/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire2/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire2", weights_layout, 64U, 64U).set_name("fire2/concat");
         graph << ConvolutionLayer(
-                  1U, 1U, 16U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire3_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire3_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire3/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire3/relu_squeeze1x1");
+                     1U, 1U, 16U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire3_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire3_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire3/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire3/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire3", weights_layout, 64U, 64U).set_name("fire3/concat");
         graph << ConvolutionLayer(
-                  1U, 1U, 32U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire4_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire4_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire4/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire4/relu_squeeze1x1");
+                     1U, 1U, 32U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire4_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire4_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire4/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire4/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire4", weights_layout, 128U, 128U).set_name("fire4/concat");
-        graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name("pool4")
+        graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+                     .set_name("pool4")
               << ConvolutionLayer(
-                  1U, 1U, 32U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire5_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire5_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire5/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire5/relu_squeeze1x1");
+                     1U, 1U, 32U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire5_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire5_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire5/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire5/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire5", weights_layout, 128U, 128U).set_name("fire5/concat");
         graph << ConvolutionLayer(
-                  1U, 1U, 48U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire6_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire6_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire6/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire6/relu_squeeze1x1");
+                     1U, 1U, 48U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire6_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire6_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire6/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire6/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire6", weights_layout, 192U, 192U).set_name("fire6/concat");
         graph << ConvolutionLayer(
-                  1U, 1U, 48U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire7_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire7_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire7/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire7/relu_squeeze1x1");
+                     1U, 1U, 48U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire7_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire7_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire7/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire7/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire7", weights_layout, 192U, 192U).set_name("fire7/concat");
         graph << ConvolutionLayer(
-                  1U, 1U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire8_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire8_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire8/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire8/relu_squeeze1x1");
+                     1U, 1U, 64U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire8_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire8_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire8/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire8/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire8", weights_layout, 256U, 256U).set_name("fire8/concat");
-        graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name("pool8")
+        graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+                     .set_name("pool8")
               << ConvolutionLayer(
-                  1U, 1U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire9_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire9_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire9/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire9/relu_squeeze1x1");
+                     1U, 1U, 64U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire9_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/fire9_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire9/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire9/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire9", weights_layout, 256U, 256U).set_name("fire9/concat");
         graph << ConvolutionLayer(
-                  1U, 1U, 1000U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/conv10_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/conv10_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("conv10")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu_conv10")
+                     1U, 1U, 1000U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/conv10_w.npy", weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/conv10_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("conv10")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("relu_conv10")
               << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, operation_layout)).set_name("pool10")
-              << FlattenLayer().set_name("flatten")
-              << SoftmaxLayer().set_name("prob")
+              << FlattenLayer().set_name("flatten") << SoftmaxLayer().set_name("prob")
               << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
@@ -188,27 +212,30 @@ private:
     CommonGraphParams  common_params;
     Stream             graph;
 
-    ConcatLayer get_expand_fire_node(const std::string &data_path, std::string &&param_path, DataLayout weights_layout,
-                                     unsigned int expand1_filt, unsigned int expand3_filt)
+    ConcatLayer get_expand_fire_node(const std::string &data_path,
+                                     std::string      &&param_path,
+                                     DataLayout         weights_layout,
+                                     unsigned int       expand1_filt,
+                                     unsigned int       expand3_filt)
     {
         std::string total_path = "/cnn_data/squeezenet_v1.0_model/" + param_path + "_";
         SubStream   i_a(graph);
-        i_a << ConvolutionLayer(
-                1U, 1U, expand1_filt,
-                get_weights_accessor(data_path, total_path + "expand1x1_w.npy", weights_layout),
-                get_weights_accessor(data_path, total_path + "expand1x1_b.npy"),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/expand1x1")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/relu_expand1x1");
+        i_a << ConvolutionLayer(1U, 1U, expand1_filt,
+                                get_weights_accessor(data_path, total_path + "expand1x1_w.npy", weights_layout),
+                                get_weights_accessor(data_path, total_path + "expand1x1_b.npy"),
+                                PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/expand1x1")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/relu_expand1x1");
 
         SubStream i_b(graph);
-        i_b << ConvolutionLayer(
-                3U, 3U, expand3_filt,
-                get_weights_accessor(data_path, total_path + "expand3x3_w.npy", weights_layout),
-                get_weights_accessor(data_path, total_path + "expand3x3_b.npy"),
-                PadStrideInfo(1, 1, 1, 1))
-            .set_name(param_path + "/expand3x3")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/relu_expand3x3");
+        i_b << ConvolutionLayer(3U, 3U, expand3_filt,
+                                get_weights_accessor(data_path, total_path + "expand3x3_w.npy", weights_layout),
+                                get_weights_accessor(data_path, total_path + "expand3x3_b.npy"),
+                                PadStrideInfo(1, 1, 1, 1))
+                   .set_name(param_path + "/expand3x3")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/relu_expand3x3");
 
         return ConcatLayer(std::move(i_a), std::move(i_b));
     }
diff --git a/examples/graph_squeezenet_v1_1.cpp b/examples/graph_squeezenet_v1_1.cpp
index 9cc183fbbd..ed0f692db2 100644
--- a/examples/graph_squeezenet_v1_1.cpp
+++ b/examples/graph_squeezenet_v1_1.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -35,8 +36,7 @@ using namespace arm_compute::graph_utils;
 class GraphSqueezenet_v1_1Example : public Example
 {
 public:
-    GraphSqueezenet_v1_1Example()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "SqueezeNetV1.1")
+    GraphSqueezenet_v1_1Example() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "SqueezeNetV1.1")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -49,7 +49,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -62,104 +62,128 @@ public:
         std::string data_path = common_params.data_path;
 
         // Create a preprocessor object
-        const std::array<float, 3> mean_rgb{ { 122.68f, 116.67f, 104.01f } };
+        const std::array<float, 3>     mean_rgb{{122.68f, 116.67f, 104.01f}};
         std::unique_ptr<IPreprocessor> preprocessor = std::make_unique<CaffePreproccessor>(mean_rgb);
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(227U, 227U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(227U, 227U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
+        graph << common_params.target << common_params.fast_math_hint
               << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor)))
               << ConvolutionLayer(
-                  3U, 3U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/conv1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/conv1_b.npy"),
-                  PadStrideInfo(2, 2, 0, 0))
-              .set_name("conv1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu_conv1")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name("pool1")
+                     3U, 3U, 64U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/conv1_w.npy", weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/conv1_b.npy"),
+                     PadStrideInfo(2, 2, 0, 0))
+                     .set_name("conv1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("relu_conv1")
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+                     .set_name("pool1")
               << ConvolutionLayer(
-                  1U, 1U, 16U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire2_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire2_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire2/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire2/relu_squeeze1x1");
+                     1U, 1U, 16U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire2_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire2_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire2/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire2/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire2", weights_layout, 64U, 64U).set_name("fire2/concat");
         graph << ConvolutionLayer(
-                  1U, 1U, 16U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire3_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire3_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire3/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire3/relu_squeeze1x1");
+                     1U, 1U, 16U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire3_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire3_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire3/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire3/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire3", weights_layout, 64U, 64U).set_name("fire3/concat");
-        graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name("pool3")
+        graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+                     .set_name("pool3")
               << ConvolutionLayer(
-                  1U, 1U, 32U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire4_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire4_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire4/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire4/relu_squeeze1x1");
+                     1U, 1U, 32U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire4_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire4_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire4/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire4/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire4", weights_layout, 128U, 128U).set_name("fire4/concat");
         graph << ConvolutionLayer(
-                  1U, 1U, 32U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire5_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire5_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire5/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire5/relu_squeeze1x1");
+                     1U, 1U, 32U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire5_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire5_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire5/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire5/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire5", weights_layout, 128U, 128U).set_name("fire5/concat");
-        graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))).set_name("pool5")
+        graph << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, operation_layout,
+                                               PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
+                     .set_name("pool5")
               << ConvolutionLayer(
-                  1U, 1U, 48U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire6_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire6_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire6/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire6/relu_squeeze1x1");
+                     1U, 1U, 48U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire6_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire6_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire6/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire6/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire6", weights_layout, 192U, 192U).set_name("fire6/concat");
         graph << ConvolutionLayer(
-                  1U, 1U, 48U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire7_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire7_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire7/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire7/relu_squeeze1x1");
+                     1U, 1U, 48U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire7_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire7_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire7/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire7/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire7", weights_layout, 192U, 192U).set_name("fire7/concat");
         graph << ConvolutionLayer(
-                  1U, 1U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire8_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire8_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire8/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire8/relu_squeeze1x1");
+                     1U, 1U, 64U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire8_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire8_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire8/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire8/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire8", weights_layout, 256U, 256U).set_name("fire8/concat");
         graph << ConvolutionLayer(
-                  1U, 1U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire9_squeeze1x1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire9_squeeze1x1_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("fire9/squeeze1x1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("fire9/relu_squeeze1x1");
+                     1U, 1U, 64U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire9_squeeze1x1_w.npy",
+                                          weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire9_squeeze1x1_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("fire9/squeeze1x1")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("fire9/relu_squeeze1x1");
         graph << get_expand_fire_node(data_path, "fire9", weights_layout, 256U, 256U).set_name("fire9/concat");
         graph << ConvolutionLayer(
-                  1U, 1U, 1000U,
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/conv10_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/conv10_b.npy"),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("conv10")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("relu_conv10")
+                     1U, 1U, 1000U,
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/conv10_w.npy", weights_layout),
+                     get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/conv10_b.npy"),
+                     PadStrideInfo(1, 1, 0, 0))
+                     .set_name("conv10")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("relu_conv10")
               << PoolingLayer(PoolingLayerInfo(PoolingType::AVG, operation_layout)).set_name("pool10")
-              << FlattenLayer().set_name("flatten")
-              << SoftmaxLayer().set_name("prob")
+              << FlattenLayer().set_name("flatten") << SoftmaxLayer().set_name("prob")
               << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
@@ -188,27 +212,30 @@ private:
     CommonGraphParams  common_params;
     Stream             graph;
 
-    ConcatLayer get_expand_fire_node(const std::string &data_path, std::string &&param_path, DataLayout weights_layout,
-                                     unsigned int expand1_filt, unsigned int expand3_filt)
+    ConcatLayer get_expand_fire_node(const std::string &data_path,
+                                     std::string      &&param_path,
+                                     DataLayout         weights_layout,
+                                     unsigned int       expand1_filt,
+                                     unsigned int       expand3_filt)
     {
         std::string total_path = "/cnn_data/squeezenet_v1_1_model/" + param_path + "_";
         SubStream   i_a(graph);
-        i_a << ConvolutionLayer(
-                1U, 1U, expand1_filt,
-                get_weights_accessor(data_path, total_path + "expand1x1_w.npy", weights_layout),
-                get_weights_accessor(data_path, total_path + "expand1x1_b.npy"),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name(param_path + "/expand1x1")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/relu_expand1x1");
+        i_a << ConvolutionLayer(1U, 1U, expand1_filt,
+                                get_weights_accessor(data_path, total_path + "expand1x1_w.npy", weights_layout),
+                                get_weights_accessor(data_path, total_path + "expand1x1_b.npy"),
+                                PadStrideInfo(1, 1, 0, 0))
+                   .set_name(param_path + "/expand1x1")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/relu_expand1x1");
 
         SubStream i_b(graph);
-        i_b << ConvolutionLayer(
-                3U, 3U, expand3_filt,
-                get_weights_accessor(data_path, total_path + "expand3x3_w.npy", weights_layout),
-                get_weights_accessor(data_path, total_path + "expand3x3_b.npy"),
-                PadStrideInfo(1, 1, 1, 1))
-            .set_name(param_path + "/expand3x3")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "/relu_expand3x3");
+        i_b << ConvolutionLayer(3U, 3U, expand3_filt,
+                                get_weights_accessor(data_path, total_path + "expand3x3_w.npy", weights_layout),
+                                get_weights_accessor(data_path, total_path + "expand3x3_b.npy"),
+                                PadStrideInfo(1, 1, 1, 1))
+                   .set_name(param_path + "/expand3x3")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name(param_path + "/relu_expand3x3");
 
         return ConcatLayer(std::move(i_a), std::move(i_b));
     }
diff --git a/examples/graph_srcnn955.cpp b/examples/graph_srcnn955.cpp
index 855bbd848e..15a8b5d8ec 100644
--- a/examples/graph_srcnn955.cpp
+++ b/examples/graph_srcnn955.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -36,7 +37,12 @@ class GraphSRCNN955Example : public Example
 {
 public:
     GraphSRCNN955Example()
-        : cmd_parser(), common_opts(cmd_parser), model_input_width(nullptr), model_input_height(nullptr), common_params(), graph(0, "SRCNN955")
+        : cmd_parser(),
+          common_opts(cmd_parser),
+          model_input_width(nullptr),
+          model_input_height(nullptr),
+          common_params(),
+          graph(0, "SRCNN955")
     {
         model_input_width  = cmd_parser.add_option<SimpleOption<unsigned int>>("image-width", 300);
         model_input_height = cmd_parser.add_option<SimpleOption<unsigned int>>("image-height", 300);
@@ -45,7 +51,7 @@ public:
         model_input_width->set_help("Input image width.");
         model_input_height->set_help("Input image height.");
     }
-    GraphSRCNN955Example(const GraphSRCNN955Example &) = delete;
+    GraphSRCNN955Example(const GraphSRCNN955Example &)            = delete;
     GraphSRCNN955Example &operator=(const GraphSRCNN955Example &) = delete;
     ~GraphSRCNN955Example() override                              = default;
     bool do_setup(int argc, char **argv) override
@@ -58,7 +64,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -81,36 +87,33 @@ public:
         std::unique_ptr<IPreprocessor> preprocessor = std::make_unique<TFPreproccessor>();
 
         // Create input descriptor
-        const TensorShape tensor_shape     = permute_shape(TensorShape(image_width, image_height, 3U, common_params.batches), DataLayout::NCHW, common_params.data_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(image_width, image_height, 3U, common_params.batches), DataLayout::NCHW,
+                          common_params.data_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
-              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor), false /* Do not convert to BGR */))
-              << ConvolutionLayer(
-                  9U, 9U, 64U,
-                  get_weights_accessor(data_path, "conv1_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, "conv1_biases.npy"),
-                  PadStrideInfo(1, 1, 4, 4))
-              .set_name("conv1/convolution")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv1/Relu")
-              << ConvolutionLayer(
-                  5U, 5U, 32U,
-                  get_weights_accessor(data_path, "conv2_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, "conv2_biases.npy"),
-                  PadStrideInfo(1, 1, 2, 2))
-              .set_name("conv2/convolution")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv2/Relu")
-              << ConvolutionLayer(
-                  5U, 5U, 3U,
-                  get_weights_accessor(data_path, "conv3_weights.npy", weights_layout),
-                  get_weights_accessor(data_path, "conv3_biases.npy"),
-                  PadStrideInfo(1, 1, 2, 2))
-              .set_name("conv3/convolution")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv3/Relu")
+        graph << common_params.target << common_params.fast_math_hint
+              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor),
+                                                                 false /* Do not convert to BGR */))
+              << ConvolutionLayer(9U, 9U, 64U, get_weights_accessor(data_path, "conv1_weights.npy", weights_layout),
+                                  get_weights_accessor(data_path, "conv1_biases.npy"), PadStrideInfo(1, 1, 4, 4))
+                     .set_name("conv1/convolution")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("conv1/Relu")
+              << ConvolutionLayer(5U, 5U, 32U, get_weights_accessor(data_path, "conv2_weights.npy", weights_layout),
+                                  get_weights_accessor(data_path, "conv2_biases.npy"), PadStrideInfo(1, 1, 2, 2))
+                     .set_name("conv2/convolution")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("conv2/Relu")
+              << ConvolutionLayer(5U, 5U, 3U, get_weights_accessor(data_path, "conv3_weights.npy", weights_layout),
+                                  get_weights_accessor(data_path, "conv3_biases.npy"), PadStrideInfo(1, 1, 2, 2))
+                     .set_name("conv3/convolution")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("conv3/Relu")
               << OutputLayer(std::make_unique<DummyAccessor>(0));
 
         // Finalize graph
@@ -137,8 +140,8 @@ public:
 private:
     CommandLineParser           cmd_parser;
     CommonGraphOptions          common_opts;
-    SimpleOption<unsigned int> *model_input_width{ nullptr };
-    SimpleOption<unsigned int> *model_input_height{ nullptr };
+    SimpleOption<unsigned int> *model_input_width{nullptr};
+    SimpleOption<unsigned int> *model_input_height{nullptr};
     CommonGraphParams           common_params;
     Stream                      graph;
 };
diff --git a/examples/graph_ssd_mobilenet.cpp b/examples/graph_ssd_mobilenet.cpp
index 9fe7f5b454..5162fe6890 100644
--- a/examples/graph_ssd_mobilenet.cpp
+++ b/examples/graph_ssd_mobilenet.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -36,23 +37,26 @@ using namespace arm_compute::graph_utils;
 class GraphSSDMobilenetExample : public Example
 {
 public:
-    GraphSSDMobilenetExample()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "MobileNetSSD")
+    GraphSSDMobilenetExample() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "MobileNetSSD")
     {
         // Add topk option
         keep_topk_opt = cmd_parser.add_option<SimpleOption<int>>("topk", 100);
         keep_topk_opt->set_help("Top k detections results per image. Used for data type F32.");
         // Add output option
         detection_boxes_opt = cmd_parser.add_option<SimpleOption<std::string>>("detection_boxes_opt", "");
-        detection_boxes_opt->set_help("Filename containing the reference values for the graph output detection_boxes. Used for data type QASYMM8.");
+        detection_boxes_opt->set_help("Filename containing the reference values for the graph output detection_boxes. "
+                                      "Used for data type QASYMM8.");
         detection_classes_opt = cmd_parser.add_option<SimpleOption<std::string>>("detection_classes_opt", "");
-        detection_classes_opt->set_help("Filename containing the reference values for the output detection_classes. Used for data type QASYMM8.");
+        detection_classes_opt->set_help(
+            "Filename containing the reference values for the output detection_classes. Used for data type QASYMM8.");
         detection_scores_opt = cmd_parser.add_option<SimpleOption<std::string>>("detection_scores_opt", "");
-        detection_scores_opt->set_help("Filename containing the reference values for the output detection_scores. Used for data type QASYMM8.");
+        detection_scores_opt->set_help(
+            "Filename containing the reference values for the output detection_scores. Used for data type QASYMM8.");
         num_detections_opt = cmd_parser.add_option<SimpleOption<std::string>>("num_detections_opt", "");
-        num_detections_opt->set_help("Filename containing the reference values for the output num_detections. Used with datatype QASYMM8.");
+        num_detections_opt->set_help(
+            "Filename containing the reference values for the output num_detections. Used with datatype QASYMM8.");
     }
-    GraphSSDMobilenetExample(const GraphSSDMobilenetExample &) = delete;
+    GraphSSDMobilenetExample(const GraphSSDMobilenetExample &)            = delete;
     GraphSSDMobilenetExample &operator=(const GraphSSDMobilenetExample &) = delete;
     ~GraphSSDMobilenetExample() override                                  = default;
     bool do_setup(int argc, char **argv) override
@@ -65,7 +69,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -75,15 +79,16 @@ public:
         std::cout << common_params << std::endl;
 
         // Create input descriptor
-        const TensorShape tensor_shape     = permute_shape(TensorShape(300, 300, 3U, 1U), DataLayout::NCHW, common_params.data_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(300, 300, 3U, 1U), DataLayout::NCHW, common_params.data_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
 
         // Set graph hints
-        graph << common_params.target
-              << common_params.fast_math_hint;
+        graph << common_params.target << common_params.fast_math_hint;
 
         // Create core graph
-        if(arm_compute::is_data_type_float(common_params.data_type))
+        if (arm_compute::is_data_type_float(common_params.data_type))
         {
             create_graph_float(input_descriptor);
         }
@@ -112,99 +117,98 @@ public:
 private:
     CommandLineParser  cmd_parser;
     CommonGraphOptions common_opts;
-    SimpleOption<int> *keep_topk_opt{ nullptr };
+    SimpleOption<int> *keep_topk_opt{nullptr};
     CommonGraphParams  common_params;
     Stream             graph;
 
-    SimpleOption<std::string> *detection_boxes_opt{ nullptr };
-    SimpleOption<std::string> *detection_classes_opt{ nullptr };
-    SimpleOption<std::string> *detection_scores_opt{ nullptr };
-    SimpleOption<std::string> *num_detections_opt{ nullptr };
-
-    ConcatLayer get_node_A_float(IStream &main_graph, const std::string &data_path, std::string &&param_path,
-                                 unsigned int  conv_filt,
-                                 PadStrideInfo dwc_pad_stride_info, PadStrideInfo conv_pad_stride_info)
+    SimpleOption<std::string> *detection_boxes_opt{nullptr};
+    SimpleOption<std::string> *detection_classes_opt{nullptr};
+    SimpleOption<std::string> *detection_scores_opt{nullptr};
+    SimpleOption<std::string> *num_detections_opt{nullptr};
+
+    ConcatLayer get_node_A_float(IStream           &main_graph,
+                                 const std::string &data_path,
+                                 std::string      &&param_path,
+                                 unsigned int       conv_filt,
+                                 PadStrideInfo      dwc_pad_stride_info,
+                                 PadStrideInfo      conv_pad_stride_info)
     {
         const std::string total_path = param_path + "_";
         SubStream         sg(main_graph);
 
-        sg << DepthwiseConvolutionLayer(
-               3U, 3U,
-               get_weights_accessor(data_path, total_path + "dw_w.npy"),
-               std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-               dwc_pad_stride_info)
-           .set_name(param_path + "/dw")
+        sg << DepthwiseConvolutionLayer(3U, 3U, get_weights_accessor(data_path, total_path + "dw_w.npy"),
+                                        std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                        dwc_pad_stride_info)
+                  .set_name(param_path + "/dw")
            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "dw_bn_mean.npy"),
                                       get_weights_accessor(data_path, total_path + "dw_bn_var.npy"),
                                       get_weights_accessor(data_path, total_path + "dw_scale_w.npy"),
                                       get_weights_accessor(data_path, total_path + "dw_scale_b.npy"), 0.00001f)
-           .set_name(param_path + "/dw/bn")
-           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "dw/relu")
-
-           << ConvolutionLayer(
-               1U, 1U, conv_filt,
-               get_weights_accessor(data_path, total_path + "w.npy"),
-               std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-               conv_pad_stride_info)
-           .set_name(param_path + "/pw")
+                  .set_name(param_path + "/dw/bn")
+           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                  .set_name(param_path + "dw/relu")
+
+           << ConvolutionLayer(1U, 1U, conv_filt, get_weights_accessor(data_path, total_path + "w.npy"),
+                               std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), conv_pad_stride_info)
+                  .set_name(param_path + "/pw")
            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "bn_mean.npy"),
                                       get_weights_accessor(data_path, total_path + "bn_var.npy"),
                                       get_weights_accessor(data_path, total_path + "scale_w.npy"),
                                       get_weights_accessor(data_path, total_path + "scale_b.npy"), 0.00001f)
-           .set_name(param_path + "/pw/bn")
-           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(param_path + "pw/relu");
+                  .set_name(param_path + "/pw/bn")
+           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                  .set_name(param_path + "pw/relu");
 
         return ConcatLayer(std::move(sg));
     }
 
-    ConcatLayer get_node_B_float(IStream &main_graph, const std::string &data_path, std::string &&param_path,
-                                 unsigned int  conv_filt,
-                                 PadStrideInfo conv_pad_stride_info_1, PadStrideInfo conv_pad_stride_info_2)
+    ConcatLayer get_node_B_float(IStream           &main_graph,
+                                 const std::string &data_path,
+                                 std::string      &&param_path,
+                                 unsigned int       conv_filt,
+                                 PadStrideInfo      conv_pad_stride_info_1,
+                                 PadStrideInfo      conv_pad_stride_info_2)
     {
         const std::string total_path = param_path + "_";
         SubStream         sg(main_graph);
 
-        sg << ConvolutionLayer(
-               1, 1, conv_filt / 2,
-               get_weights_accessor(data_path, total_path + "1_w.npy"),
-               std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-               conv_pad_stride_info_1)
-           .set_name(total_path + "1/conv")
+        sg << ConvolutionLayer(1, 1, conv_filt / 2, get_weights_accessor(data_path, total_path + "1_w.npy"),
+                               std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), conv_pad_stride_info_1)
+                  .set_name(total_path + "1/conv")
            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "1_bn_mean.npy"),
                                       get_weights_accessor(data_path, total_path + "1_bn_var.npy"),
                                       get_weights_accessor(data_path, total_path + "1_scale_w.npy"),
                                       get_weights_accessor(data_path, total_path + "1_scale_b.npy"), 0.00001f)
-           .set_name(total_path + "1/bn")
-           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(total_path + "1/relu");
-
-        sg << ConvolutionLayer(
-               3, 3, conv_filt,
-               get_weights_accessor(data_path, total_path + "2_w.npy"),
-               std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-               conv_pad_stride_info_2)
-           .set_name(total_path + "2/conv")
+                  .set_name(total_path + "1/bn")
+           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                  .set_name(total_path + "1/relu");
+
+        sg << ConvolutionLayer(3, 3, conv_filt, get_weights_accessor(data_path, total_path + "2_w.npy"),
+                               std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), conv_pad_stride_info_2)
+                  .set_name(total_path + "2/conv")
            << BatchNormalizationLayer(get_weights_accessor(data_path, total_path + "2_bn_mean.npy"),
                                       get_weights_accessor(data_path, total_path + "2_bn_var.npy"),
                                       get_weights_accessor(data_path, total_path + "2_scale_w.npy"),
                                       get_weights_accessor(data_path, total_path + "2_scale_b.npy"), 0.00001f)
-           .set_name(total_path + "2/bn")
-           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(total_path + "2/relu");
+                  .set_name(total_path + "2/bn")
+           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                  .set_name(total_path + "2/relu");
 
         return ConcatLayer(std::move(sg));
     }
 
-    ConcatLayer get_node_C_float(IStream &main_graph, const std::string &data_path, std::string &&param_path,
-                                 unsigned int conv_filt, PadStrideInfo conv_pad_stride_info)
+    ConcatLayer get_node_C_float(IStream           &main_graph,
+                                 const std::string &data_path,
+                                 std::string      &&param_path,
+                                 unsigned int       conv_filt,
+                                 PadStrideInfo      conv_pad_stride_info)
     {
         const std::string total_path = param_path + "_";
         SubStream         sg(main_graph);
-        sg << ConvolutionLayer(
-               1U, 1U, conv_filt,
-               get_weights_accessor(data_path, total_path + "w.npy"),
-               get_weights_accessor(data_path, total_path + "b.npy"),
-               conv_pad_stride_info)
-           .set_name(param_path + "/conv");
-        if(common_params.data_layout == DataLayout::NCHW)
+        sg << ConvolutionLayer(1U, 1U, conv_filt, get_weights_accessor(data_path, total_path + "w.npy"),
+                               get_weights_accessor(data_path, total_path + "b.npy"), conv_pad_stride_info)
+                  .set_name(param_path + "/conv");
+        if (common_params.data_layout == DataLayout::NCHW)
         {
             sg << PermuteLayer(PermutationVector(2U, 0U, 1U), DataLayout::NHWC).set_name(param_path + "/perm");
         }
@@ -216,62 +220,77 @@ private:
     void create_graph_float(TensorDescriptor &input_descriptor)
     {
         // Create a preprocessor object
-        const std::array<float, 3> mean_rgb{ { 127.5f, 127.5f, 127.5f } };
+        const std::array<float, 3>     mean_rgb{{127.5f, 127.5f, 127.5f}};
         std::unique_ptr<IPreprocessor> preprocessor = std::make_unique<CaffePreproccessor>(mean_rgb, true, 0.007843f);
 
         // Get trainable parameters data path
         std::string data_path = common_params.data_path;
 
         // Add model path to data path
-        if(!data_path.empty())
+        if (!data_path.empty())
         {
             data_path += "/cnn_data/ssd_mobilenet_model/";
         }
 
-        graph << InputLayer(input_descriptor,
-                            get_input_accessor(common_params, std::move(preprocessor)));
+        graph << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor)));
 
         SubStream conv_11(graph);
-        conv_11 << ConvolutionLayer(
-                    3U, 3U, 32U,
-                    get_weights_accessor(data_path, "conv0_w.npy"),
-                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                    PadStrideInfo(2, 2, 1, 1))
-                .set_name("conv0");
+        conv_11 << ConvolutionLayer(3U, 3U, 32U, get_weights_accessor(data_path, "conv0_w.npy"),
+                                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                                    PadStrideInfo(2, 2, 1, 1))
+                       .set_name("conv0");
         conv_11 << BatchNormalizationLayer(get_weights_accessor(data_path, "conv0_bn_mean.npy"),
                                            get_weights_accessor(data_path, "conv0_bn_var.npy"),
                                            get_weights_accessor(data_path, "conv0_scale_w.npy"),
                                            get_weights_accessor(data_path, "conv0_scale_b.npy"), 0.00001f)
-                .set_name("conv0/bn")
-                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv0/relu");
-
-        conv_11 << get_node_A_float(conv_11, data_path, "conv1", 64, PadStrideInfo(1, 1, 1, 1), PadStrideInfo(1, 1, 0, 0));
-        conv_11 << get_node_A_float(conv_11, data_path, "conv2", 128, PadStrideInfo(2, 2, 1, 1), PadStrideInfo(1, 1, 0, 0));
-        conv_11 << get_node_A_float(conv_11, data_path, "conv3", 128, PadStrideInfo(1, 1, 1, 1), PadStrideInfo(1, 1, 0, 0));
-        conv_11 << get_node_A_float(conv_11, data_path, "conv4", 256, PadStrideInfo(2, 2, 1, 1), PadStrideInfo(1, 1, 0, 0));
-        conv_11 << get_node_A_float(conv_11, data_path, "conv5", 256, PadStrideInfo(1, 1, 1, 1), PadStrideInfo(1, 1, 0, 0));
-        conv_11 << get_node_A_float(conv_11, data_path, "conv6", 512, PadStrideInfo(2, 2, 1, 1), PadStrideInfo(1, 1, 0, 0));
-        conv_11 << get_node_A_float(conv_11, data_path, "conv7", 512, PadStrideInfo(1, 1, 1, 1), PadStrideInfo(1, 1, 0, 0));
-        conv_11 << get_node_A_float(conv_11, data_path, "conv8", 512, PadStrideInfo(1, 1, 1, 1), PadStrideInfo(1, 1, 0, 0));
-        conv_11 << get_node_A_float(conv_11, data_path, "conv9", 512, PadStrideInfo(1, 1, 1, 1), PadStrideInfo(1, 1, 0, 0));
-        conv_11 << get_node_A_float(conv_11, data_path, "conv10", 512, PadStrideInfo(1, 1, 1, 1), PadStrideInfo(1, 1, 0, 0));
-        conv_11 << get_node_A_float(conv_11, data_path, "conv11", 512, PadStrideInfo(1, 1, 1, 1), PadStrideInfo(1, 1, 0, 0));
+                       .set_name("conv0/bn")
+                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                       .set_name("conv0/relu");
+
+        conv_11 << get_node_A_float(conv_11, data_path, "conv1", 64, PadStrideInfo(1, 1, 1, 1),
+                                    PadStrideInfo(1, 1, 0, 0));
+        conv_11 << get_node_A_float(conv_11, data_path, "conv2", 128, PadStrideInfo(2, 2, 1, 1),
+                                    PadStrideInfo(1, 1, 0, 0));
+        conv_11 << get_node_A_float(conv_11, data_path, "conv3", 128, PadStrideInfo(1, 1, 1, 1),
+                                    PadStrideInfo(1, 1, 0, 0));
+        conv_11 << get_node_A_float(conv_11, data_path, "conv4", 256, PadStrideInfo(2, 2, 1, 1),
+                                    PadStrideInfo(1, 1, 0, 0));
+        conv_11 << get_node_A_float(conv_11, data_path, "conv5", 256, PadStrideInfo(1, 1, 1, 1),
+                                    PadStrideInfo(1, 1, 0, 0));
+        conv_11 << get_node_A_float(conv_11, data_path, "conv6", 512, PadStrideInfo(2, 2, 1, 1),
+                                    PadStrideInfo(1, 1, 0, 0));
+        conv_11 << get_node_A_float(conv_11, data_path, "conv7", 512, PadStrideInfo(1, 1, 1, 1),
+                                    PadStrideInfo(1, 1, 0, 0));
+        conv_11 << get_node_A_float(conv_11, data_path, "conv8", 512, PadStrideInfo(1, 1, 1, 1),
+                                    PadStrideInfo(1, 1, 0, 0));
+        conv_11 << get_node_A_float(conv_11, data_path, "conv9", 512, PadStrideInfo(1, 1, 1, 1),
+                                    PadStrideInfo(1, 1, 0, 0));
+        conv_11 << get_node_A_float(conv_11, data_path, "conv10", 512, PadStrideInfo(1, 1, 1, 1),
+                                    PadStrideInfo(1, 1, 0, 0));
+        conv_11 << get_node_A_float(conv_11, data_path, "conv11", 512, PadStrideInfo(1, 1, 1, 1),
+                                    PadStrideInfo(1, 1, 0, 0));
 
         SubStream conv_13(conv_11);
-        conv_13 << get_node_A_float(conv_11, data_path, "conv12", 1024, PadStrideInfo(2, 2, 1, 1), PadStrideInfo(1, 1, 0, 0));
-        conv_13 << get_node_A_float(conv_13, data_path, "conv13", 1024, PadStrideInfo(1, 1, 1, 1), PadStrideInfo(1, 1, 0, 0));
+        conv_13 << get_node_A_float(conv_11, data_path, "conv12", 1024, PadStrideInfo(2, 2, 1, 1),
+                                    PadStrideInfo(1, 1, 0, 0));
+        conv_13 << get_node_A_float(conv_13, data_path, "conv13", 1024, PadStrideInfo(1, 1, 1, 1),
+                                    PadStrideInfo(1, 1, 0, 0));
 
         SubStream conv_14(conv_13);
-        conv_14 << get_node_B_float(conv_13, data_path, "conv14", 512, PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 2, 1, 1));
+        conv_14 << get_node_B_float(conv_13, data_path, "conv14", 512, PadStrideInfo(1, 1, 0, 0),
+                                    PadStrideInfo(2, 2, 1, 1));
 
         SubStream conv_15(conv_14);
-        conv_15 << get_node_B_float(conv_14, data_path, "conv15", 256, PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 2, 1, 1));
+        conv_15 << get_node_B_float(conv_14, data_path, "conv15", 256, PadStrideInfo(1, 1, 0, 0),
+                                    PadStrideInfo(2, 2, 1, 1));
 
         SubStream conv_16(conv_15);
-        conv_16 << get_node_B_float(conv_15, data_path, "conv16", 256, PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 2, 1, 1));
+        conv_16 << get_node_B_float(conv_15, data_path, "conv16", 256, PadStrideInfo(1, 1, 0, 0),
+                                    PadStrideInfo(2, 2, 1, 1));
 
         SubStream conv_17(conv_16);
-        conv_17 << get_node_B_float(conv_16, data_path, "conv17", 128, PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 2, 1, 1));
+        conv_17 << get_node_B_float(conv_16, data_path, "conv17", 128, PadStrideInfo(1, 1, 0, 0),
+                                    PadStrideInfo(2, 2, 1, 1));
 
         //mbox_loc
         SubStream conv_11_mbox_loc(conv_11);
@@ -293,8 +312,9 @@ private:
         conv_17_2_mbox_loc << get_node_C_float(conv_17, data_path, "conv17_2_mbox_loc", 24, PadStrideInfo(1, 1, 0, 0));
 
         SubStream mbox_loc(graph);
-        mbox_loc << ConcatLayer(std::move(conv_11_mbox_loc), std::move(conv_13_mbox_loc), conv_14_2_mbox_loc, std::move(conv_15_2_mbox_loc),
-                                std::move(conv_16_2_mbox_loc), std::move(conv_17_2_mbox_loc));
+        mbox_loc << ConcatLayer(std::move(conv_11_mbox_loc), std::move(conv_13_mbox_loc), conv_14_2_mbox_loc,
+                                std::move(conv_15_2_mbox_loc), std::move(conv_16_2_mbox_loc),
+                                std::move(conv_17_2_mbox_loc));
 
         //mbox_conf
         SubStream conv_11_mbox_conf(conv_11);
@@ -304,67 +324,79 @@ private:
         conv_13_mbox_conf << get_node_C_float(conv_13, data_path, "conv13_mbox_conf", 126, PadStrideInfo(1, 1, 0, 0));
 
         SubStream conv_14_2_mbox_conf(conv_14);
-        conv_14_2_mbox_conf << get_node_C_float(conv_14, data_path, "conv14_2_mbox_conf", 126, PadStrideInfo(1, 1, 0, 0));
+        conv_14_2_mbox_conf << get_node_C_float(conv_14, data_path, "conv14_2_mbox_conf", 126,
+                                                PadStrideInfo(1, 1, 0, 0));
 
         SubStream conv_15_2_mbox_conf(conv_15);
-        conv_15_2_mbox_conf << get_node_C_float(conv_15, data_path, "conv15_2_mbox_conf", 126, PadStrideInfo(1, 1, 0, 0));
+        conv_15_2_mbox_conf << get_node_C_float(conv_15, data_path, "conv15_2_mbox_conf", 126,
+                                                PadStrideInfo(1, 1, 0, 0));
 
         SubStream conv_16_2_mbox_conf(conv_16);
-        conv_16_2_mbox_conf << get_node_C_float(conv_16, data_path, "conv16_2_mbox_conf", 126, PadStrideInfo(1, 1, 0, 0));
+        conv_16_2_mbox_conf << get_node_C_float(conv_16, data_path, "conv16_2_mbox_conf", 126,
+                                                PadStrideInfo(1, 1, 0, 0));
 
         SubStream conv_17_2_mbox_conf(conv_17);
-        conv_17_2_mbox_conf << get_node_C_float(conv_17, data_path, "conv17_2_mbox_conf", 126, PadStrideInfo(1, 1, 0, 0));
+        conv_17_2_mbox_conf << get_node_C_float(conv_17, data_path, "conv17_2_mbox_conf", 126,
+                                                PadStrideInfo(1, 1, 0, 0));
 
         SubStream mbox_conf(graph);
-        mbox_conf << ConcatLayer(std::move(conv_11_mbox_conf), std::move(conv_13_mbox_conf), std::move(conv_14_2_mbox_conf),
-                                 std::move(conv_15_2_mbox_conf), std::move(conv_16_2_mbox_conf), std::move(conv_17_2_mbox_conf));
+        mbox_conf << ConcatLayer(std::move(conv_11_mbox_conf), std::move(conv_13_mbox_conf),
+                                 std::move(conv_14_2_mbox_conf), std::move(conv_15_2_mbox_conf),
+                                 std::move(conv_16_2_mbox_conf), std::move(conv_17_2_mbox_conf));
         mbox_conf << ReshapeLayer(TensorShape(21U, 1917U)).set_name("mbox_conf/reshape");
         mbox_conf << SoftmaxLayer().set_name("mbox_conf/softmax");
         mbox_conf << FlattenLayer().set_name("mbox_conf/flat");
 
-        const std::vector<float> priorbox_variances     = { 0.1f, 0.1f, 0.2f, 0.2f };
+        const std::vector<float> priorbox_variances     = {0.1f, 0.1f, 0.2f, 0.2f};
         const float              priorbox_offset        = 0.5f;
-        const std::vector<float> priorbox_aspect_ratios = { 2.f, 3.f };
+        const std::vector<float> priorbox_aspect_ratios = {2.f, 3.f};
 
         //mbox_priorbox branch
         SubStream conv_11_mbox_priorbox(conv_11);
 
         conv_11_mbox_priorbox << PriorBoxLayer(SubStream(graph),
-                                               PriorBoxLayerInfo({ 60.f }, priorbox_variances, priorbox_offset, true, false, {}, { 2.f }))
-                              .set_name("conv11/priorbox");
+                                               PriorBoxLayerInfo({60.f}, priorbox_variances, priorbox_offset, true,
+                                                                 false, {}, {2.f}))
+                                     .set_name("conv11/priorbox");
 
         SubStream conv_13_mbox_priorbox(conv_13);
         conv_13_mbox_priorbox << PriorBoxLayer(SubStream(graph),
-                                               PriorBoxLayerInfo({ 105.f }, priorbox_variances, priorbox_offset, true, false, { 150.f }, priorbox_aspect_ratios))
-                              .set_name("conv13/priorbox");
+                                               PriorBoxLayerInfo({105.f}, priorbox_variances, priorbox_offset, true,
+                                                                 false, {150.f}, priorbox_aspect_ratios))
+                                     .set_name("conv13/priorbox");
 
         SubStream conv_14_2_mbox_priorbox(conv_14);
         conv_14_2_mbox_priorbox << PriorBoxLayer(SubStream(graph),
-                                                 PriorBoxLayerInfo({ 150.f }, priorbox_variances, priorbox_offset, true, false, { 195.f }, priorbox_aspect_ratios))
-                                .set_name("conv14/priorbox");
+                                                 PriorBoxLayerInfo({150.f}, priorbox_variances, priorbox_offset, true,
+                                                                   false, {195.f}, priorbox_aspect_ratios))
+                                       .set_name("conv14/priorbox");
 
         SubStream conv_15_2_mbox_priorbox(conv_15);
         conv_15_2_mbox_priorbox << PriorBoxLayer(SubStream(graph),
-                                                 PriorBoxLayerInfo({ 195.f }, priorbox_variances, priorbox_offset, true, false, { 240.f }, priorbox_aspect_ratios))
-                                .set_name("conv15/priorbox");
+                                                 PriorBoxLayerInfo({195.f}, priorbox_variances, priorbox_offset, true,
+                                                                   false, {240.f}, priorbox_aspect_ratios))
+                                       .set_name("conv15/priorbox");
 
         SubStream conv_16_2_mbox_priorbox(conv_16);
         conv_16_2_mbox_priorbox << PriorBoxLayer(SubStream(graph),
-                                                 PriorBoxLayerInfo({ 240.f }, priorbox_variances, priorbox_offset, true, false, { 285.f }, priorbox_aspect_ratios))
-                                .set_name("conv16/priorbox");
+                                                 PriorBoxLayerInfo({240.f}, priorbox_variances, priorbox_offset, true,
+                                                                   false, {285.f}, priorbox_aspect_ratios))
+                                       .set_name("conv16/priorbox");
 
         SubStream conv_17_2_mbox_priorbox(conv_17);
         conv_17_2_mbox_priorbox << PriorBoxLayer(SubStream(graph),
-                                                 PriorBoxLayerInfo({ 285.f }, priorbox_variances, priorbox_offset, true, false, { 300.f }, priorbox_aspect_ratios))
-                                .set_name("conv17/priorbox");
+                                                 PriorBoxLayerInfo({285.f}, priorbox_variances, priorbox_offset, true,
+                                                                   false, {300.f}, priorbox_aspect_ratios))
+                                       .set_name("conv17/priorbox");
 
         SubStream mbox_priorbox(graph);
 
         mbox_priorbox << ConcatLayer(
-                          (common_params.data_layout == DataLayout::NCHW) ? arm_compute::graph::descriptors::ConcatLayerDescriptor(DataLayoutDimension::WIDTH) : arm_compute::graph::descriptors::ConcatLayerDescriptor(
-                              DataLayoutDimension::CHANNEL),
-                          std::move(conv_11_mbox_priorbox), std::move(conv_13_mbox_priorbox), std::move(conv_14_2_mbox_priorbox),
-                          std::move(conv_15_2_mbox_priorbox), std::move(conv_16_2_mbox_priorbox), std::move(conv_17_2_mbox_priorbox));
+            (common_params.data_layout == DataLayout::NCHW)
+                ? arm_compute::graph::descriptors::ConcatLayerDescriptor(DataLayoutDimension::WIDTH)
+                : arm_compute::graph::descriptors::ConcatLayerDescriptor(DataLayoutDimension::CHANNEL),
+            std::move(conv_11_mbox_priorbox), std::move(conv_13_mbox_priorbox), std::move(conv_14_2_mbox_priorbox),
+            std::move(conv_15_2_mbox_priorbox), std::move(conv_16_2_mbox_priorbox), std::move(conv_17_2_mbox_priorbox));
 
         const int                          num_classes         = 21;
         const bool                         share_location      = true;
@@ -377,77 +409,85 @@ private:
 
         SubStream detection_ouput(mbox_loc);
         detection_ouput << DetectionOutputLayer(std::move(mbox_conf), std::move(mbox_priorbox),
-                                                DetectionOutputLayerInfo(num_classes, share_location, detection_type, keep_top_k, nms_threshold, top_k, label_id_background, conf_thrs));
-        detection_ouput << OutputLayer(get_detection_output_accessor(common_params, { input_descriptor.shape }));
+                                                DetectionOutputLayerInfo(num_classes, share_location, detection_type,
+                                                                         keep_top_k, nms_threshold, top_k,
+                                                                         label_id_background, conf_thrs));
+        detection_ouput << OutputLayer(get_detection_output_accessor(common_params, {input_descriptor.shape}));
     }
 
-    ConcatLayer get_node_A_qasymm(IStream &main_graph, const std::string &data_path, std::string &&param_path,
-                                  unsigned int  conv_filt,
-                                  PadStrideInfo dwc_pad_stride_info, PadStrideInfo conv_pad_stride_info,
-                                  std::pair<QuantizationInfo, QuantizationInfo> depth_quant_info, std::pair<QuantizationInfo, QuantizationInfo> point_quant_info)
+    ConcatLayer get_node_A_qasymm(IStream                                      &main_graph,
+                                  const std::string                            &data_path,
+                                  std::string                                 &&param_path,
+                                  unsigned int                                  conv_filt,
+                                  PadStrideInfo                                 dwc_pad_stride_info,
+                                  PadStrideInfo                                 conv_pad_stride_info,
+                                  std::pair<QuantizationInfo, QuantizationInfo> depth_quant_info,
+                                  std::pair<QuantizationInfo, QuantizationInfo> point_quant_info)
     {
         const std::string total_path = param_path + "_";
         SubStream         sg(main_graph);
 
-        sg << DepthwiseConvolutionLayer(
-               3U, 3U,
-               get_weights_accessor(data_path, total_path + "dw_w.npy"),
-               get_weights_accessor(data_path, total_path + "dw_b.npy"),
-               dwc_pad_stride_info, 1, depth_quant_info.first, depth_quant_info.second)
-           .set_name(param_path + "/dw")
-           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f)).set_name(param_path + "/dw/relu6");
-
-        sg << ConvolutionLayer(
-               1U, 1U, conv_filt,
-               get_weights_accessor(data_path, total_path + "w.npy"),
-               get_weights_accessor(data_path, total_path + "b.npy"),
-               conv_pad_stride_info, 1, point_quant_info.first, point_quant_info.second)
-           .set_name(param_path + "/pw")
-           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f)).set_name(param_path + "/pw/relu6");
+        sg << DepthwiseConvolutionLayer(3U, 3U, get_weights_accessor(data_path, total_path + "dw_w.npy"),
+                                        get_weights_accessor(data_path, total_path + "dw_b.npy"), dwc_pad_stride_info,
+                                        1, depth_quant_info.first, depth_quant_info.second)
+                  .set_name(param_path + "/dw")
+           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f))
+                  .set_name(param_path + "/dw/relu6");
+
+        sg << ConvolutionLayer(1U, 1U, conv_filt, get_weights_accessor(data_path, total_path + "w.npy"),
+                               get_weights_accessor(data_path, total_path + "b.npy"), conv_pad_stride_info, 1,
+                               point_quant_info.first, point_quant_info.second)
+                  .set_name(param_path + "/pw")
+           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f))
+                  .set_name(param_path + "/pw/relu6");
 
         return ConcatLayer(std::move(sg));
     }
 
-    ConcatLayer get_node_B_qasymm(IStream &main_graph, const std::string &data_path, std::string &&param_path,
-                                  unsigned int  conv_filt,
-                                  PadStrideInfo conv_pad_stride_info_1x1, PadStrideInfo conv_pad_stride_info_3x3,
-                                  const std::pair<QuantizationInfo, QuantizationInfo> quant_info_1x1, const std::pair<QuantizationInfo, QuantizationInfo> quant_info_3x3)
+    ConcatLayer get_node_B_qasymm(IStream                                            &main_graph,
+                                  const std::string                                  &data_path,
+                                  std::string                                       &&param_path,
+                                  unsigned int                                        conv_filt,
+                                  PadStrideInfo                                       conv_pad_stride_info_1x1,
+                                  PadStrideInfo                                       conv_pad_stride_info_3x3,
+                                  const std::pair<QuantizationInfo, QuantizationInfo> quant_info_1x1,
+                                  const std::pair<QuantizationInfo, QuantizationInfo> quant_info_3x3)
     {
         const std::string total_path = param_path + "_";
         SubStream         sg(main_graph);
 
-        sg << ConvolutionLayer(
-               1, 1, conv_filt / 2,
-               get_weights_accessor(data_path, total_path + "1x1_w.npy"),
-               get_weights_accessor(data_path, total_path + "1x1_b.npy"),
-               conv_pad_stride_info_1x1, 1, quant_info_1x1.first, quant_info_1x1.second)
-           .set_name(total_path + "1x1/conv")
-           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f)).set_name(total_path + "1x1/conv/relu6");
-
-        sg << ConvolutionLayer(
-               3, 3, conv_filt,
-               get_weights_accessor(data_path, total_path + "3x3_w.npy"),
-               get_weights_accessor(data_path, total_path + "3x3_b.npy"),
-               conv_pad_stride_info_3x3, 1, quant_info_3x3.first, quant_info_3x3.second)
-           .set_name(total_path + "3x3/conv")
-           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f)).set_name(total_path + "3x3/conv/relu6");
+        sg << ConvolutionLayer(1, 1, conv_filt / 2, get_weights_accessor(data_path, total_path + "1x1_w.npy"),
+                               get_weights_accessor(data_path, total_path + "1x1_b.npy"), conv_pad_stride_info_1x1, 1,
+                               quant_info_1x1.first, quant_info_1x1.second)
+                  .set_name(total_path + "1x1/conv")
+           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f))
+                  .set_name(total_path + "1x1/conv/relu6");
+
+        sg << ConvolutionLayer(3, 3, conv_filt, get_weights_accessor(data_path, total_path + "3x3_w.npy"),
+                               get_weights_accessor(data_path, total_path + "3x3_b.npy"), conv_pad_stride_info_3x3, 1,
+                               quant_info_3x3.first, quant_info_3x3.second)
+                  .set_name(total_path + "3x3/conv")
+           << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f))
+                  .set_name(total_path + "3x3/conv/relu6");
 
         return ConcatLayer(std::move(sg));
     }
 
-    ConcatLayer get_node_C_qasymm(IStream &main_graph, const std::string &data_path, std::string &&param_path,
-                                  unsigned int conv_filt, PadStrideInfo               conv_pad_stride_info,
-                                  const std::pair<QuantizationInfo, QuantizationInfo> quant_info, TensorShape reshape_shape)
+    ConcatLayer get_node_C_qasymm(IStream                                            &main_graph,
+                                  const std::string                                  &data_path,
+                                  std::string                                       &&param_path,
+                                  unsigned int                                        conv_filt,
+                                  PadStrideInfo                                       conv_pad_stride_info,
+                                  const std::pair<QuantizationInfo, QuantizationInfo> quant_info,
+                                  TensorShape                                         reshape_shape)
     {
         const std::string total_path = param_path + "_";
         SubStream         sg(main_graph);
-        sg << ConvolutionLayer(
-               1U, 1U, conv_filt,
-               get_weights_accessor(data_path, total_path + "w.npy"),
-               get_weights_accessor(data_path, total_path + "b.npy"),
-               conv_pad_stride_info, 1, quant_info.first, quant_info.second)
-           .set_name(param_path + "/conv");
-        if(common_params.data_layout == DataLayout::NCHW)
+        sg << ConvolutionLayer(1U, 1U, conv_filt, get_weights_accessor(data_path, total_path + "w.npy"),
+                               get_weights_accessor(data_path, total_path + "b.npy"), conv_pad_stride_info, 1,
+                               quant_info.first, quant_info.second)
+                  .set_name(param_path + "/conv");
+        if (common_params.data_layout == DataLayout::NCHW)
         {
             sg << PermuteLayer(PermutationVector(2U, 0U, 1U), DataLayout::NHWC);
         }
@@ -462,57 +502,59 @@ private:
         std::string data_path = common_params.data_path;
 
         // Add model path to data path
-        if(!data_path.empty())
+        if (!data_path.empty())
         {
             data_path += "/cnn_data/ssd_mobilenet_qasymm8_model/";
         }
 
         // Quantization info are saved as pair for each (pointwise/depthwise) convolution layer: <weight_quant_info, output_quant_info>
-        const std::vector<std::pair<QuantizationInfo, QuantizationInfo>> conv_quant_info =
-        {
-            { QuantizationInfo(0.03624850884079933f, 163), QuantizationInfo(0.22219789028167725f, 113) },   // conv0
-            { QuantizationInfo(0.0028752065263688564f, 113), QuantizationInfo(0.05433657020330429f, 128) }, // conv13_2_1_1
-            { QuantizationInfo(0.0014862528769299388f, 125), QuantizationInfo(0.05037643015384674f, 131) }, // conv13_2_3_3
-            { QuantizationInfo(0.00233650766313076f, 113), QuantizationInfo(0.04468846693634987f, 126) },   // conv13_3_1_1
-            { QuantizationInfo(0.002501056529581547f, 120), QuantizationInfo(0.06026708707213402f, 111) },  // conv13_3_3_3
-            { QuantizationInfo(0.002896666992455721f, 121), QuantizationInfo(0.037775348871946335f, 117) }, // conv13_4_1_1
-            { QuantizationInfo(0.0023875406477600336f, 122), QuantizationInfo(0.03881589323282242f, 108) }, // conv13_4_3_3
-            { QuantizationInfo(0.0022081052884459496f, 77), QuantizationInfo(0.025450613349676132f, 125) }, // conv13_5_1_1
-            { QuantizationInfo(0.00604657270014286f, 121), QuantizationInfo(0.033533502370119095f, 109) }   // conv13_5_3_3
+        const std::vector<std::pair<QuantizationInfo, QuantizationInfo>> conv_quant_info = {
+            {QuantizationInfo(0.03624850884079933f, 163), QuantizationInfo(0.22219789028167725f, 113)}, // conv0
+            {QuantizationInfo(0.0028752065263688564f, 113),
+             QuantizationInfo(0.05433657020330429f, 128)}, // conv13_2_1_1
+            {QuantizationInfo(0.0014862528769299388f, 125),
+             QuantizationInfo(0.05037643015384674f, 131)},                                               // conv13_2_3_3
+            {QuantizationInfo(0.00233650766313076f, 113), QuantizationInfo(0.04468846693634987f, 126)},  // conv13_3_1_1
+            {QuantizationInfo(0.002501056529581547f, 120), QuantizationInfo(0.06026708707213402f, 111)}, // conv13_3_3_3
+            {QuantizationInfo(0.002896666992455721f, 121),
+             QuantizationInfo(0.037775348871946335f, 117)}, // conv13_4_1_1
+            {QuantizationInfo(0.0023875406477600336f, 122),
+             QuantizationInfo(0.03881589323282242f, 108)}, // conv13_4_3_3
+            {QuantizationInfo(0.0022081052884459496f, 77),
+             QuantizationInfo(0.025450613349676132f, 125)},                                             // conv13_5_1_1
+            {QuantizationInfo(0.00604657270014286f, 121), QuantizationInfo(0.033533502370119095f, 109)} // conv13_5_3_3
         };
 
-        const std::vector<std::pair<QuantizationInfo, QuantizationInfo>> depth_quant_info =
-        {
-            { QuantizationInfo(0.03408717364072f, 131), QuantizationInfo(0.29286590218544006f, 108) },     // dwsc1
-            { QuantizationInfo(0.027518004179000854f, 107), QuantizationInfo(0.20796941220760345, 117) },  // dwsc2
-            { QuantizationInfo(0.052489638328552246f, 85), QuantizationInfo(0.4303881824016571f, 142) },   // dwsc3
-            { QuantizationInfo(0.016570359468460083f, 79), QuantizationInfo(0.10512150079011917f, 116) },  // dwsc4
-            { QuantizationInfo(0.060739465057849884f, 65), QuantizationInfo(0.15331414341926575f, 94) },   // dwsc5
-            { QuantizationInfo(0.01324534136801958f, 124), QuantizationInfo(0.13010895252227783f, 153) },  // dwsc6
-            { QuantizationInfo(0.032326459884643555f, 124), QuantizationInfo(0.11565316468477249, 156) },  // dwsc7
-            { QuantizationInfo(0.029948478564620018f, 155), QuantizationInfo(0.11413891613483429f, 146) }, // dwsc8
-            { QuantizationInfo(0.028054025024175644f, 129), QuantizationInfo(0.1142905130982399f, 140) },  // dwsc9
-            { QuantizationInfo(0.025204822421073914f, 129), QuantizationInfo(0.14668069779872894f, 149) }, // dwsc10
-            { QuantizationInfo(0.019332280382514f, 110), QuantizationInfo(0.1480235457420349f, 91) },      // dwsc11
-            { QuantizationInfo(0.0319712869822979f, 88), QuantizationInfo(0.10424695909023285f, 117) },    // dwsc12
-            { QuantizationInfo(0.04378943517804146f, 164), QuantizationInfo(0.23176774382591248f, 138) }   // dwsc13
+        const std::vector<std::pair<QuantizationInfo, QuantizationInfo>> depth_quant_info = {
+            {QuantizationInfo(0.03408717364072f, 131), QuantizationInfo(0.29286590218544006f, 108)},     // dwsc1
+            {QuantizationInfo(0.027518004179000854f, 107), QuantizationInfo(0.20796941220760345, 117)},  // dwsc2
+            {QuantizationInfo(0.052489638328552246f, 85), QuantizationInfo(0.4303881824016571f, 142)},   // dwsc3
+            {QuantizationInfo(0.016570359468460083f, 79), QuantizationInfo(0.10512150079011917f, 116)},  // dwsc4
+            {QuantizationInfo(0.060739465057849884f, 65), QuantizationInfo(0.15331414341926575f, 94)},   // dwsc5
+            {QuantizationInfo(0.01324534136801958f, 124), QuantizationInfo(0.13010895252227783f, 153)},  // dwsc6
+            {QuantizationInfo(0.032326459884643555f, 124), QuantizationInfo(0.11565316468477249, 156)},  // dwsc7
+            {QuantizationInfo(0.029948478564620018f, 155), QuantizationInfo(0.11413891613483429f, 146)}, // dwsc8
+            {QuantizationInfo(0.028054025024175644f, 129), QuantizationInfo(0.1142905130982399f, 140)},  // dwsc9
+            {QuantizationInfo(0.025204822421073914f, 129), QuantizationInfo(0.14668069779872894f, 149)}, // dwsc10
+            {QuantizationInfo(0.019332280382514f, 110), QuantizationInfo(0.1480235457420349f, 91)},      // dwsc11
+            {QuantizationInfo(0.0319712869822979f, 88), QuantizationInfo(0.10424695909023285f, 117)},    // dwsc12
+            {QuantizationInfo(0.04378943517804146f, 164), QuantizationInfo(0.23176774382591248f, 138)}   // dwsc13
         };
 
-        const std::vector<std::pair<QuantizationInfo, QuantizationInfo>> point_quant_info =
-        {
-            { QuantizationInfo(0.028777318075299263f, 144), QuantizationInfo(0.2663874328136444f, 121) },  // pw1
-            { QuantizationInfo(0.015796702355146408f, 127), QuantizationInfo(0.1739964485168457f, 111) },  // pw2
-            { QuantizationInfo(0.009349990636110306f, 127), QuantizationInfo(0.1805974692106247f, 104) },  // pw3
-            { QuantizationInfo(0.012920888140797615f, 106), QuantizationInfo(0.1205204650759697f, 100) },  // pw4
-            { QuantizationInfo(0.008119508624076843f, 145), QuantizationInfo(0.12272439152002335f, 97) },  // pw5
-            { QuantizationInfo(0.0070041813887655735f, 115), QuantizationInfo(0.0947074219584465f, 101) }, // pw6
-            { QuantizationInfo(0.004827278666198254f, 115), QuantizationInfo(0.0842885747551918f, 110) },  // pw7
-            { QuantizationInfo(0.004755120258778334f, 128), QuantizationInfo(0.08283159881830215f, 116) }, // pw8
-            { QuantizationInfo(0.007527193054556847f, 142), QuantizationInfo(0.12555131316184998f, 137) }, // pw9
-            { QuantizationInfo(0.006050156895071268f, 109), QuantizationInfo(0.10871313512325287f, 124) }, // pw10
-            { QuantizationInfo(0.00490700313821435f, 127), QuantizationInfo(0.10364262014627457f, 140) },  // pw11
-            { QuantizationInfo(0.006063731852918863, 124), QuantizationInfo(0.11241862177848816f, 125) },  // pw12
-            { QuantizationInfo(0.007901716977357864f, 139), QuantizationInfo(0.49889302253723145f, 141) }  // pw13
+        const std::vector<std::pair<QuantizationInfo, QuantizationInfo>> point_quant_info = {
+            {QuantizationInfo(0.028777318075299263f, 144), QuantizationInfo(0.2663874328136444f, 121)},  // pw1
+            {QuantizationInfo(0.015796702355146408f, 127), QuantizationInfo(0.1739964485168457f, 111)},  // pw2
+            {QuantizationInfo(0.009349990636110306f, 127), QuantizationInfo(0.1805974692106247f, 104)},  // pw3
+            {QuantizationInfo(0.012920888140797615f, 106), QuantizationInfo(0.1205204650759697f, 100)},  // pw4
+            {QuantizationInfo(0.008119508624076843f, 145), QuantizationInfo(0.12272439152002335f, 97)},  // pw5
+            {QuantizationInfo(0.0070041813887655735f, 115), QuantizationInfo(0.0947074219584465f, 101)}, // pw6
+            {QuantizationInfo(0.004827278666198254f, 115), QuantizationInfo(0.0842885747551918f, 110)},  // pw7
+            {QuantizationInfo(0.004755120258778334f, 128), QuantizationInfo(0.08283159881830215f, 116)}, // pw8
+            {QuantizationInfo(0.007527193054556847f, 142), QuantizationInfo(0.12555131316184998f, 137)}, // pw9
+            {QuantizationInfo(0.006050156895071268f, 109), QuantizationInfo(0.10871313512325287f, 124)}, // pw10
+            {QuantizationInfo(0.00490700313821435f, 127), QuantizationInfo(0.10364262014627457f, 140)},  // pw11
+            {QuantizationInfo(0.006063731852918863, 124), QuantizationInfo(0.11241862177848816f, 125)},  // pw12
+            {QuantizationInfo(0.007901716977357864f, 139), QuantizationInfo(0.49889302253723145f, 141)}  // pw13
         };
 
         // Quantization info taken from the TfLite SSD MobileNet example
@@ -520,114 +562,154 @@ private:
         // Create core graph
         graph << InputLayer(input_descriptor.set_quantization_info(in_quant_info),
                             get_weights_accessor(data_path, common_params.image, DataLayout::NHWC));
-        graph << ConvolutionLayer(
-                  3U, 3U, 32U,
-                  get_weights_accessor(data_path, "conv0_w.npy"),
-                  get_weights_accessor(data_path, "conv0_b.npy"),
-                  PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::CEIL), 1, conv_quant_info.at(0).first, conv_quant_info.at(0).second)
-              .set_name("conv0");
-        graph << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f)).set_name("conv0/relu");
-        graph << get_node_A_qasymm(graph, data_path, "conv1", 64U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL), PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(0),
-                                   point_quant_info.at(0));
-        graph << get_node_A_qasymm(graph, data_path, "conv2", 128U, PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::CEIL), PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(1),
-                                   point_quant_info.at(1));
-        graph << get_node_A_qasymm(graph, data_path, "conv3", 128U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL), PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(2),
-                                   point_quant_info.at(2));
-        graph << get_node_A_qasymm(graph, data_path, "conv4", 256U, PadStrideInfo(2U, 2U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL), PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(3),
-                                   point_quant_info.at(3));
-        graph << get_node_A_qasymm(graph, data_path, "conv5", 256U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL), PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(4),
-                                   point_quant_info.at(4));
-        graph << get_node_A_qasymm(graph, data_path, "conv6", 512U, PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::CEIL), PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(5),
-                                   point_quant_info.at(5));
-        graph << get_node_A_qasymm(graph, data_path, "conv7", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL), PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(6),
-                                   point_quant_info.at(6));
-        graph << get_node_A_qasymm(graph, data_path, "conv8", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL), PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(7),
-                                   point_quant_info.at(7));
-        graph << get_node_A_qasymm(graph, data_path, "conv9", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL), PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(8),
-                                   point_quant_info.at(8));
-        graph << get_node_A_qasymm(graph, data_path, "conv10", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL), PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(9),
-                                   point_quant_info.at(9));
-        graph << get_node_A_qasymm(graph, data_path, "conv11", 512U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL), PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(10),
-                                   point_quant_info.at(10));
+        graph << ConvolutionLayer(3U, 3U, 32U, get_weights_accessor(data_path, "conv0_w.npy"),
+                                  get_weights_accessor(data_path, "conv0_b.npy"),
+                                  PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::CEIL), 1,
+                                  conv_quant_info.at(0).first, conv_quant_info.at(0).second)
+                     .set_name("conv0");
+        graph << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f))
+                     .set_name("conv0/relu");
+        graph << get_node_A_qasymm(graph, data_path, "conv1", 64U,
+                                   PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL),
+                                   PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(0), point_quant_info.at(0));
+        graph << get_node_A_qasymm(graph, data_path, "conv2", 128U,
+                                   PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::CEIL),
+                                   PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(1), point_quant_info.at(1));
+        graph << get_node_A_qasymm(graph, data_path, "conv3", 128U,
+                                   PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL),
+                                   PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(2), point_quant_info.at(2));
+        graph << get_node_A_qasymm(graph, data_path, "conv4", 256U,
+                                   PadStrideInfo(2U, 2U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL),
+                                   PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(3), point_quant_info.at(3));
+        graph << get_node_A_qasymm(graph, data_path, "conv5", 256U,
+                                   PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL),
+                                   PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(4), point_quant_info.at(4));
+        graph << get_node_A_qasymm(graph, data_path, "conv6", 512U,
+                                   PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::CEIL),
+                                   PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(5), point_quant_info.at(5));
+        graph << get_node_A_qasymm(graph, data_path, "conv7", 512U,
+                                   PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL),
+                                   PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(6), point_quant_info.at(6));
+        graph << get_node_A_qasymm(graph, data_path, "conv8", 512U,
+                                   PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL),
+                                   PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(7), point_quant_info.at(7));
+        graph << get_node_A_qasymm(graph, data_path, "conv9", 512U,
+                                   PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL),
+                                   PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(8), point_quant_info.at(8));
+        graph << get_node_A_qasymm(graph, data_path, "conv10", 512U,
+                                   PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL),
+                                   PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(9), point_quant_info.at(9));
+        graph << get_node_A_qasymm(graph, data_path, "conv11", 512U,
+                                   PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL),
+                                   PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(10), point_quant_info.at(10));
 
         SubStream conv_13(graph);
-        conv_13 << get_node_A_qasymm(graph, data_path, "conv12", 1024U, PadStrideInfo(2U, 2U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL), PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(11),
-                                     point_quant_info.at(11));
-        conv_13 << get_node_A_qasymm(conv_13, data_path, "conv13", 1024U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL), PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(12),
-                                     point_quant_info.at(12));
+        conv_13 << get_node_A_qasymm(graph, data_path, "conv12", 1024U,
+                                     PadStrideInfo(2U, 2U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL),
+                                     PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(11), point_quant_info.at(11));
+        conv_13 << get_node_A_qasymm(conv_13, data_path, "conv13", 1024U,
+                                     PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL),
+                                     PadStrideInfo(1U, 1U, 0U, 0U), depth_quant_info.at(12), point_quant_info.at(12));
         SubStream conv_14(conv_13);
-        conv_14 << get_node_B_qasymm(conv_13, data_path, "conv13_2", 512U, PadStrideInfo(1U, 1U, 0U, 0U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::CEIL), conv_quant_info.at(1),
-                                     conv_quant_info.at(2));
+        conv_14 << get_node_B_qasymm(conv_13, data_path, "conv13_2", 512U, PadStrideInfo(1U, 1U, 0U, 0U),
+                                     PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::CEIL),
+                                     conv_quant_info.at(1), conv_quant_info.at(2));
         SubStream conv_15(conv_14);
-        conv_15 << get_node_B_qasymm(conv_14, data_path, "conv13_3", 256U, PadStrideInfo(1U, 1U, 0U, 0U), PadStrideInfo(2U, 2U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL), conv_quant_info.at(3),
-                                     conv_quant_info.at(4));
+        conv_15 << get_node_B_qasymm(conv_14, data_path, "conv13_3", 256U, PadStrideInfo(1U, 1U, 0U, 0U),
+                                     PadStrideInfo(2U, 2U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL),
+                                     conv_quant_info.at(3), conv_quant_info.at(4));
         SubStream conv_16(conv_15);
-        conv_16 << get_node_B_qasymm(conv_15, data_path, "conv13_4", 256U, PadStrideInfo(1U, 1U, 0U, 0U), PadStrideInfo(2U, 2U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL), conv_quant_info.at(5),
-                                     conv_quant_info.at(6));
+        conv_16 << get_node_B_qasymm(conv_15, data_path, "conv13_4", 256U, PadStrideInfo(1U, 1U, 0U, 0U),
+                                     PadStrideInfo(2U, 2U, 1U, 1U, 1U, 1U, DimensionRoundingType::CEIL),
+                                     conv_quant_info.at(5), conv_quant_info.at(6));
         SubStream conv_17(conv_16);
-        conv_17 << get_node_B_qasymm(conv_16, data_path, "conv13_5", 128U, PadStrideInfo(1U, 1U, 0U, 0U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::CEIL), conv_quant_info.at(7),
-                                     conv_quant_info.at(8));
+        conv_17 << get_node_B_qasymm(conv_16, data_path, "conv13_5", 128U, PadStrideInfo(1U, 1U, 0U, 0U),
+                                     PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::CEIL),
+                                     conv_quant_info.at(7), conv_quant_info.at(8));
 
         // box_predictor
-        const std::vector<std::pair<QuantizationInfo, QuantizationInfo>> box_enc_pred_quant_info =
-        {
-            { QuantizationInfo(0.005202020984143019f, 136), QuantizationInfo(0.08655580133199692f, 183) },   // boxpredictor0_bep
-            { QuantizationInfo(0.003121797926723957f, 132), QuantizationInfo(0.03218776360154152f, 140) },   // boxpredictor1_bep
-            { QuantizationInfo(0.002995674265548587f, 130), QuantizationInfo(0.029072262346744537f, 125) },  // boxpredictor2_bep
-            { QuantizationInfo(0.0023131705820560455f, 130), QuantizationInfo(0.026488754898309708f, 127) }, // boxpredictor3_bep
-            { QuantizationInfo(0.0013905081432312727f, 132), QuantizationInfo(0.0199890099465847f, 137) },   // boxpredictor4_bep
-            { QuantizationInfo(0.00216794665902853f, 121), QuantizationInfo(0.019798893481492996f, 151) }    // boxpredictor5_bep
+        const std::vector<std::pair<QuantizationInfo, QuantizationInfo>> box_enc_pred_quant_info = {
+            {QuantizationInfo(0.005202020984143019f, 136),
+             QuantizationInfo(0.08655580133199692f, 183)}, // boxpredictor0_bep
+            {QuantizationInfo(0.003121797926723957f, 132),
+             QuantizationInfo(0.03218776360154152f, 140)}, // boxpredictor1_bep
+            {QuantizationInfo(0.002995674265548587f, 130),
+             QuantizationInfo(0.029072262346744537f, 125)}, // boxpredictor2_bep
+            {QuantizationInfo(0.0023131705820560455f, 130),
+             QuantizationInfo(0.026488754898309708f, 127)}, // boxpredictor3_bep
+            {QuantizationInfo(0.0013905081432312727f, 132),
+             QuantizationInfo(0.0199890099465847f, 137)}, // boxpredictor4_bep
+            {QuantizationInfo(0.00216794665902853f, 121),
+             QuantizationInfo(0.019798893481492996f, 151)} // boxpredictor5_bep
         };
 
         const std::vector<TensorShape> box_reshape = // NHWC
-        {
-            TensorShape(4U, 1U, 1083U), // boxpredictor0_bep_reshape
-            TensorShape(4U, 1U, 600U),  // boxpredictor1_bep_reshape
-            TensorShape(4U, 1U, 150U),  // boxpredictor2_bep_reshape
-            TensorShape(4U, 1U, 54U),   // boxpredictor3_bep_reshape
-            TensorShape(4U, 1U, 24U),   // boxpredictor4_bep_reshape
-            TensorShape(4U, 1U, 6U)     // boxpredictor5_bep_reshape
-        };
+            {
+                TensorShape(4U, 1U, 1083U), // boxpredictor0_bep_reshape
+                TensorShape(4U, 1U, 600U),  // boxpredictor1_bep_reshape
+                TensorShape(4U, 1U, 150U),  // boxpredictor2_bep_reshape
+                TensorShape(4U, 1U, 54U),   // boxpredictor3_bep_reshape
+                TensorShape(4U, 1U, 24U),   // boxpredictor4_bep_reshape
+                TensorShape(4U, 1U, 6U)     // boxpredictor5_bep_reshape
+            };
 
         SubStream conv_11_box_enc_pre(graph);
-        conv_11_box_enc_pre << get_node_C_qasymm(graph, data_path, "BoxPredictor_0_BEP", 12U, PadStrideInfo(1U, 1U, 0U, 0U), box_enc_pred_quant_info.at(0), box_reshape.at(0));
+        conv_11_box_enc_pre << get_node_C_qasymm(graph, data_path, "BoxPredictor_0_BEP", 12U,
+                                                 PadStrideInfo(1U, 1U, 0U, 0U), box_enc_pred_quant_info.at(0),
+                                                 box_reshape.at(0));
 
         SubStream conv_13_box_enc_pre(conv_13);
-        conv_13_box_enc_pre << get_node_C_qasymm(conv_13, data_path, "BoxPredictor_1_BEP", 24U, PadStrideInfo(1U, 1U, 0U, 0U), box_enc_pred_quant_info.at(1), box_reshape.at(1));
+        conv_13_box_enc_pre << get_node_C_qasymm(conv_13, data_path, "BoxPredictor_1_BEP", 24U,
+                                                 PadStrideInfo(1U, 1U, 0U, 0U), box_enc_pred_quant_info.at(1),
+                                                 box_reshape.at(1));
 
         SubStream conv_14_2_box_enc_pre(conv_14);
-        conv_14_2_box_enc_pre << get_node_C_qasymm(conv_14, data_path, "BoxPredictor_2_BEP", 24U, PadStrideInfo(1U, 1U, 0U, 0U), box_enc_pred_quant_info.at(2), box_reshape.at(2));
+        conv_14_2_box_enc_pre << get_node_C_qasymm(conv_14, data_path, "BoxPredictor_2_BEP", 24U,
+                                                   PadStrideInfo(1U, 1U, 0U, 0U), box_enc_pred_quant_info.at(2),
+                                                   box_reshape.at(2));
 
         SubStream conv_15_2_box_enc_pre(conv_15);
-        conv_15_2_box_enc_pre << get_node_C_qasymm(conv_15, data_path, "BoxPredictor_3_BEP", 24U, PadStrideInfo(1U, 1U, 0U, 0U), box_enc_pred_quant_info.at(3), box_reshape.at(3));
+        conv_15_2_box_enc_pre << get_node_C_qasymm(conv_15, data_path, "BoxPredictor_3_BEP", 24U,
+                                                   PadStrideInfo(1U, 1U, 0U, 0U), box_enc_pred_quant_info.at(3),
+                                                   box_reshape.at(3));
 
         SubStream conv_16_2_box_enc_pre(conv_16);
-        conv_16_2_box_enc_pre << get_node_C_qasymm(conv_16, data_path, "BoxPredictor_4_BEP", 24U, PadStrideInfo(1U, 1U, 0U, 0U), box_enc_pred_quant_info.at(4), box_reshape.at(4));
+        conv_16_2_box_enc_pre << get_node_C_qasymm(conv_16, data_path, "BoxPredictor_4_BEP", 24U,
+                                                   PadStrideInfo(1U, 1U, 0U, 0U), box_enc_pred_quant_info.at(4),
+                                                   box_reshape.at(4));
 
         SubStream conv_17_2_box_enc_pre(conv_17);
-        conv_17_2_box_enc_pre << get_node_C_qasymm(conv_17, data_path, "BoxPredictor_5_BEP", 24U, PadStrideInfo(1U, 1U, 0U, 0U), box_enc_pred_quant_info.at(5), box_reshape.at(5));
+        conv_17_2_box_enc_pre << get_node_C_qasymm(conv_17, data_path, "BoxPredictor_5_BEP", 24U,
+                                                   PadStrideInfo(1U, 1U, 0U, 0U), box_enc_pred_quant_info.at(5),
+                                                   box_reshape.at(5));
 
         SubStream              box_enc_pre(graph);
         const QuantizationInfo bep_concate_qinfo = QuantizationInfo(0.08655580133199692f, 183);
-        box_enc_pre << ConcatLayer(arm_compute::graph::descriptors::ConcatLayerDescriptor(DataLayoutDimension::HEIGHT, bep_concate_qinfo),
-                                   std::move(conv_11_box_enc_pre), std::move(conv_13_box_enc_pre), conv_14_2_box_enc_pre, std::move(conv_15_2_box_enc_pre),
+        box_enc_pre << ConcatLayer(arm_compute::graph::descriptors::ConcatLayerDescriptor(DataLayoutDimension::HEIGHT,
+                                                                                          bep_concate_qinfo),
+                                   std::move(conv_11_box_enc_pre), std::move(conv_13_box_enc_pre),
+                                   conv_14_2_box_enc_pre, std::move(conv_15_2_box_enc_pre),
                                    std::move(conv_16_2_box_enc_pre), std::move(conv_17_2_box_enc_pre))
-                    .set_name("BoxPredictor/concat");
+                           .set_name("BoxPredictor/concat");
         box_enc_pre << ReshapeLayer(TensorShape(4U, 1917U)).set_name("BoxPredictor/reshape");
 
         // class_predictor
-        const std::vector<std::pair<QuantizationInfo, QuantizationInfo>> class_pred_quant_info =
-        {
-            { QuantizationInfo(0.002744135679677129f, 125), QuantizationInfo(0.05746262148022652f, 234) },   // boxpredictor0_cp
-            { QuantizationInfo(0.0024326108396053314f, 80), QuantizationInfo(0.03764628246426582f, 217) },   // boxpredictor1_cp
-            { QuantizationInfo(0.0013898586621508002f, 141), QuantizationInfo(0.034081317484378815f, 214) }, // boxpredictor2_cp
-            { QuantizationInfo(0.0014176908880472183f, 133), QuantizationInfo(0.033889178186655045f, 215) }, // boxpredictor3_cp
-            { QuantizationInfo(0.001090311910957098f, 125), QuantizationInfo(0.02646234817802906f, 230) },   // boxpredictor4_cp
-            { QuantizationInfo(0.001134163816459477f, 115), QuantizationInfo(0.026926767081022263f, 218) }   // boxpredictor5_cp
+        const std::vector<std::pair<QuantizationInfo, QuantizationInfo>> class_pred_quant_info = {
+            {QuantizationInfo(0.002744135679677129f, 125),
+             QuantizationInfo(0.05746262148022652f, 234)}, // boxpredictor0_cp
+            {QuantizationInfo(0.0024326108396053314f, 80),
+             QuantizationInfo(0.03764628246426582f, 217)}, // boxpredictor1_cp
+            {QuantizationInfo(0.0013898586621508002f, 141),
+             QuantizationInfo(0.034081317484378815f, 214)}, // boxpredictor2_cp
+            {QuantizationInfo(0.0014176908880472183f, 133),
+             QuantizationInfo(0.033889178186655045f, 215)}, // boxpredictor3_cp
+            {QuantizationInfo(0.001090311910957098f, 125),
+             QuantizationInfo(0.02646234817802906f, 230)}, // boxpredictor4_cp
+            {QuantizationInfo(0.001134163816459477f, 115),
+             QuantizationInfo(0.026926767081022263f, 218)} // boxpredictor5_cp
         };
 
-        const std::vector<TensorShape> class_reshape =
-        {
+        const std::vector<TensorShape> class_reshape = {
             TensorShape(91U, 1083U), // boxpredictor0_cp_reshape
             TensorShape(91U, 600U),  // boxpredictor1_cp_reshape
             TensorShape(91U, 150U),  // boxpredictor2_cp_reshape
@@ -637,60 +719,80 @@ private:
         };
 
         SubStream conv_11_class_pre(graph);
-        conv_11_class_pre << get_node_C_qasymm(graph, data_path, "BoxPredictor_0_CP", 273U, PadStrideInfo(1U, 1U, 0U, 0U), class_pred_quant_info.at(0), class_reshape.at(0));
+        conv_11_class_pre << get_node_C_qasymm(graph, data_path, "BoxPredictor_0_CP", 273U,
+                                               PadStrideInfo(1U, 1U, 0U, 0U), class_pred_quant_info.at(0),
+                                               class_reshape.at(0));
 
         SubStream conv_13_class_pre(conv_13);
-        conv_13_class_pre << get_node_C_qasymm(conv_13, data_path, "BoxPredictor_1_CP", 546U, PadStrideInfo(1U, 1U, 0U, 0U), class_pred_quant_info.at(1), class_reshape.at(1));
+        conv_13_class_pre << get_node_C_qasymm(conv_13, data_path, "BoxPredictor_1_CP", 546U,
+                                               PadStrideInfo(1U, 1U, 0U, 0U), class_pred_quant_info.at(1),
+                                               class_reshape.at(1));
 
         SubStream conv_14_2_class_pre(conv_14);
-        conv_14_2_class_pre << get_node_C_qasymm(conv_14, data_path, "BoxPredictor_2_CP", 546U, PadStrideInfo(1U, 1U, 0U, 0U), class_pred_quant_info.at(2), class_reshape.at(2));
+        conv_14_2_class_pre << get_node_C_qasymm(conv_14, data_path, "BoxPredictor_2_CP", 546U,
+                                                 PadStrideInfo(1U, 1U, 0U, 0U), class_pred_quant_info.at(2),
+                                                 class_reshape.at(2));
 
         SubStream conv_15_2_class_pre(conv_15);
-        conv_15_2_class_pre << get_node_C_qasymm(conv_15, data_path, "BoxPredictor_3_CP", 546U, PadStrideInfo(1U, 1U, 0U, 0U), class_pred_quant_info.at(3), class_reshape.at(3));
+        conv_15_2_class_pre << get_node_C_qasymm(conv_15, data_path, "BoxPredictor_3_CP", 546U,
+                                                 PadStrideInfo(1U, 1U, 0U, 0U), class_pred_quant_info.at(3),
+                                                 class_reshape.at(3));
 
         SubStream conv_16_2_class_pre(conv_16);
-        conv_16_2_class_pre << get_node_C_qasymm(conv_16, data_path, "BoxPredictor_4_CP", 546U, PadStrideInfo(1U, 1U, 0U, 0U), class_pred_quant_info.at(4), class_reshape.at(4));
+        conv_16_2_class_pre << get_node_C_qasymm(conv_16, data_path, "BoxPredictor_4_CP", 546U,
+                                                 PadStrideInfo(1U, 1U, 0U, 0U), class_pred_quant_info.at(4),
+                                                 class_reshape.at(4));
 
         SubStream conv_17_2_class_pre(conv_17);
-        conv_17_2_class_pre << get_node_C_qasymm(conv_17, data_path, "BoxPredictor_5_CP", 546U, PadStrideInfo(1U, 1U, 0U, 0U), class_pred_quant_info.at(5), class_reshape.at(5));
+        conv_17_2_class_pre << get_node_C_qasymm(conv_17, data_path, "BoxPredictor_5_CP", 546U,
+                                                 PadStrideInfo(1U, 1U, 0U, 0U), class_pred_quant_info.at(5),
+                                                 class_reshape.at(5));
 
         const QuantizationInfo cp_concate_qinfo = QuantizationInfo(0.0584389753639698f, 230);
         SubStream              class_pred(graph);
-        class_pred << ConcatLayer(
-                       arm_compute::graph::descriptors::ConcatLayerDescriptor(DataLayoutDimension::WIDTH, cp_concate_qinfo),
-                       std::move(conv_11_class_pre), std::move(conv_13_class_pre), std::move(conv_14_2_class_pre),
-                       std::move(conv_15_2_class_pre), std::move(conv_16_2_class_pre), std::move(conv_17_2_class_pre))
-                   .set_name("ClassPrediction/concat");
+        class_pred << ConcatLayer(arm_compute::graph::descriptors::ConcatLayerDescriptor(DataLayoutDimension::WIDTH,
+                                                                                         cp_concate_qinfo),
+                                  std::move(conv_11_class_pre), std::move(conv_13_class_pre),
+                                  std::move(conv_14_2_class_pre), std::move(conv_15_2_class_pre),
+                                  std::move(conv_16_2_class_pre), std::move(conv_17_2_class_pre))
+                          .set_name("ClassPrediction/concat");
 
         const QuantizationInfo logistic_out_qinfo = QuantizationInfo(0.00390625f, 0);
-        class_pred << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC), logistic_out_qinfo).set_name("ClassPrediction/logistic");
-
-        const int   max_detections            = 10;
-        const int   max_classes_per_detection = 1;
-        const float nms_score_threshold       = 0.30000001192092896f;
-        const float nms_iou_threshold         = 0.6000000238418579f;
-        const int   num_classes               = 90;
-        const float x_scale                   = 10.f;
-        const float y_scale                   = 10.f;
-        const float h_scale                   = 5.f;
-        const float w_scale                   = 5.f;
-        std::array<float, 4> scales = { y_scale, x_scale, w_scale, h_scale };
-        const QuantizationInfo anchors_qinfo = QuantizationInfo(0.006453060545027256f, 0);
+        class_pred << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC),
+                                      logistic_out_qinfo)
+                          .set_name("ClassPrediction/logistic");
+
+        const int              max_detections            = 10;
+        const int              max_classes_per_detection = 1;
+        const float            nms_score_threshold       = 0.30000001192092896f;
+        const float            nms_iou_threshold         = 0.6000000238418579f;
+        const int              num_classes               = 90;
+        const float            x_scale                   = 10.f;
+        const float            y_scale                   = 10.f;
+        const float            h_scale                   = 5.f;
+        const float            w_scale                   = 5.f;
+        std::array<float, 4>   scales                    = {y_scale, x_scale, w_scale, h_scale};
+        const QuantizationInfo anchors_qinfo             = QuantizationInfo(0.006453060545027256f, 0);
 
         SubStream detection_ouput(box_enc_pre);
         detection_ouput << DetectionPostProcessLayer(std::move(class_pred),
-                                                     DetectionPostProcessLayerInfo(max_detections, max_classes_per_detection, nms_score_threshold, nms_iou_threshold, num_classes, scales),
+                                                     DetectionPostProcessLayerInfo(
+                                                         max_detections, max_classes_per_detection, nms_score_threshold,
+                                                         nms_iou_threshold, num_classes, scales),
                                                      get_weights_accessor(data_path, "anchors.npy"), anchors_qinfo)
-                        .set_name("DetectionPostProcess");
+                               .set_name("DetectionPostProcess");
 
         SubStream ouput_0(detection_ouput);
-        ouput_0 << OutputLayer(get_npy_output_accessor(detection_boxes_opt->value(), TensorShape(4U, 10U), DataType::F32), 0);
+        ouput_0 << OutputLayer(
+            get_npy_output_accessor(detection_boxes_opt->value(), TensorShape(4U, 10U), DataType::F32), 0);
 
         SubStream ouput_1(detection_ouput);
-        ouput_1 << OutputLayer(get_npy_output_accessor(detection_classes_opt->value(), TensorShape(10U), DataType::F32), 1);
+        ouput_1 << OutputLayer(get_npy_output_accessor(detection_classes_opt->value(), TensorShape(10U), DataType::F32),
+                               1);
 
         SubStream ouput_2(detection_ouput);
-        ouput_2 << OutputLayer(get_npy_output_accessor(detection_scores_opt->value(), TensorShape(10U), DataType::F32), 2);
+        ouput_2 << OutputLayer(get_npy_output_accessor(detection_scores_opt->value(), TensorShape(10U), DataType::F32),
+                               2);
 
         SubStream ouput_3(detection_ouput);
         ouput_3 << OutputLayer(get_npy_output_accessor(num_detections_opt->value(), TensorShape(1U), DataType::F32), 3);
diff --git a/examples/graph_vgg16.cpp b/examples/graph_vgg16.cpp
index fcfe6ef50d..72ac9694b1 100644
--- a/examples/graph_vgg16.cpp
+++ b/examples/graph_vgg16.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -35,8 +36,7 @@ using namespace arm_compute::graph_utils;
 class GraphVGG16Example : public Example
 {
 public:
-    GraphVGG16Example()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "VGG16")
+    GraphVGG16Example() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "VGG16")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -49,7 +49,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -62,153 +62,143 @@ public:
         std::string data_path = common_params.data_path;
 
         // Create a preprocessor object
-        const std::array<float, 3> mean_rgb{ { 123.68f, 116.779f, 103.939f } };
+        const std::array<float, 3>     mean_rgb{{123.68f, 116.779f, 103.939f}};
         std::unique_ptr<IPreprocessor> preprocessor = std::make_unique<CaffePreproccessor>(mean_rgb);
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
         // Create graph
-        graph << common_params.target
-              << common_params.fast_math_hint
-              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor)))
-              // Layer 1
-              << ConvolutionLayer(
-                  3U, 3U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv1_1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv1_1_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv1_1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv1_1/Relu")
-              // Layer 2
-              << ConvolutionLayer(
-                  3U, 3U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv1_2_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv1_2_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv1_2")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv1_2/Relu")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool1")
-              // Layer 3
-              << ConvolutionLayer(
-                  3U, 3U, 128U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv2_1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv2_1_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv2_1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv2_1/Relu")
-              // Layer 4
-              << ConvolutionLayer(
-                  3U, 3U, 128U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv2_2_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv2_2_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv2_2")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv2_2/Relu")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool2")
-              // Layer 5
-              << ConvolutionLayer(
-                  3U, 3U, 256U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv3_1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv3_1_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv3_1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv3_1/Relu")
-              // Layer 6
-              << ConvolutionLayer(
-                  3U, 3U, 256U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv3_2_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv3_2_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv3_2")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv3_2/Relu")
-              // Layer 7
-              << ConvolutionLayer(
-                  3U, 3U, 256U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv3_3_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv3_3_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv3_3")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv3_3/Relu")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool3")
-              // Layer 8
-              << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv4_1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv4_1_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv4_1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv4_1/Relu")
-              // Layer 9
-              << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv4_2_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv4_2_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv4_2")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv4_2/Relu")
-              // Layer 10
-              << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv4_3_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv4_3_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv4_3")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv4_3/Relu")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool4")
-              // Layer 11
-              << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv5_1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv5_1_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv5_1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv5_1/Relu")
-              // Layer 12
-              << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv5_2_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv5_2_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv5_2")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv5_2/Relu")
-              // Layer 13
-              << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv5_3_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv5_3_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv5_3")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv5_3/Relu")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool5")
-              // Layer 14
-              << FullyConnectedLayer(
-                  4096U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/fc6_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/fc6_b.npy"))
-              .set_name("fc6")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Relu")
-              // Layer 15
-              << FullyConnectedLayer(
-                  4096U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/fc7_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/fc7_b.npy"))
-              .set_name("fc7")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Relu_1")
-              // Layer 16
-              << FullyConnectedLayer(
-                  1000U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/fc8_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg16_model/fc8_b.npy"))
-              .set_name("fc8")
-              // Softmax
-              << SoftmaxLayer().set_name("prob")
-              << OutputLayer(get_output_accessor(common_params, 5));
+        graph
+            << common_params.target << common_params.fast_math_hint
+            << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor)))
+            // Layer 1
+            << ConvolutionLayer(
+                   3U, 3U, 64U, get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv1_1_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv1_1_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv1_1")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv1_1/Relu")
+            // Layer 2
+            << ConvolutionLayer(
+                   3U, 3U, 64U, get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv1_2_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv1_2_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv1_2")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv1_2/Relu")
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                   .set_name("pool1")
+            // Layer 3
+            << ConvolutionLayer(
+                   3U, 3U, 128U, get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv2_1_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv2_1_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv2_1")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv2_1/Relu")
+            // Layer 4
+            << ConvolutionLayer(
+                   3U, 3U, 128U, get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv2_2_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv2_2_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv2_2")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv2_2/Relu")
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                   .set_name("pool2")
+            // Layer 5
+            << ConvolutionLayer(
+                   3U, 3U, 256U, get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv3_1_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv3_1_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv3_1")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv3_1/Relu")
+            // Layer 6
+            << ConvolutionLayer(
+                   3U, 3U, 256U, get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv3_2_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv3_2_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv3_2")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv3_2/Relu")
+            // Layer 7
+            << ConvolutionLayer(
+                   3U, 3U, 256U, get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv3_3_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv3_3_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv3_3")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv3_3/Relu")
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                   .set_name("pool3")
+            // Layer 8
+            << ConvolutionLayer(
+                   3U, 3U, 512U, get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv4_1_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv4_1_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv4_1")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv4_1/Relu")
+            // Layer 9
+            << ConvolutionLayer(
+                   3U, 3U, 512U, get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv4_2_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv4_2_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv4_2")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv4_2/Relu")
+            // Layer 10
+            << ConvolutionLayer(
+                   3U, 3U, 512U, get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv4_3_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv4_3_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv4_3")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv4_3/Relu")
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                   .set_name("pool4")
+            // Layer 11
+            << ConvolutionLayer(
+                   3U, 3U, 512U, get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv5_1_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv5_1_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv5_1")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv5_1/Relu")
+            // Layer 12
+            << ConvolutionLayer(
+                   3U, 3U, 512U, get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv5_2_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv5_2_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv5_2")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv5_2/Relu")
+            // Layer 13
+            << ConvolutionLayer(
+                   3U, 3U, 512U, get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv5_3_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/conv5_3_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv5_3")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv5_3/Relu")
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                   .set_name("pool5")
+            // Layer 14
+            << FullyConnectedLayer(4096U,
+                                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/fc6_w.npy", weights_layout),
+                                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/fc6_b.npy"))
+                   .set_name("fc6")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Relu")
+            // Layer 15
+            << FullyConnectedLayer(4096U,
+                                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/fc7_w.npy", weights_layout),
+                                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/fc7_b.npy"))
+                   .set_name("fc7")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Relu_1")
+            // Layer 16
+            << FullyConnectedLayer(1000U,
+                                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/fc8_w.npy", weights_layout),
+                                   get_weights_accessor(data_path, "/cnn_data/vgg16_model/fc8_b.npy"))
+                   .set_name("fc8")
+            // Softmax
+            << SoftmaxLayer().set_name("prob") << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
         GraphConfig config;
diff --git a/examples/graph_vgg19.cpp b/examples/graph_vgg19.cpp
index efc0bcce19..9293544655 100644
--- a/examples/graph_vgg19.cpp
+++ b/examples/graph_vgg19.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -34,8 +35,7 @@ using namespace arm_compute::graph_utils;
 class GraphVGG19Example : public Example
 {
 public:
-    GraphVGG19Example()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "VGG19")
+    GraphVGG19Example() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "VGG19")
     {
     }
     bool do_setup(int argc, char **argv) override
@@ -48,7 +48,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -61,165 +61,152 @@ public:
         std::string data_path = common_params.data_path;
 
         // Create a preprocessor object
-        const std::array<float, 3> mean_rgb{ { 123.68f, 116.779f, 103.939f } };
+        const std::array<float, 3>     mean_rgb{{123.68f, 116.779f, 103.939f}};
         std::unique_ptr<IPreprocessor> preprocessor = std::make_unique<CaffePreproccessor>(mean_rgb);
 
         // Create input descriptor
         const auto        operation_layout = common_params.data_layout;
-        const TensorShape tensor_shape     = permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(224U, 224U, 3U, common_params.batches), DataLayout::NCHW, operation_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(operation_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
-              << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor)))
-              // Layer 1
-              << ConvolutionLayer(
-                  3U, 3U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv1_1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv1_1_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv1_1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv1_1/Relu")
-              << ConvolutionLayer(
-                  3U, 3U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv1_2_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv1_2_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv1_2")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv1_2/Relu")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool1")
-              // Layer 2
-              << ConvolutionLayer(
-                  3U, 3U, 128U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv2_1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv2_1_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv2_1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv2_1/Relu")
-              << ConvolutionLayer(
-                  3U, 3U, 128U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv2_2_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv2_2_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv2_2")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv2_2/Relu")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool2")
-              // Layer 3
-              << ConvolutionLayer(
-                  3U, 3U, 256U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_1_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv3_1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv3_1/Relu")
-              << ConvolutionLayer(
-                  3U, 3U, 256U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_2_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_2_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv3_2")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv3_2/Relu")
-              << ConvolutionLayer(
-                  3U, 3U, 256U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_3_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_3_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv3_3")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv3_3/Relu")
-              << ConvolutionLayer(
-                  3U, 3U, 256U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_4_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_4_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv3_4")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv3_4/Relu")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool3")
-              // Layer 4
-              << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_1_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv4_1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv4_1/Relu")
-              << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_2_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_2_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv4_2")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv4_2/Relu")
-              << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_3_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_3_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv4_3")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv4_3/Relu")
-              << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_4_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_4_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv4_4")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv4_4/Relu")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool4")
-              // Layer 5
-              << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_1_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_1_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv5_1")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv5_1/Relu")
-              << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_2_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_2_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv5_2")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv5_2/Relu")
-              << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_3_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_3_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv5_3")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv5_3/Relu")
-              << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_4_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_4_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv5_4")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv5_4/Relu")
-              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0))).set_name("pool5")
-              // Layer 6
-              << FullyConnectedLayer(
-                  4096U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/fc6_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/fc6_b.npy"))
-              .set_name("fc6")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Relu")
-              // Layer 7
-              << FullyConnectedLayer(
-                  4096U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/fc7_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/fc7_b.npy"))
-              .set_name("fc7")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Relu_1")
-              // Layer 8
-              << FullyConnectedLayer(
-                  1000U,
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/fc8_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/vgg19_model/fc8_b.npy"))
-              .set_name("fc8")
-              // Softmax
-              << SoftmaxLayer().set_name("prob")
-              << OutputLayer(get_output_accessor(common_params, 5));
+        graph
+            << common_params.target << common_params.fast_math_hint
+            << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor)))
+            // Layer 1
+            << ConvolutionLayer(
+                   3U, 3U, 64U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv1_1_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv1_1_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv1_1")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv1_1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 64U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv1_2_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv1_2_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv1_2")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv1_2/Relu")
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                   .set_name("pool1")
+            // Layer 2
+            << ConvolutionLayer(
+                   3U, 3U, 128U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv2_1_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv2_1_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv2_1")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv2_1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 128U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv2_2_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv2_2_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv2_2")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv2_2/Relu")
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                   .set_name("pool2")
+            // Layer 3
+            << ConvolutionLayer(
+                   3U, 3U, 256U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_1_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_1_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv3_1")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv3_1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 256U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_2_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_2_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv3_2")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv3_2/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 256U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_3_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_3_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv3_3")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv3_3/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 256U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_4_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv3_4_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv3_4")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv3_4/Relu")
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                   .set_name("pool3")
+            // Layer 4
+            << ConvolutionLayer(
+                   3U, 3U, 512U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_1_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_1_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv4_1")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv4_1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 512U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_2_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_2_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv4_2")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv4_2/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 512U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_3_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_3_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv4_3")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv4_3/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 512U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_4_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv4_4_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv4_4")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv4_4/Relu")
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                   .set_name("pool4")
+            // Layer 5
+            << ConvolutionLayer(
+                   3U, 3U, 512U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_1_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_1_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv5_1")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv5_1/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 512U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_2_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_2_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv5_2")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv5_2/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 512U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_3_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_3_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv5_3")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv5_3/Relu")
+            << ConvolutionLayer(
+                   3U, 3U, 512U, get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_4_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/conv5_4_b.npy"), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv5_4")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                   .set_name("conv5_4/Relu")
+            << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 2, operation_layout, PadStrideInfo(2, 2, 0, 0)))
+                   .set_name("pool5")
+            // Layer 6
+            << FullyConnectedLayer(4096U,
+                                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/fc6_w.npy", weights_layout),
+                                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/fc6_b.npy"))
+                   .set_name("fc6")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Relu")
+            // Layer 7
+            << FullyConnectedLayer(4096U,
+                                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/fc7_w.npy", weights_layout),
+                                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/fc7_b.npy"))
+                   .set_name("fc7")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("Relu_1")
+            // Layer 8
+            << FullyConnectedLayer(1000U,
+                                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/fc8_w.npy", weights_layout),
+                                   get_weights_accessor(data_path, "/cnn_data/vgg19_model/fc8_b.npy"))
+                   .set_name("fc8")
+            // Softmax
+            << SoftmaxLayer().set_name("prob") << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
         GraphConfig config;
diff --git a/examples/graph_vgg_vdsr.cpp b/examples/graph_vgg_vdsr.cpp
index 3fe28e0fed..a6cd337f82 100644
--- a/examples/graph_vgg_vdsr.cpp
+++ b/examples/graph_vgg_vdsr.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -36,8 +37,7 @@ using namespace arm_compute::graph_utils;
 class GraphVDSRExample : public Example
 {
 public:
-    GraphVDSRExample()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "VDSR")
+    GraphVDSRExample() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "VDSR")
     {
         model_input_width  = cmd_parser.add_option<SimpleOption<unsigned int>>("image-width", 192);
         model_input_height = cmd_parser.add_option<SimpleOption<unsigned int>>("image-height", 192);
@@ -46,7 +46,7 @@ public:
         model_input_width->set_help("Input image width.");
         model_input_height->set_help("Input image height.");
     }
-    GraphVDSRExample(const GraphVDSRExample &) = delete;
+    GraphVDSRExample(const GraphVDSRExample &)            = delete;
     GraphVDSRExample &operator=(const GraphVDSRExample &) = delete;
     ~GraphVDSRExample() override                          = default;
     bool do_setup(int argc, char **argv) override
@@ -59,7 +59,7 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
@@ -82,15 +82,17 @@ public:
         std::unique_ptr<IPreprocessor> preprocessor = std::make_unique<TFPreproccessor>();
 
         // Create input descriptor
-        const TensorShape tensor_shape     = permute_shape(TensorShape(image_width, image_height, 1U, common_params.batches), DataLayout::NCHW, common_params.data_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(image_width, image_height, 1U, common_params.batches), DataLayout::NCHW,
+                          common_params.data_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
         // Note: Quantization info are random and used only for benchmarking purposes
-        graph << common_params.target
-              << common_params.fast_math_hint
+        graph << common_params.target << common_params.fast_math_hint
               << InputLayer(input_descriptor.set_quantization_info(QuantizationInfo(0.0078125f, 128)),
                             get_input_accessor(common_params, std::move(preprocessor), false));
 
@@ -98,37 +100,34 @@ public:
         SubStream right(graph);
 
         // Layer 1
-        right << ConvolutionLayer(
-                  3U, 3U, 64U,
-                  get_weights_accessor(data_path, "conv0_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "conv0_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1), 1, QuantizationInfo(0.031778190285f, 156), QuantizationInfo(0.0784313753247f, 128))
-              .set_name("conv0")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv0/Relu");
+        right << ConvolutionLayer(3U, 3U, 64U, get_weights_accessor(data_path, "conv0_w.npy", weights_layout),
+                                  get_weights_accessor(data_path, "conv0_b.npy"), PadStrideInfo(1, 1, 1, 1), 1,
+                                  QuantizationInfo(0.031778190285f, 156), QuantizationInfo(0.0784313753247f, 128))
+                     .set_name("conv0")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("conv0/Relu");
 
         // Rest 17 layers
-        for(unsigned int i = 1; i < 19; ++i)
+        for (unsigned int i = 1; i < 19; ++i)
         {
             const std::string conv_w_path = "conv" + arm_compute::support::cpp11::to_string(i) + "_w.npy";
             const std::string conv_b_path = "conv" + arm_compute::support::cpp11::to_string(i) + "_b.npy";
             const std::string conv_name   = "conv" + arm_compute::support::cpp11::to_string(i);
-            right << ConvolutionLayer(
-                      3U, 3U, 64U,
-                      get_weights_accessor(data_path, conv_w_path, weights_layout),
-                      get_weights_accessor(data_path, conv_b_path),
-                      PadStrideInfo(1, 1, 1, 1), 1, QuantizationInfo(0.015851572156f, 93))
-                  .set_name(conv_name)
-                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name(conv_name + "/Relu");
+            right << ConvolutionLayer(3U, 3U, 64U, get_weights_accessor(data_path, conv_w_path, weights_layout),
+                                      get_weights_accessor(data_path, conv_b_path), PadStrideInfo(1, 1, 1, 1), 1,
+                                      QuantizationInfo(0.015851572156f, 93))
+                         .set_name(conv_name)
+                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                         .set_name(conv_name + "/Relu");
         }
 
         // Final layer
-        right << ConvolutionLayer(
-                  3U, 3U, 1U,
-                  get_weights_accessor(data_path, "conv20_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "conv20_b.npy"),
-                  PadStrideInfo(1, 1, 1, 1), 1, QuantizationInfo(0.015851572156f, 93))
-              .set_name("conv20")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)).set_name("conv20/Relu");
+        right << ConvolutionLayer(3U, 3U, 1U, get_weights_accessor(data_path, "conv20_w.npy", weights_layout),
+                                  get_weights_accessor(data_path, "conv20_b.npy"), PadStrideInfo(1, 1, 1, 1), 1,
+                                  QuantizationInfo(0.015851572156f, 93))
+                     .set_name("conv20")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+                     .set_name("conv20/Relu");
 
         // Add residual to input
         graph << EltwiseLayer(std::move(left), std::move(right), EltwiseOperation::Add).set_name("add")
@@ -157,8 +156,8 @@ public:
 private:
     CommandLineParser           cmd_parser;
     CommonGraphOptions          common_opts;
-    SimpleOption<unsigned int> *model_input_width{ nullptr };
-    SimpleOption<unsigned int> *model_input_height{ nullptr };
+    SimpleOption<unsigned int> *model_input_width{nullptr};
+    SimpleOption<unsigned int> *model_input_height{nullptr};
     CommonGraphParams           common_params;
     Stream                      graph;
 };
diff --git a/examples/graph_yolov3.cpp b/examples/graph_yolov3.cpp
index 3c8ddbffd8..5c8d3426ec 100644
--- a/examples/graph_yolov3.cpp
+++ b/examples/graph_yolov3.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/CommonGraphOptions.h"
 #include "utils/GraphUtils.h"
@@ -35,8 +36,7 @@ using namespace arm_compute::graph_utils;
 class GraphYOLOv3Example : public Example
 {
 public:
-    GraphYOLOv3Example()
-        : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "YOLOv3")
+    GraphYOLOv3Example() : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "YOLOv3")
     {
     }
 
@@ -50,14 +50,15 @@ public:
         common_params = consume_common_graph_parameters(common_opts);
 
         // Return when help menu is requested
-        if(common_params.help)
+        if (common_params.help)
         {
             cmd_parser.print_help(argv[0]);
             return false;
         }
 
         // Checks
-        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type), "QASYMM8 not supported for this graph");
+        ARM_COMPUTE_EXIT_ON_MSG(arm_compute::is_data_type_quantized_asymmetric(common_params.data_type),
+                                "QASYMM8 not supported for this graph");
 
         // Print parameter values
         std::cout << common_params << std::endl;
@@ -69,331 +70,322 @@ public:
         std::unique_ptr<IPreprocessor> preprocessor = std::make_unique<TFPreproccessor>(0.f);
 
         // Create input descriptor
-        const TensorShape tensor_shape     = permute_shape(TensorShape(608U, 608U, 3U, 1U), DataLayout::NCHW, common_params.data_layout);
-        TensorDescriptor  input_descriptor = TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
+        const TensorShape tensor_shape =
+            permute_shape(TensorShape(608U, 608U, 3U, 1U), DataLayout::NCHW, common_params.data_layout);
+        TensorDescriptor input_descriptor =
+            TensorDescriptor(tensor_shape, common_params.data_type).set_layout(common_params.data_layout);
 
         // Set weights trained layout
         const DataLayout weights_layout = DataLayout::NCHW;
 
-        graph << common_params.target
-              << common_params.fast_math_hint
+        graph << common_params.target << common_params.fast_math_hint
               << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor), false));
         std::pair<SubStream, SubStream> intermediate_layers = darknet53(data_path, weights_layout);
-        graph << ConvolutionLayer(
-                  1U, 1U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_53_w.npy", weights_layout),
-                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("conv2d_53")
-              << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_53_mean.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_53_var.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_53_gamma.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_53_beta.npy"),
-                  0.000001f)
-              .set_name("conv2d_53/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_53/LeakyRelu")
-              << ConvolutionLayer(
-                  3U, 3U, 1024U,
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_54_w.npy", weights_layout),
-                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv2d_54")
-              << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_54_mean.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_54_var.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_54_gamma.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_54_beta.npy"),
-                  0.000001f)
-              .set_name("conv2d_54/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_54/LeakyRelu")
-              << ConvolutionLayer(
-                  1U, 1U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_55_w.npy", weights_layout),
-                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("conv2d_55")
-              << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_55_mean.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_55_var.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_55_gamma.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_55_beta.npy"),
-                  0.000001f)
-              .set_name("conv2d_55/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_55/LeakyRelu")
-              << ConvolutionLayer(
-                  3U, 3U, 1024U,
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_56_w.npy", weights_layout),
-                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv2d_56")
-              << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_56_mean.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_56_var.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_56_gamma.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_56_beta.npy"),
-                  0.000001f)
-              .set_name("conv2d_56/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_56/LeakyRelu")
-              << ConvolutionLayer(
-                  1U, 1U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_57_w.npy", weights_layout),
-                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("conv2d_57")
-              << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_57_mean.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_57_var.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_57_gamma.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_57_beta.npy"),
-                  0.000001f)
-              .set_name("conv2d_57/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_57/LeakyRelu");
+        graph
+            << ConvolutionLayer(
+                   1U, 1U, 512U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_53_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("conv2d_53")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_53_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_53_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_53_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_53_beta.npy"), 0.000001f)
+                   .set_name("conv2d_53/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_53/LeakyRelu")
+            << ConvolutionLayer(
+                   3U, 3U, 1024U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_54_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv2d_54")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_54_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_54_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_54_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_54_beta.npy"), 0.000001f)
+                   .set_name("conv2d_54/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_54/LeakyRelu")
+            << ConvolutionLayer(
+                   1U, 1U, 512U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_55_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("conv2d_55")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_55_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_55_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_55_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_55_beta.npy"), 0.000001f)
+                   .set_name("conv2d_55/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_55/LeakyRelu")
+            << ConvolutionLayer(
+                   3U, 3U, 1024U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_56_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv2d_56")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_56_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_56_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_56_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_56_beta.npy"), 0.000001f)
+                   .set_name("conv2d_56/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_56/LeakyRelu")
+            << ConvolutionLayer(
+                   1U, 1U, 512U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_57_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("conv2d_57")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_57_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_57_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_57_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_57_beta.npy"), 0.000001f)
+                   .set_name("conv2d_57/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_57/LeakyRelu");
         SubStream route_1(graph);
-        graph << ConvolutionLayer(
-                  3U, 3U, 1024U,
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_58_w.npy", weights_layout),
-                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv2d_58")
-              << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_58_mean.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_58_var.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_58_gamma.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_58_beta.npy"),
-                  0.000001f)
-              .set_name("conv2d_58/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_58/LeakyRelu")
-              << ConvolutionLayer(
-                  1U, 1U, 255U,
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_59_w.npy", weights_layout),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_59_b.npy", weights_layout),
-                  PadStrideInfo(1, 1, 0, 0))
-              .set_name("conv2d_59")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 1.f)).set_name("conv2d_59/Linear")
-              << YOLOLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 0.1f)).set_name("Yolo1")
-              << OutputLayer(get_output_accessor(common_params, 5));
+        graph
+            << ConvolutionLayer(
+                   3U, 3U, 1024U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_58_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv2d_58")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_58_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_58_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_58_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_58_beta.npy"), 0.000001f)
+                   .set_name("conv2d_58/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_58/LeakyRelu")
+            << ConvolutionLayer(
+                   1U, 1U, 255U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_59_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_59_b.npy", weights_layout),
+                   PadStrideInfo(1, 1, 0, 0))
+                   .set_name("conv2d_59")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 1.f))
+                   .set_name("conv2d_59/Linear")
+            << YOLOLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 0.1f)).set_name("Yolo1")
+            << OutputLayer(get_output_accessor(common_params, 5));
         route_1 << ConvolutionLayer(
-                    1U, 1U, 256U,
-                    get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_60_w.npy", weights_layout),
-                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                    PadStrideInfo(1, 1, 0, 0))
-                .set_name("conv2d_60")
+                       1U, 1U, 256U,
+                       get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_60_w.npy", weights_layout),
+                       std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                       .set_name("conv2d_60")
                 << BatchNormalizationLayer(
-                    get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_59_mean.npy"),
-                    get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_59_var.npy"),
-                    get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_59_gamma.npy"),
-                    get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_59_beta.npy"),
-                    0.000001f)
-                .set_name("conv2d_59/BatchNorm")
-                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_60/LeakyRelu")
+                       get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_59_mean.npy"),
+                       get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_59_var.npy"),
+                       get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_59_gamma.npy"),
+                       get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_59_beta.npy"),
+                       0.000001f)
+                       .set_name("conv2d_59/BatchNorm")
+                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                       .set_name("conv2d_60/LeakyRelu")
                 << ResizeLayer(InterpolationPolicy::NEAREST_NEIGHBOR, 2, 2).set_name("Upsample_60");
         SubStream concat_1(route_1);
-        concat_1 << ConcatLayer(std::move(route_1), std::move(intermediate_layers.second)).set_name("Route1")
-                 << ConvolutionLayer(
-                     1U, 1U, 256U,
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_61_w.npy", weights_layout),
-                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                     PadStrideInfo(1, 1, 0, 0))
-                 .set_name("conv2d_61")
-                 << BatchNormalizationLayer(
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_60_mean.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_60_var.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_60_gamma.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_60_beta.npy"),
-                     0.000001f)
-                 .set_name("conv2d_60/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_61/LeakyRelu")
-                 << ConvolutionLayer(
-                     3U, 3U, 512U,
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_62_w.npy", weights_layout),
-                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                     PadStrideInfo(1, 1, 1, 1))
-                 .set_name("conv2d_62")
-                 << BatchNormalizationLayer(
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_61_mean.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_61_var.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_61_gamma.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_61_beta.npy"),
-                     0.000001f)
-                 .set_name("conv2d_61/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_62/LeakyRelu")
-                 << ConvolutionLayer(
-                     1U, 1U, 256U,
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_63_w.npy", weights_layout),
-                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                     PadStrideInfo(1, 1, 0, 0))
-                 .set_name("conv2d_63")
-                 << BatchNormalizationLayer(
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_62_mean.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_62_var.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_62_gamma.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_62_beta.npy"),
-                     0.000001f)
-                 .set_name("conv2d_62/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_63/LeakyRelu")
-                 << ConvolutionLayer(
-                     3U, 3U, 512U,
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_64_w.npy", weights_layout),
-                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                     PadStrideInfo(1, 1, 1, 1))
-                 .set_name("conv2d_64")
-                 << BatchNormalizationLayer(
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_63_mean.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_63_var.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_63_gamma.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_63_beta.npy"),
-                     0.000001f)
-                 .set_name("conv2d_63/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_64/LeakyRelu")
-                 << ConvolutionLayer(
-                     1U, 1U, 256U,
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_65_w.npy", weights_layout),
-                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                     PadStrideInfo(1, 1, 0, 0))
-                 .set_name("conv2d_65")
-                 << BatchNormalizationLayer(
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_64_mean.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_64_var.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_64_gamma.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_64_beta.npy"),
-                     0.000001f)
-                 .set_name("conv2d_65/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_65/LeakyRelu");
+        concat_1
+            << ConcatLayer(std::move(route_1), std::move(intermediate_layers.second)).set_name("Route1")
+            << ConvolutionLayer(
+                   1U, 1U, 256U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_61_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("conv2d_61")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_60_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_60_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_60_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_60_beta.npy"), 0.000001f)
+                   .set_name("conv2d_60/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_61/LeakyRelu")
+            << ConvolutionLayer(
+                   3U, 3U, 512U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_62_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv2d_62")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_61_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_61_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_61_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_61_beta.npy"), 0.000001f)
+                   .set_name("conv2d_61/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_62/LeakyRelu")
+            << ConvolutionLayer(
+                   1U, 1U, 256U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_63_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("conv2d_63")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_62_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_62_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_62_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_62_beta.npy"), 0.000001f)
+                   .set_name("conv2d_62/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_63/LeakyRelu")
+            << ConvolutionLayer(
+                   3U, 3U, 512U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_64_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv2d_64")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_63_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_63_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_63_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_63_beta.npy"), 0.000001f)
+                   .set_name("conv2d_63/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_64/LeakyRelu")
+            << ConvolutionLayer(
+                   1U, 1U, 256U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_65_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("conv2d_65")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_64_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_64_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_64_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_64_beta.npy"), 0.000001f)
+                   .set_name("conv2d_65/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_65/LeakyRelu");
         SubStream route_2(concat_1);
-        concat_1 << ConvolutionLayer(
-                     3U, 3U, 512U,
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_66_w.npy", weights_layout),
-                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                     PadStrideInfo(1, 1, 1, 1))
-                 .set_name("conv2d_66")
-                 << BatchNormalizationLayer(
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_65_mean.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_65_var.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_65_gamma.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_65_beta.npy"),
-                     0.000001f)
-                 .set_name("conv2d_65/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_66/LeakyRelu")
-                 << ConvolutionLayer(
-                     1U, 1U, 255U,
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_67_w.npy", weights_layout),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_67_b.npy", weights_layout),
-                     PadStrideInfo(1, 1, 0, 0))
-                 .set_name("conv2d_67")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 1.f)).set_name("conv2d_67/Linear")
-                 << YOLOLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 0.1f)).set_name("Yolo2")
-                 << OutputLayer(get_output_accessor(common_params, 5));
+        concat_1
+            << ConvolutionLayer(
+                   3U, 3U, 512U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_66_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv2d_66")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_65_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_65_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_65_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_65_beta.npy"), 0.000001f)
+                   .set_name("conv2d_65/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_66/LeakyRelu")
+            << ConvolutionLayer(
+                   1U, 1U, 255U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_67_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_67_b.npy", weights_layout),
+                   PadStrideInfo(1, 1, 0, 0))
+                   .set_name("conv2d_67")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 1.f))
+                   .set_name("conv2d_67/Linear")
+            << YOLOLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 0.1f)).set_name("Yolo2")
+            << OutputLayer(get_output_accessor(common_params, 5));
         route_2 << ConvolutionLayer(
-                    1U, 1U, 128U,
-                    get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_68_w.npy", weights_layout),
-                    std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                    PadStrideInfo(1, 1, 0, 0))
-                .set_name("conv2d_68")
+                       1U, 1U, 128U,
+                       get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_68_w.npy", weights_layout),
+                       std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                       .set_name("conv2d_68")
                 << BatchNormalizationLayer(
-                    get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_66_mean.npy"),
-                    get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_66_var.npy"),
-                    get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_66_gamma.npy"),
-                    get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_66_beta.npy"),
-                    0.000001f)
-                .set_name("conv2d_66/BatchNorm")
-                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_68/LeakyRelu")
+                       get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_66_mean.npy"),
+                       get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_66_var.npy"),
+                       get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_66_gamma.npy"),
+                       get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_66_beta.npy"),
+                       0.000001f)
+                       .set_name("conv2d_66/BatchNorm")
+                << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                       .set_name("conv2d_68/LeakyRelu")
                 << ResizeLayer(InterpolationPolicy::NEAREST_NEIGHBOR, 2, 2).set_name("Upsample_68");
         SubStream concat_2(route_2);
-        concat_2 << ConcatLayer(std::move(route_2), std::move(intermediate_layers.first)).set_name("Route2")
-                 << ConvolutionLayer(
-                     1U, 1U, 128U,
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_69_w.npy", weights_layout),
-                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                     PadStrideInfo(1, 1, 0, 0))
-                 .set_name("conv2d_69")
-                 << BatchNormalizationLayer(
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_67_mean.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_67_var.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_67_gamma.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_67_beta.npy"),
-                     0.000001f)
-                 .set_name("conv2d_67/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_69/LeakyRelu")
-                 << ConvolutionLayer(
-                     3U, 3U, 256U,
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_70_w.npy", weights_layout),
-                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                     PadStrideInfo(1, 1, 1, 1))
-                 .set_name("conv2d_70")
-                 << BatchNormalizationLayer(
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_68_mean.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_68_var.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_68_gamma.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_68_beta.npy"),
-                     0.000001f)
-                 .set_name("conv2d_68/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_70/LeakyRelu")
-                 << ConvolutionLayer(
-                     1U, 1U, 128U,
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_71_w.npy", weights_layout),
-                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                     PadStrideInfo(1, 1, 0, 0))
-                 .set_name("conv2d_71")
-                 << BatchNormalizationLayer(
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_69_mean.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_69_var.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_69_gamma.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_69_beta.npy"),
-                     0.000001f)
-                 .set_name("conv2d_69/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_71/LeakyRelu")
-                 << ConvolutionLayer(
-                     3U, 3U, 256U,
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_72_w.npy", weights_layout),
-                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                     PadStrideInfo(1, 1, 1, 1))
-                 .set_name("conv2d_72")
-                 << BatchNormalizationLayer(
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_70_mean.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_70_var.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_70_gamma.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_70_beta.npy"),
-                     0.000001f)
-                 .set_name("conv2d_70/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_72/LeakyRelu")
-                 << ConvolutionLayer(
-                     1U, 1U, 128U,
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_73_w.npy", weights_layout),
-                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                     PadStrideInfo(1, 1, 0, 0))
-                 .set_name("conv2d_73")
-                 << BatchNormalizationLayer(
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_71_mean.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_71_var.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_71_gamma.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_71_beta.npy"),
-                     0.000001f)
-                 .set_name("conv2d_71/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_73/LeakyRelu")
-                 << ConvolutionLayer(
-                     3U, 3U, 256U,
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_74_w.npy", weights_layout),
-                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                     PadStrideInfo(1, 1, 1, 1))
-                 .set_name("conv2d_74")
-                 << BatchNormalizationLayer(
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_72_mean.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_72_var.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_72_gamma.npy"),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_72_beta.npy"),
-                     0.000001f)
-                 .set_name("conv2d_72/BatchNorm")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_74/LeakyRelu")
-                 << ConvolutionLayer(
-                     1U, 1U, 255U,
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_75_w.npy", weights_layout),
-                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_75_b.npy", weights_layout),
-                     PadStrideInfo(1, 1, 0, 0))
-                 .set_name("conv2d_75")
-                 << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 1.f)).set_name("conv2d_75/Linear")
-                 << YOLOLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 0.1f)).set_name("Yolo3")
-                 << OutputLayer(get_output_accessor(common_params, 5));
+        concat_2
+            << ConcatLayer(std::move(route_2), std::move(intermediate_layers.first)).set_name("Route2")
+            << ConvolutionLayer(
+                   1U, 1U, 128U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_69_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("conv2d_69")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_67_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_67_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_67_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_67_beta.npy"), 0.000001f)
+                   .set_name("conv2d_67/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_69/LeakyRelu")
+            << ConvolutionLayer(
+                   3U, 3U, 256U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_70_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv2d_70")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_68_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_68_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_68_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_68_beta.npy"), 0.000001f)
+                   .set_name("conv2d_68/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_70/LeakyRelu")
+            << ConvolutionLayer(
+                   1U, 1U, 128U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_71_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("conv2d_71")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_69_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_69_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_69_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_69_beta.npy"), 0.000001f)
+                   .set_name("conv2d_69/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_71/LeakyRelu")
+            << ConvolutionLayer(
+                   3U, 3U, 256U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_72_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv2d_72")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_70_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_70_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_70_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_70_beta.npy"), 0.000001f)
+                   .set_name("conv2d_70/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_72/LeakyRelu")
+            << ConvolutionLayer(
+                   1U, 1U, 128U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_73_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("conv2d_73")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_71_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_71_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_71_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_71_beta.npy"), 0.000001f)
+                   .set_name("conv2d_71/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_73/LeakyRelu")
+            << ConvolutionLayer(
+                   3U, 3U, 256U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_74_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv2d_74")
+            << BatchNormalizationLayer(
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_72_mean.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_72_var.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_72_gamma.npy"),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_72_beta.npy"), 0.000001f)
+                   .set_name("conv2d_72/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_74/LeakyRelu")
+            << ConvolutionLayer(
+                   1U, 1U, 255U,
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_75_w.npy", weights_layout),
+                   get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_75_b.npy", weights_layout),
+                   PadStrideInfo(1, 1, 0, 0))
+                   .set_name("conv2d_75")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 1.f))
+                   .set_name("conv2d_75/Linear")
+            << YOLOLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 0.1f)).set_name("Yolo3")
+            << OutputLayer(get_output_accessor(common_params, 5));
 
         // Finalize graph
         GraphConfig config;
@@ -422,64 +414,64 @@ private:
     std::pair<SubStream, SubStream> darknet53(const std::string &data_path, DataLayout weights_layout)
     {
         graph << ConvolutionLayer(
-                  3U, 3U, 32U,
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_1_w.npy", weights_layout),
-                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                  PadStrideInfo(1, 1, 1, 1))
-              .set_name("conv2d_1/Conv2D")
+                     3U, 3U, 32U,
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_1_w.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                     .set_name("conv2d_1/Conv2D")
               << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_1_mean.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_1_var.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_1_gamma.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_1_beta.npy"),
-                  0.000001f)
-              .set_name("conv2d_1/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_1/LeakyRelu")
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_1_mean.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_1_var.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_1_gamma.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_1_beta.npy"),
+                     0.000001f)
+                     .set_name("conv2d_1/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                     .set_name("conv2d_1/LeakyRelu")
               << ConvolutionLayer(
-                  3U, 3U, 64U,
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_2_w.npy", weights_layout),
-                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                  PadStrideInfo(2, 2, 1, 1))
-              .set_name("conv2d_2/Conv2D")
+                     3U, 3U, 64U,
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_2_w.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 1, 1))
+                     .set_name("conv2d_2/Conv2D")
               << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_2_mean.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_2_var.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_2_gamma.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_2_beta.npy"),
-                  0.000001f)
-              .set_name("conv2d_2/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_2/LeakyRelu");
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_2_mean.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_2_var.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_2_gamma.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_2_beta.npy"),
+                     0.000001f)
+                     .set_name("conv2d_2/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                     .set_name("conv2d_2/LeakyRelu");
         darknet53_block(data_path, "3", weights_layout, 32U);
         graph << ConvolutionLayer(
-                  3U, 3U, 128U,
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_5_w.npy", weights_layout),
-                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                  PadStrideInfo(2, 2, 1, 1))
-              .set_name("conv2d_5/Conv2D")
+                     3U, 3U, 128U,
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_5_w.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 1, 1))
+                     .set_name("conv2d_5/Conv2D")
               << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_5_mean.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_5_var.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_5_gamma.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_5_beta.npy"),
-                  0.000001f)
-              .set_name("conv2d_5/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_5/LeakyRelu");
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_5_mean.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_5_var.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_5_gamma.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_5_beta.npy"),
+                     0.000001f)
+                     .set_name("conv2d_5/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                     .set_name("conv2d_5/LeakyRelu");
         darknet53_block(data_path, "6", weights_layout, 64U);
         darknet53_block(data_path, "8", weights_layout, 64U);
         graph << ConvolutionLayer(
-                  3U, 3U, 256U,
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_10_w.npy", weights_layout),
-                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                  PadStrideInfo(2, 2, 1, 1))
-              .set_name("conv2d_10/Conv2D")
+                     3U, 3U, 256U,
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_10_w.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 1, 1))
+                     .set_name("conv2d_10/Conv2D")
               << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_10_mean.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_10_var.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_10_gamma.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_10_beta.npy"),
-                  0.000001f)
-              .set_name("conv2d_10/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_10/LeakyRelu");
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_10_mean.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_10_var.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_10_gamma.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_10_beta.npy"),
+                     0.000001f)
+                     .set_name("conv2d_10/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                     .set_name("conv2d_10/LeakyRelu");
         darknet53_block(data_path, "11", weights_layout, 128U);
         darknet53_block(data_path, "13", weights_layout, 128U);
         darknet53_block(data_path, "15", weights_layout, 128U);
@@ -490,19 +482,19 @@ private:
         darknet53_block(data_path, "25", weights_layout, 128U);
         SubStream layer_36(graph);
         graph << ConvolutionLayer(
-                  3U, 3U, 512U,
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_27_w.npy", weights_layout),
-                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                  PadStrideInfo(2, 2, 1, 1))
-              .set_name("conv2d_27/Conv2D")
+                     3U, 3U, 512U,
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_27_w.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 1, 1))
+                     .set_name("conv2d_27/Conv2D")
               << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_27_mean.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_27_var.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_27_gamma.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_27_beta.npy"),
-                  0.000001f)
-              .set_name("conv2d_27/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_27/LeakyRelu");
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_27_mean.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_27_var.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_27_gamma.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_27_beta.npy"),
+                     0.000001f)
+                     .set_name("conv2d_27/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                     .set_name("conv2d_27/LeakyRelu");
         darknet53_block(data_path, "28", weights_layout, 256U);
         darknet53_block(data_path, "30", weights_layout, 256U);
         darknet53_block(data_path, "32", weights_layout, 256U);
@@ -513,19 +505,19 @@ private:
         darknet53_block(data_path, "42", weights_layout, 256U);
         SubStream layer_61(graph);
         graph << ConvolutionLayer(
-                  3U, 3U, 1024U,
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_44_w.npy", weights_layout),
-                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                  PadStrideInfo(2, 2, 1, 1))
-              .set_name("conv2d_44/Conv2D")
+                     3U, 3U, 1024U,
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/conv2d_44_w.npy", weights_layout),
+                     std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(2, 2, 1, 1))
+                     .set_name("conv2d_44/Conv2D")
               << BatchNormalizationLayer(
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_44_mean.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_44_var.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_44_gamma.npy"),
-                  get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_44_beta.npy"),
-                  0.000001f)
-              .set_name("conv2d_44/BatchNorm")
-              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_44/LeakyRelu");
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_44_mean.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_44_var.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_44_gamma.npy"),
+                     get_weights_accessor(data_path, "/cnn_data/yolov3_model/batch_normalization_44_beta.npy"),
+                     0.000001f)
+                     .set_name("conv2d_44/BatchNorm")
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                     .set_name("conv2d_44/LeakyRelu");
         darknet53_block(data_path, "45", weights_layout, 512U);
         darknet53_block(data_path, "47", weights_layout, 512U);
         darknet53_block(data_path, "49", weights_layout, 512U);
@@ -534,43 +526,48 @@ private:
         return std::pair<SubStream, SubStream>(layer_36, layer_61);
     }
 
-    void darknet53_block(const std::string &data_path, std::string &&param_path, DataLayout weights_layout,
-                         unsigned int filter_size)
+    void darknet53_block(const std::string &data_path,
+                         std::string      &&param_path,
+                         DataLayout         weights_layout,
+                         unsigned int       filter_size)
     {
-        std::string total_path  = "/cnn_data/yolov3_model/";
-        std::string param_path2 = arm_compute::support::cpp11::to_string(arm_compute::support::cpp11::stoi(param_path) + 1);
-        SubStream   i_a(graph);
-        SubStream   i_b(graph);
+        std::string total_path = "/cnn_data/yolov3_model/";
+        std::string param_path2 =
+            arm_compute::support::cpp11::to_string(arm_compute::support::cpp11::stoi(param_path) + 1);
+        SubStream i_a(graph);
+        SubStream i_b(graph);
         i_a << ConvolutionLayer(
-                1U, 1U, filter_size,
-                get_weights_accessor(data_path, total_path + "conv2d_" + param_path + "_w.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 0, 0))
-            .set_name("conv2d_" + param_path + "/Conv2D")
+                   1U, 1U, filter_size,
+                   get_weights_accessor(data_path, total_path + "conv2d_" + param_path + "_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 0, 0))
+                   .set_name("conv2d_" + param_path + "/Conv2D")
             << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path + "_mean.npy"),
-                get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path + "_var.npy"),
-                get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path + "_gamma.npy"),
-                get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path + "_beta.npy"),
-                0.000001f)
-            .set_name("conv2d_" + param_path + "/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_" + param_path + "/LeakyRelu")
+                   get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path + "_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path + "_var.npy"),
+                   get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path + "_gamma.npy"),
+                   get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path + "_beta.npy"),
+                   0.000001f)
+                   .set_name("conv2d_" + param_path + "/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_" + param_path + "/LeakyRelu")
             << ConvolutionLayer(
-                3U, 3U, filter_size * 2,
-                get_weights_accessor(data_path, total_path + "conv2d_" + param_path2 + "_w.npy", weights_layout),
-                std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
-                PadStrideInfo(1, 1, 1, 1))
-            .set_name("conv2d_" + param_path2 + "/Conv2D")
+                   3U, 3U, filter_size * 2,
+                   get_weights_accessor(data_path, total_path + "conv2d_" + param_path2 + "_w.npy", weights_layout),
+                   std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr), PadStrideInfo(1, 1, 1, 1))
+                   .set_name("conv2d_" + param_path2 + "/Conv2D")
             << BatchNormalizationLayer(
-                get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path2 + "_mean.npy"),
-                get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path2 + "_var.npy"),
-                get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path2 + "_gamma.npy"),
-                get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path2 + "_beta.npy"),
-                0.000001f)
-            .set_name("conv2d_" + param_path2 + "/BatchNorm")
-            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f)).set_name("conv2d_" + param_path2 + "/LeakyRelu");
+                   get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path2 + "_mean.npy"),
+                   get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path2 + "_var.npy"),
+                   get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path2 + "_gamma.npy"),
+                   get_weights_accessor(data_path, total_path + "batch_normalization_" + param_path2 + "_beta.npy"),
+                   0.000001f)
+                   .set_name("conv2d_" + param_path2 + "/BatchNorm")
+            << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LEAKY_RELU, 0.1f))
+                   .set_name("conv2d_" + param_path2 + "/LeakyRelu");
 
-        graph << EltwiseLayer(std::move(i_a), std::move(i_b), EltwiseOperation::Add).set_name("").set_name("add_" + param_path + "_" + param_path2);
+        graph << EltwiseLayer(std::move(i_a), std::move(i_b), EltwiseOperation::Add)
+                     .set_name("")
+                     .set_name("add_" + param_path + "_" + param_path2);
     }
 };
 
diff --git a/examples/neon_cnn.cpp b/examples/neon_cnn.cpp
index 5ecf055e60..1f7a1ea6ca 100644
--- a/examples/neon_cnn.cpp
+++ b/examples/neon_cnn.cpp
@@ -21,13 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/NEFunctions.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/Allocator.h"
 #include "arm_compute/runtime/BlobLifetimeManager.h"
 #include "arm_compute/runtime/MemoryManagerOnDemand.h"
+#include "arm_compute/runtime/NEON/NEFunctions.h"
 #include "arm_compute/runtime/PoolManager.h"
+
 #include "utils/Utils.h"
 
 using namespace arm_compute;
@@ -43,12 +43,13 @@ public:
 
         // Create memory manager components
         // We need 2 memory managers: 1 for handling the tensors within the functions (mm_layers) and 1 for handling the input and output tensors of the functions (mm_transitions))
-        auto lifetime_mgr0  = std::make_shared<BlobLifetimeManager>();                           // Create lifetime manager
-        auto lifetime_mgr1  = std::make_shared<BlobLifetimeManager>();                           // Create lifetime manager
-        auto pool_mgr0      = std::make_shared<PoolManager>();                                   // Create pool manager
-        auto pool_mgr1      = std::make_shared<PoolManager>();                                   // Create pool manager
-        auto mm_layers      = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr0, pool_mgr0); // Create the memory manager
-        auto mm_transitions = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr1, pool_mgr1); // Create the memory manager
+        auto lifetime_mgr0 = std::make_shared<BlobLifetimeManager>();                       // Create lifetime manager
+        auto lifetime_mgr1 = std::make_shared<BlobLifetimeManager>();                       // Create lifetime manager
+        auto pool_mgr0     = std::make_shared<PoolManager>();                               // Create pool manager
+        auto pool_mgr1     = std::make_shared<PoolManager>();                               // Create pool manager
+        auto mm_layers = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr0, pool_mgr0); // Create the memory manager
+        auto mm_transitions =
+            std::make_shared<MemoryManagerOnDemand>(lifetime_mgr1, pool_mgr1); // Create the memory manager
 
         // The weights and biases tensors should be initialized with the values inferred with the training
 
@@ -116,7 +117,8 @@ public:
         // Initialize tensor of fc0
         constexpr unsigned int num_labels = 128;
 
-        const TensorShape weights_shape_fc0(out_shape_pool1.x() * out_shape_pool1.y() * out_shape_pool1.z(), num_labels);
+        const TensorShape weights_shape_fc0(out_shape_pool1.x() * out_shape_pool1.y() * out_shape_pool1.z(),
+                                            num_labels);
         const TensorShape biases_shape_fc0(num_labels);
         const TensorShape out_shape_fc0(num_labels);
 
@@ -138,22 +140,28 @@ public:
         /* [Configure functions] */
 
         // in:32x32x1: 5x5 convolution, 8 output features maps (OFM)
-        conv0->configure(&src, &weights0, &biases0, &out_conv0, PadStrideInfo(1 /* stride_x */, 1 /* stride_y */, 2 /* pad_x */, 2 /* pad_y */));
+        conv0->configure(&src, &weights0, &biases0, &out_conv0,
+                         PadStrideInfo(1 /* stride_x */, 1 /* stride_y */, 2 /* pad_x */, 2 /* pad_y */));
 
         // in:32x32x8, out:32x32x8, Activation function: relu
         act0.configure(&out_conv0, &out_act0, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
 
         // in:32x32x8, out:16x16x8 (2x2 pooling), Pool type function: Max
-        pool0.configure(&out_act0, &out_pool0, PoolingLayerInfo(PoolingType::MAX, 2, data_layout, PadStrideInfo(2 /* stride_x */, 2 /* stride_y */)));
+        pool0.configure(
+            &out_act0, &out_pool0,
+            PoolingLayerInfo(PoolingType::MAX, 2, data_layout, PadStrideInfo(2 /* stride_x */, 2 /* stride_y */)));
 
         // in:16x16x8: 3x3 convolution, 16 output features maps (OFM)
-        conv1->configure(&out_pool0, &weights1, &biases1, &out_conv1, PadStrideInfo(1 /* stride_x */, 1 /* stride_y */, 1 /* pad_x */, 1 /* pad_y */));
+        conv1->configure(&out_pool0, &weights1, &biases1, &out_conv1,
+                         PadStrideInfo(1 /* stride_x */, 1 /* stride_y */, 1 /* pad_x */, 1 /* pad_y */));
 
         // in:16x16x16, out:16x16x16, Activation function: relu
         act1.configure(&out_conv1, &out_act1, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
 
         // in:16x16x16, out:8x8x16 (2x2 pooling), Pool type function: Average
-        pool1.configure(&out_act1, &out_pool1, PoolingLayerInfo(PoolingType::AVG, 2, data_layout, PadStrideInfo(2 /* stride_x */, 2 /* stride_y */)));
+        pool1.configure(
+            &out_act1, &out_pool1,
+            PoolingLayerInfo(PoolingType::AVG, 2, data_layout, PadStrideInfo(2 /* stride_x */, 2 /* stride_y */)));
 
         // in:8x8x16, out:128
         fc0->configure(&out_pool1, &weights2, &biases2, &out_fc0);
diff --git a/examples/neon_copy_objects.cpp b/examples/neon_copy_objects.cpp
index b060b09759..6e9ebcaad5 100644
--- a/examples/neon_copy_objects.cpp
+++ b/examples/neon_copy_objects.cpp
@@ -22,9 +22,9 @@
  * SOFTWARE.
  */
 
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/NEFunctions.h"
 
-#include "arm_compute/core/Types.h"
 #include "utils/Utils.h"
 
 #include <cstring>
@@ -50,11 +50,11 @@ public:
         dst_data = new float[width * height * batch];
 
         // Fill src_data with pseudo(meaningless) values:
-        for(unsigned int b = 0; b < batch; b++)
+        for (unsigned int b = 0; b < batch; b++)
         {
-            for(unsigned int h = 0; h < height; h++)
+            for (unsigned int h = 0; h < height; h++)
             {
-                for(unsigned int w = 0; w < width; w++)
+                for (unsigned int w = 0; w < width; w++)
                 {
                     src_data[b * (width * height) + h * width + w] = static_cast<float>(100 * b + 10 * h + w);
                 }
@@ -78,9 +78,12 @@ public:
         Window input_window;
         input_window.use_tensor_dimensions(input.info()->tensor_shape());
         std::cout << " Dimensions of the input's iterator:\n";
-        std::cout << " X = [start=" << input_window.x().start() << ", end=" << input_window.x().end() << ", step=" << input_window.x().step() << "]\n";
-        std::cout << " Y = [start=" << input_window.y().start() << ", end=" << input_window.y().end() << ", step=" << input_window.y().step() << "]\n";
-        std::cout << " Z = [start=" << input_window.z().start() << ", end=" << input_window.z().end() << ", step=" << input_window.z().step() << "]\n";
+        std::cout << " X = [start=" << input_window.x().start() << ", end=" << input_window.x().end()
+                  << ", step=" << input_window.x().step() << "]\n";
+        std::cout << " Y = [start=" << input_window.y().start() << ", end=" << input_window.y().end()
+                  << ", step=" << input_window.y().step() << "]\n";
+        std::cout << " Z = [start=" << input_window.z().start() << ", end=" << input_window.z().end()
+                  << ", step=" << input_window.z().step() << "]\n";
 
         // Create an iterator:
         Iterator input_it(&input, input_window);
@@ -98,20 +101,28 @@ public:
         //   }
         // }
         // Except it works for an arbitrary number of dimensions
-        execute_window_loop(input_window, [&](const Coordinates & id)
-        {
-            std::cout << "Setting item [" << id.x() << "," << id.y() << "," << id.z() << "]\n";
-            *reinterpret_cast<float *>(input_it.ptr()) = src_data[id.z() * (width * height) + id.y() * width + id.x()];
-        },
-        input_it);
+        execute_window_loop(
+            input_window,
+            [&](const Coordinates &id)
+            {
+                std::cout << "Setting item [" << id.x() << "," << id.y() << "," << id.z() << "]\n";
+                *reinterpret_cast<float *>(input_it.ptr()) =
+                    src_data[id.z() * (width * height) + id.y() * width + id.x()];
+            },
+            input_it);
 
         // More efficient way: create an iterator to iterate through each row (instead of each element) of the output tensor:
         Window output_window;
-        output_window.use_tensor_dimensions(output.info()->tensor_shape(), /* first_dimension =*/Window::DimY); // Iterate through the rows (not each element)
+        output_window.use_tensor_dimensions(
+            output.info()->tensor_shape(),
+            /* first_dimension =*/Window::DimY); // Iterate through the rows (not each element)
         std::cout << " Dimensions of the output's iterator:\n";
-        std::cout << " X = [start=" << output_window.x().start() << ", end=" << output_window.x().end() << ", step=" << output_window.x().step() << "]\n";
-        std::cout << " Y = [start=" << output_window.y().start() << ", end=" << output_window.y().end() << ", step=" << output_window.y().step() << "]\n";
-        std::cout << " Z = [start=" << output_window.z().start() << ", end=" << output_window.z().end() << ", step=" << output_window.z().step() << "]\n";
+        std::cout << " X = [start=" << output_window.x().start() << ", end=" << output_window.x().end()
+                  << ", step=" << output_window.x().step() << "]\n";
+        std::cout << " Y = [start=" << output_window.y().start() << ", end=" << output_window.y().end()
+                  << ", step=" << output_window.y().step() << "]\n";
+        std::cout << " Z = [start=" << output_window.z().start() << ", end=" << output_window.z().end()
+                  << ", step=" << output_window.z().step() << "]\n";
 
         // Create an iterator:
         Iterator output_it(&output, output_window);
@@ -126,13 +137,15 @@ public:
         //   }
         // }
         // Except it works for an arbitrary number of dimensions
-        execute_window_loop(output_window, [&](const Coordinates & id)
-        {
-            std::cout << "Copying one row starting from [" << id.x() << "," << id.y() << "," << id.z() << "]\n";
-            // Copy one whole row:
-            memcpy(dst_data + id.z() * (width * height) + id.y() * width, output_it.ptr(), width * sizeof(float));
-        },
-        output_it);
+        execute_window_loop(
+            output_window,
+            [&](const Coordinates &id)
+            {
+                std::cout << "Copying one row starting from [" << id.x() << "," << id.y() << "," << id.z() << "]\n";
+                // Copy one whole row:
+                memcpy(dst_data + id.z() * (width * height) + id.y() * width, output_it.ptr(), width * sizeof(float));
+            },
+            output_it);
 
         /** [Copy objects example] */
 
diff --git a/examples/neon_gemm_qasymm8.cpp b/examples/neon_gemm_qasymm8.cpp
index f6f5dc1026..3aaad02f8a 100644
--- a/examples/neon_gemm_qasymm8.cpp
+++ b/examples/neon_gemm_qasymm8.cpp
@@ -22,10 +22,11 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/WindowIterator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/WindowIterator.h"
 #include "arm_compute/runtime/NEON/NEFunctions.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "support/ToolchainSupport.h"
 #include "utils/Utils.h"
 
@@ -38,7 +39,7 @@ using namespace utils;
 void find_min_max(int size, const float *data, float *min, float *max)
 {
     *min = *max = data[0];
-    for(int i = 0; i < size; i++)
+    for (int i = 0; i < size; i++)
     {
         const float val = data[i];
         *min            = std::min(*min, val);
@@ -66,11 +67,11 @@ QuantizationInfo choose_quantization_params(float min, float max)
 
     // But we need to nudge the zero_point to an integer (exact quantized value)
     std::uint8_t zero_point_nudged = 0;
-    if(zero_point_real < qmin)
+    if (zero_point_real < qmin)
     {
         zero_point_nudged = qmin;
     }
-    else if(zero_point_real > qmax)
+    else if (zero_point_real > qmax)
     {
         zero_point_nudged = qmax;
     }
@@ -85,7 +86,7 @@ QuantizationInfo choose_quantization_params(float min, float max)
 
 void quantize_values(int size, qasymm8_t *output, float *input, const QuantizationInfo qinfo)
 {
-    for(int i = 0; i < size; i++)
+    for (int i = 0; i < size; i++)
     {
         output[i] = quantize_qasymm8(input[i], qinfo);
     }
@@ -108,7 +109,7 @@ int main(int argc, char **argv)
     bool   default_input = true;
 
     // Parse args
-    if(argc < 3) /* case default matrix sizes */
+    if (argc < 3) /* case default matrix sizes */
     {
         // Print help
         std::cout << "Usage: ./build/neon_gemm_qasymm8 M N K\n";
@@ -144,23 +145,23 @@ int main(int argc, char **argv)
 
     // Fill in: one is the identity matrix, other is sequential values
     // src1: Identity matrix
-    for(size_t i = 0; i < M * K; i++)
+    for (size_t i = 0; i < M * K; i++)
     {
         src1_ptr[i] = 0;
     }
-    for(size_t i = 0; i < M; i++)
+    for (size_t i = 0; i < M; i++)
     {
         src1_ptr[i * K + i] = 1.0f;
     }
 
     // src2: Sequential values matrix
-    for(size_t i = 0; i < K * N; i++)
+    for (size_t i = 0; i < K * N; i++)
     {
         src2_ptr[i] = i * 1.123f;
     }
 
     // Otherwise if M, N, K is given, fill in with random values
-    if(!default_input)
+    if (!default_input)
     {
         fill_random_tensor(src1, 0.f, 1.f);
         fill_random_tensor(src2, 0.f, 1.f);
@@ -223,7 +224,7 @@ int main(int argc, char **argv)
     NEGEMMLowpOutputStage gemmlowp_output_stage;
     int                   output_multiplier;
     int                   output_shift;
-    float                 multiplier = (src1_qinfo.uniform().scale * src2_qinfo.uniform().scale) / dst0_qinfo.uniform().scale;
+    float multiplier = (src1_qinfo.uniform().scale * src2_qinfo.uniform().scale) / dst0_qinfo.uniform().scale;
     quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
     std::cout << "(q_multiplier, q_shift) = (" << output_multiplier << ", " << output_shift << ")\n\n";
 
diff --git a/examples/neon_permute.cpp b/examples/neon_permute.cpp
index 49848de4ea..76ba079430 100644
--- a/examples/neon_permute.cpp
+++ b/examples/neon_permute.cpp
@@ -21,9 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/NEFunctions.h"
 
-#include "arm_compute/core/Types.h"
 #include "utils/Utils.h"
 
 using namespace arm_compute;
@@ -85,11 +85,13 @@ private:
         window.use_tensor_dimensions(reference.info()->tensor_shape());
         Iterator ref_it(&reference, window);
         Iterator res_it(&result, window);
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-            assert(*reinterpret_cast<unsigned char *>(ref_it.ptr()) == *reinterpret_cast<unsigned char *>(res_it.ptr()));
-        },
-        ref_it, res_it);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &) {
+                assert(*reinterpret_cast<unsigned char *>(ref_it.ptr()) ==
+                       *reinterpret_cast<unsigned char *>(res_it.ptr()));
+            },
+            ref_it, res_it);
     }
 
     void fill_tensor(Tensor &tensor)
@@ -98,11 +100,9 @@ private:
         window.use_tensor_dimensions(tensor.info()->tensor_shape());
         Iterator      tensor_it(&tensor, window);
         unsigned char val(0);
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-            *reinterpret_cast<unsigned char *>(tensor_it.ptr()) = val++;
-        },
-        tensor_it);
+        execute_window_loop(
+            window, [&](const Coordinates &) { *reinterpret_cast<unsigned char *>(tensor_it.ptr()) = val++; },
+            tensor_it);
     }
     void init_tensor(const TensorShape shape, Tensor &tensor, DataType type, DataLayout layout)
     {
diff --git a/examples/neon_scale.cpp b/examples/neon_scale.cpp
index f120ea7f96..28590bd861 100644
--- a/examples/neon_scale.cpp
+++ b/examples/neon_scale.cpp
@@ -21,9 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/NEFunctions.h"
 
-#include "arm_compute/core/Types.h"
 #include "utils/ImageLoader.h"
 #include "utils/Utils.h"
 
@@ -37,7 +37,7 @@ public:
     {
         PPMLoader ppm;
 
-        if(argc < 2)
+        if (argc < 2)
         {
             // Print help
             std::cout << "Usage: ./build/neon_scale[input_image.ppm]\n\n";
@@ -60,20 +60,16 @@ public:
         dst.allocator()->init(dst_tensor_info);
 
         // Configure Scale function object:
-        scale.configure(&src, &dst, ScaleKernelInfo{
-                    InterpolationPolicy::NEAREST_NEIGHBOR,
-                    BorderMode::UNDEFINED,
-                    PixelValue(),
-                    SamplingPolicy::CENTER,
-                    false
-        });
+        scale.configure(&src, &dst,
+                        ScaleKernelInfo{InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED, PixelValue(),
+                                        SamplingPolicy::CENTER, false});
 
         // Allocate all the images
         src.allocator()->allocate();
         dst.allocator()->allocate();
 
         // Fill the input image with the content of the PPM image if a filename was provided:
-        if(ppm.is_open())
+        if (ppm.is_open())
         {
             ppm.fill_image(src);
             output_filename = std::string(argv[1]) + "_out.ppm";
@@ -89,7 +85,7 @@ public:
     void do_teardown() override
     {
         // Save the result to file:
-        if(!output_filename.empty())
+        if (!output_filename.empty())
         {
             save_to_ppm(dst, output_filename); // save_to_ppm maps and unmaps the image to store as PPM
         }
diff --git a/examples/neon_sgemm.cpp b/examples/neon_sgemm.cpp
index 07696bd622..8cda65a400 100644
--- a/examples/neon_sgemm.cpp
+++ b/examples/neon_sgemm.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/NEFunctions.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "utils/Utils.h"
 
 #include <cstdlib>
@@ -43,15 +44,16 @@ public:
         beta  = 0.0f;
 
         std::ifstream stream;
-        if(argc > 1)
+        if (argc > 1)
         {
             stream.open(argv[1], std::fstream::in);
         }
 
-        if(argc < 3 || (argc < 4 && stream.bad()))
+        if (argc < 3 || (argc < 4 && stream.bad()))
         {
             // Print help
-            std::cout << "Usage: 1) ./build/neon_sgemm input_matrix_1.npy input_matrix_2.npy [input_matrix_3.npy] [alpha = 1] [beta = 0]\n";
+            std::cout << "Usage: 1) ./build/neon_sgemm input_matrix_1.npy input_matrix_2.npy [input_matrix_3.npy] "
+                         "[alpha = 1] [beta = 0]\n";
             std::cout << "       2) ./build/neon_sgemm M N K [alpha = 1.0f] [beta = 0.0f]\n\n";
             std::cout << "Too few or no input_matrices provided. Using M=7, N=3, K=5, alpha=1.0f and beta=0.0f\n\n";
 
@@ -61,29 +63,29 @@ public:
         }
         else
         {
-            if(stream.good()) /* case file1.npy file2.npy [file3.npy] [alpha = 1.0f] [beta = 0.0f] */
+            if (stream.good()) /* case file1.npy file2.npy [file3.npy] [alpha = 1.0f] [beta = 0.0f] */
             {
                 npy0.open(argv[1]);
                 npy0.init_tensor(src0, DataType::F32);
                 npy1.open(argv[2]);
                 npy1.init_tensor(src1, DataType::F32);
 
-                if(argc > 3)
+                if (argc > 3)
                 {
                     stream.close();
                     stream.clear();
                     stream.open(argv[3], std::fstream::in);
-                    if(stream.good()) /* case with third file */
+                    if (stream.good()) /* case with third file */
                     {
                         npy2.open(argv[3]);
                         npy2.init_tensor(src2, DataType::F32);
 
-                        if(argc > 4)
+                        if (argc > 4)
                         {
                             // Convert string to float
                             alpha = strtof(argv[4], nullptr);
 
-                            if(argc > 5)
+                            if (argc > 5)
                             {
                                 // Convert string to float
                                 beta = strtof(argv[5], nullptr);
@@ -94,7 +96,7 @@ public:
                     {
                         alpha = strtof(argv[3], nullptr);
 
-                        if(argc > 4)
+                        if (argc > 4)
                         {
                             beta = strtof(argv[4], nullptr);
                         }
@@ -111,11 +113,11 @@ public:
                 src1.allocator()->init(TensorInfo(TensorShape(N, K), 1, DataType::F32));
                 src2.allocator()->init(TensorInfo(TensorShape(N, M), 1, DataType::F32));
 
-                if(argc > 4)
+                if (argc > 4)
                 {
                     alpha = strtof(argv[4], nullptr);
 
-                    if(argc > 5)
+                    if (argc > 5)
                     {
                         beta = strtof(argv[5], nullptr);
                     }
@@ -134,7 +136,7 @@ public:
         dst.allocator()->allocate();
 
         // Fill the input images with either the data provided or random data
-        if(npy0.is_open())
+        if (npy0.is_open())
         {
             npy0.fill_tensor(src0);
             npy1.fill_tensor(src1);
@@ -142,7 +144,7 @@ public:
             output_filename = "sgemm_out.npy";
             is_fortran      = npy0.is_fortran();
 
-            if(npy2.is_open())
+            if (npy2.is_open())
             {
                 src2.allocator()->allocate();
                 npy2.fill_tensor(src2);
@@ -169,7 +171,7 @@ public:
     }
     void do_teardown() override
     {
-        if(!output_filename.empty()) /* Save to .npy file */
+        if (!output_filename.empty()) /* Save to .npy file */
         {
             save_to_npy(dst, output_filename, is_fortran);
         }
diff --git a/src/c/AclContext.cpp b/src/c/AclContext.cpp
index 9b8ffea619..c6c0820c92 100644
--- a/src/c/AclContext.cpp
+++ b/src/c/AclContext.cpp
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "arm_compute/AclEntrypoints.h"
-
 #include "arm_compute/core/Error.h"
 
 #include "src/common/IContext.h"
@@ -42,25 +41,25 @@ namespace
 template <typename ContextType>
 arm_compute::IContext *create_backend_ctx(const AclContextOptions *options)
 {
-    return new(std::nothrow) ContextType(options);
+    return new (std::nothrow) ContextType(options);
 }
 
 bool is_target_valid(AclTarget target)
 {
-    return arm_compute::utils::is_in(target, { AclCpu, AclGpuOcl });
+    return arm_compute::utils::is_in(target, {AclCpu, AclGpuOcl});
 }
 
 bool are_context_options_valid(const AclContextOptions *options)
 {
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(options);
-    return arm_compute::utils::is_in(options->mode, { AclPreferFastRerun, AclPreferFastStart });
+    return arm_compute::utils::is_in(options->mode, {AclPreferFastRerun, AclPreferFastStart});
 }
 
 arm_compute::IContext *create_context(AclTarget target, const AclContextOptions *options)
 {
     ARM_COMPUTE_UNUSED(options);
 
-    switch(target)
+    switch (target)
     {
 #ifdef ARM_COMPUTE_CPU_ENABLED
         case AclCpu:
@@ -77,24 +76,22 @@ arm_compute::IContext *create_context(AclTarget target, const AclContextOptions
 }
 } // namespace
 
-extern "C" AclStatus AclCreateContext(AclContext              *external_ctx,
-                                      AclTarget                target,
-                                      const AclContextOptions *options)
+extern "C" AclStatus AclCreateContext(AclContext *external_ctx, AclTarget target, const AclContextOptions *options)
 {
-    if(!is_target_valid(target))
+    if (!is_target_valid(target))
     {
         ARM_COMPUTE_LOG_ERROR_WITH_FUNCNAME_ACL("Target is invalid!");
         return AclUnsupportedTarget;
     }
 
-    if(options != nullptr && !are_context_options_valid(options))
+    if (options != nullptr && !are_context_options_valid(options))
     {
         ARM_COMPUTE_LOG_ERROR_WITH_FUNCNAME_ACL("Context options are invalid!");
         return AclInvalidArgument;
     }
 
     auto ctx = create_context(target, options);
-    if(ctx == nullptr)
+    if (ctx == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_WITH_FUNCNAME_ACL("Couldn't allocate internal resources for context creation!");
         return AclOutOfMemory;
@@ -113,7 +110,7 @@ extern "C" AclStatus AclDestroyContext(AclContext external_ctx)
     StatusCode status = detail::validate_internal_context(ctx);
     ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status);
 
-    if(ctx->refcount() != 0)
+    if (ctx->refcount() != 0)
     {
         ARM_COMPUTE_LOG_ERROR_WITH_FUNCNAME_ACL("Context has references on it that haven't been released!");
         // TODO: Fix the refcount with callback when reaches 0
diff --git a/src/c/AclQueue.cpp b/src/c/AclQueue.cpp
index 020c6ed531..c3e867bffc 100644
--- a/src/c/AclQueue.cpp
+++ b/src/c/AclQueue.cpp
@@ -38,7 +38,7 @@ namespace
 bool is_mode_valid(const AclQueueOptions *options)
 {
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(options);
-    return arm_compute::utils::is_in(options->mode, { AclTuningModeNone, AclRapid, AclNormal, AclExhaustive });
+    return arm_compute::utils::is_in(options->mode, {AclTuningModeNone, AclRapid, AclNormal, AclExhaustive});
 }
 } // namespace
 
@@ -51,14 +51,14 @@ extern "C" AclStatus AclCreateQueue(AclQueue *external_queue, AclContext externa
     StatusCode status = detail::validate_internal_context(ctx);
     ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status);
 
-    if(options != nullptr && !is_mode_valid(options))
+    if (options != nullptr && !is_mode_valid(options))
     {
         ARM_COMPUTE_LOG_ERROR_ACL("Queue options are invalid");
         return AclInvalidArgument;
     }
 
     auto queue = ctx->create_queue(options);
-    if(queue == nullptr)
+    if (queue == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources");
         return AclOutOfMemory;
diff --git a/src/c/AclTensor.cpp b/src/c/AclTensor.cpp
index 5b184697aa..c4cd08ac70 100644
--- a/src/c/AclTensor.cpp
+++ b/src/c/AclTensor.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/AclEntrypoints.h"
 #include "arm_compute/AclUtils.h"
 #include "arm_compute/core/Error.h"
+
 #include "src/common/ITensorV2.h"
 #include "src/common/utils/Macros.h"
 
@@ -41,17 +42,17 @@ constexpr int32_t max_allowed_dims = 6;
  */
 bool is_desc_valid(const AclTensorDescriptor &desc)
 {
-    if(desc.data_type > AclFloat32 || desc.data_type <= AclDataTypeUnknown)
+    if (desc.data_type > AclFloat32 || desc.data_type <= AclDataTypeUnknown)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[AclCreateTensor]: Unknown data type!");
         return false;
     }
-    if(desc.ndims > max_allowed_dims)
+    if (desc.ndims > max_allowed_dims)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[AclCreateTensor]: Dimensions surpass the maximum allowed value!");
         return false;
     }
-    if(desc.ndims > 0 && desc.shape == nullptr)
+    if (desc.ndims > 0 && desc.shape == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[AclCreateTensor]: Dimensions values are empty while dimensionality is > 0!");
         return false;
@@ -66,10 +67,8 @@ StatusCode convert_and_validate_tensor(AclTensor tensor, ITensorV2 **internal_te
 }
 } // namespace
 
-extern "C" AclStatus AclCreateTensor(AclTensor                 *external_tensor,
-                                     AclContext                 external_ctx,
-                                     const AclTensorDescriptor *desc,
-                                     bool                       allocate)
+extern "C" AclStatus
+AclCreateTensor(AclTensor *external_tensor, AclContext external_ctx, const AclTensorDescriptor *desc, bool allocate)
 {
     using namespace arm_compute;
 
@@ -78,14 +77,14 @@ extern "C" AclStatus AclCreateTensor(AclTensor                 *external_tensor,
     StatusCode status = detail::validate_internal_context(ctx);
     ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status);
 
-    if(desc == nullptr || !is_desc_valid(*desc))
+    if (desc == nullptr || !is_desc_valid(*desc))
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[AclCreateTensor]: Descriptor is invalid!");
         return AclInvalidArgument;
     }
 
     auto tensor = ctx->create_tensor(*desc, allocate);
-    if(tensor == nullptr)
+    if (tensor == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[AclCreateTensor]: Couldn't allocate internal resources for tensor creation!");
         return AclOutOfMemory;
@@ -103,7 +102,7 @@ extern "C" AclStatus AclMapTensor(AclTensor external_tensor, void **handle)
     StatusCode status = detail::validate_internal_tensor(tensor);
     ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status);
 
-    if(handle == nullptr)
+    if (handle == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[AclMapTensor]: Handle object is nullptr!");
         return AclInvalidArgument;
@@ -160,12 +159,12 @@ extern "C" AclStatus AclGetTensorSize(AclTensor tensor, uint64_t *size)
 {
     using namespace arm_compute;
 
-    if(size == nullptr)
+    if (size == nullptr)
     {
         return AclStatus::AclInvalidArgument;
     }
 
-    ITensorV2 *internal_tensor{ nullptr };
+    ITensorV2 *internal_tensor{nullptr};
     auto       status = convert_and_validate_tensor(tensor, &internal_tensor);
     ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status);
 
@@ -177,15 +176,15 @@ extern "C" AclStatus AclGetTensorDescriptor(AclTensor tensor, AclTensorDescripto
 {
     using namespace arm_compute;
 
-    if(desc == nullptr)
+    if (desc == nullptr)
     {
         return AclStatus::AclInvalidArgument;
     }
 
-    ITensorV2 *internal_tensor{ nullptr };
+    ITensorV2 *internal_tensor{nullptr};
     const auto status = convert_and_validate_tensor(tensor, &internal_tensor);
     ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status);
 
     *desc = internal_tensor->get_descriptor();
     return utils::as_cenum<AclStatus>(status);
-}
-\ No newline at end of file
+}
diff --git a/src/c/AclTensorPack.cpp b/src/c/AclTensorPack.cpp
index 6202524ca7..daf1be4f44 100644
--- a/src/c/AclTensorPack.cpp
+++ b/src/c/AclTensorPack.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/AclEntrypoints.h"
+
 #include "src/common/ITensorV2.h"
 #include "src/common/TensorPack.h"
 #include "src/common/utils/Macros.h"
@@ -36,7 +37,7 @@ StatusCode PackTensorInternal(TensorPack &pack, AclTensor external_tensor, int32
 
     status = detail::validate_internal_tensor(tensor);
 
-    if(status != StatusCode::Success)
+    if (status != StatusCode::Success)
     {
         return status;
     }
@@ -57,7 +58,7 @@ extern "C" AclStatus AclCreateTensorPack(AclTensorPack *external_pack, AclContex
     ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status);
 
     auto pack = new TensorPack(ctx);
-    if(pack == nullptr)
+    if (pack == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_WITH_FUNCNAME_ACL("Couldn't allocate internal resources!");
         return AclOutOfMemory;
@@ -77,14 +78,15 @@ extern "C" AclStatus AclPackTensor(AclTensorPack external_pack, AclTensor extern
     return AclStatus::AclSuccess;
 }
 
-extern "C" AclStatus AclPackTensors(AclTensorPack external_pack, AclTensor *external_tensors, int32_t *slot_ids, size_t num_tensors)
+extern "C" AclStatus
+AclPackTensors(AclTensorPack external_pack, AclTensor *external_tensors, int32_t *slot_ids, size_t num_tensors)
 {
     using namespace arm_compute;
 
     auto pack = get_internal(external_pack);
     ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(detail::validate_internal_pack(pack));
 
-    for(unsigned i = 0; i < num_tensors; ++i)
+    for (unsigned i = 0; i < num_tensors; ++i)
     {
         ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(PackTensorInternal(*pack, external_tensors[i], slot_ids[i]));
     }
diff --git a/src/c/AclVersion.cpp b/src/c/AclVersion.cpp
index 971189a6d4..a659e90837 100644
--- a/src/c/AclVersion.cpp
+++ b/src/c/AclVersion.cpp
@@ -25,8 +25,7 @@
 
 namespace
 {
-constexpr AclVersion version_info
-{
+constexpr AclVersion version_info{
     ARM_COMPUTE_LIBRARY_VERSION_MAJOR,
     ARM_COMPUTE_LIBRARY_VERSION_MINOR,
     ARM_COMPUTE_LIBRARY_VERSION_PATCH,
diff --git a/src/c/cl/AclOpenClExt.cpp b/src/c/cl/AclOpenClExt.cpp
index e72babcae8..8e42cf5510 100644
--- a/src/c/cl/AclOpenClExt.cpp
+++ b/src/c/cl/AclOpenClExt.cpp
@@ -23,13 +23,12 @@
  */
 #include "arm_compute/AclOpenClExt.h"
 
+#include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/common/ITensorV2.h"
 #include "src/common/Types.h"
 #include "src/gpu/cl/ClContext.h"
 #include "src/gpu/cl/ClQueue.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-
 #include "support/Cast.h"
 
 extern "C" AclStatus AclGetClContext(AclContext external_ctx, cl_context *opencl_context)
@@ -37,17 +36,17 @@ extern "C" AclStatus AclGetClContext(AclContext external_ctx, cl_context *opencl
     using namespace arm_compute;
     IContext *ctx = get_internal(external_ctx);
 
-    if(detail::validate_internal_context(ctx) != StatusCode::Success)
+    if (detail::validate_internal_context(ctx) != StatusCode::Success)
     {
         return AclStatus::AclInvalidArgument;
     }
 
-    if(ctx->type() != Target::GpuOcl)
+    if (ctx->type() != Target::GpuOcl)
     {
         return AclStatus::AclInvalidTarget;
     }
 
-    if(opencl_context == nullptr)
+    if (opencl_context == nullptr)
     {
         return AclStatus::AclInvalidArgument;
     }
@@ -62,23 +61,23 @@ extern "C" AclStatus AclSetClContext(AclContext external_ctx, cl_context opencl_
     using namespace arm_compute;
     IContext *ctx = get_internal(external_ctx);
 
-    if(detail::validate_internal_context(ctx) != StatusCode::Success)
+    if (detail::validate_internal_context(ctx) != StatusCode::Success)
     {
         return AclStatus::AclInvalidArgument;
     }
 
-    if(ctx->type() != Target::GpuOcl)
+    if (ctx->type() != Target::GpuOcl)
     {
         return AclStatus::AclInvalidTarget;
     }
 
-    if(ctx->refcount() != 0)
+    if (ctx->refcount() != 0)
     {
         return AclStatus::AclUnsupportedConfig;
     }
 
     auto cl_ctx = utils::cast::polymorphic_downcast<arm_compute::gpu::opencl::ClContext *>(ctx);
-    if(!cl_ctx->set_cl_ctx(::cl::Context(opencl_context)))
+    if (!cl_ctx->set_cl_ctx(::cl::Context(opencl_context)))
     {
         return AclStatus::AclRuntimeError;
     }
@@ -91,17 +90,17 @@ extern "C" AclStatus AclGetClDevice(AclContext external_ctx, cl_device_id *openc
     using namespace arm_compute;
     IContext *ctx = get_internal(external_ctx);
 
-    if(detail::validate_internal_context(ctx) != StatusCode::Success)
+    if (detail::validate_internal_context(ctx) != StatusCode::Success)
     {
         return AclStatus::AclInvalidArgument;
     }
 
-    if(ctx->type() != Target::GpuOcl)
+    if (ctx->type() != Target::GpuOcl)
     {
         return AclStatus::AclInvalidTarget;
     }
 
-    if(opencl_device == nullptr)
+    if (opencl_device == nullptr)
     {
         return AclStatus::AclInvalidArgument;
     }
@@ -116,17 +115,17 @@ extern "C" AclStatus AclGetClQueue(AclQueue external_queue, cl_command_queue *op
     using namespace arm_compute;
     IQueue *queue = get_internal(external_queue);
 
-    if(detail::validate_internal_queue(queue) != StatusCode::Success)
+    if (detail::validate_internal_queue(queue) != StatusCode::Success)
     {
         return AclStatus::AclInvalidArgument;
     }
 
-    if(queue->header.ctx->type() != Target::GpuOcl)
+    if (queue->header.ctx->type() != Target::GpuOcl)
     {
         return AclStatus::AclInvalidTarget;
     }
 
-    if(opencl_queue == nullptr)
+    if (opencl_queue == nullptr)
     {
         return AclStatus::AclInvalidArgument;
     }
@@ -141,18 +140,18 @@ extern "C" AclStatus AclSetClQueue(AclQueue external_queue, cl_command_queue ope
     using namespace arm_compute;
     IQueue *queue = get_internal(external_queue);
 
-    if(detail::validate_internal_queue(queue) != StatusCode::Success)
+    if (detail::validate_internal_queue(queue) != StatusCode::Success)
     {
         return AclStatus::AclInvalidArgument;
     }
 
-    if(queue->header.ctx->type() != Target::GpuOcl)
+    if (queue->header.ctx->type() != Target::GpuOcl)
     {
         return AclStatus::AclInvalidTarget;
     }
 
     auto cl_queue = utils::cast::polymorphic_downcast<arm_compute::gpu::opencl::ClQueue *>(queue);
-    if(!cl_queue->set_cl_queue(::cl::CommandQueue(opencl_queue)))
+    if (!cl_queue->set_cl_queue(::cl::CommandQueue(opencl_queue)))
     {
         return AclStatus::AclRuntimeError;
     }
@@ -165,17 +164,17 @@ extern "C" AclStatus AclGetClMem(AclTensor external_tensor, cl_mem *opencl_mem)
     using namespace arm_compute;
     ITensorV2 *tensor = get_internal(external_tensor);
 
-    if(detail::validate_internal_tensor(tensor) != StatusCode::Success)
+    if (detail::validate_internal_tensor(tensor) != StatusCode::Success)
     {
         return AclStatus::AclInvalidArgument;
     }
 
-    if(tensor->header.ctx->type() != Target::GpuOcl)
+    if (tensor->header.ctx->type() != Target::GpuOcl)
     {
         return AclStatus::AclInvalidTarget;
     }
 
-    if(opencl_mem == nullptr)
+    if (opencl_mem == nullptr)
     {
         return AclStatus::AclInvalidArgument;
     }
@@ -184,4 +183,4 @@ extern "C" AclStatus AclGetClMem(AclTensor external_tensor, cl_mem *opencl_mem)
     *opencl_mem    = cl_tensor->cl_buffer().get();
 
     return AclStatus::AclSuccess;
-}
-\ No newline at end of file
+}
diff --git a/src/common/AllocatorWrapper.cpp b/src/common/AllocatorWrapper.cpp
index 7b5bb34433..28d81a9fa4 100644
--- a/src/common/AllocatorWrapper.cpp
+++ b/src/common/AllocatorWrapper.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "src/common/AllocatorWrapper.h"
+
 #include "arm_compute/core/Error.h"
 
 namespace arm_compute
@@ -57,7 +58,7 @@ void AllocatorWrapper::aligned_free(void *ptr)
 
 void AllocatorWrapper::set_user_data(void *user_data)
 {
-    if(user_data != nullptr)
+    if (user_data != nullptr)
     {
         _backing_allocator.user_data = user_data;
     }
diff --git a/src/common/AllocatorWrapper.h b/src/common/AllocatorWrapper.h
index 5e1f138f16..bbf70a2cb1 100644
--- a/src/common/AllocatorWrapper.h
+++ b/src/common/AllocatorWrapper.h
@@ -37,8 +37,8 @@ public:
      * @param[in] backing_allocator Backing memory allocator to be used
      */
     AllocatorWrapper(const AclAllocator &backing_allocator) noexcept;
-    AllocatorWrapper(const AllocatorWrapper &) noexcept = default;
-    AllocatorWrapper(AllocatorWrapper &&) noexcept      = default;
+    AllocatorWrapper(const AllocatorWrapper &) noexcept            = default;
+    AllocatorWrapper(AllocatorWrapper &&) noexcept                 = default;
     AllocatorWrapper &operator=(const AllocatorWrapper &) noexcept = delete;
     AllocatorWrapper &operator=(AllocatorWrapper &&other) noexcept = default;
     /** Allocate a chunk of memory of a given size in bytes
@@ -78,4 +78,4 @@ private:
 };
 } // namespace arm_compute
 
-#endif /* SRC_COMMON_ALLOCATORWRAPPER_H */
-\ No newline at end of file
+#endif /* SRC_COMMON_ALLOCATORWRAPPER_H */
diff --git a/src/common/IContext.h b/src/common/IContext.h
index 65bb76744d..a221e5db61 100644
--- a/src/common/IContext.h
+++ b/src/common/IContext.h
@@ -33,7 +33,7 @@
 
 struct AclContext_
 {
-    arm_compute::detail::Header header{ arm_compute::detail::ObjectType::Context, nullptr };
+    arm_compute::detail::Header header{arm_compute::detail::ObjectType::Context, nullptr};
 
 protected:
     AclContext_()  = default;
@@ -51,8 +51,7 @@ class IOperator;
 class IContext : public AclContext_
 {
 public:
-    IContext(Target target)
-        : AclContext_(), _target(target), _refcount(0)
+    IContext(Target target) : AclContext_(), _target(target), _refcount(0)
     {
     }
     /** Virtual Destructor */
@@ -108,11 +107,11 @@ public:
      *
      * @return A pointer to the created queue object
      */
-    virtual IQueue *create_queue(const AclQueueOptions *options) = 0;
-    virtual std::tuple<IOperator *, StatusCode> create_activation(const AclTensorDescriptor &src,
+    virtual IQueue                             *create_queue(const AclQueueOptions *options) = 0;
+    virtual std::tuple<IOperator *, StatusCode> create_activation(const AclTensorDescriptor     &src,
                                                                   const AclTensorDescriptor     &dst,
                                                                   const AclActivationDescriptor &act,
-                                                                  bool                           is_validate) = 0;
+                                                                  bool                           is_validate)          = 0;
 
 private:
     Target                   _target;   /**< Target type of context */
@@ -140,7 +139,7 @@ namespace detail
  */
 inline StatusCode validate_internal_context(const IContext *ctx)
 {
-    if(ctx == nullptr || !ctx->is_valid())
+    if (ctx == nullptr || !ctx->is_valid())
     {
         ARM_COMPUTE_LOG_ERROR_ACL("Invalid context object");
         return StatusCode::InvalidArgument;
diff --git a/src/common/IOperator.cpp b/src/common/IOperator.cpp
index b56f0e97fb..90e3473814 100644
--- a/src/common/IOperator.cpp
+++ b/src/common/IOperator.cpp
@@ -22,13 +22,13 @@
  * SOFTWARE.
  */
 #include "src/common/IOperator.h"
+
 #include "src/common/utils/Validate.h"
 
 namespace arm_compute
 {
 #ifndef DOXYGEN_SKIP_THIS
-IOperator::IOperator(IContext *ctx)
-    : AclOperator_()
+IOperator::IOperator(IContext *ctx) : AclOperator_()
 {
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(ctx);
     this->header.ctx = ctx;
diff --git a/src/common/IOperator.h b/src/common/IOperator.h
index 1b65a09e0d..e86e11fe25 100644
--- a/src/common/IOperator.h
+++ b/src/common/IOperator.h
@@ -30,13 +30,14 @@
 // TODO: Remove when all functions have been ported
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/IOperator.h"
+
 #include "src/common/utils/Validate.h"
 
 #include <vector>
 
 struct AclOperator_
 {
-    arm_compute::detail::Header header{ arm_compute::detail::ObjectType::Operator, nullptr };
+    arm_compute::detail::Header header{arm_compute::detail::ObjectType::Operator, nullptr};
 
 protected:
     AclOperator_()  = default;
@@ -100,7 +101,7 @@ public:
     }
 
 private:
-    std::unique_ptr<experimental::IOperator> _op{ nullptr };
+    std::unique_ptr<experimental::IOperator> _op{nullptr};
 };
 
 /** Extract internal representation of an Operator
@@ -124,7 +125,7 @@ namespace detail
  */
 inline StatusCode validate_internal_operator(const IOperator *op)
 {
-    if(op == nullptr || !op->is_valid())
+    if (op == nullptr || !op->is_valid())
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[IOperator]: Invalid operator object");
         return StatusCode::InvalidArgument;
diff --git a/src/common/IQueue.h b/src/common/IQueue.h
index 6a0cbc75da..60745d206e 100644
--- a/src/common/IQueue.h
+++ b/src/common/IQueue.h
@@ -28,7 +28,7 @@
 
 struct AclQueue_
 {
-    arm_compute::detail::Header header{ arm_compute::detail::ObjectType::Queue, nullptr };
+    arm_compute::detail::Header header{arm_compute::detail::ObjectType::Queue, nullptr};
 
 protected:
     AclQueue_()  = default;
@@ -88,7 +88,7 @@ namespace detail
  */
 inline StatusCode validate_internal_queue(const IQueue *queue)
 {
-    if(queue == nullptr || !queue->is_valid())
+    if (queue == nullptr || !queue->is_valid())
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[IQueue]: Invalid queue object");
         return StatusCode::InvalidArgument;
diff --git a/src/common/ITensorV2.cpp b/src/common/ITensorV2.cpp
index 39bf1c6fb3..bf3d963926 100644
--- a/src/common/ITensorV2.cpp
+++ b/src/common/ITensorV2.cpp
@@ -22,7 +22,9 @@
  * SOFTWARE.
  */
 #include "src/common/ITensorV2.h"
+
 #include "arm_compute/core/TensorInfo.h"
+
 #include "src/common/utils/LegacySupport.h"
 
 namespace arm_compute
@@ -36,4 +38,4 @@ AclTensorDescriptor ITensorV2::get_descriptor() const
 {
     return detail::convert_to_descriptor(*tensor()->info());
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/common/ITensorV2.h b/src/common/ITensorV2.h
index 965aacea23..903bfad66a 100644
--- a/src/common/ITensorV2.h
+++ b/src/common/ITensorV2.h
@@ -29,7 +29,7 @@
 
 struct AclTensor_
 {
-    arm_compute::detail::Header header{ arm_compute::detail::ObjectType::Tensor, nullptr };
+    arm_compute::detail::Header header{arm_compute::detail::ObjectType::Tensor, nullptr};
 
 protected:
     AclTensor_()  = default;
@@ -49,8 +49,7 @@ public:
      *
      * @param[in] ctx Context to be used by the operator
      */
-    explicit ITensorV2(IContext *ctx)
-        : AclTensor_()
+    explicit ITensorV2(IContext *ctx) : AclTensor_()
     {
         ARM_COMPUTE_ASSERT_NOT_NULLPTR(ctx);
         this->header.ctx = ctx;
@@ -128,7 +127,7 @@ namespace detail
  */
 inline StatusCode validate_internal_tensor(const ITensorV2 *tensor)
 {
-    if(tensor == nullptr || !tensor->is_valid())
+    if (tensor == nullptr || !tensor->is_valid())
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[ITensorV2]: Invalid tensor object");
         return StatusCode::InvalidArgument;
diff --git a/src/common/TensorPack.cpp b/src/common/TensorPack.cpp
index 6c2c7f9622..b51fc0bdd8 100644
--- a/src/common/TensorPack.cpp
+++ b/src/common/TensorPack.cpp
@@ -22,13 +22,13 @@
  * SOFTWARE.
  */
 #include "src/common/TensorPack.h"
+
 #include "src/common/ITensorV2.h"
 #include "src/common/utils/Validate.h"
 
 namespace arm_compute
 {
-TensorPack::TensorPack(IContext *ctx)
-    : AclTensorPack_(), _pack()
+TensorPack::TensorPack(IContext *ctx) : AclTensorPack_(), _pack()
 {
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(ctx);
     this->header.ctx = ctx;
diff --git a/src/common/TensorPack.h b/src/common/TensorPack.h
index f330eee740..b3d1624dae 100644
--- a/src/common/TensorPack.h
+++ b/src/common/TensorPack.h
@@ -25,11 +25,12 @@
 #define SRC_COMMON_ITENSORPACK_H_
 
 #include "arm_compute/core/ITensorPack.h"
+
 #include "src/common/IContext.h"
 
 struct AclTensorPack_
 {
-    arm_compute::detail::Header header{ arm_compute::detail::ObjectType::TensorPack, nullptr };
+    arm_compute::detail::Header header{arm_compute::detail::ObjectType::TensorPack, nullptr};
 
 protected:
     AclTensorPack_()  = default;
@@ -118,7 +119,7 @@ namespace detail
  */
 inline StatusCode validate_internal_pack(const TensorPack *pack)
 {
-    if(pack == nullptr || !pack->is_valid())
+    if (pack == nullptr || !pack->is_valid())
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[TensorPack]: Invalid tensor pack object");
         return StatusCode::InvalidArgument;
diff --git a/src/common/cpuinfo/CpuInfo.cpp b/src/common/cpuinfo/CpuInfo.cpp
index cdcdea916c..23a477332a 100644
--- a/src/common/cpuinfo/CpuInfo.cpp
+++ b/src/common/cpuinfo/CpuInfo.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Log.h"
+
 #include "support/StringSupport.h"
 #include "support/ToolchainSupport.h"
 
@@ -53,16 +54,16 @@
 #endif /* defined(__APPLE__) && defined(__aarch64__)) */
 #endif /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */
 
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP_CPUID (1 << 11)
-#define ARM_COMPUTE_GET_FEATURE_REG(var, freg) __asm __volatile("MRS %0, " #freg \
-                                                                : "=r"(var))
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP_CPUID    (1 << 11)
+#define ARM_COMPUTE_GET_FEATURE_REG(var, freg) __asm __volatile("MRS %0, " #freg : "=r"(var))
 namespace arm_compute
 {
 namespace cpuinfo
 {
 namespace
 {
-#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__))
+#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__))
 /** Extract MIDR using CPUID information that are exposed to user-space
  *
  * @param[in] max_num_cpus Maximum number of possible CPUs
@@ -72,15 +73,15 @@ namespace
 std::vector<uint32_t> midr_from_cpuid(uint32_t max_num_cpus)
 {
     std::vector<uint32_t> cpus;
-    for(unsigned int i = 0; i < max_num_cpus; ++i)
+    for (unsigned int i = 0; i < max_num_cpus; ++i)
     {
         std::stringstream str;
         str << "/sys/devices/system/cpu/cpu" << i << "/regs/identification/midr_el1";
         std::ifstream file(str.str(), std::ios::in);
-        if(file.is_open())
+        if (file.is_open())
         {
             std::string line;
-            if(bool(getline(file, line)))
+            if (bool(getline(file, line)))
             {
                 cpus.emplace_back(support::cpp11::stoul(line, nullptr, support::cpp11::NumericBase::BASE_16));
             }
@@ -122,34 +123,35 @@ std::vector<uint32_t> midr_from_proc_cpuinfo(int max_num_cpus)
     ARM_COMPUTE_ERROR_ON_MSG(ret_status != 0, "Regex compilation failed.");
 
     std::ifstream file("/proc/cpuinfo", std::ios::in);
-    if(file.is_open())
+    if (file.is_open())
     {
         std::string line;
         int         midr   = 0;
         int         curcpu = -1;
 
-        while(bool(getline(file, line)))
+        while (bool(getline(file, line)))
         {
             std::array<regmatch_t, 2> match;
             ret_status = regexec(&proc_regex, line.c_str(), 2, match.data(), 0);
-            if(ret_status == 0)
+            if (ret_status == 0)
             {
                 std::string id     = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
                 int         newcpu = support::cpp11::stoi(id, nullptr);
 
-                if(curcpu >= 0 && midr == 0)
+                if (curcpu >= 0 && midr == 0)
                 {
                     // Matched a new CPU ID without any description of the previous one - looks like old format.
                     return {};
                 }
 
-                if(curcpu >= 0 && curcpu < max_num_cpus)
+                if (curcpu >= 0 && curcpu < max_num_cpus)
                 {
                     cpus.emplace_back(midr);
                 }
                 else
                 {
-                    ARM_COMPUTE_LOG_INFO_MSG_CORE("Trying to populate a core id with id greater than the expected number of cores!");
+                    ARM_COMPUTE_LOG_INFO_MSG_CORE(
+                        "Trying to populate a core id with id greater than the expected number of cores!");
                 }
 
                 midr   = 0;
@@ -159,7 +161,7 @@ std::vector<uint32_t> midr_from_proc_cpuinfo(int max_num_cpus)
             }
 
             ret_status = regexec(&imp_regex, line.c_str(), 2, match.data(), 0);
-            if(ret_status == 0)
+            if (ret_status == 0)
             {
                 std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
                 int         impv   = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16);
@@ -169,7 +171,7 @@ std::vector<uint32_t> midr_from_proc_cpuinfo(int max_num_cpus)
             }
 
             ret_status = regexec(&var_regex, line.c_str(), 2, match.data(), 0);
-            if(ret_status == 0)
+            if (ret_status == 0)
             {
                 std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
                 int         varv   = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16);
@@ -179,7 +181,7 @@ std::vector<uint32_t> midr_from_proc_cpuinfo(int max_num_cpus)
             }
 
             ret_status = regexec(&part_regex, line.c_str(), 2, match.data(), 0);
-            if(ret_status == 0)
+            if (ret_status == 0)
             {
                 std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
                 int         partv  = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16);
@@ -189,7 +191,7 @@ std::vector<uint32_t> midr_from_proc_cpuinfo(int max_num_cpus)
             }
 
             ret_status = regexec(&rev_regex, line.c_str(), 2, match.data(), 0);
-            if(ret_status == 0)
+            if (ret_status == 0)
             {
                 std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
                 int         regv   = support::cpp11::stoi(subexp, nullptr);
@@ -200,13 +202,14 @@ std::vector<uint32_t> midr_from_proc_cpuinfo(int max_num_cpus)
             }
         }
 
-        if(curcpu >= 0 && curcpu < max_num_cpus)
+        if (curcpu >= 0 && curcpu < max_num_cpus)
         {
             cpus.emplace_back(midr);
         }
         else
         {
-            ARM_COMPUTE_LOG_INFO_MSG_CORE("Trying to populate a core id with id greater than the expected number of cores!");
+            ARM_COMPUTE_LOG_INFO_MSG_CORE(
+                "Trying to populate a core id with id greater than the expected number of cores!");
         }
     }
 
@@ -231,11 +234,11 @@ int get_max_cpus()
     CPUspresent.open("/sys/devices/system/cpu/present", std::ios::in);
     bool success = false;
 
-    if(CPUspresent.is_open())
+    if (CPUspresent.is_open())
     {
         std::string line;
 
-        if(bool(getline(CPUspresent, line)))
+        if (bool(getline(CPUspresent, line)))
         {
             /* The content of this file is a list of ranges or single values, e.g.
                  * 0-5, or 1-3,5,7 or similar.  As we are interested in the
@@ -244,9 +247,9 @@ int get_max_cpus()
                  */
             auto startfrom = line.begin();
 
-            for(auto i = line.begin(); i < line.end(); ++i)
+            for (auto i = line.begin(); i < line.end(); ++i)
             {
-                if(*i == '-' || *i == ',')
+                if (*i == '-' || *i == ',')
                 {
                     startfrom = i + 1;
                 }
@@ -260,13 +263,14 @@ int get_max_cpus()
     }
 
     // Return std::thread::hardware_concurrency() as a fallback.
-    if(!success)
+    if (!success)
     {
         max_cpus = std::thread::hardware_concurrency();
     }
     return max_cpus;
 }
-#elif defined(__aarch64__) && defined(__APPLE__) /* !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__)) */
+#elif defined(__aarch64__) && \
+    defined(__APPLE__) /* !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__)) */
 /** Query features through sysctlbyname
   *
   * @return int value queried
@@ -278,46 +282,45 @@ int get_hw_capability(const std::string &cap)
     sysctlbyname(cap.c_str(), &result, &size, NULL, 0);
     return result;
 }
-#endif                                           /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */
+#endif /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */
 
 #if defined(BARE_METAL) && defined(__aarch64__)
 uint64_t get_sve_feature_reg()
 {
     uint64_t svefr0 = 0;
-    __asm __volatile(
-        ".inst 0xd5380483 // mrs x3, ID_AA64ZFR0_EL1\n"
-        "MOV  %0, X3"
-        : "=r"(svefr0)
-        :
-        : "x3");
+    __asm __volatile(".inst 0xd5380483 // mrs x3, ID_AA64ZFR0_EL1\n"
+                     "MOV  %0, X3"
+                     : "=r"(svefr0)
+                     :
+                     : "x3");
     return svefr0;
 }
 #endif /* defined(BARE_METAL) && defined(__aarch64__) */
 } // namespace
 
-CpuInfo::CpuInfo(CpuIsaInfo isa, std::vector<CpuModel> cpus)
-    : _isa(std::move(isa)), _cpus(std::move(cpus))
+CpuInfo::CpuInfo(CpuIsaInfo isa, std::vector<CpuModel> cpus) : _isa(std::move(isa)), _cpus(std::move(cpus))
 {
 }
 
 CpuInfo CpuInfo::build()
 {
-#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__))
+#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__))
     const uint32_t hwcaps   = getauxval(AT_HWCAP);
     const uint32_t hwcaps2  = getauxval(AT_HWCAP2);
     const uint32_t max_cpus = get_max_cpus();
 
     // Populate midr values
     std::vector<uint32_t> cpus_midr;
-    if(hwcaps & ARM_COMPUTE_CPU_FEATURE_HWCAP_CPUID)
+    if (hwcaps & ARM_COMPUTE_CPU_FEATURE_HWCAP_CPUID)
     {
         cpus_midr = midr_from_cpuid(max_cpus);
     }
-    if(cpus_midr.empty())
+    if (cpus_midr.empty())
     {
         cpus_midr = midr_from_proc_cpuinfo(max_cpus);
     }
-    if(cpus_midr.empty())
+    if (cpus_midr.empty())
     {
         cpus_midr.resize(max_cpus, 0);
     }
@@ -333,7 +336,9 @@ CpuInfo CpuInfo::build()
     CpuInfo info(isa, cpus_model);
     return info;
 
-#elif(BARE_METAL) && defined(__aarch64__)        /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */
+#elif (BARE_METAL) && \
+    defined(          \
+        __aarch64__) /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */
 
     // Assume single CPU in bare metal mode.  Just read the ID register and feature bits directly.
     uint64_t isar0 = 0, isar1 = 0, pfr0 = 0, pfr1 = 0, svefr0 = 0, midr = 0;
@@ -342,7 +347,7 @@ CpuInfo CpuInfo::build()
     ARM_COMPUTE_GET_FEATURE_REG(pfr0, ID_AA64PFR0_EL1);
     ARM_COMPUTE_GET_FEATURE_REG(pfr1, ID_AA64PFR1_EL1);
     ARM_COMPUTE_GET_FEATURE_REG(midr, MIDR_EL1);
-    if((pfr0 >> 32) & 0xf)
+    if ((pfr0 >> 32) & 0xf)
     {
         svefr0 = get_sve_feature_reg();
     }
@@ -361,14 +366,14 @@ CpuInfo CpuInfo::build()
     CpuInfo info(isainfo, cpus_model);
     return info;
 #else                                            /* #elif defined(__aarch64__) && defined(__APPLE__) */
-    CpuInfo info(CpuIsaInfo(), { CpuModel::GENERIC });
+    CpuInfo info(CpuIsaInfo(), {CpuModel::GENERIC});
     return info;
-#endif                                           /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */
+#endif /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */
 }
 
 CpuModel CpuInfo::cpu_model(uint32_t cpuid) const
 {
-    if(cpuid < _cpus.size())
+    if (cpuid < _cpus.size())
     {
         return _cpus[cpuid];
     }
@@ -377,9 +382,10 @@ CpuModel CpuInfo::cpu_model(uint32_t cpuid) const
 
 CpuModel CpuInfo::cpu_model() const
 {
-#if defined(_WIN64) || defined(BARE_METAL) || defined(__APPLE__) || defined(__OpenBSD__) || (!defined(__arm__) && !defined(__aarch64__))
+#if defined(_WIN64) || defined(BARE_METAL) || defined(__APPLE__) || defined(__OpenBSD__) || \
+    (!defined(__arm__) && !defined(__aarch64__))
     return cpu_model(0);
-#else  /* defined(BARE_METAL) || defined(__APPLE__) || defined(__OpenBSD__) || (!defined(__arm__) && !defined(__aarch64__)) */
+#else /* defined(BARE_METAL) || defined(__APPLE__) || defined(__OpenBSD__) || (!defined(__arm__) && !defined(__aarch64__)) */
     return cpu_model(sched_getcpu());
 #endif /* defined(BARE_METAL) || defined(__APPLE__) || defined(__OpenBSD__) || (!defined(__arm__) && !defined(__aarch64__)) */
 }
@@ -406,13 +412,13 @@ uint32_t num_threads_hint()
 
     // Read cpuinfo and get occurrence of each core
     std::ifstream cpuinfo_file("/proc/cpuinfo", std::ios::in);
-    if(cpuinfo_file.is_open())
+    if (cpuinfo_file.is_open())
     {
         std::string line;
-        while(bool(getline(cpuinfo_file, line)))
+        while (bool(getline(cpuinfo_file, line)))
         {
             std::array<regmatch_t, 2> match;
-            if(regexec(&cpu_part_rgx, line.c_str(), 2, match.data(), 0) == 0)
+            if (regexec(&cpu_part_rgx, line.c_str(), 2, match.data(), 0) == 0)
             {
                 cpus.emplace_back(line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so)));
             }
@@ -425,13 +431,13 @@ uint32_t num_threads_hint()
     auto least_frequent_cpu_occurences = [](const std::vector<std::string> &cpus) -> uint32_t
     {
         std::unordered_map<std::string, uint32_t> cpus_freq;
-        for(const auto &cpu : cpus)
+        for (const auto &cpu : cpus)
         {
             cpus_freq[cpu]++;
         }
 
         uint32_t vmin = cpus.size() + 1;
-        for(const auto &cpu_freq : cpus_freq)
+        for (const auto &cpu_freq : cpus_freq)
         {
             vmin = std::min(vmin, cpu_freq.second);
         }
diff --git a/src/common/cpuinfo/CpuIsaInfo.cpp b/src/common/cpuinfo/CpuIsaInfo.cpp
index 23da54a35d..597768530b 100644
--- a/src/common/cpuinfo/CpuIsaInfo.cpp
+++ b/src/common/cpuinfo/CpuIsaInfo.cpp
@@ -24,6 +24,7 @@
 #include "src/common/cpuinfo/CpuIsaInfo.h"
 
 #include "arm_compute/core/Error.h"
+
 #include "src/common/cpuinfo/CpuModel.h"
 
 /* Arm Feature flags */
@@ -31,18 +32,18 @@
 #define ARM_COMPUTE_CPU_FEATURE_HWCAP_NEON (1 << 12)
 
 /* Arm64 Feature flags */
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMD (1 << 1)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP_FPHP (1 << 9)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMDHP (1 << 10)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMDDP (1 << 20)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP_SVE (1 << 22)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVE2 (1 << 1)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVEI8MM (1 << 9)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMD     (1 << 1)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP_FPHP      (1 << 9)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMDHP   (1 << 10)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMDDP   (1 << 20)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP_SVE       (1 << 22)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVE2     (1 << 1)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVEI8MM  (1 << 9)
 #define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVEF32MM (1 << 10)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVEBF16 (1 << 12)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_I8MM (1 << 13)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_BF16 (1 << 14)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SME (1 << 23)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVEBF16  (1 << 12)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_I8MM     (1 << 13)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_BF16     (1 << 14)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SME      (1 << 23)
 
 namespace arm_compute
 {
@@ -71,12 +72,12 @@ void decode_hwcaps(CpuIsaInfo &isa, const uint32_t hwcaps, const uint32_t hwcaps
     isa.sve2 = is_feature_supported(hwcaps2, ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVE2);
 
     // Detection of SME from type HWCAP2 in the auxillary vector
-    isa.sme   = is_feature_supported(hwcaps2, ARM_COMPUTE_CPU_FEATURE_HWCAP2_SME);
-    isa.sme2  = isa.sme; // Needs to be set properly
+    isa.sme  = is_feature_supported(hwcaps2, ARM_COMPUTE_CPU_FEATURE_HWCAP2_SME);
+    isa.sme2 = isa.sme; // Needs to be set properly
 
     // Data-type support
-    isa.fp16    = is_feature_supported(hwcaps, ARM_COMPUTE_CPU_FEATURE_HWCAP_FPHP | ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMDHP);
-    isa.bf16    = is_feature_supported(hwcaps2, ARM_COMPUTE_CPU_FEATURE_HWCAP2_BF16);
+    isa.fp16 = is_feature_supported(hwcaps, ARM_COMPUTE_CPU_FEATURE_HWCAP_FPHP | ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMDHP);
+    isa.bf16 = is_feature_supported(hwcaps2, ARM_COMPUTE_CPU_FEATURE_HWCAP2_BF16);
     isa.svebf16 = is_feature_supported(hwcaps2, ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVEBF16);
 
     // Instruction extensions
@@ -92,12 +93,15 @@ void decode_hwcaps(CpuIsaInfo &isa, const uint32_t hwcaps, const uint32_t hwcaps
 }
 #endif /* defined(__aarch64__) */
 
-void decode_regs(CpuIsaInfo &isa, const uint64_t isar0, const uint64_t isar1, const uint64_t pfr0, const uint64_t pfr1, const uint64_t svefr0)
+void decode_regs(CpuIsaInfo    &isa,
+                 const uint64_t isar0,
+                 const uint64_t isar1,
+                 const uint64_t pfr0,
+                 const uint64_t pfr1,
+                 const uint64_t svefr0)
 {
     auto is_supported = [](uint64_t feature_reg, uint8_t feature_pos) -> bool
-    {
-        return ((feature_reg >> feature_pos) & 0xf);
-    };
+    { return ((feature_reg >> feature_pos) & 0xf); };
 
     // High-level SIMD support
     isa.sve  = is_supported(pfr0, 32);
@@ -124,11 +128,11 @@ void decode_regs(CpuIsaInfo &isa, const uint64_t isar0, const uint64_t isar1, co
  */
 void allowlisted_model_features(CpuIsaInfo &isa, CpuModel model)
 {
-    if(isa.dot == false)
+    if (isa.dot == false)
     {
         isa.dot = model_supports_dot(model);
     }
-    if(isa.fp16 == false)
+    if (isa.fp16 == false)
     {
         isa.fp16 = model_supports_fp16(model);
     }
@@ -147,7 +151,8 @@ CpuIsaInfo init_cpu_isa_from_hwcaps(uint32_t hwcaps, uint32_t hwcaps2, uint32_t
     return isa;
 }
 
-CpuIsaInfo init_cpu_isa_from_regs(uint64_t isar0, uint64_t isar1, uint64_t pfr0, uint64_t pfr1, uint64_t svefr0, uint64_t midr)
+CpuIsaInfo
+init_cpu_isa_from_regs(uint64_t isar0, uint64_t isar1, uint64_t pfr0, uint64_t pfr1, uint64_t svefr0, uint64_t midr)
 {
     CpuIsaInfo isa;
 
diff --git a/src/common/cpuinfo/CpuIsaInfo.h b/src/common/cpuinfo/CpuIsaInfo.h
index b92b6538b6..9d6bc07b67 100644
--- a/src/common/cpuinfo/CpuIsaInfo.h
+++ b/src/common/cpuinfo/CpuIsaInfo.h
@@ -37,22 +37,22 @@ namespace cpuinfo
 struct CpuIsaInfo
 {
     /* SIMD extension support */
-    bool neon{ false };
-    bool sve{ false };
-    bool sve2{ false };
-    bool sme{ false };
-    bool sme2{ false };
+    bool neon{false};
+    bool sve{false};
+    bool sve2{false};
+    bool sme{false};
+    bool sme2{false};
 
     /* Data-type extensions support */
-    bool fp16{ false };
-    bool bf16{ false };
-    bool svebf16{ false };
+    bool fp16{false};
+    bool bf16{false};
+    bool svebf16{false};
 
     /* Instruction support */
-    bool dot{ false };
-    bool i8mm{ false };
-    bool svei8mm{ false };
-    bool svef32mm{ false };
+    bool dot{false};
+    bool i8mm{false};
+    bool svei8mm{false};
+    bool svef32mm{false};
 };
 
 /** Identify ISA related information through system information
@@ -76,7 +76,8 @@ CpuIsaInfo init_cpu_isa_from_hwcaps(uint32_t hwcaps, uint32_t hwcaps2, uint32_t
  *
  * @return CpuIsaInfo A populated ISA feature structure
  */
-CpuIsaInfo init_cpu_isa_from_regs(uint64_t isar0, uint64_t isar1, uint64_t pfr0, uint64_t pfr1, uint64_t svefr0, uint64_t midr);
+CpuIsaInfo
+init_cpu_isa_from_regs(uint64_t isar0, uint64_t isar1, uint64_t pfr0, uint64_t pfr1, uint64_t svefr0, uint64_t midr);
 } // namespace cpuinfo
 } // namespace arm_compute
 
diff --git a/src/common/cpuinfo/CpuModel.cpp b/src/common/cpuinfo/CpuModel.cpp
index d6d91df133..0455670302 100644
--- a/src/common/cpuinfo/CpuModel.cpp
+++ b/src/common/cpuinfo/CpuModel.cpp
@@ -29,12 +29,12 @@ namespace cpuinfo
 {
 std::string cpu_model_to_string(CpuModel model)
 {
-    switch(model)
+    switch (model)
     {
 #define X(MODEL)          \
-case CpuModel::MODEL: \
-    return #MODEL;
-            ARM_COMPUTE_CPU_MODEL_LIST
+    case CpuModel::MODEL: \
+        return #MODEL;
+        ARM_COMPUTE_CPU_MODEL_LIST
 #undef X
         default:
         {
@@ -45,7 +45,7 @@ case CpuModel::MODEL: \
 
 bool model_supports_fp16(CpuModel model)
 {
-    switch(model)
+    switch (model)
     {
         case CpuModel::GENERIC_FP16:
         case CpuModel::GENERIC_FP16_DOT:
@@ -63,7 +63,7 @@ bool model_supports_fp16(CpuModel model)
 
 bool model_supports_dot(CpuModel model)
 {
-    switch(model)
+    switch (model)
     {
         case CpuModel::GENERIC_FP16_DOT:
         case CpuModel::A55r1:
@@ -87,16 +87,16 @@ CpuModel midr_to_model(uint32_t midr)
     const int cpunum      = (midr >> 4) & 0xFFF;
 
     // Only CPUs we have code paths for are detected.  All other CPUs can be safely classed as "GENERIC"
-    if(implementer == 0x41) // Arm CPUs
+    if (implementer == 0x41) // Arm CPUs
     {
-        switch(cpunum)
+        switch (cpunum)
         {
             case 0xd03: // A53
             case 0xd04: // A35
                 model = CpuModel::A53;
                 break;
             case 0xd05: // A55
-                if(variant != 0)
+                if (variant != 0)
                 {
                     model = CpuModel::A55r1;
                 }
@@ -109,7 +109,7 @@ CpuModel midr_to_model(uint32_t midr)
                 model = CpuModel::A73;
                 break;
             case 0xd0a: // A75
-                if(variant != 0)
+                if (variant != 0)
                 {
                     model = CpuModel::GENERIC_FP16_DOT;
                 }
@@ -144,9 +144,9 @@ CpuModel midr_to_model(uint32_t midr)
                 break;
         }
     }
-    else if(implementer == 0x46)
+    else if (implementer == 0x46)
     {
-        switch(cpunum)
+        switch (cpunum)
         {
             case 0x001: // A64FX
                 model = CpuModel::A64FX;
@@ -156,9 +156,9 @@ CpuModel midr_to_model(uint32_t midr)
                 break;
         }
     }
-    else if(implementer == 0x48)
+    else if (implementer == 0x48)
     {
-        switch(cpunum)
+        switch (cpunum)
         {
             case 0xd40: // A76
                 model = CpuModel::GENERIC_FP16_DOT;
@@ -168,9 +168,9 @@ CpuModel midr_to_model(uint32_t midr)
                 break;
         }
     }
-    else if(implementer == 0x51)
+    else if (implementer == 0x51)
     {
-        switch(cpunum)
+        switch (cpunum)
         {
             case 0x800: // A73
                 model = CpuModel::A73;
@@ -196,4 +196,4 @@ CpuModel midr_to_model(uint32_t midr)
     return model;
 }
 } // namespace cpuinfo
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/common/cpuinfo/CpuModel.h b/src/common/cpuinfo/CpuModel.h
index 4fe6c29e53..3b9d9e3494 100644
--- a/src/common/cpuinfo/CpuModel.h
+++ b/src/common/cpuinfo/CpuModel.h
@@ -24,11 +24,11 @@
 #ifndef SRC_COMMON_CPUINFO_CPUMODEL_H
 #define SRC_COMMON_CPUINFO_CPUMODEL_H
 
+#include "arm_compute/core/CPP/CPPTypes.h"
+
 #include <cstdint>
 #include <string>
 
-#include "arm_compute/core/CPP/CPPTypes.h"
-
 namespace arm_compute
 {
 namespace cpuinfo
diff --git a/src/common/utils/LegacySupport.cpp b/src/common/utils/LegacySupport.cpp
index 06b1693bd1..102644227e 100644
--- a/src/common/utils/LegacySupport.cpp
+++ b/src/common/utils/LegacySupport.cpp
@@ -33,7 +33,7 @@ namespace
 {
 DataType convert_to_legacy_data_type(AclDataType data_type)
 {
-    switch(data_type)
+    switch (data_type)
     {
         case AclDataType::AclFloat32:
             return DataType::F32;
@@ -48,7 +48,7 @@ DataType convert_to_legacy_data_type(AclDataType data_type)
 
 AclDataType convert_to_c_data_type(DataType data_type)
 {
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::F32:
             return AclDataType::AclFloat32;
@@ -64,7 +64,7 @@ AclDataType convert_to_c_data_type(DataType data_type)
 TensorShape create_legacy_tensor_shape(int32_t ndims, int32_t *shape)
 {
     TensorShape legacy_shape{};
-    for(int32_t d = 0; d < ndims; ++d)
+    for (int32_t d = 0; d < ndims; ++d)
     {
         legacy_shape.set(d, shape[d], false);
     }
@@ -73,14 +73,14 @@ TensorShape create_legacy_tensor_shape(int32_t ndims, int32_t *shape)
 int32_t *create_tensor_shape_array(const TensorInfo &info)
 {
     const auto num_dims = info.num_dimensions();
-    if(num_dims <= 0)
+    if (num_dims <= 0)
     {
         return nullptr;
     }
 
     int32_t *shape_array = new int32_t[num_dims];
 
-    for(size_t d = 0; d < num_dims; ++d)
+    for (size_t d = 0; d < num_dims; ++d)
     {
         shape_array[d] = info.tensor_shape()[d];
     }
@@ -92,28 +92,23 @@ int32_t *create_tensor_shape_array(const TensorInfo &info)
 TensorInfo convert_to_legacy_tensor_info(const AclTensorDescriptor &desc)
 {
     TensorInfo legacy_desc;
-    legacy_desc.init(create_legacy_tensor_shape(desc.ndims, desc.shape), 1, convert_to_legacy_data_type(desc.data_type));
+    legacy_desc.init(create_legacy_tensor_shape(desc.ndims, desc.shape), 1,
+                     convert_to_legacy_data_type(desc.data_type));
     return legacy_desc;
 }
 
 AclTensorDescriptor convert_to_descriptor(const TensorInfo &info)
 {
     const auto          num_dims = info.num_dimensions();
-    AclTensorDescriptor desc
-    {
-        static_cast<int32_t>(num_dims),
-        create_tensor_shape_array(info),
-        convert_to_c_data_type(info.data_type()),
-        nullptr,
-        0
-    };
+    AclTensorDescriptor desc{static_cast<int32_t>(num_dims), create_tensor_shape_array(info),
+                             convert_to_c_data_type(info.data_type()), nullptr, 0};
     return desc;
 }
 
 ActivationLayerInfo convert_to_activation_info(const AclActivationDescriptor &desc)
 {
     ActivationLayerInfo::ActivationFunction act;
-    switch(desc.type)
+    switch (desc.type)
     {
         case AclActivationType::AclIdentity:
             act = ActivationLayerInfo::ActivationFunction::IDENTITY;
diff --git a/src/common/utils/Log.h b/src/common/utils/Log.h
index bbfe1ce1b3..6ebfed366e 100644
--- a/src/common/utils/Log.h
+++ b/src/common/utils/Log.h
@@ -38,20 +38,22 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/logging/Macros.h"
+
 #include "utils/TypePrinter.h"
 
 /** Create a logger
  *
  * @note It will eventually create all default loggers in don't exist
  */
-#define ARM_COMPUTE_CREATE_ACL_LOGGER()                                                                                        \
-    do                                                                                                                         \
-    {                                                                                                                          \
-        if(arm_compute::logging::LoggerRegistry::get().logger("ComputeLibrary") == nullptr)                                    \
-        {                                                                                                                      \
-            arm_compute::logging::LoggerRegistry::get().create_logger("ComputeLibrary", arm_compute::logging::LogLevel::INFO); \
-        }                                                                                                                      \
-    } while(false)
+#define ARM_COMPUTE_CREATE_ACL_LOGGER()                                                                      \
+    do                                                                                                       \
+    {                                                                                                        \
+        if (arm_compute::logging::LoggerRegistry::get().logger("ComputeLibrary") == nullptr)                 \
+        {                                                                                                    \
+            arm_compute::logging::LoggerRegistry::get().create_logger("ComputeLibrary",                      \
+                                                                      arm_compute::logging::LogLevel::INFO); \
+        }                                                                                                    \
+    } while (false)
 
 /** Log a message to the logger
  *
@@ -63,7 +65,7 @@
     {                                                          \
         ARM_COMPUTE_CREATE_ACL_LOGGER();                       \
         ARM_COMPUTE_LOG_MSG("ComputeLibrary", log_level, msg); \
-    } while(false)
+    } while (false)
 
 /** Log a message with format to the logger
  *
@@ -76,7 +78,7 @@
     {                                                                                   \
         ARM_COMPUTE_CREATE_ACL_LOGGER();                                                \
         ARM_COMPUTE_LOG_MSG_WITH_FORMAT("ComputeLibrary", log_level, fmt, __VA_ARGS__); \
-    } while(false)
+    } while (false)
 
 /** Log an error message to the logger
  *
@@ -87,7 +89,7 @@
     {                                                                                      \
         ARM_COMPUTE_CREATE_ACL_LOGGER();                                                   \
         ARM_COMPUTE_LOG_MSG("ComputeLibrary", arm_compute::logging::LogLevel::ERROR, msg); \
-    } while(false)
+    } while (false)
 
 /** Log an error message to the logger with function name before the message
  *
@@ -98,7 +100,7 @@
     {                                                                                                    \
         ARM_COMPUTE_CREATE_ACL_LOGGER();                                                                 \
         ARM_COMPUTE_LOG_MSG_WITH_FUNCNAME("ComputeLibrary", arm_compute::logging::LogLevel::ERROR, msg); \
-    } while(false)
+    } while (false)
 
 /** Log an information message to the logger with function name before the message
  *
@@ -109,7 +111,7 @@
     {                                                                                                   \
         ARM_COMPUTE_CREATE_ACL_LOGGER();                                                                \
         ARM_COMPUTE_LOG_MSG_WITH_FUNCNAME("ComputeLibrary", arm_compute::logging::LogLevel::INFO, msg); \
-    } while(false)
+    } while (false)
 
 /** Function template specialization for the out of bound element at index = tuple_size
  *
@@ -131,12 +133,13 @@ logParamsImpl(std::vector<std::string> &data_registry, const std::tuple<Tp...> &
  * @param[in]     in_params_tuple Constant reference to a tuple of different input data types
  */
 template <std::size_t Index, typename... Tp>
-inline typename std::enable_if < Index<sizeof...(Tp), void>::type
-logParamsImpl(std::vector<std::string> &data_registry, const std::tuple<Tp...> &in_params_tuple)
+    inline typename std::enable_if <
+    Index<sizeof...(Tp), void>::type logParamsImpl(std::vector<std::string> &data_registry,
+                                                   const std::tuple<Tp...>  &in_params_tuple)
 {
     data_registry.push_back(arm_compute::to_string(std::get<Index>(in_params_tuple)));
     // Unfold the next tuple element
-    logParamsImpl < Index + 1, Tp... > (data_registry, in_params_tuple);
+    logParamsImpl<Index + 1, Tp...>(data_registry, in_params_tuple);
 }
 
 /** Function Template with variable number of inputs to collect all the passed parameters from
@@ -149,10 +152,10 @@ logParamsImpl(std::vector<std::string> &data_registry, const std::tuple<Tp...> &
  * @return  Vector of the parameters' data in a string format
  */
 template <typename... Ts>
-const std::vector<std::string> logParams(Ts &&... ins)
+const std::vector<std::string> logParams(Ts &&...ins)
 {
     std::vector<std::string> data_registry{};
-    std::tuple<Ts...>        in_params_tuple{ ins... };
+    std::tuple<Ts...>        in_params_tuple{ins...};
 
     // Start logging the tuple elements, starting from 0 to tuple_size-1
     logParamsImpl<0>(data_registry, in_params_tuple);
@@ -178,11 +181,11 @@ inline const std::vector<std::string> getParamsNames(const std::string &in_param
 
     // Usually the input parameters string would be name of parameters separated
     // by ',' e.g. "src0, src1, policy"
-    while(std::getline(ss, temp, ','))
+    while (std::getline(ss, temp, ','))
     {
         names.push_back(temp);
     }
-    for(auto &name : names)
+    for (auto &name : names)
     {
         // Totally get rid of white space characters
         name.erase(std::remove(name.begin(), name.end(), ' '), name.end());
@@ -205,7 +208,7 @@ inline const std::string constructDataLog(const std::vector<std::string> &params
 {
     std::string dataLog = "\n ";
     ARM_COMPUTE_ERROR_ON(params_names.size() != data_registry.size());
-    for(uint8_t i = 0; i < params_names.size(); ++i)
+    for (uint8_t i = 0; i < params_names.size(); ++i)
     {
         dataLog += params_names[i] + ": " + data_registry.at(i) + "\n ";
     }
@@ -220,11 +223,11 @@ inline const std::string constructDataLog(const std::vector<std::string> &params
  *
  * @param[in] ... Input parameters
  */
-#define ARM_COMPUTE_LOG_PARAMS(...)                                                           \
-    do                                                                                        \
-    {                                                                                         \
-        ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL(constructDataLog(getParamsNames(#__VA_ARGS__), \
-                                                                logParams(__VA_ARGS__)));     \
-    } while(false)
+#define ARM_COMPUTE_LOG_PARAMS(...)                                                  \
+    do                                                                               \
+    {                                                                                \
+        ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL(                                      \
+            constructDataLog(getParamsNames(#__VA_ARGS__), logParams(__VA_ARGS__))); \
+    } while (false)
 #endif /* ARM_COMPUTE_LOGGING_ENABLED */
 #endif /* SRC_COMMON_LOG_H */
diff --git a/src/common/utils/Macros.h b/src/common/utils/Macros.h
index 2e44ea599e..35f7e759d3 100644
--- a/src/common/utils/Macros.h
+++ b/src/common/utils/Macros.h
@@ -28,7 +28,7 @@
 
 #define ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status)                 \
     {                                                               \
-        if(status != arm_compute::StatusCode::Success)              \
+        if (status != arm_compute::StatusCode::Success)             \
         {                                                           \
             return arm_compute::utils::as_cenum<AclStatus>(status); \
         }                                                           \
diff --git a/src/common/utils/Object.h b/src/common/utils/Object.h
index 1f194737d4..b73de8e430 100644
--- a/src/common/utils/Object.h
+++ b/src/common/utils/Object.h
@@ -52,14 +52,12 @@ struct Header
      * @param[in] type_ Object identification type
      * @param[in] ctx_  Context to reference
      */
-    Header(ObjectType type_, IContext *ctx_) noexcept
-        : type(type_),
-          ctx(ctx_)
+    Header(ObjectType type_, IContext *ctx_) noexcept : type(type_), ctx(ctx_)
     {
     }
 
-    ObjectType type{ ObjectType::Invalid };
-    IContext *ctx{ nullptr };
+    ObjectType type{ObjectType::Invalid};
+    IContext  *ctx{nullptr};
 };
 } // namespace detail
 } // namespace arm_compute
diff --git a/src/common/utils/Utils.h b/src/common/utils/Utils.h
index 1bd1c7ec57..33fe6c0e81 100644
--- a/src/common/utils/Utils.h
+++ b/src/common/utils/Utils.h
@@ -74,10 +74,7 @@ constexpr SE as_enum(const E val) noexcept
 template <typename E>
 bool is_in(E check, std::initializer_list<E> list)
 {
-    return std::any_of(list.begin(), list.end(), [&check](E e)
-    {
-        return check == e;
-    });
+    return std::any_of(list.begin(), list.end(), [&check](E e) { return check == e; });
 }
 } // namespace utils
 } // namespace arm_compute
diff --git a/src/common/utils/Validate.h b/src/common/utils/Validate.h
index 4e8807273a..97819c619f 100644
--- a/src/common/utils/Validate.h
+++ b/src/common/utils/Validate.h
@@ -29,7 +29,7 @@
 
 #include <cassert>
 
-#define ARM_COMPUTE_ASSERT(cond) assert(cond)
+#define ARM_COMPUTE_ASSERT(cond)            assert(cond)
 #define ARM_COMPUTE_ASSERT_NOT_NULLPTR(ptr) assert((ptr) != nullptr)
 
 #else /* defined(ARM_COMPUTE_ASSERTS_ENABLED) */
diff --git a/src/core/AccessWindowAutoPadding.cpp b/src/core/AccessWindowAutoPadding.cpp
index ca2f7d238f..52be6990ab 100644
--- a/src/core/AccessWindowAutoPadding.cpp
+++ b/src/core/AccessWindowAutoPadding.cpp
@@ -28,12 +28,14 @@
 
 using namespace arm_compute;
 
-AccessWindowAutoPadding::AccessWindowAutoPadding(ITensorInfo *info)
-    : _info(info)
+AccessWindowAutoPadding::AccessWindowAutoPadding(ITensorInfo *info) : _info(info)
 {
 }
 
-ValidRegion AccessWindowAutoPadding::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const
+ValidRegion AccessWindowAutoPadding::compute_valid_region(const Window &window,
+                                                          ValidRegion   input_valid_region,
+                                                          bool          border_undefined,
+                                                          BorderSize    border_size) const
 {
     ARM_COMPUTE_UNUSED(window);
     ARM_COMPUTE_UNUSED(input_valid_region);
@@ -45,17 +47,17 @@ ValidRegion AccessWindowAutoPadding::compute_valid_region(const Window &window,
 
 ValidRegion AccessWindowAutoPadding::compute_valid_region() const
 {
-    if(_info == nullptr)
+    if (_info == nullptr)
     {
         return ValidRegion{};
     }
 
-    return ValidRegion{ Coordinates(), _info->tensor_shape() };
+    return ValidRegion{Coordinates(), _info->tensor_shape()};
 }
 
 void AccessWindowAutoPadding::set_valid_region()
 {
-    if(_info == nullptr)
+    if (_info == nullptr)
     {
         return;
     }
@@ -75,7 +77,7 @@ bool AccessWindowAutoPadding::update_padding_if_needed(const Window &window)
     ARM_COMPUTE_UNUSED(window);
 
     // Only update the padding if the tensor allows it
-    if(_info == nullptr || !_info->is_resizable())
+    if (_info == nullptr || !_info->is_resizable())
     {
         return false;
     }
diff --git a/src/core/AccessWindowAutoPadding.h b/src/core/AccessWindowAutoPadding.h
index b8d1508679..406bdba0d8 100644
--- a/src/core/AccessWindowAutoPadding.h
+++ b/src/core/AccessWindowAutoPadding.h
@@ -74,9 +74,12 @@ public:
     ValidRegion compute_valid_region() const;
 
     // Inherited methods overridden:
-    bool update_window_if_needed(Window &window) const override;
-    bool update_padding_if_needed(const Window &window) override;
-    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+    bool        update_window_if_needed(Window &window) const override;
+    bool        update_padding_if_needed(const Window &window) override;
+    ValidRegion compute_valid_region(const Window &window,
+                                     ValidRegion   input_valid_region,
+                                     bool          border_undefined,
+                                     BorderSize    border_size) const override;
 
 private:
     ITensorInfo *_info;
diff --git a/src/core/AccessWindowStatic.cpp b/src/core/AccessWindowStatic.cpp
index 0607011bc5..98182b1202 100644
--- a/src/core/AccessWindowStatic.cpp
+++ b/src/core/AccessWindowStatic.cpp
@@ -34,7 +34,10 @@ AccessWindowStatic::AccessWindowStatic(ITensorInfo *info, int start_x, int start
 {
 }
 
-ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const
+ValidRegion AccessWindowStatic::compute_valid_region(const Window &window,
+                                                     ValidRegion   input_valid_region,
+                                                     bool          border_undefined,
+                                                     BorderSize    border_size) const
 {
     ARM_COMPUTE_UNUSED(border_undefined);
     ARM_COMPUTE_UNUSED(border_size);
@@ -44,7 +47,7 @@ ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, Valid
 
 ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, ValidRegion input_valid_region) const
 {
-    if(_info == nullptr)
+    if (_info == nullptr)
     {
         return input_valid_region;
     }
@@ -57,7 +60,7 @@ ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, Valid
     // Start of the valid region is equal to the start of the static access but
     // never outside of the tensor.
     anchor.set(0, std::max<int>(0, _start_x));
-    if(_info->num_dimensions() > 1)
+    if (_info->num_dimensions() > 1)
     {
         anchor.set(1, std::max<int>(0, _start_y));
     }
@@ -65,7 +68,7 @@ ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, Valid
     // End of the valid region is equal to the end of the static access but
     // never outside of the tensor.
     shape.set(0, std::min<int>(_end_x, _info->tensor_shape()[0]));
-    if(_info->num_dimensions() > 1)
+    if (_info->num_dimensions() > 1)
     {
         shape.set(1, std::min<int>(_end_y, _info->tensor_shape()[1]));
     }
@@ -75,7 +78,7 @@ ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, Valid
 
 void AccessWindowStatic::set_valid_region(const Window &window, const ValidRegion &input_valid_region)
 {
-    if(_info != nullptr)
+    if (_info != nullptr)
     {
         _info->set_valid_region(compute_valid_region(window, input_valid_region));
     }
@@ -84,7 +87,7 @@ void AccessWindowStatic::set_valid_region(const Window &window, const ValidRegio
 bool AccessWindowStatic::update_window_if_needed(Window &window) const
 {
     // If the padding is not enough and the tensor is not resizable, shrink the window to size 0
-    if(_info == nullptr || _info->is_resizable())
+    if (_info == nullptr || _info->is_resizable())
     {
         return false;
     }
@@ -96,48 +99,50 @@ bool AccessWindowStatic::update_window_if_needed(Window &window) const
     bool window_modified = false;
 
     // Calculate if padding is enough
-    if(_start_y < 0)
+    if (_start_y < 0)
     {
         const int front_pad_y_available = -static_cast<int>(offset_first_element / strides[1]);
 
-        if(_start_y < front_pad_y_available)
+        if (_start_y < front_pad_y_available)
         {
             window_modified = true;
         }
     }
 
-    if(!window_modified)
+    if (!window_modified)
     {
-        if(_end_y > static_cast<int>(shape[1]))
+        if (_end_y > static_cast<int>(shape[1]))
         {
             const int stride_z             = _info->num_dimensions() > 2 ? strides[2] : _info->total_size();
             const int tail_pad_y_available = (stride_z / strides[1]) - shape[1];
 
-            if(static_cast<int>(shape[1]) + tail_pad_y_available < _end_y)
+            if (static_cast<int>(shape[1]) + tail_pad_y_available < _end_y)
             {
                 window_modified = true;
             }
         }
 
-        if(!window_modified)
+        if (!window_modified)
         {
             const int stride_y = _info->num_dimensions() > 1 ? strides[1] : _info->total_size();
 
-            if(_start_x < 0)
+            if (_start_x < 0)
             {
-                const int front_pad_x_available = -std::min<int>(static_cast<int>(offset_first_element), stride_y - shape[0] * strides[0]) / static_cast<int>(strides[0]);
+                const int front_pad_x_available =
+                    -std::min<int>(static_cast<int>(offset_first_element), stride_y - shape[0] * strides[0]) /
+                    static_cast<int>(strides[0]);
 
-                if(_start_x < front_pad_x_available)
+                if (_start_x < front_pad_x_available)
                 {
                     window_modified = true;
                 }
             }
 
-            if(!window_modified && _end_x > static_cast<int>(shape[0]))
+            if (!window_modified && _end_x > static_cast<int>(shape[0]))
             {
                 const int tail_pad_x_available = (stride_y / strides[0]) - shape[0];
 
-                if(static_cast<int>(shape[0]) + tail_pad_x_available < _end_x)
+                if (static_cast<int>(shape[0]) + tail_pad_x_available < _end_x)
                 {
                     window_modified = true;
                 }
@@ -146,9 +151,9 @@ bool AccessWindowStatic::update_window_if_needed(Window &window) const
     }
 
     // If padding is not enough
-    if(window_modified)
+    if (window_modified)
     {
-        for(size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
+        for (size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
         {
             window.set(i, Window::Dimension(0, 0, 1));
         }
@@ -162,7 +167,7 @@ bool AccessWindowStatic::update_padding_if_needed(const Window &window)
     ARM_COMPUTE_UNUSED(window);
 
     // Only update the padding if the tensor allows it
-    if(_info == nullptr || !_info->is_resizable())
+    if (_info == nullptr || !_info->is_resizable())
     {
         return false;
     }
diff --git a/src/core/AccessWindowStatic.h b/src/core/AccessWindowStatic.h
index f7d43cbb55..5c6d2c7db0 100644
--- a/src/core/AccessWindowStatic.h
+++ b/src/core/AccessWindowStatic.h
@@ -86,9 +86,12 @@ public:
     ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region) const;
 
     // Inherited methods overriden:
-    bool update_window_if_needed(Window &window) const override;
-    bool update_padding_if_needed(const Window &window) override;
-    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+    bool        update_window_if_needed(Window &window) const override;
+    bool        update_padding_if_needed(const Window &window) override;
+    ValidRegion compute_valid_region(const Window &window,
+                                     ValidRegion   input_valid_region,
+                                     bool          border_undefined,
+                                     BorderSize    border_size) const override;
 
 private:
     ITensorInfo *_info;
diff --git a/src/core/AccessWindowTranspose.cpp b/src/core/AccessWindowTranspose.cpp
index d8bd4c4de1..42f0081c14 100644
--- a/src/core/AccessWindowTranspose.cpp
+++ b/src/core/AccessWindowTranspose.cpp
@@ -29,9 +29,12 @@
 
 using namespace arm_compute;
 
-ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const
+ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window,
+                                                        ValidRegion   input_valid_region,
+                                                        bool          border_undefined,
+                                                        BorderSize    border_size) const
 {
-    if(_info == nullptr)
+    if (_info == nullptr)
     {
         return input_valid_region;
     }
@@ -41,7 +44,7 @@ ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, Va
     Coordinates  old_anchor(anchor);
     TensorShape  old_shape(shape);
 
-    if(!border_undefined)
+    if (!border_undefined)
     {
         border_size = BorderSize(0);
     }
@@ -53,7 +56,7 @@ ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, Va
     // the kernel to write back output values.
     // As the relation between input and output is transposed window.y() is
     // used for x anchor and window.x() for y anchor.
-    if(_info->dimension(0) > 1)
+    if (_info->dimension(0) > 1)
     {
         anchor.set(0, std::max<int>(window.y().start() * _scale_x, anchor[1] + border_size.top) + _x);
     }
@@ -69,15 +72,19 @@ ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, Va
     // a size of the region.
     // As the relation between input and output is transposed window.y() is
     // used for x shape and window.x() for y shape.
-    if(_info->dimension(0) > 1)
+    if (_info->dimension(0) > 1)
     {
-        shape.set(0, std::min<int>((old_anchor[1] + old_shape[0]) * _scale_x - border_size.right, (window.y().end() - window.y().step()) * _scale_x + _width) - anchor[0]);
+        shape.set(0, std::min<int>((old_anchor[1] + old_shape[0]) * _scale_x - border_size.right,
+                                   (window.y().end() - window.y().step()) * _scale_x + _width) -
+                         anchor[0]);
     }
-    shape.set(1, std::min<int>((old_anchor[0] + old_shape[1]) * _scale_y - border_size.bottom, (window.x().end() - window.x().step()) * _scale_y + _height) - anchor[1]);
+    shape.set(1, std::min<int>((old_anchor[0] + old_shape[1]) * _scale_y - border_size.bottom,
+                               (window.x().end() - window.x().step()) * _scale_y + _height) -
+                     anchor[1]);
 
     // For higher dimensions use the intersection of the window size and the
     // valid region of the input
-    for(size_t d = 2; d < _info->num_dimensions(); ++d)
+    for (size_t d = 2; d < _info->num_dimensions(); ++d)
     {
         anchor.set(d, std::max(window[d].start(), input_valid_region.anchor[d]));
         shape.set(d, std::min<int>(window[d].end(), input_valid_region.shape[d]) - anchor[d]);
@@ -89,7 +96,7 @@ ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, Va
 bool AccessWindowTranspose::update_window_if_needed(Window &window) const
 {
     // Only update the window size if we can't use padding
-    if(_info == nullptr || _info->is_resizable())
+    if (_info == nullptr || _info->is_resizable())
     {
         return false;
     }
@@ -107,12 +114,12 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const
     const int max_y = window.x().end() * _scale_y + _y;
 
     // Adjust window start for output's Y dimension (so X in (input) window)
-    if(min_y < 0)
+    if (min_y < 0)
     {
         // Calculate rows available above the tensor
         const int front_pad_y_available = -offset_first_element / strides[1];
 
-        if(min_y < front_pad_y_available)
+        if (min_y < front_pad_y_available)
         {
             // Not enough padding available, need to shrink the window
             const int start = adjust_up(min_y, front_pad_y_available, window.x().step() * _scale_y) - _y;
@@ -126,17 +133,18 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const
     }
 
     // Adjust window end for Y dimension
-    if(max_y > static_cast<int>(shape[1]))
+    if (max_y > static_cast<int>(shape[1]))
     {
         const int stride_z = _info->num_dimensions() > 2 ? strides[2] : _info->total_size();
 
         // Calculate rows available below the tensor
         const int tail_pad_y_available = (stride_z / strides[1]) - shape[1] - front_pad_y;
 
-        if(static_cast<int>(shape[1]) + tail_pad_y_available < max_y)
+        if (static_cast<int>(shape[1]) + tail_pad_y_available < max_y)
         {
             // Not enough padding available, need to shrink the window
-            const int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.x().step() * _scale_y) + window.x().step() * _scale_y - _y - _height;
+            const int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.x().step() * _scale_y) +
+                            window.x().step() * _scale_y - _y - _height;
             window.set(0, Window::Dimension(window.x().start(), end / _scale_y, window.x().step()));
             window_modified = true;
         }
@@ -151,11 +159,14 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const
     const int stride_y = _info->num_dimensions() > 1 ? strides[1] : _info->total_size();
 
     // Adjust window start for X dimension
-    if(min_x < 0)
+    if (min_x < 0)
     {
-        const int front_pad_x_available = -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1], stride_y - shape[0] * strides[0]) / static_cast<int>(strides[0]);
+        const int front_pad_x_available =
+            -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1],
+                           stride_y - shape[0] * strides[0]) /
+            static_cast<int>(strides[0]);
 
-        if(min_x < front_pad_x_available)
+        if (min_x < front_pad_x_available)
         {
             // Not enough padding available, need to shrink the window
             const int start = adjust_up(min_x, front_pad_x_available, window.y().step() * _scale_x) - _x;
@@ -168,14 +179,15 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const
     }
 
     // Adjust window end for X dimension
-    if(max_x > static_cast<int>(shape[0]))
+    if (max_x > static_cast<int>(shape[0]))
     {
         const int tail_pad_x_available = (stride_y / strides[0]) - shape[0] - front_pad_x;
 
-        if(static_cast<int>(shape[0]) + tail_pad_x_available < max_x)
+        if (static_cast<int>(shape[0]) + tail_pad_x_available < max_x)
         {
             // Not enough padding available, need to shrink the window
-            const int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.y().step() * _scale_x) + window.y().step() * _scale_x - _x - _width;
+            const int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.y().step() * _scale_x) +
+                            window.y().step() * _scale_x - _x - _width;
             window.set(1, Window::Dimension(window.y().start(), end / _scale_x, window.y().step()));
             window_modified = true;
         }
@@ -189,7 +201,7 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const
 bool AccessWindowTranspose::update_padding_if_needed(const Window &window)
 {
     // Only update the padding if the tensor allows it
-    if(_info == nullptr || !_info->is_resizable())
+    if (_info == nullptr || !_info->is_resizable())
     {
         return false;
     }
diff --git a/src/core/AccessWindowTranspose.h b/src/core/AccessWindowTranspose.h
index 0306076d6e..12bb9a535b 100644
--- a/src/core/AccessWindowTranspose.h
+++ b/src/core/AccessWindowTranspose.h
@@ -42,7 +42,10 @@ public:
     bool update_window_if_needed(Window &window) const override;
     bool update_padding_if_needed(const Window &window) override;
     using AccessWindowRectangle::compute_valid_region;
-    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+    ValidRegion compute_valid_region(const Window &window,
+                                     ValidRegion   input_valid_region,
+                                     bool          border_undefined,
+                                     BorderSize    border_size) const override;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_IACCESS_WINDOW_TRANSPOSE_H*/
diff --git a/src/core/CL/CLCommandBuffer.cpp b/src/core/CL/CLCommandBuffer.cpp
index 7fcfdf2c89..d094dcdaea 100644
--- a/src/core/CL/CLCommandBuffer.cpp
+++ b/src/core/CL/CLCommandBuffer.cpp
@@ -38,7 +38,7 @@ std::unique_ptr<CLCommandBuffer> CLCommandBuffer::create(cl_command_queue queue)
     const auto &cl_device            = CLKernelLibrary::get().get_device();
     const auto  has_mutable_dispatch = command_buffer_mutable_dispatch_supported(cl_device);
 
-    if(has_mutable_dispatch)
+    if (has_mutable_dispatch)
     {
         return std::make_unique<CLMutableCommandBuffer>(queue);
     }
diff --git a/src/core/CL/CLCommandBuffer.h b/src/core/CL/CLCommandBuffer.h
index 8a94e389fa..90e434161e 100644
--- a/src/core/CL/CLCommandBuffer.h
+++ b/src/core/CL/CLCommandBuffer.h
@@ -87,7 +87,8 @@ public:
      * @param[in] global The global work size.
      * @param[in] local  The local work size.
      */
-    virtual void add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) = 0;
+    virtual void
+    add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) = 0;
 
     /** Add the mutable argument to the current kernel enqueue command.
      *
@@ -154,7 +155,7 @@ protected:
     CLCommandBuffer &state(State state);
 
 private:
-    State _state{ State::Created };
+    State _state{State::Created};
 };
 
 } // namespace arm_compute
diff --git a/src/core/CL/CLCompatCommandBuffer.cpp b/src/core/CL/CLCompatCommandBuffer.cpp
index f1a902c7b9..242fd7719c 100644
--- a/src/core/CL/CLCompatCommandBuffer.cpp
+++ b/src/core/CL/CLCompatCommandBuffer.cpp
@@ -31,8 +31,7 @@
 namespace arm_compute
 {
 
-CLCompatCommandBuffer::CLCompatCommandBuffer(cl_command_queue queue)
-    : _queue(queue)
+CLCompatCommandBuffer::CLCompatCommandBuffer(cl_command_queue queue) : _queue(queue)
 {
 }
 
@@ -40,11 +39,14 @@ CLCompatCommandBuffer::~CLCompatCommandBuffer()
 {
 }
 
-void CLCompatCommandBuffer::add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local)
+void CLCompatCommandBuffer::add_kernel(cl_kernel          kernel,
+                                       const cl::NDRange &offset,
+                                       const cl::NDRange &global,
+                                       const cl::NDRange &local)
 {
     ARM_COMPUTE_ERROR_ON(state() != State::Created);
 
-    _kernel_cmds.push_back(KernelCommand{ kernel, offset, global, local, {} });
+    _kernel_cmds.push_back(KernelCommand{kernel, offset, global, local, {}});
 }
 
 void CLCompatCommandBuffer::add_mutable_argument_generic(cl_uint arg_idx, const void *value, size_t size)
@@ -52,7 +54,7 @@ void CLCompatCommandBuffer::add_mutable_argument_generic(cl_uint arg_idx, const
     ARM_COMPUTE_ERROR_ON(state() != State::Created);
     ARM_COMPUTE_ERROR_ON(_kernel_cmds.empty());
 
-    _kernel_cmds.back().mutable_args.push_back(cl_mutable_dispatch_arg_khr{ arg_idx, size, value });
+    _kernel_cmds.back().mutable_args.push_back(cl_mutable_dispatch_arg_khr{arg_idx, size, value});
 }
 
 void CLCompatCommandBuffer::finalize()
@@ -61,7 +63,7 @@ void CLCompatCommandBuffer::finalize()
 
     _kernel_cmds.shrink_to_fit();
 
-    for(auto &cmd : _kernel_cmds)
+    for (auto &cmd : _kernel_cmds)
     {
         cmd.mutable_args.shrink_to_fit();
     }
@@ -80,25 +82,19 @@ void CLCompatCommandBuffer::enqueue()
 {
     ARM_COMPUTE_ERROR_ON(state() != State::Finalized);
 
-    for(const auto &cmd : _kernel_cmds)
+    for (const auto &cmd : _kernel_cmds)
     {
-        for(const auto &arg : cmd.mutable_args)
+        for (const auto &arg : cmd.mutable_args)
         {
             const auto error = clSetKernelArg(cmd.kernel, arg.arg_index, arg.arg_size, arg.arg_value);
 
             handle_cl_error("clSetKernelArg", error);
         }
 
-        const auto error = clEnqueueNDRangeKernel(
-            _queue,
-            cmd.kernel,
-            static_cast<cl_uint>(cmd.global.dimensions()),
-            cmd.offset.dimensions() != 0 ? cmd.offset.get() : nullptr,
-            cmd.global.get(),
-            cmd.local.dimensions() != 0 ? cmd.local.get() : nullptr,
-            0,
-            nullptr,
-            nullptr);
+        const auto error =
+            clEnqueueNDRangeKernel(_queue, cmd.kernel, static_cast<cl_uint>(cmd.global.dimensions()),
+                                   cmd.offset.dimensions() != 0 ? cmd.offset.get() : nullptr, cmd.global.get(),
+                                   cmd.local.dimensions() != 0 ? cmd.local.get() : nullptr, 0, nullptr, nullptr);
 
         handle_cl_error("clEnqueueNDRangeKernel", error);
     }
diff --git a/src/core/CL/CLCompatCommandBuffer.h b/src/core/CL/CLCompatCommandBuffer.h
index e91d52d2d6..d5df106425 100644
--- a/src/core/CL/CLCompatCommandBuffer.h
+++ b/src/core/CL/CLCompatCommandBuffer.h
@@ -57,7 +57,10 @@ public:
     /** Disallow move assignment. */
     CLCompatCommandBuffer &operator=(CLCompatCommandBuffer &&) = delete;
 
-    void add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) override;
+    void add_kernel(cl_kernel          kernel,
+                    const cl::NDRange &offset,
+                    const cl::NDRange &global,
+                    const cl::NDRange &local) override;
 
     void finalize() override;
 
diff --git a/src/core/CL/CLCompileContext.cpp b/src/core/CL/CLCompileContext.cpp
index 2d024f9c2f..9bbc32657e 100644
--- a/src/core/CL/CLCompileContext.cpp
+++ b/src/core/CL/CLCompileContext.cpp
@@ -22,19 +22,19 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/CL/CLCompileContext.h"
-#include "arm_compute/core/CL/OpenCL.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Utils.h"
+
 #include "support/StringSupport.h"
 
 #include <regex>
 
 namespace arm_compute
 {
-CLBuildOptions::CLBuildOptions()
-    : _build_opts()
+CLBuildOptions::CLBuildOptions() : _build_opts()
 {
 }
 
@@ -45,7 +45,7 @@ void CLBuildOptions::add_option(std::string option)
 
 void CLBuildOptions::add_option_if(bool cond, std::string option)
 {
-    if(cond)
+    if (cond)
     {
         add_option(std::move(option));
     }
@@ -63,7 +63,7 @@ void CLBuildOptions::add_options(const StringSet &options)
 
 void CLBuildOptions::add_options_if(bool cond, const StringSet &options)
 {
-    if(cond)
+    if (cond)
     {
         add_options(options);
     }
@@ -79,26 +79,35 @@ bool CLBuildOptions::operator==(const CLBuildOptions &other) const
     return _build_opts == other._build_opts;
 }
 
-Program::Program()
-    : _context(), _device(), _is_binary(false), _name(), _source(), _binary()
+Program::Program() : _context(), _device(), _is_binary(false), _name(), _source(), _binary()
 {
 }
 
 Program::Program(cl::Context context, std::string name, std::string source)
-    : _context(std::move(context)), _device(), _is_binary(false), _name(std::move(name)), _source(std::move(source)), _binary()
+    : _context(std::move(context)),
+      _device(),
+      _is_binary(false),
+      _name(std::move(name)),
+      _source(std::move(source)),
+      _binary()
 {
 }
 
 Program::Program(cl::Context context, cl::Device device, std::string name, std::vector<unsigned char> binary)
-    : _context(std::move(context)), _device(std::move(device)), _is_binary(true), _name(std::move(name)), _source(), _binary(std::move(binary))
+    : _context(std::move(context)),
+      _device(std::move(device)),
+      _is_binary(true),
+      _name(std::move(name)),
+      _source(),
+      _binary(std::move(binary))
 {
 }
 
 Program::operator cl::Program() const
 {
-    if(_is_binary)
+    if (_is_binary)
     {
-        return cl::Program(_context, { _device }, { _binary });
+        return cl::Program(_context, {_device}, {_binary});
     }
     else
     {
@@ -112,12 +121,12 @@ bool Program::build(const cl::Program &program, const std::string &build_options
     {
         return program.build(build_options.c_str()) == CL_SUCCESS;
     }
-    catch(const cl::Error &e)
+    catch (const cl::Error &e)
     {
         cl_int     err        = CL_SUCCESS;
         const auto build_info = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(&err);
 
-        for(auto &pair : build_info)
+        for (auto &pair : build_info)
         {
             std::cerr << pair.second << std::endl;
         }
@@ -133,14 +142,12 @@ cl::Program Program::build(const std::string &build_options) const
     return cl_program;
 }
 
-Kernel::Kernel()
-    : _name(), _kernel()
+Kernel::Kernel() : _name(), _kernel()
 {
 }
 
 Kernel::Kernel(std::string name, const cl::Program &program)
-    : _name(std::move(name)),
-      _kernel(cl::Kernel(program, _name.c_str()))
+    : _name(std::move(name)), _kernel(cl::Kernel(program, _name.c_str()))
 {
 }
 CLCompileContext::CLCompileContext()
@@ -156,15 +163,19 @@ CLCompileContext::CLCompileContext(cl::Context context, const cl::Device &device
     _is_wbsm_supported = get_wbsm_support_info(device);
 }
 
-Kernel CLCompileContext::create_kernel(const std::string &kernel_name, const std::string &program_name, const std::string &program_source,
-                                       const std::string &kernel_path, const StringSet &build_options_set, bool is_binary) const
+Kernel CLCompileContext::create_kernel(const std::string &kernel_name,
+                                       const std::string &program_name,
+                                       const std::string &program_source,
+                                       const std::string &kernel_path,
+                                       const StringSet   &build_options_set,
+                                       bool               is_binary) const
 {
     const std::string build_options      = generate_build_options(build_options_set, kernel_path);
     const std::string built_program_name = program_name + "_" + build_options;
     auto              built_program_it   = _built_programs_map.find(built_program_name);
     cl::Program       cl_program;
 
-    if(_built_programs_map.end() != built_program_it)
+    if (_built_programs_map.end() != built_program_it)
     {
         // If program has been built, retrieve to create kernel from it
         cl_program = built_program_it->second;
@@ -184,11 +195,12 @@ Kernel CLCompileContext::create_kernel(const std::string &kernel_name, const std
     return Kernel(kernel_name, cl_program);
 }
 
-const Program &CLCompileContext::load_program(const std::string &program_name, const std::string &program_source, bool is_binary) const
+const Program &
+CLCompileContext::load_program(const std::string &program_name, const std::string &program_source, bool is_binary) const
 {
     const auto program_it = _programs_map.find(program_name);
 
-    if(program_it != _programs_map.end())
+    if (program_it != _programs_map.end())
     {
         return program_it->second;
     }
@@ -199,9 +211,10 @@ const Program &CLCompileContext::load_program(const std::string &program_name, c
     ARM_COMPUTE_UNUSED(is_binary);
     program = Program(_context, program_name, program_source);
 #else  /* EMBEDDED_KERNELS */
-    if(is_binary)
+    if (is_binary)
     {
-        program = Program(_context, _device.cl_device(), program_name, std::vector<unsigned char>(program_source.begin(), program_source.end()));
+        program = Program(_context, _device.cl_device(), program_name,
+                          std::vector<unsigned char>(program_source.begin(), program_source.end()));
     }
     else
     {
@@ -218,18 +231,19 @@ const Program &CLCompileContext::load_program(const std::string &program_name, c
 void CLCompileContext::set_context(cl::Context context)
 {
     _context = std::move(context);
-    if(_context.get() != nullptr)
+    if (_context.get() != nullptr)
     {
         const auto cl_devices = _context.getInfo<CL_CONTEXT_DEVICES>();
 
-        if(!cl_devices.empty())
+        if (!cl_devices.empty())
         {
             _device = CLDevice(cl_devices[0]);
         }
     }
 }
 
-std::string CLCompileContext::generate_build_options(const StringSet &build_options_set, const std::string &kernel_path) const
+std::string CLCompileContext::generate_build_options(const StringSet   &build_options_set,
+                                                     const std::string &kernel_path) const
 {
     std::string concat_str;
     bool        ext_supported = false;
@@ -241,27 +255,27 @@ std::string CLCompileContext::generate_build_options(const StringSet &build_opti
 #endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
 
     GPUTarget gpu_arch = get_arch_from_target(_device.target());
-    concat_str += " -DGPU_ARCH=" + support::cpp11::to_string(
-                      static_cast<std::underlying_type<GPUTarget>::type>(gpu_arch));
+    concat_str +=
+        " -DGPU_ARCH=" + support::cpp11::to_string(static_cast<std::underlying_type<GPUTarget>::type>(gpu_arch));
 
-    if(_device.supported("cl_khr_fp16"))
+    if (_device.supported("cl_khr_fp16"))
     {
         concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 ";
     }
 
-    if(_device.supported("cl_arm_integer_dot_product_int8") || _device.supported("cl_khr_integer_dot_product"))
+    if (_device.supported("cl_arm_integer_dot_product_int8") || _device.supported("cl_khr_integer_dot_product"))
     {
         concat_str += " -DARM_COMPUTE_OPENCL_DOT8_ENABLED=1 ";
     }
 
-    if(_device.supported("cl_arm_integer_dot_product_accumulate_int8"))
+    if (_device.supported("cl_arm_integer_dot_product_accumulate_int8"))
     {
         concat_str += " -DARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED=1 ";
     }
 
     std::tie(ext_supported, ext_buildopts) = _device.is_non_uniform_workgroup_supported();
 
-    if(ext_supported)
+    if (ext_supported)
     {
         concat_str += ext_buildopts;
     }
@@ -270,7 +284,7 @@ std::string CLCompileContext::generate_build_options(const StringSet &build_opti
         ARM_COMPUTE_ERROR("Non uniform workgroup size is not supported!!");
     }
 
-    if(gpu_arch != GPUTarget::UNKNOWN && gpu_arch != GPUTarget::MIDGARD && get_ddk_version() >= 11)
+    if (gpu_arch != GPUTarget::UNKNOWN && gpu_arch != GPUTarget::MIDGARD && get_ddk_version() >= 11)
     {
         concat_str += " -DUNROLL_WITH_PRAGMA ";
     }
@@ -295,7 +309,7 @@ std::string CLCompileContext::stringify_set(const StringSet &s, const std::strin
 #endif /* EMBEDDED_KERNELS */
 
     // Concatenate set
-    for(const auto &el : s)
+    for (const auto &el : s)
     {
         concat_set += " " + el;
     }
@@ -340,7 +354,7 @@ cl::NDRange CLCompileContext::default_ndrange() const
     GPUTarget   _target = get_target_from_device(_device.cl_device());
     cl::NDRange default_range;
 
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::MIDGARD:
         case GPUTarget::T600:
@@ -370,7 +384,8 @@ size_t CLCompileContext::max_local_workgroup_size(const cl::Kernel &kernel) cons
     size_t result;
 
     size_t err = kernel.getWorkGroupInfo(_device.cl_device(), CL_KERNEL_WORK_GROUP_SIZE, &result);
-    ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel");
+    ARM_COMPUTE_ERROR_ON_MSG(err != 0,
+                             "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel");
     ARM_COMPUTE_UNUSED(err);
 
     return result;
@@ -392,7 +407,7 @@ int32_t CLCompileContext::get_ddk_version() const
     const std::regex  ddk_regex("r([0-9]*)p[0-9]");
     std::smatch       ddk_match;
 
-    if(std::regex_search(device_version, ddk_match, ddk_regex))
+    if (std::regex_search(device_version, ddk_match, ddk_regex))
     {
         return std::stoi(ddk_match[1]);
     }
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index 78f36100d5..5ea99d360a 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -22,14 +22,15 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/CL/CLHelpers.h"
+
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLTypes.h"
-#include "arm_compute/core/utils/DataTypeUtils.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Log.h"
 #include "arm_compute/core/Types.h"
-#include "src/gpu/cl/ClCompileContext.h"
+#include "arm_compute/core/utils/DataTypeUtils.h"
 
+#include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/ClKernelLibrary.h"
 
 #include <utility>
@@ -39,7 +40,7 @@ namespace arm_compute
 {
 std::string get_cl_type_from_data_type(const DataType &dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::QASYMM8:
@@ -75,7 +76,7 @@ std::string get_cl_type_from_data_type(const DataType &dt)
 
 std::string get_cl_promoted_type_from_data_type(const DataType &dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::QASYMM8:
@@ -105,7 +106,7 @@ std::string get_cl_promoted_type_from_data_type(const DataType &dt)
 
 std::string get_cl_unsigned_type_from_element_size(size_t element_size)
 {
-    switch(element_size)
+    switch (element_size)
     {
         case 1:
             return "uchar";
@@ -123,7 +124,7 @@ std::string get_cl_unsigned_type_from_element_size(size_t element_size)
 
 std::string get_cl_signed_type_from_element_size(size_t element_size)
 {
-    switch(element_size)
+    switch (element_size)
     {
         case 1:
             return "char";
@@ -141,7 +142,7 @@ std::string get_cl_signed_type_from_element_size(size_t element_size)
 
 std::string get_cl_select_type_from_data_type(const DataType &dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::QASYMM8:
@@ -174,7 +175,7 @@ std::string get_cl_select_type_from_data_type(const DataType &dt)
 
 std::string get_cl_dot8_acc_type_from_data_type(const DataType &dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::QASYMM8:
@@ -192,7 +193,7 @@ std::string get_cl_dot8_acc_type_from_data_type(const DataType &dt)
 
 std::string get_data_size_from_data_type(const DataType &dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::S8:
@@ -244,8 +245,9 @@ bool dot8_supported(const cl::Device &device)
     const GPUTarget gpu_target  = get_target_from_name(device_name);
 
     // SW_WORKAROUND: Workaround for DDK revision r14p0.to enable cl_arm_integer_dot_product_int8
-    std::set<GPUTarget> sw_workaround_issue = { GPUTarget::G76 };
-    return (device_supports_extension(device, "cl_arm_integer_dot_product_int8") || sw_workaround_issue.count(gpu_target) != 0);
+    std::set<GPUTarget> sw_workaround_issue = {GPUTarget::G76};
+    return (device_supports_extension(device, "cl_arm_integer_dot_product_int8") ||
+            sw_workaround_issue.count(gpu_target) != 0);
 }
 
 bool dot8_acc_supported(const cl::Device &device)
@@ -256,23 +258,23 @@ bool dot8_acc_supported(const cl::Device &device)
 CLVersion get_cl_version(const cl::Device &device)
 {
     std::string version_str = device.getInfo<CL_DEVICE_VERSION>();
-    if(version_str.find("OpenCL 3") != std::string::npos)
+    if (version_str.find("OpenCL 3") != std::string::npos)
     {
         return CLVersion::CL30;
     }
-    else if(version_str.find("OpenCL 2") != std::string::npos)
+    else if (version_str.find("OpenCL 2") != std::string::npos)
     {
         return CLVersion::CL20;
     }
-    else if(version_str.find("OpenCL 1.2") != std::string::npos)
+    else if (version_str.find("OpenCL 1.2") != std::string::npos)
     {
         return CLVersion::CL12;
     }
-    else if(version_str.find("OpenCL 1.1") != std::string::npos)
+    else if (version_str.find("OpenCL 1.1") != std::string::npos)
     {
         return CLVersion::CL11;
     }
-    else if(version_str.find("OpenCL 1.0") != std::string::npos)
+    else if (version_str.find("OpenCL 1.0") != std::string::npos)
     {
         return CLVersion::CL10;
     }
@@ -287,14 +289,15 @@ bool device_supports_extension(const cl::Device &device, const char *extension_n
     return (pos != std::string::npos);
 }
 
-bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Size2D &kernel_size, DataLayout data_layout)
+bool cl_winograd_convolution_layer_supported(const Size2D &output_tile,
+                                             const Size2D &kernel_size,
+                                             DataLayout    data_layout)
 {
     ARM_COMPUTE_ERROR_ON(data_layout == DataLayout::UNKNOWN);
 
     using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
 
-    std::vector<WinogradConfiguration> winograd_configs_nchw =
-    {
+    std::vector<WinogradConfiguration> winograd_configs_nchw = {
         WinogradConfiguration(std::pair<int, int>(1, 2), std::pair<int, int>(1, 3)),
         WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 3)),
         WinogradConfiguration(std::pair<int, int>(2, 1), std::pair<int, int>(3, 1)),
@@ -303,11 +306,9 @@ bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Si
         WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3)),
         WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)),
         WinogradConfiguration(std::pair<int, int>(4, 1), std::pair<int, int>(5, 1)),
-        WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5))
-    };
+        WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5))};
 
-    std::vector<WinogradConfiguration> winograd_configs_nhwc =
-    {
+    std::vector<WinogradConfiguration> winograd_configs_nhwc = {
         WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(3, 3)),
         WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 3)),
         WinogradConfiguration(std::pair<int, int>(4, 1), std::pair<int, int>(3, 1)),
@@ -324,19 +325,21 @@ bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Si
                             std::pair<int, int>(kernel_size.width, kernel_size.height));
 
     // Return true if supported
-    if(data_layout == DataLayout::NCHW)
+    if (data_layout == DataLayout::NCHW)
     {
-        return (std::find(winograd_configs_nchw.begin(), winograd_configs_nchw.end(), p) != winograd_configs_nchw.end());
+        return (std::find(winograd_configs_nchw.begin(), winograd_configs_nchw.end(), p) !=
+                winograd_configs_nchw.end());
     }
     else
     {
-        return (std::find(winograd_configs_nhwc.begin(), winograd_configs_nhwc.end(), p) != winograd_configs_nhwc.end());
+        return (std::find(winograd_configs_nhwc.begin(), winograd_configs_nhwc.end(), p) !=
+                winograd_configs_nhwc.end());
     }
 }
 
 size_t preferred_vector_width(const cl::Device &device, const DataType dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::S8:
@@ -382,7 +385,7 @@ size_t get_cl_image_pitch_alignment(const cl::Device &device)
 
     cl_int err = clGetDeviceInfo(device(), CL_DEVICE_IMAGE_PITCH_ALIGNMENT, sizeof(cl_uint), &pixel_aligment, nullptr);
 
-    if(err == CL_SUCCESS)
+    if (err == CL_SUCCESS)
     {
         return pixel_aligment;
     }
@@ -396,12 +399,14 @@ bool get_cl_non_uniform_work_group_supported(const cl::Device &device)
 {
     cl_bool supported = CL_FALSE;
 
-    cl_int err = clGetDeviceInfo(device(), CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool), &supported, nullptr);
+    cl_int err =
+        clGetDeviceInfo(device(), CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool), &supported, nullptr);
 
     return (err == CL_SUCCESS && supported == CL_TRUE);
 }
 
-cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_name, const std::set<std::string> &build_opts)
+cl::Kernel
+create_kernel(const CLCompileContext &ctx, const std::string &kernel_name, const std::set<std::string> &build_opts)
 {
     opencl::ClKernelLibrary &klib = opencl::ClKernelLibrary::get();
 
@@ -409,7 +414,8 @@ cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_
     auto              kernel_src   = klib.program(program_name);
     const std::string kernel_path  = klib.kernel_path();
 
-    return static_cast<cl::Kernel>(ctx.create_kernel(kernel_name, program_name, kernel_src.program, kernel_path, build_opts, kernel_src.is_binary));
+    return static_cast<cl::Kernel>(ctx.create_kernel(kernel_name, program_name, kernel_src.program, kernel_path,
+                                                     build_opts, kernel_src.is_binary));
 }
 
 cl::NDRange create_lws_hint_parallel_implementations(unsigned int input_dimension, unsigned int vector_size)
@@ -423,8 +429,9 @@ cl::NDRange create_lws_hint_parallel_implementations(unsigned int input_dimensio
 bool get_wbsm_support_info(const cl::Device &device)
 {
     cl_bitfield capabilities = 0;
-    cl_int      err          = clGetDeviceInfo(device.get(), CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM, sizeof(cl_bitfield), &capabilities, nullptr);
-    if((err == CL_SUCCESS) && (capabilities & CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM))
+    cl_int      err = clGetDeviceInfo(device.get(), CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM, sizeof(cl_bitfield),
+                                      &capabilities, nullptr);
+    if ((err == CL_SUCCESS) && (capabilities & CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM))
     {
         return true;
     }
@@ -433,35 +440,33 @@ bool get_wbsm_support_info(const cl::Device &device)
 
 void set_wbsm(cl::Kernel &kernel, cl_int wbsm_hint)
 {
-    cl_int err = clSetKernelExecInfo(kernel.get(),
-                                     CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM,
-                                     sizeof(cl_int),
-                                     &wbsm_hint);
+    cl_int err = clSetKernelExecInfo(kernel.get(), CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM,
+                                     sizeof(cl_int), &wbsm_hint);
     ARM_COMPUTE_UNUSED(err);
     ARM_COMPUTE_ERROR_ON(err != CL_SUCCESS);
 }
 
 bool export_to_cl_image(const ITensorInfo *tensor)
 {
-    if(tensor->tensor_shape()[0] % 4 != 0)
+    if (tensor->tensor_shape()[0] % 4 != 0)
     {
         return false;
     }
 
     // If not floating point
-    if(!is_data_type_float(tensor->data_type()))
+    if (!is_data_type_float(tensor->data_type()))
     {
         return false;
     }
 
     // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform
-    if(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
+    if (!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
     {
         return false;
     }
 
     // Check cl image pitch alignment
-    if(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0)
+    if (get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0)
     {
         return false;
     }
@@ -471,7 +476,7 @@ bool export_to_cl_image(const ITensorInfo *tensor)
     const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
     const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
 
-    if(image_w > max_image_w || image_h > max_image_h)
+    if (image_w > max_image_w || image_h > max_image_h)
     {
         return false;
     }
@@ -481,9 +486,9 @@ bool export_to_cl_image(const ITensorInfo *tensor)
 
 void set_unroll_with_pragma(CLBuildOptions &built_opts, std::initializer_list<int> values)
 {
-    for(const int value : values)
+    for (const int value : values)
     {
-        if(value > max_manual_loop_unrolling)
+        if (value > max_manual_loop_unrolling)
         {
             built_opts.add_option("-DUNROLL_WITH_PRAGMA");
             return;
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index c5a0796c3a..e69d006750 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -22,8 +22,11 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/CL/CLKernelLibrary.h"
+
 #include "arm_compute/core/Error.h"
+
 #include "src/gpu/cl/ClKernelLibrary.h"
+
 #include <algorithm>
 #include <array>
 #include <fstream>
@@ -31,8 +34,7 @@
 #include <vector>
 namespace arm_compute
 {
-CLKernelLibrary::CLKernelLibrary()
-    : _compile_context()
+CLKernelLibrary::CLKernelLibrary() : _compile_context()
 {
     opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the CLKernelLibrary is built
 }
@@ -41,13 +43,15 @@ CLKernelLibrary &CLKernelLibrary::get()
     static CLKernelLibrary _kernel_library;
     return _kernel_library;
 }
-Kernel CLKernelLibrary::create_kernel(const std::string &kernel_name, const std::set<std::string> &build_options_set) const
+Kernel CLKernelLibrary::create_kernel(const std::string           &kernel_name,
+                                      const std::set<std::string> &build_options_set) const
 {
     const opencl::ClKernelLibrary &klib         = opencl::ClKernelLibrary::get();
     const std::string              program_name = klib.program_name(kernel_name);
     auto                           program      = klib.program(program_name);
     const std::string             &kernel_path  = CLKernelLibrary::get().get_kernel_path();
-    return _compile_context.create_kernel(kernel_name, program_name, program.program, kernel_path, build_options_set, program.is_binary);
+    return _compile_context.create_kernel(kernel_name, program_name, program.program, kernel_path, build_options_set,
+                                          program.is_binary);
 }
 std::string CLKernelLibrary::get_program_name(const std::string &kernel_name) const
 {
@@ -131,4 +135,4 @@ CLCompileContext &CLKernelLibrary::get_compile_context()
 {
     return _compile_context;
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/CL/CLMutableCommandBuffer.cpp b/src/core/CL/CLMutableCommandBuffer.cpp
index b9c59ac6f0..05b351fc25 100644
--- a/src/core/CL/CLMutableCommandBuffer.cpp
+++ b/src/core/CL/CLMutableCommandBuffer.cpp
@@ -31,8 +31,7 @@
 namespace arm_compute
 {
 
-CLMutableCommandBuffer::CLMutableCommandBuffer(cl_command_queue queue)
-    : CLCommandBuffer()
+CLMutableCommandBuffer::CLMutableCommandBuffer(cl_command_queue queue) : CLCommandBuffer()
 {
     cl_int status = CL_SUCCESS;
 
@@ -52,7 +51,10 @@ CLMutableCommandBuffer::~CLMutableCommandBuffer()
     handle_cl_error("clReleaseCommandBufferKHR", status);
 }
 
-void CLMutableCommandBuffer::add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local)
+void CLMutableCommandBuffer::add_kernel(cl_kernel          kernel,
+                                        const cl::NDRange &offset,
+                                        const cl::NDRange &global,
+                                        const cl::NDRange &local)
 {
     ARM_COMPUTE_ERROR_ON(state() != State::Created);
 
@@ -65,18 +67,8 @@ void CLMutableCommandBuffer::add_kernel(cl_kernel kernel, const cl::NDRange &off
     };
 
     const auto error = clCommandNDRangeKernelKHR(
-        _cb,
-        nullptr,
-        properties,
-        kernel,
-        global.dimensions(),
-        offset.dimensions() != 0 ? offset.get() : nullptr,
-        global.get(),
-        local.dimensions() != 0 ? local.get() : nullptr,
-        0,
-        nullptr,
-        nullptr,
-        &mutable_handle);
+        _cb, nullptr, properties, kernel, global.dimensions(), offset.dimensions() != 0 ? offset.get() : nullptr,
+        global.get(), local.dimensions() != 0 ? local.get() : nullptr, 0, nullptr, nullptr, &mutable_handle);
 
     handle_cl_error("clCommandNDRangeKernelKHR", error);
 
@@ -114,7 +106,7 @@ void CLMutableCommandBuffer::finalize()
 
     size_t arg_no = 0;
 
-    for(auto &mut_dispatch_cfg : _mut_dispatch_cfgs)
+    for (auto &mut_dispatch_cfg : _mut_dispatch_cfgs)
     {
         ARM_COMPUTE_ERROR_ON(arg_no >= _mut_arg_cfgs.size());
         mut_dispatch_cfg.arg_list = &_mut_arg_cfgs[arg_no];
@@ -132,9 +124,7 @@ void CLMutableCommandBuffer::update()
 {
     ARM_COMPUTE_ERROR_ON(state() != State::Finalized);
 
-    const auto error = clUpdateMutableCommandsKHR(
-        _cb,
-        &_mut_cfg);
+    const auto error = clUpdateMutableCommandsKHR(_cb, &_mut_cfg);
 
     handle_cl_error("clUpdateMutableCommandsKHR", error);
 }
@@ -143,13 +133,7 @@ void CLMutableCommandBuffer::enqueue()
 {
     ARM_COMPUTE_ERROR_ON(state() != State::Finalized);
 
-    const auto error = clEnqueueCommandBufferKHR(
-        0,
-        nullptr,
-        _cb,
-        0,
-        nullptr,
-        nullptr);
+    const auto error = clEnqueueCommandBufferKHR(0, nullptr, _cb, 0, nullptr, nullptr);
 
     handle_cl_error("clEnqueueCommandBufferKHR", error);
 }
diff --git a/src/core/CL/CLMutableCommandBuffer.h b/src/core/CL/CLMutableCommandBuffer.h
index 04e94b0bb2..8997d7d1fd 100644
--- a/src/core/CL/CLMutableCommandBuffer.h
+++ b/src/core/CL/CLMutableCommandBuffer.h
@@ -57,7 +57,10 @@ public:
     /** Disallow move assignment. */
     CLMutableCommandBuffer &operator=(CLMutableCommandBuffer &&) = delete;
 
-    void add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) override;
+    void add_kernel(cl_kernel          kernel,
+                    const cl::NDRange &offset,
+                    const cl::NDRange &global,
+                    const cl::NDRange &local) override;
 
     void finalize() override;
 
diff --git a/src/core/CL/CLUtils.cpp b/src/core/CL/CLUtils.cpp
index 289300b3a1..290ed32648 100644
--- a/src/core/CL/CLUtils.cpp
+++ b/src/core/CL/CLUtils.cpp
@@ -26,9 +26,10 @@
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -38,15 +39,15 @@ cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType im
     ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
 
     const cl::Context &ctx    = CLKernelLibrary::get().context();
-    const cl::Buffer &buffer = tensor->cl_buffer();
+    const cl::Buffer  &buffer = tensor->cl_buffer();
     const ITensorInfo *info   = tensor->info();
-    ARM_COMPUTE_ERROR_ON_MSG(info->lock_paddings(),
-                             "Tensor paddings must not be locked to allow extending paddings to satisfy cl_image pitch alignment requirement");
+    ARM_COMPUTE_ERROR_ON_MSG(info->lock_paddings(), "Tensor paddings must not be locked to allow extending paddings to "
+                                                    "satisfy cl_image pitch alignment requirement");
 
-    const size_t image_w{ info->dimension(0) / 4 };
-    const size_t image_h{ info->tensor_shape().total_size() / info->dimension(0) };
-    const size_t max_image_w{ CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>() };
-    const size_t max_image_h{ CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>() };
+    const size_t image_w{info->dimension(0) / 4};
+    const size_t image_h{info->tensor_shape().total_size() / info->dimension(0)};
+    const size_t max_image_w{CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>()};
+    const size_t max_image_h{CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>()};
 
     ARM_COMPUTE_UNUSED(max_image_w, max_image_h);
     ARM_COMPUTE_ERROR_ON_MSG(image_w > max_image_w, "Image width exceeds maximum width for exporting to cl_image");
@@ -58,18 +59,22 @@ cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType im
     return create_image2d_from_buffer(ctx, buffer, shape2d, info->data_type(), image_row_pitch, image_type);
 }
 
-cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch, CLImage2DType image_type)
+cl::Image2D create_image2d_from_buffer(const cl::Context &ctx,
+                                       const cl::Buffer  &buffer,
+                                       const TensorShape &shape2d,
+                                       DataType           data_type,
+                                       size_t             image_row_pitch,
+                                       CLImage2DType      image_type)
 {
     ARM_COMPUTE_ERROR_ON_MSG(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()),
                              "The extension cl_khr_image2d_from_buffer is not supported on the target platform");
     ARM_COMPUTE_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0,
                              "Impossible to retrieve the cl_image pitch alignment");
-    ARM_COMPUTE_ERROR_ON_MSG(buffer.get() == nullptr,
-                             "Cannot create cl_image from empty cl_buffer");
+    ARM_COMPUTE_ERROR_ON_MSG(buffer.get() == nullptr, "Cannot create cl_image from empty cl_buffer");
 
     cl_channel_type cl_data_type;
 
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::F32:
             cl_data_type = CL_FLOAT;
@@ -84,7 +89,7 @@ cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer
     cl_mem cl_image;
     cl_int err = CL_SUCCESS;
 
-    const cl_image_format format = { CL_RGBA, cl_data_type };
+    const cl_image_format format = {CL_RGBA, cl_data_type};
 
     cl_image_desc desc;
     memset(&desc, 0, sizeof(desc));
@@ -94,7 +99,7 @@ cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer
     desc.image_width     = shape2d[0];
     desc.image_height    = shape2d[1];
 
-    switch(image_type)
+    switch (image_type)
     {
         case CLImage2DType::ReadOnly:
             cl_image = clCreateImage(ctx(), CL_MEM_READ_ONLY, &format, &desc, nullptr, &err);
@@ -114,7 +119,7 @@ cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer
 
 void handle_cl_error(const std::string &function_name, cl_int error_code)
 {
-    if(error_code != CL_SUCCESS)
+    if (error_code != CL_SUCCESS)
     {
         std::string error_message = function_name + " - Error code: " + std::to_string(error_code);
         ARM_COMPUTE_ERROR(error_message.c_str());
diff --git a/src/core/CL/CLUtils.h b/src/core/CL/CLUtils.h
index de9c1b3194..f9dcfeac3a 100644
--- a/src/core/CL/CLUtils.h
+++ b/src/core/CL/CLUtils.h
@@ -72,7 +72,12 @@ cl::Image2D create_image2d_from_tensor(const ICLTensor *tensor, CLImage2DType im
  *
  * @return cl::Image2D object
  */
-cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch, CLImage2DType image_type);
+cl::Image2D create_image2d_from_buffer(const cl::Context &ctx,
+                                       const cl::Buffer  &buffer,
+                                       const TensorShape &shape2d,
+                                       DataType           data_type,
+                                       size_t             image_row_pitch,
+                                       CLImage2DType      image_type);
 
 /** Check for CL error code and throw exception accordingly.
  *
diff --git a/src/core/CL/CLValidate.h b/src/core/CL/CLValidate.h
index 7b5294e452..50d224f1c0 100644
--- a/src/core/CL/CLValidate.h
+++ b/src/core/CL/CLValidate.h
@@ -29,11 +29,13 @@
 
 namespace arm_compute
 {
-#define ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(tensor) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, CLKernelLibrary::get().fp16_supported()))
+#define ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(tensor)                                                          \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, \
+                                                                        CLKernelLibrary::get().fp16_supported()))
 
-#define ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(tensor) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, CLKernelLibrary::get().fp16_supported()))
+#define ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(tensor)                                                    \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, \
+                                                                         CLKernelLibrary::get().fp16_supported()))
 
 /** Return an error if int64_base_atomics extension is not supported by the device.
  *
@@ -43,11 +45,13 @@ namespace arm_compute
  *
  * @return Status
  */
-inline arm_compute::Status error_on_unsupported_int64_base_atomics(const char *function, const char *file, const int line)
+inline arm_compute::Status
+error_on_unsupported_int64_base_atomics(const char *function, const char *file, const int line)
 {
-    if(!CLKernelLibrary::get().int64_base_atomics_supported())
+    if (!CLKernelLibrary::get().int64_base_atomics_supported())
     {
-        return ARM_COMPUTE_CREATE_ERROR_LOC(arm_compute::ErrorCode::UNSUPPORTED_EXTENSION_USE, function, file, line, "Atomic functions are not supported");
+        return ARM_COMPUTE_CREATE_ERROR_LOC(arm_compute::ErrorCode::UNSUPPORTED_EXTENSION_USE, function, file, line,
+                                            "Atomic functions are not supported");
     }
     return arm_compute::Status{};
 }
diff --git a/src/core/CL/DefaultLWSHeuristics.cpp b/src/core/CL/DefaultLWSHeuristics.cpp
index a53fdbbab6..f96b24d2a9 100644
--- a/src/core/CL/DefaultLWSHeuristics.cpp
+++ b/src/core/CL/DefaultLWSHeuristics.cpp
@@ -31,13 +31,13 @@ cl::NDRange get_gemm_lws(size_t gws_x, size_t gws_y, size_t gws_z)
 {
     ARM_COMPUTE_UNUSED(gws_y);
 
-    if(gws_z != 1)
+    if (gws_z != 1)
     {
         return cl::NDRange(4, 4, 2);
     }
     else
     {
-        if(gws_x > 256)
+        if (gws_x > 256)
         {
             return cl::NDRange(2, 16, 1);
         }
@@ -59,9 +59,9 @@ cl::NDRange get_direct_lws(size_t gws_x, size_t gws_y, size_t gws_z)
 {
     ARM_COMPUTE_UNUSED(gws_z);
 
-    if(gws_x < gws_y)
+    if (gws_x < gws_y)
     {
-        if(gws_x < 4)
+        if (gws_x < 4)
         {
             return cl::NDRange(std::min(gws_x, static_cast<size_t>(2u)), 32, 1);
         }
@@ -81,7 +81,7 @@ cl::NDRange get_dwc_lws(size_t gws_x, size_t gws_y, size_t gws_z)
     ARM_COMPUTE_UNUSED(gws_y);
     ARM_COMPUTE_UNUSED(gws_z);
 
-    if(gws_x < 32)
+    if (gws_x < 32)
     {
         return cl::NDRange(gws_x, 4, 4);
     }
@@ -100,7 +100,7 @@ cl::NDRange get_default_lws_for_type(CLKernelType kernel_type, cl::NDRange gws)
     const size_t gws_y = gws[1];
     const size_t gws_z = gws[2];
 
-    switch(kernel_type)
+    switch (kernel_type)
     {
         case CLKernelType::GEMM:
         {
@@ -124,4 +124,4 @@ cl::NDRange get_default_lws_for_type(CLKernelType kernel_type, cl::NDRange gws)
         }
     }
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index dc3a86a528..ac53e7f1d2 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -25,18 +25,23 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/helpers/Utils.h"
 
 #include <cstddef>
 
-void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint, bool use_dummy_work_items)
+void arm_compute::enqueue(cl::CommandQueue  &queue,
+                          ICLKernel         &kernel,
+                          const Window      &window,
+                          const cl::NDRange &lws_hint,
+                          bool               use_dummy_work_items)
 {
-    if(kernel.kernel()() == nullptr)
+    if (kernel.kernel()() == nullptr)
     {
         return;
     }
 
-    for(unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i)
+    for (unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_ERROR_ON(window[i].step() == 0);
         // Make sure that dimensions > Z are 1
@@ -46,7 +51,7 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind
     cl::NDRange gws = ICLKernel::gws_from_window(window, use_dummy_work_items);
 
     // Check for empty NDRange
-    if(gws.dimensions() == 0)
+    if (gws.dimensions() == 0)
     {
         return;
     }
@@ -54,7 +59,7 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind
     kernel.cache_gws(gws);
 
     cl::NDRange valid_lws;
-    if(lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size())
+    if (lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size())
     {
         valid_lws = cl::NullRange;
     }
@@ -65,12 +70,12 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind
 
     cl::NDRange lws = cl::NullRange;
 
-    if((valid_lws[0] <= gws[0]) && (valid_lws[1] <= gws[1]) && (valid_lws[2] <= gws[2]))
+    if ((valid_lws[0] <= gws[0]) && (valid_lws[1] <= gws[1]) && (valid_lws[2] <= gws[2]))
     {
         lws = valid_lws;
     }
 
-    if(CLKernelLibrary::get().is_wbsm_supported())
+    if (CLKernelLibrary::get().is_wbsm_supported())
     {
         set_wbsm(kernel.kernel(), kernel.wbsm_hint());
     }
@@ -90,7 +95,7 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons
     // Calculate offset to the start of the window
     unsigned int offset_first_element = info->offset_first_element_in_bytes();
 
-    for(unsigned int n = 0; n < info->num_dimensions(); ++n)
+    for (unsigned int n = 0; n < info->num_dimensions(); ++n)
     {
         offset_first_element += (window.is_broadcasted(n) ? 0 : window[n].start()) * strides[n];
     }
@@ -98,7 +103,7 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons
     unsigned int idx_start = idx;
     _kernel.setArg(idx++, tensor->cl_buffer());
 
-    for(unsigned int d = 0; d < dimension_size; ++d)
+    for (unsigned int d = 0; d < dimension_size; ++d)
     {
         _kernel.setArg<cl_uint>(idx++, window.is_broadcasted(d) ? 0 : strides[d]);
         _kernel.setArg<cl_uint>(idx++, window.is_broadcasted(d) ? 0 : (strides[d] * window[d].step()));
@@ -107,7 +112,8 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons
     _kernel.setArg<cl_uint>(idx++, offset_first_element);
 
     ARM_COMPUTE_ERROR_ON_MSG_VAR(idx_start + num_arguments_per_tensor<dimension_size>() != idx,
-                                 "add_%dD_tensor_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_tensor<dimension_size>());
+                                 "add_%dD_tensor_argument() is supposed to add exactly %d arguments to the kernel",
+                                 dimension_size, num_arguments_per_tensor<dimension_size>());
     ARM_COMPUTE_UNUSED(idx_start);
 }
 
@@ -178,7 +184,7 @@ void ICLKernel::set_target(cl::Device &device)
 
 size_t ICLKernel::get_max_workgroup_size()
 {
-    if(_max_workgroup_size == 0)
+    if (_max_workgroup_size == 0)
     {
         _max_workgroup_size = CLKernelLibrary::get().max_local_workgroup_size(_kernel);
     }
@@ -187,7 +193,7 @@ size_t ICLKernel::get_max_workgroup_size()
 
 cl::NDRange ICLKernel::gws_from_window(const Window &window, bool use_dummy_work_items)
 {
-    if((window.x().end() - window.x().start()) == 0 || (window.y().end() - window.y().start()) == 0)
+    if ((window.x().end() - window.x().start()) == 0 || (window.y().end() - window.y().start()) == 0)
     {
         return cl::NullRange;
     }
@@ -196,7 +202,7 @@ cl::NDRange ICLKernel::gws_from_window(const Window &window, bool use_dummy_work
                     (window.y().end() - window.y().start()) / window.y().step(),
                     (window.z().end() - window.z().start()) / window.z().step());
 
-    if(use_dummy_work_items)
+    if (use_dummy_work_items)
     {
         gws.get()[0] = get_next_power_two(gws[0]);
         gws.get()[1] = get_next_power_two(gws[1]);
diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h
index c82809cef3..6aebef15a5 100644
--- a/src/core/CL/ICLKernel.h
+++ b/src/core/CL/ICLKernel.h
@@ -27,10 +27,10 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLTypes.h"
 #include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/IKernel.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/CL/CLTuningParams.h"
 
 #include "src/core/CL/DefaultLWSHeuristics.h"
@@ -43,14 +43,14 @@ namespace
 {
 bool is_same_lws(cl::NDRange lws0, cl::NDRange lws1)
 {
-    if(lws0.dimensions() != lws1.dimensions())
+    if (lws0.dimensions() != lws1.dimensions())
     {
         return false;
     }
 
-    for(size_t i = 0; i < lws0.dimensions(); ++i)
+    for (size_t i = 0; i < lws0.dimensions(); ++i)
     {
-        if(lws0.get()[i] != lws1.get()[i])
+        if (lws0.get()[i] != lws1.get()[i])
         {
             return false;
         }
@@ -71,7 +71,7 @@ private:
      *
      * @return The number of arguments enqueued per array object.
      */
-    template <unsigned int        dimension_size>
+    template <unsigned int dimension_size>
     constexpr static unsigned int num_arguments_per_array()
     {
         return num_arguments_per_tensor<dimension_size>();
@@ -80,7 +80,7 @@ private:
      *
      * @return The number of arguments enqueued per tensor object.
      */
-    template <unsigned int        dimension_size>
+    template <unsigned int dimension_size>
     constexpr static unsigned int num_arguments_per_tensor()
     {
         return 2 + 2 * dimension_size;
@@ -116,11 +116,13 @@ protected:
      * @param[in] window             The maximum window which will be returned by window()
      * @param[in] tuning_params_hint (Optional) Tuning parameters to use.
      */
-    void configure_internal(const Window &window, CLTuningParams tuning_params_hint = CLTuningParams(CLKernelLibrary::get().default_ndrange(), 0))
+    void configure_internal(const Window  &window,
+                            CLTuningParams tuning_params_hint = CLTuningParams(CLKernelLibrary::get().default_ndrange(),
+                                                                               0))
     {
         _tuning_params_hint = tuning_params_hint;
 
-        if(is_same_lws(_tuning_params_hint.get_lws(), CLKernelLibrary::get().default_ndrange()))
+        if (is_same_lws(_tuning_params_hint.get_lws(), CLKernelLibrary::get().default_ndrange()))
         {
             // Disable use_dummy_work_items at configure time. Because dummy work items only affect gws size, which
             // will be recalculated with use_dummy_work_items flag at run time again anyway.
@@ -133,7 +135,13 @@ protected:
 public:
     /** Constructor */
     ICLKernel()
-        : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _type(CLKernelType::UNKNOWN), _tuning_params_hint(), _cached_gws(cl::NullRange)
+        : _kernel(nullptr),
+          _target(GPUTarget::MIDGARD),
+          _config_id(arm_compute::default_config_id),
+          _max_workgroup_size(0),
+          _type(CLKernelType::UNKNOWN),
+          _tuning_params_hint(),
+          _cached_gws(cl::NullRange)
     {
     }
     /** Returns a reference to the OpenCL kernel of this object.
@@ -161,7 +169,11 @@ public:
      * @param[in]     window         Window the kernel will be executed on.
      */
     template <typename T>
-    void add_1D_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
+    void add_1D_array_argument(unsigned int      &idx,
+                               const ICLArray<T> *array,
+                               const Strides     &strides,
+                               unsigned int       num_dimensions,
+                               const Window      &window)
     {
         add_array_argument<T, 1>(idx, array, strides, num_dimensions, window);
     }
@@ -184,7 +196,7 @@ public:
      */
     void add_1D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
     {
-        if(cond)
+        if (cond)
         {
             add_1D_tensor_argument(idx, tensor, window);
         }
@@ -208,7 +220,7 @@ public:
      */
     void add_2D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
     {
-        if(cond)
+        if (cond)
         {
             add_2D_tensor_argument(idx, tensor, window);
         }
@@ -469,7 +481,11 @@ private:
      * @param[in]     window         Window the kernel will be executed on.
      */
     template <typename T, unsigned int dimension_size>
-    void add_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window);
+    void add_array_argument(unsigned int      &idx,
+                            const ICLArray<T> *array,
+                            const Strides     &strides,
+                            unsigned int       num_dimensions,
+                            const Window      &window);
     /** Add the passed tensor's parameters to the object's kernel's arguments starting from the index idx.
      *
      * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
@@ -505,7 +521,11 @@ private:
  *
  * @note If any dimension of the lws is greater than the global workgroup size then no lws will be passed.
  */
-void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint = CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items = false);
+void enqueue(cl::CommandQueue  &queue,
+             ICLKernel         &kernel,
+             const Window      &window,
+             const cl::NDRange &lws_hint             = CLKernelLibrary::get().default_ndrange(),
+             bool               use_dummy_work_items = false);
 
 /** Add the passed array's parameters to the object's kernel's arguments starting from the index idx.
  *
@@ -516,14 +536,15 @@ void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, c
  * @param[in]     window         Window the kernel will be executed on.
  */
 template <typename T, unsigned int dimension_size>
-void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
+void ICLKernel::add_array_argument(
+    unsigned &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
 {
     ARM_COMPUTE_ERROR_ON(array == nullptr);
 
     // Calculate offset to the start of the window
     unsigned int offset_first_element = 0;
 
-    for(unsigned int n = 0; n < num_dimensions; ++n)
+    for (unsigned int n = 0; n < num_dimensions; ++n)
     {
         offset_first_element += window[n].start() * strides[n];
     }
@@ -531,7 +552,7 @@ void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, cons
     unsigned int idx_start = idx;
     _kernel.setArg(idx++, array->cl_buffer());
 
-    for(unsigned int dimension = 0; dimension < dimension_size; dimension++)
+    for (unsigned int dimension = 0; dimension < dimension_size; dimension++)
     {
         _kernel.setArg<cl_uint>(idx++, strides[dimension]);
         _kernel.setArg<cl_uint>(idx++, strides[dimension] * window[dimension].step());
@@ -540,8 +561,9 @@ void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, cons
     _kernel.setArg<cl_uint>(idx++, offset_first_element);
 
     ARM_COMPUTE_ERROR_ON_MSG_VAR(idx_start + num_arguments_per_array<dimension_size>() != idx,
-                                 "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_array<dimension_size>());
+                                 "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel",
+                                 dimension_size, num_arguments_per_array<dimension_size>());
     ARM_COMPUTE_UNUSED(idx_start);
 }
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ICLKERNEL_H */
diff --git a/src/core/CL/ICLSimple2DKernel.cpp b/src/core/CL/ICLSimple2DKernel.cpp
index 5d8295bdfe..3f7edbb88d 100644
--- a/src/core/CL/ICLSimple2DKernel.cpp
+++ b/src/core/CL/ICLSimple2DKernel.cpp
@@ -40,6 +40,5 @@ void ICLSimple2DKernel::run(const Window &window, cl::CommandQueue &queue)
         add_2D_tensor_argument(idx, _input, slice);
         add_2D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
+    } while (window.slide_window_slice_2D(slice));
 }
diff --git a/src/core/CL/ICLSimple2DKernel.h b/src/core/CL/ICLSimple2DKernel.h
index 5246492401..97bc1e58c2 100644
--- a/src/core/CL/ICLSimple2DKernel.h
+++ b/src/core/CL/ICLSimple2DKernel.h
@@ -37,5 +37,5 @@ public:
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ICLSIMPLE2DKERNEL_H */
diff --git a/src/core/CL/ICLSimple3DKernel.cpp b/src/core/CL/ICLSimple3DKernel.cpp
index fef1a86125..71d7d1f07b 100644
--- a/src/core/CL/ICLSimple3DKernel.cpp
+++ b/src/core/CL/ICLSimple3DKernel.cpp
@@ -42,6 +42,5 @@ void ICLSimple3DKernel::run(const Window &window, cl::CommandQueue &queue)
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
diff --git a/src/core/CL/ICLSimple3DKernel.h b/src/core/CL/ICLSimple3DKernel.h
index ff0b274663..5071b6b339 100644
--- a/src/core/CL/ICLSimple3DKernel.h
+++ b/src/core/CL/ICLSimple3DKernel.h
@@ -39,5 +39,5 @@ public:
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ICLSIMPLE3DKERNEL_H */
diff --git a/src/core/CL/ICLSimpleKernel.cpp b/src/core/CL/ICLSimpleKernel.cpp
index d67fefdf71..c31db8355f 100644
--- a/src/core/CL/ICLSimpleKernel.cpp
+++ b/src/core/CL/ICLSimpleKernel.cpp
@@ -22,30 +22,35 @@
  * SOFTWARE.
  */
 #include "src/core/CL/ICLSimpleKernel.h"
+
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 
 using namespace arm_compute;
 
-ICLSimpleKernel::ICLSimpleKernel()
-    : _input(nullptr), _output(nullptr)
+ICLSimpleKernel::ICLSimpleKernel() : _input(nullptr), _output(nullptr)
 {
 }
 
-void ICLSimpleKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined, const BorderSize &border_size)
+void ICLSimpleKernel::configure(const ICLTensor  *input,
+                                ICLTensor        *output,
+                                unsigned int      num_elems_processed_per_iteration,
+                                bool              border_undefined,
+                                const BorderSize &border_size)
 {
     _input  = input;
     _output = output;
 
     // Configure kernel window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size);
+    Window win =
+        calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size);
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
 
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
                               output_access);
 
     output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size);
diff --git a/src/core/CL/ICLSimpleKernel.h b/src/core/CL/ICLSimpleKernel.h
index b35547a217..6afd7309aa 100644
--- a/src/core/CL/ICLSimpleKernel.h
+++ b/src/core/CL/ICLSimpleKernel.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -55,12 +56,16 @@ public:
      * @param[in]  border_undefined                  (Optional) True if the border mode is undefined. False if it's replicate or constant.
      * @param[in]  border_size                       (Optional) Size of the border.
      */
-    void configure(const ICLTensor *input, ICLTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined = false, const BorderSize &border_size = BorderSize());
+    void configure(const ICLTensor  *input,
+                   ICLTensor        *output,
+                   unsigned int      num_elems_processed_per_iteration,
+                   bool              border_undefined = false,
+                   const BorderSize &border_size      = BorderSize());
 
 protected:
     const ICLTensor *_input;
     ICLTensor       *_output;
 };
-}
+} // namespace arm_compute
 
 #endif /*ARM_COMPUTE_ICLSIMPLEKERNEL_H */
diff --git a/src/core/CL/ICLTensor.cpp b/src/core/CL/ICLTensor.cpp
index b541bff04a..0771db7f50 100644
--- a/src/core/CL/ICLTensor.cpp
+++ b/src/core/CL/ICLTensor.cpp
@@ -27,8 +27,7 @@
 
 using namespace arm_compute;
 
-ICLTensor::ICLTensor()
-    : _mapping(nullptr)
+ICLTensor::ICLTensor() : _mapping(nullptr)
 {
 }
 
diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index b092dfb4e2..35421d025e 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp
@@ -36,11 +36,7 @@
 
 namespace arm_compute
 {
-CLSymbols::CLSymbols() noexcept(false)
-    : _loaded(
-{
-    false, false
-})
+CLSymbols::CLSymbols() noexcept(false) : _loaded({false, false})
 {
 }
 
@@ -52,9 +48,9 @@ CLSymbols &CLSymbols::get()
 
 bool CLSymbols::load_default()
 {
-    static const std::vector<std::string> libraries_filenames{ "libOpenCL.so", "libGLES_mali.so", "libmali.so" };
+    static const std::vector<std::string> libraries_filenames{"libOpenCL.so", "libGLES_mali.so", "libmali.so"};
 
-    if(_loaded.first)
+    if (_loaded.first)
     {
         return _loaded.second;
     }
@@ -62,34 +58,32 @@ bool CLSymbols::load_default()
     // Indicate that default loading has been tried
     _loaded.first = true;
 
-    if(load(libraries_filenames, /* use_loader */ false))
+    if (load(libraries_filenames, /* use_loader */ false))
     {
-        ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr, "Failed to load OpenCL symbols from shared library");
+        ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr,
+                                 "Failed to load OpenCL symbols from shared library");
         return true;
     }
 
 #ifdef __ANDROID__
     // When running in NDK environment, the above libraries are not accessible.
-    static const std::vector<std::string> android_libraries_filenames{ "libOpenCL-pixel.so", "libOpenCL-car.so" };
+    static const std::vector<std::string> android_libraries_filenames{"libOpenCL-pixel.so", "libOpenCL-car.so"};
 
-    if(load(android_libraries_filenames, /* use_loader */ true))
+    if (load(android_libraries_filenames, /* use_loader */ true))
     {
-        ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr, "Failed to load OpenCL symbols from android shared library");
+        ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr,
+                                 "Failed to load OpenCL symbols from android shared library");
         return true;
     }
 #endif // __ANDROID__
 
     // If not returned till here then libraries not found
     std::stringstream ss;
-    std::for_each(libraries_filenames.begin(), libraries_filenames.end(), [&ss](const std::string & s)
-    {
-        ss << s << " ";
-    });
+    std::for_each(libraries_filenames.begin(), libraries_filenames.end(),
+                  [&ss](const std::string &s) { ss << s << " "; });
 #ifdef __ANDROID__
-    std::for_each(android_libraries_filenames.begin(), android_libraries_filenames.end(), [&ss](const std::string & s)
-    {
-        ss << s << " ";
-    });
+    std::for_each(android_libraries_filenames.begin(), android_libraries_filenames.end(),
+                  [&ss](const std::string &s) { ss << s << " "; });
 #endif // __ANDROID__
     std::cerr << "Couldn't find any of the following OpenCL library: " << ss.str() << std::endl;
     return false;
@@ -99,15 +93,15 @@ bool CLSymbols::load(const std::vector<std::string> &libraries_filenames, bool u
 {
     void        *handle = nullptr;
     unsigned int index  = 0;
-    for(index = 0; index < libraries_filenames.size(); ++index)
+    for (index = 0; index < libraries_filenames.size(); ++index)
     {
         handle = dlopen(libraries_filenames[index].c_str(), RTLD_LAZY | RTLD_LOCAL);
-        if(handle != nullptr)
+        if (handle != nullptr)
         {
             break;
         }
     }
-    if(index == libraries_filenames.size())
+    if (index == libraries_filenames.size())
     {
         // Set status of loading to failed
         _loaded.second = false;
@@ -115,22 +109,23 @@ bool CLSymbols::load(const std::vector<std::string> &libraries_filenames, bool u
     }
 
 #ifdef __ANDROID__
-    typedef void* (*loadOpenCLPointer_t)(const char* name);
+    typedef void *(*loadOpenCLPointer_t)(const char *name);
     loadOpenCLPointer_t loadOpenCLPointer;
-    if (use_loader) {
+    if (use_loader)
+    {
         typedef void (*enableOpenCL_t)();
-        enableOpenCL_t enableOpenCL =
-            reinterpret_cast<enableOpenCL_t>(dlsym(handle, "enableOpenCL"));
+        enableOpenCL_t enableOpenCL = reinterpret_cast<enableOpenCL_t>(dlsym(handle, "enableOpenCL"));
         enableOpenCL();
 
-        loadOpenCLPointer = reinterpret_cast<loadOpenCLPointer_t>(
-            dlsym(handle, "loadOpenCLPointer"));
-    } else {
+        loadOpenCLPointer = reinterpret_cast<loadOpenCLPointer_t>(dlsym(handle, "loadOpenCLPointer"));
+    }
+    else
+    {
         loadOpenCLPointer = nullptr;
     }
-#define LOAD_FUNCTION_PTR(func_name, _handle) \
-    func_name##_ptr = reinterpret_cast<decltype(func_name) *>( use_loader ? \
-        loadOpenCLPointer(#func_name) : dlsym(handle, #func_name));
+#define LOAD_FUNCTION_PTR(func_name, _handle)                                                            \
+    func_name##_ptr = reinterpret_cast<decltype(func_name) *>(use_loader ? loadOpenCLPointer(#func_name) \
+                                                                         : dlsym(handle, #func_name));
 #else /* __ANDROID__ */
     (void)use_loader; // Avoid unused warning
 #define LOAD_FUNCTION_PTR(func_name, handle) \
@@ -234,12 +229,11 @@ bool opencl_is_available()
 }
 } // namespace arm_compute
 
-cl_int clEnqueueMarker(cl_command_queue command_queue,
-                       cl_event        *event)
+cl_int clEnqueueMarker(cl_command_queue command_queue, cl_event *event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueMarker_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue, event);
     }
@@ -249,12 +243,11 @@ cl_int clEnqueueMarker(cl_command_queue command_queue,
     }
 }
 
-cl_int clWaitForEvents(cl_uint         num_events,
-                       const cl_event *event_list)
+cl_int clWaitForEvents(cl_uint num_events, const cl_event *event_list)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clWaitForEvents_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(num_events, event_list);
     }
@@ -264,12 +257,18 @@ cl_int clWaitForEvents(cl_uint         num_events,
     }
 }
 
-cl_int clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking_map, cl_map_flags flags, void *svm_ptr,
-                       size_t size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
+cl_int clEnqueueSVMMap(cl_command_queue command_queue,
+                       cl_bool          blocking_map,
+                       cl_map_flags     flags,
+                       void            *svm_ptr,
+                       size_t           size,
+                       cl_uint          num_events_in_wait_list,
+                       const cl_event  *event_wait_list,
+                       cl_event        *event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueSVMMap_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue, blocking_map, flags, svm_ptr, size, num_events_in_wait_list, event_wait_list, event);
     }
@@ -279,12 +278,15 @@ cl_int clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking_map, cl_
     }
 }
 
-cl_int clEnqueueSVMUnmap(cl_command_queue command_queue, void *svm_ptr, cl_uint num_events_in_wait_list,
-                         const cl_event *event_wait_list, cl_event *event)
+cl_int clEnqueueSVMUnmap(cl_command_queue command_queue,
+                         void            *svm_ptr,
+                         cl_uint          num_events_in_wait_list,
+                         const cl_event  *event_wait_list,
+                         cl_event        *event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueSVMUnmap_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue, svm_ptr, num_events_in_wait_list, event_wait_list, event);
     }
@@ -298,7 +300,7 @@ void *clSVMAlloc(cl_context context, cl_svm_mem_flags_arm flags, size_t size, cl
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clSVMAlloc_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, flags, size, alignment);
     }
@@ -312,7 +314,7 @@ void clSVMFree(cl_context context, void *svm_pointer)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clSVMFree_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         func(context, svm_pointer);
     }
@@ -326,7 +328,7 @@ cl_int clGetContextInfo(cl_context      context,
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetContextInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -343,7 +345,7 @@ cl_command_queue clCreateCommandQueue(cl_context                  context,
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateCommandQueue_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, device, properties, errcode_ret);
     }
@@ -360,7 +362,7 @@ cl_command_queue clCreateCommandQueueWithProperties(cl_context                 c
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateCommandQueueWithProperties_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, device, properties, errcode_ret);
     }
@@ -370,17 +372,16 @@ cl_command_queue clCreateCommandQueueWithProperties(cl_context                 c
     }
 }
 
-cl_context clCreateContext(
-    const cl_context_properties *properties,
-    cl_uint                      num_devices,
-    const cl_device_id          *devices,
-    void (*pfn_notify)(const char *, const void *, size_t, void *),
-    void   *user_data,
-    cl_int *errcode_ret)
+cl_context clCreateContext(const cl_context_properties *properties,
+                           cl_uint                      num_devices,
+                           const cl_device_id          *devices,
+                           void (*pfn_notify)(const char *, const void *, size_t, void *),
+                           void   *user_data,
+                           cl_int *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateContext_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(properties, num_devices, devices, pfn_notify, user_data, errcode_ret);
     }
@@ -398,7 +399,7 @@ cl_context clCreateContextFromType(const cl_context_properties *properties,
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateContextFromType_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(properties, device_type, pfn_notify, user_data, errcode_ret);
     }
@@ -408,17 +409,16 @@ cl_context clCreateContextFromType(const cl_context_properties *properties,
     }
 }
 
-cl_int clBuildProgram(
-    cl_program          program,
-    cl_uint             num_devices,
-    const cl_device_id *device_list,
-    const char         *options,
-    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
-    void *user_data)
+cl_int clBuildProgram(cl_program          program,
+                      cl_uint             num_devices,
+                      const cl_device_id *device_list,
+                      const char         *options,
+                      void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
+                      void *user_data)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clBuildProgram_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(program, num_devices, device_list, options, pfn_notify, user_data);
     }
@@ -428,22 +428,22 @@ cl_int clBuildProgram(
     }
 }
 
-cl_int clEnqueueNDRangeKernel(
-    cl_command_queue command_queue,
-    cl_kernel        kernel,
-    cl_uint          work_dim,
-    const size_t    *global_work_offset,
-    const size_t    *global_work_size,
-    const size_t    *local_work_size,
-    cl_uint          num_events_in_wait_list,
-    const cl_event *event_wait_list,
-    cl_event        *event)
+cl_int clEnqueueNDRangeKernel(cl_command_queue command_queue,
+                              cl_kernel        kernel,
+                              cl_uint          work_dim,
+                              const size_t    *global_work_offset,
+                              const size_t    *global_work_size,
+                              const size_t    *local_work_size,
+                              cl_uint          num_events_in_wait_list,
+                              const cl_event  *event_wait_list,
+                              cl_event        *event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueNDRangeKernel_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
-        return func(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, event);
+        return func(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size,
+                    num_events_in_wait_list, event_wait_list, event);
     }
     else
     {
@@ -451,15 +451,11 @@ cl_int clEnqueueNDRangeKernel(
     }
 }
 
-cl_int clSetKernelArg(
-    cl_kernel   kernel,
-    cl_uint     arg_index,
-    size_t      arg_size,
-    const void *arg_value)
+cl_int clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clSetKernelArg_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(kernel, arg_index, arg_size, arg_value);
     }
@@ -473,7 +469,7 @@ cl_int clRetainMemObject(cl_mem memobj)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clRetainMemObject_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(memobj);
     }
@@ -487,7 +483,7 @@ cl_int clReleaseMemObject(cl_mem memobj)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clReleaseMemObject_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(memobj);
     }
@@ -497,17 +493,16 @@ cl_int clReleaseMemObject(cl_mem memobj)
     }
 }
 
-cl_int clEnqueueUnmapMemObject(
-    cl_command_queue command_queue,
-    cl_mem           memobj,
-    void            *mapped_ptr,
-    cl_uint          num_events_in_wait_list,
-    const cl_event *event_wait_list,
-    cl_event        *event)
+cl_int clEnqueueUnmapMemObject(cl_command_queue command_queue,
+                               cl_mem           memobj,
+                               void            *mapped_ptr,
+                               cl_uint          num_events_in_wait_list,
+                               const cl_event  *event_wait_list,
+                               cl_event        *event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueUnmapMemObject_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event);
     }
@@ -521,7 +516,7 @@ cl_int clRetainCommandQueue(cl_command_queue command_queue)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clRetainCommandQueue_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue);
     }
@@ -535,7 +530,7 @@ cl_int clReleaseContext(cl_context context)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clReleaseContext_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context);
     }
@@ -548,7 +543,7 @@ cl_int clReleaseEvent(cl_event event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clReleaseEvent_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(event);
     }
@@ -558,22 +553,22 @@ cl_int clReleaseEvent(cl_event event)
     }
 }
 
-cl_int clEnqueueWriteBuffer(
-    cl_command_queue command_queue,
-    cl_mem           buffer,
-    cl_bool          blocking_write,
-    size_t           offset,
-    size_t           size,
-    const void      *ptr,
-    cl_uint          num_events_in_wait_list,
-    const cl_event *event_wait_list,
-    cl_event        *event)
+cl_int clEnqueueWriteBuffer(cl_command_queue command_queue,
+                            cl_mem           buffer,
+                            cl_bool          blocking_write,
+                            size_t           offset,
+                            size_t           size,
+                            const void      *ptr,
+                            cl_uint          num_events_in_wait_list,
+                            const cl_event  *event_wait_list,
+                            cl_event        *event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueWriteBuffer_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
-        return func(command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list, event_wait_list, event);
+        return func(command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list, event_wait_list,
+                    event);
     }
     else
     {
@@ -581,22 +576,22 @@ cl_int clEnqueueWriteBuffer(
     }
 }
 
-cl_int clEnqueueReadBuffer(
-    cl_command_queue command_queue,
-    cl_mem           buffer,
-    cl_bool          blocking_read,
-    size_t           offset,
-    size_t           size,
-    void            *ptr,
-    cl_uint          num_events_in_wait_list,
-    const cl_event *event_wait_list,
-    cl_event        *event)
+cl_int clEnqueueReadBuffer(cl_command_queue command_queue,
+                           cl_mem           buffer,
+                           cl_bool          blocking_read,
+                           size_t           offset,
+                           size_t           size,
+                           void            *ptr,
+                           cl_uint          num_events_in_wait_list,
+                           const cl_event  *event_wait_list,
+                           cl_event        *event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueReadBuffer_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
-        return func(command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list, event_wait_list, event);
+        return func(command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list, event_wait_list,
+                    event);
     }
     else
     {
@@ -604,17 +599,16 @@ cl_int clEnqueueReadBuffer(
     }
 }
 
-cl_int clGetProgramBuildInfo(
-    cl_program            program,
-    cl_device_id          device,
-    cl_program_build_info param_name,
-    size_t                param_value_size,
-    void                 *param_value,
-    size_t               *param_value_size_ret)
+cl_int clGetProgramBuildInfo(cl_program            program,
+                             cl_device_id          device,
+                             cl_program_build_info param_name,
+                             size_t                param_value_size,
+                             void                 *param_value,
+                             size_t               *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetProgramBuildInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(program, device, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -628,7 +622,7 @@ cl_int clRetainProgram(cl_program program)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clRetainProgram_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(program);
     }
@@ -638,27 +632,27 @@ cl_int clRetainProgram(cl_program program)
     }
 }
 
-void *clEnqueueMapBuffer(
-    cl_command_queue command_queue,
-    cl_mem           buffer,
-    cl_bool          blocking_map,
-    cl_map_flags     map_flags,
-    size_t           offset,
-    size_t           size,
-    cl_uint          num_events_in_wait_list,
-    const cl_event *event_wait_list,
-    cl_event        *event,
-    cl_int          *errcode_ret)
+void *clEnqueueMapBuffer(cl_command_queue command_queue,
+                         cl_mem           buffer,
+                         cl_bool          blocking_map,
+                         cl_map_flags     map_flags,
+                         size_t           offset,
+                         size_t           size,
+                         cl_uint          num_events_in_wait_list,
+                         const cl_event  *event_wait_list,
+                         cl_event        *event,
+                         cl_int          *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueMapBuffer_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
-        return func(command_queue, buffer, blocking_map, map_flags, offset, size, num_events_in_wait_list, event_wait_list, event, errcode_ret);
+        return func(command_queue, buffer, blocking_map, map_flags, offset, size, num_events_in_wait_list,
+                    event_wait_list, event, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_OUT_OF_RESOURCES;
         }
@@ -670,7 +664,7 @@ cl_int clReleaseCommandQueue(cl_command_queue command_queue)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clReleaseCommandQueue_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue);
     }
@@ -680,24 +674,23 @@ cl_int clReleaseCommandQueue(cl_command_queue command_queue)
     }
 }
 
-cl_program clCreateProgramWithBinary(
-    cl_context            context,
-    cl_uint               num_devices,
-    const cl_device_id   *device_list,
-    const size_t         *lengths,
-    const unsigned char **binaries,
-    cl_int               *binary_status,
-    cl_int               *errcode_ret)
+cl_program clCreateProgramWithBinary(cl_context            context,
+                                     cl_uint               num_devices,
+                                     const cl_device_id   *device_list,
+                                     const size_t         *lengths,
+                                     const unsigned char **binaries,
+                                     cl_int               *binary_status,
+                                     cl_int               *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateProgramWithBinary_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_OUT_OF_RESOURCES;
         }
@@ -709,7 +702,7 @@ cl_int clRetainContext(cl_context context)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clRetainContext_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context);
     }
@@ -723,7 +716,7 @@ cl_int clReleaseProgram(cl_program program)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clReleaseProgram_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(program);
     }
@@ -737,7 +730,7 @@ cl_int clFlush(cl_command_queue command_queue)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clFlush_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue);
     }
@@ -751,7 +744,7 @@ cl_int clFinish(cl_command_queue command_queue)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clFinish_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue);
     }
@@ -761,16 +754,15 @@ cl_int clFinish(cl_command_queue command_queue)
     }
 }
 
-cl_int clGetProgramInfo(
-    cl_program      program,
-    cl_program_info param_name,
-    size_t          param_value_size,
-    void           *param_value,
-    size_t         *param_value_size_ret)
+cl_int clGetProgramInfo(cl_program      program,
+                        cl_program_info param_name,
+                        size_t          param_value_size,
+                        void           *param_value,
+                        size_t         *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetProgramInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(program, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -780,20 +772,17 @@ cl_int clGetProgramInfo(
     }
 }
 
-cl_kernel clCreateKernel(
-    cl_program  program,
-    const char *kernel_name,
-    cl_int     *errcode_ret)
+cl_kernel clCreateKernel(cl_program program, const char *kernel_name, cl_int *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateKernel_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(program, kernel_name, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_OUT_OF_RESOURCES;
         }
@@ -805,7 +794,7 @@ cl_int clRetainKernel(cl_kernel kernel)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clRetainKernel_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(kernel);
     }
@@ -815,22 +804,17 @@ cl_int clRetainKernel(cl_kernel kernel)
     }
 }
 
-cl_mem clCreateBuffer(
-    cl_context   context,
-    cl_mem_flags flags,
-    size_t       size,
-    void        *host_ptr,
-    cl_int      *errcode_ret)
+cl_mem clCreateBuffer(cl_context context, cl_mem_flags flags, size_t size, void *host_ptr, cl_int *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateBuffer_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, flags, size, host_ptr, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_OUT_OF_RESOURCES;
         }
@@ -839,21 +823,17 @@ cl_mem clCreateBuffer(
 }
 
 cl_program clCreateProgramWithSource(
-    cl_context    context,
-    cl_uint       count,
-    const char **strings,
-    const size_t *lengths,
-    cl_int       *errcode_ret)
+    cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateProgramWithSource_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, count, strings, lengths, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_OUT_OF_RESOURCES;
         }
@@ -865,7 +845,7 @@ cl_int clReleaseKernel(cl_kernel kernel)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clReleaseKernel_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(kernel);
     }
@@ -878,12 +858,12 @@ cl_int clReleaseKernel(cl_kernel kernel)
 cl_int clGetDeviceIDs(cl_platform_id platform,
                       cl_device_type device_type,
                       cl_uint        num_entries,
-                      cl_device_id *devices,
+                      cl_device_id  *devices,
                       cl_uint       *num_devices)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetDeviceIDs_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(platform, device_type, num_entries, devices, num_devices);
     }
@@ -901,7 +881,7 @@ cl_int clGetDeviceInfo(cl_device_id   device,
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetDeviceInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(device, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -911,15 +891,12 @@ cl_int clGetDeviceInfo(cl_device_id   device,
     }
 }
 
-cl_int clGetMemObjectInfo(cl_mem      memobj,
-                          cl_mem_info param_name,
-                          size_t      param_value_size,
-                          void       *param_value,
-                          size_t     *param_value_size_ret)
+cl_int clGetMemObjectInfo(
+    cl_mem memobj, cl_mem_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetMemObjectInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(memobj, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -933,7 +910,7 @@ cl_int clRetainEvent(cl_event event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clRetainEvent_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(event);
     }
@@ -951,7 +928,7 @@ cl_int clGetPlatformInfo(cl_platform_id   platform,
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetPlatformInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(platform, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -965,7 +942,7 @@ cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, cl_uint
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetPlatformIDs_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(num_entries, platforms, num_platforms);
     }
@@ -975,17 +952,16 @@ cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, cl_uint
     }
 }
 
-cl_int
-clGetKernelWorkGroupInfo(cl_kernel                 kernel,
-                         cl_device_id              device,
-                         cl_kernel_work_group_info param_name,
-                         size_t                    param_value_size,
-                         void                     *param_value,
-                         size_t                   *param_value_size_ret)
+cl_int clGetKernelWorkGroupInfo(cl_kernel                 kernel,
+                                cl_device_id              device,
+                                cl_kernel_work_group_info param_name,
+                                size_t                    param_value_size,
+                                void                     *param_value,
+                                size_t                   *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetKernelWorkGroupInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(kernel, device, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -995,16 +971,15 @@ clGetKernelWorkGroupInfo(cl_kernel                 kernel,
     }
 }
 
-cl_int
-clGetCommandQueueInfo(cl_command_queue      command_queue,
-                      cl_command_queue_info param_name,
-                      size_t                param_value_size,
-                      void                 *param_value,
-                      size_t               *param_value_size_ret)
+cl_int clGetCommandQueueInfo(cl_command_queue      command_queue,
+                             cl_command_queue_info param_name,
+                             size_t                param_value_size,
+                             void                 *param_value,
+                             size_t               *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetCommandQueueInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -1014,16 +989,15 @@ clGetCommandQueueInfo(cl_command_queue      command_queue,
     }
 }
 
-cl_int
-clGetKernelInfo(cl_kernel      kernel,
-                cl_kernel_info param_name,
-                size_t         param_value_size,
-                void          *param_value,
-                size_t        *param_value_size_ret)
+cl_int clGetKernelInfo(cl_kernel      kernel,
+                       cl_kernel_info param_name,
+                       size_t         param_value_size,
+                       void          *param_value,
+                       size_t        *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetKernelInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(kernel, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -1033,16 +1007,15 @@ clGetKernelInfo(cl_kernel      kernel,
     }
 }
 
-cl_int
-clGetEventProfilingInfo(cl_event          event,
-                        cl_profiling_info param_name,
-                        size_t            param_value_size,
-                        void             *param_value,
-                        size_t           *param_value_size_ret)
+cl_int clGetEventProfilingInfo(cl_event          event,
+                               cl_profiling_info param_name,
+                               size_t            param_value_size,
+                               void             *param_value,
+                               size_t           *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetEventProfilingInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(event, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -1052,23 +1025,22 @@ clGetEventProfilingInfo(cl_event          event,
     }
 }
 
-cl_mem
-clCreateImage(cl_context             context,
-              cl_mem_flags           flags,
-              const cl_image_format *image_format,
-              const cl_image_desc   *image_desc,
-              void                  *host_ptr,
-              cl_int                *errcode_ret)
+cl_mem clCreateImage(cl_context             context,
+                     cl_mem_flags           flags,
+                     const cl_image_format *image_format,
+                     const cl_image_desc   *image_desc,
+                     void                  *host_ptr,
+                     cl_int                *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateImage_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, flags, image_format, image_desc, host_ptr, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_OUT_OF_RESOURCES;
         }
@@ -1076,14 +1048,12 @@ clCreateImage(cl_context             context,
     }
 }
 
-cl_int clSetKernelExecInfo(cl_kernel           kernel,
-                           cl_kernel_exec_info param_name,
-                           size_t              param_value_size,
-                           const void         *param_value)
+cl_int
+clSetKernelExecInfo(cl_kernel kernel, cl_kernel_exec_info param_name, size_t param_value_size, const void *param_value)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clSetKernelExecInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(kernel, param_name, param_value_size, param_value);
     }
@@ -1093,22 +1063,21 @@ cl_int clSetKernelExecInfo(cl_kernel           kernel,
     }
 }
 
-cl_command_buffer_khr clCreateCommandBufferKHR(
-    cl_uint num_queues,
-    const cl_command_queue* queues,
-    const cl_command_buffer_properties_khr* properties,
-    cl_int* errcode_ret)
+cl_command_buffer_khr clCreateCommandBufferKHR(cl_uint                                 num_queues,
+                                               const cl_command_queue                 *queues,
+                                               const cl_command_buffer_properties_khr *properties,
+                                               cl_int                                 *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     const auto func = arm_compute::CLSymbols::get().clCreateCommandBufferKHR_ptr;
 
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(num_queues, queues, properties, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_INVALID_OPERATION;
         }
@@ -1122,7 +1091,7 @@ cl_int clFinalizeCommandBufferKHR(cl_command_buffer_khr command_buffer)
     arm_compute::CLSymbols::get().load_default();
     const auto func = arm_compute::CLSymbols::get().clFinalizeCommandBufferKHR_ptr;
 
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_buffer);
     }
@@ -1137,7 +1106,7 @@ cl_int clRetainCommandBufferKHR(cl_command_buffer_khr command_buffer)
     arm_compute::CLSymbols::get().load_default();
     const auto func = arm_compute::CLSymbols::get().clRetainCommandBufferKHR_ptr;
 
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_buffer);
     }
@@ -1152,7 +1121,7 @@ cl_int clReleaseCommandBufferKHR(cl_command_buffer_khr command_buffer)
     arm_compute::CLSymbols::get().load_default();
     const auto func = arm_compute::CLSymbols::get().clReleaseCommandBufferKHR_ptr;
 
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_buffer);
     }
@@ -1162,18 +1131,17 @@ cl_int clReleaseCommandBufferKHR(cl_command_buffer_khr command_buffer)
     }
 }
 
-cl_int clEnqueueCommandBufferKHR(
-    cl_uint num_queues,
-    cl_command_queue* queues,
-    cl_command_buffer_khr command_buffer,
-    cl_uint num_events_in_wait_list,
-    const cl_event* event_wait_list,
-    cl_event* event)
+cl_int clEnqueueCommandBufferKHR(cl_uint               num_queues,
+                                 cl_command_queue     *queues,
+                                 cl_command_buffer_khr command_buffer,
+                                 cl_uint               num_events_in_wait_list,
+                                 const cl_event       *event_wait_list,
+                                 cl_event             *event)
 {
     arm_compute::CLSymbols::get().load_default();
     const auto func = arm_compute::CLSymbols::get().clEnqueueCommandBufferKHR_ptr;
 
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(num_queues, queues, command_buffer, num_events_in_wait_list, event_wait_list, event);
     }
@@ -1183,27 +1151,26 @@ cl_int clEnqueueCommandBufferKHR(
     }
 }
 
-
-cl_int clCommandNDRangeKernelKHR(
-    cl_command_buffer_khr command_buffer,
-    cl_command_queue command_queue,
-    const cl_ndrange_kernel_command_properties_khr* properties,
-    cl_kernel kernel,
-    cl_uint work_dim,
-    const size_t* global_work_offset,
-    const size_t* global_work_size,
-    const size_t* local_work_size,
-    cl_uint num_sync_points_in_wait_list,
-    const cl_sync_point_khr* sync_point_wait_list,
-    cl_sync_point_khr* sync_point,
-    cl_mutable_command_khr* mutable_handle)
+cl_int clCommandNDRangeKernelKHR(cl_command_buffer_khr                           command_buffer,
+                                 cl_command_queue                                command_queue,
+                                 const cl_ndrange_kernel_command_properties_khr *properties,
+                                 cl_kernel                                       kernel,
+                                 cl_uint                                         work_dim,
+                                 const size_t                                   *global_work_offset,
+                                 const size_t                                   *global_work_size,
+                                 const size_t                                   *local_work_size,
+                                 cl_uint                                         num_sync_points_in_wait_list,
+                                 const cl_sync_point_khr                        *sync_point_wait_list,
+                                 cl_sync_point_khr                              *sync_point,
+                                 cl_mutable_command_khr                         *mutable_handle)
 {
     arm_compute::CLSymbols::get().load_default();
     const auto func = arm_compute::CLSymbols::get().clCommandNDRangeKernelKHR_ptr;
 
-    if(func != nullptr)
+    if (func != nullptr)
     {
-        return func(command_buffer, command_queue, properties, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_sync_points_in_wait_list, sync_point_wait_list, sync_point, mutable_handle);
+        return func(command_buffer, command_queue, properties, kernel, work_dim, global_work_offset, global_work_size,
+                    local_work_size, num_sync_points_in_wait_list, sync_point_wait_list, sync_point, mutable_handle);
     }
     else
     {
@@ -1211,14 +1178,13 @@ cl_int clCommandNDRangeKernelKHR(
     }
 }
 
-cl_int clUpdateMutableCommandsKHR(
-    cl_command_buffer_khr command_buffer,
-    const cl_mutable_base_config_khr* mutable_config)
+cl_int clUpdateMutableCommandsKHR(cl_command_buffer_khr             command_buffer,
+                                  const cl_mutable_base_config_khr *mutable_config)
 {
     arm_compute::CLSymbols::get().load_default();
     const auto func = arm_compute::CLSymbols::get().clUpdateMutableCommandsKHR_ptr;
 
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_buffer, mutable_config);
     }
@@ -1228,23 +1194,22 @@ cl_int clUpdateMutableCommandsKHR(
     }
 }
 
-cl_mem
-clImportMemoryARM(cl_context                      context,
-                  cl_mem_flags                    flags,
-                  const cl_import_properties_arm *properties,
-                  void                           *memory,
-                  size_t                          size,
-                  cl_int                         *errcode_ret)
+cl_mem clImportMemoryARM(cl_context                      context,
+                         cl_mem_flags                    flags,
+                         const cl_import_properties_arm *properties,
+                         void                           *memory,
+                         size_t                          size,
+                         cl_int                         *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clImportMemoryARM_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, flags, properties, memory, size, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_OUT_OF_RESOURCES;
         }
diff --git a/src/core/CL/cl_kernels/activation_float_helpers.h b/src/core/CL/cl_kernels/activation_float_helpers.h
index 3f93c8d6fc..02faae2369 100644
--- a/src/core/CL/cl_kernels/activation_float_helpers.h
+++ b/src/core/CL/cl_kernels/activation_float_helpers.h
@@ -31,7 +31,8 @@
 #endif // GPU_ARCH == GPU_ARCH_BIFROST
 
 // Hard-Swish
-#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
+#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \
+    (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
 
 // Logistic Activation
 #define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
@@ -49,13 +50,16 @@
 #define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
 
 // Leaky RELU Activation
-#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
+#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \
+    ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
 
 // Soft RELU Activation
 #define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
 
 // ELU Activation
-#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0)))
+#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)           \
+    (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, \
+            (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0)))
 
 // Absolute Activation
 #define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x))
@@ -70,7 +74,8 @@
 #define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
 
 // GELU Activation
-#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237)))
+#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \
+    (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237)))
 
 // Identity Activation
 #define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x)
diff --git a/src/core/CL/cl_kernels/activation_quant_helpers.h b/src/core/CL/cl_kernels/activation_quant_helpers.h
index c420578546..c758ff1278 100644
--- a/src/core/CL/cl_kernels/activation_quant_helpers.h
+++ b/src/core/CL/cl_kernels/activation_quant_helpers.h
@@ -60,17 +60,17 @@ inline TYPE identiy_op(TYPE x)
 }
 
 #define ACTIVATION_OP2(op, x) op##_op(x)
-#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
+#define ACTIVATION_OP(op, x)  ACTIVATION_OP2(op, x)
 
 #if defined(S1_VAL) && defined(S2_VAL)
 #if defined(O1_VAL) && defined(O2_VAL)
 #define PERFORM_ACTIVATION_QUANT(act, data)                                                       \
     ({                                                                                            \
         data = ACTIVATION_OP(act, data);                                                          \
-        \
+                                                                                                  \
         VEC_DATA_TYPE(float, VEC_SIZE)                                                            \
         fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE));                                    \
-        \
+                                                                                                  \
         fdata = round((fdata - (float)O1_VAL) * ((float)S1_VAL / (float)S2_VAL) + (float)O2_VAL); \
         data  = CONVERT_SAT(fdata, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));                           \
     })
@@ -78,17 +78,14 @@ inline TYPE identiy_op(TYPE x)
 #define PERFORM_ACTIVATION_QUANT(act, data)                             \
     ({                                                                  \
         data = ACTIVATION_OP(act, data);                                \
-        \
+                                                                        \
         VEC_DATA_TYPE(float, VEC_SIZE)                                  \
         fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE));          \
-        \
+                                                                        \
         fdata = round((fdata) * ((float)S1_VAL / (float)S2_VAL));       \
         data  = CONVERT_SAT(fdata, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); \
     })
 #endif /* defined(O1_VAL) && defined(O2_VAL) */
 #else  /* defined(S1_VAL) && defined(S2_VAL) */
-#define PERFORM_ACTIVATION_QUANT(act, data) \
-    ({                                      \
-        data = ACTIVATION_OP(act, data);    \
-    })
+#define PERFORM_ACTIVATION_QUANT(act, data) ({ data = ACTIVATION_OP(act, data); })
 #endif /* defined(S1_VAL) && defined(S2_VAL) */
diff --git a/src/core/CL/cl_kernels/gemm_helpers.h b/src/core/CL/cl_kernels/gemm_helpers.h
index 0e938cb668..4bef02314f 100644
--- a/src/core/CL/cl_kernels/gemm_helpers.h
+++ b/src/core/CL/cl_kernels/gemm_helpers.h
@@ -34,14 +34,14 @@
  *
  */
 #define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x)
-#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x)
+#define SCALAR_ACCESS(offset, n0, x)     SCALAR_ACCESS_STR(offset, n0, x)
 
 // offset == 0
-#define scalar_access_0_1(x) ((x).s0)
-#define scalar_access_0_2(x) ((x).s01)
-#define scalar_access_0_3(x) ((x).s012)
-#define scalar_access_0_4(x) ((x).s0123)
-#define scalar_access_0_8(x) ((x).s01234567)
+#define scalar_access_0_1(x)  ((x).s0)
+#define scalar_access_0_2(x)  ((x).s01)
+#define scalar_access_0_3(x)  ((x).s012)
+#define scalar_access_0_4(x)  ((x).s0123)
+#define scalar_access_0_8(x)  ((x).s01234567)
 #define scalar_access_0_16(x) ((x).s0123456789ABCDEF)
 
 // offset == 1
@@ -100,8 +100,7 @@
  * @param[in] Z          The z-axis offset vector
  * @{
  */
-#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
-    ({})
+#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) ({})
 
 #define LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
     SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##0) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
@@ -186,8 +185,10 @@
  * @param[in] Z          The z-axis offset vector
  * @{
  */
-#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
-#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
+#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+    LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
+#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+    LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
 /** @} */ // end of group LOAD_TENSOR
 
 /** Load 2D tensor (consecutive rows and columns) with Z offset.
@@ -202,8 +203,7 @@
  * @param[in] Z         The z-axis offset vector
  * @{
  */
-#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
-    ({})
+#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) ({})
 
 #define LOAD_TENSOR_M0X1(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
     LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
@@ -279,8 +279,10 @@
  * @param[in] Z         The z-axis offset vector
  * @{
  */
-#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
 /** @}*/ // end of group LOAD_TENSOR_M0XN0
 
 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
@@ -395,8 +397,10 @@
  * @param[in] Z         The z-axis offset vector
  * @{
  */
-#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
-#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
 /** @} */ // end of group LOAD_BLOCK
 
 /** Partially load the 0 to (n-1)th rows of the given variables
@@ -517,8 +521,10 @@
  * @param[in] Z         The offset in z-axis direction
  * @{
  */
-#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
-#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
 /** Load a block that can be partial in both x and y dimensions
  *
  * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
@@ -541,22 +547,23 @@
  * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial load Y. True to use PARTIAL_STORE_M0 rather than M0.
  * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0.
  */
-#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                                   \
-    {                                                                                                                                                            \
-        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                                           \
-    }                                                                                                                                                            \
-    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                               \
-    {                                                                                                                                                            \
-        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                             \
-    }                                                                                                                                                            \
-    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                               \
-    {                                                                                                                                                            \
-        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                             \
-    }                                                                                                                                                            \
-    else                                                                                                                                                         \
-    {                                                                                                                                                            \
-        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                               \
+#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0,     \
+                                      PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                            \
+    if (!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                    \
+    {                                                                                                              \
+        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                             \
+    }                                                                                                              \
+    else if ((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                \
+    {                                                                                                              \
+        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);               \
+    }                                                                                                              \
+    else if (!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                \
+    {                                                                                                              \
+        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);               \
+    }                                                                                                              \
+    else                                                                                                           \
+    {                                                                                                              \
+        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
     }
 /** Load a block that can only be partial in x but not y.
  *
@@ -578,14 +585,15 @@
  * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
  * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0.
  */
-#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
-    if(!(PARTIAL_COND_X))                                                                                                \
-    {                                                                                                                    \
-        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                   \
-    }                                                                                                                    \
-    else                                                                                                                 \
-    {                                                                                                                    \
-        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                     \
+#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, \
+                                PARTIAL_COND_X)                                                          \
+    if (!(PARTIAL_COND_X))                                                                               \
+    {                                                                                                    \
+        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                   \
+    }                                                                                                    \
+    else                                                                                                 \
+    {                                                                                                    \
+        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);     \
     }
 /** Load a block that can only be partial in y but not x.
  *
@@ -607,14 +615,15 @@
  * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
  * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
  */
-#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
-    if(!(PARTIAL_COND_Y))                                                                                                \
-    {                                                                                                                    \
-        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                   \
-    }                                                                                                                    \
-    else                                                                                                                 \
-    {                                                                                                                    \
-        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                     \
+#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                PARTIAL_COND_Y)                                                          \
+    if (!(PARTIAL_COND_Y))                                                                               \
+    {                                                                                                    \
+        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                   \
+    }                                                                                                    \
+    else                                                                                                 \
+    {                                                                                                    \
+        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);     \
     }
 /** @} */ // end of group LOAD_BLOCK_PARTIAL
 /** Boundary-aware GeMM block load
@@ -676,28 +685,33 @@
  */
 #if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
 // Case1: No partial blocks in either x or y
-#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                  PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                        \
     LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
 
 #elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
 // Case2: Partial blocks in y
-#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
+#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                  PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                        \
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                               \
     LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
 
 #elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
 // Case3: Partial blocks in x
-#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
+#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                  PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                        \
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                               \
     LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
 
 #else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
 // Case4: Partial blocks in both x and y
-#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
-    LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
+#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                  PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                        \
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                               \
+    LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                  PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
 
-#endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
+#endif    // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
 /** @} */ // end of group LOAD_BLOCK_BOUNDARY_AWARE
 
 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
@@ -795,8 +809,10 @@
  * @param[in] Y_STEP_ROW The incremental step row for the y coordinate (in pixels)
  * @{
  */
-#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
-#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
+#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
+    LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
+#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
+    LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
 /** @} */ // end of group LOAD_TEXTURE2D
 
 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1) passing the Y index for each row to be loaded.
@@ -815,7 +831,7 @@
 #define LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##0;                                                                            \
-    if(Y_MASK##0 != 0)                                                                      \
+    if (Y_MASK##0 != 0)                                                                     \
         BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##0 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##0 = 0;
@@ -824,7 +840,7 @@
     LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##1;                                                                            \
-    if(Y_MASK##1 != 0)                                                                      \
+    if (Y_MASK##1 != 0)                                                                     \
         BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##1 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##1 = 0;
@@ -833,7 +849,7 @@
     LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##2;                                                                            \
-    if(Y_MASK##2 != 0)                                                                      \
+    if (Y_MASK##2 != 0)                                                                     \
         BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##2 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##2 = 0;
@@ -842,7 +858,7 @@
     LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##3;                                                                            \
-    if(Y_MASK##3 != 0)                                                                      \
+    if (Y_MASK##3 != 0)                                                                     \
         BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##3 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##3 = 0;
@@ -851,7 +867,7 @@
     LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##4;                                                                            \
-    if(Y_MASK##4 != 0)                                                                      \
+    if (Y_MASK##4 != 0)                                                                     \
         BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##4 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##4 = 0;
@@ -860,7 +876,7 @@
     LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##5;                                                                            \
-    if(Y_MASK##5 != 0)                                                                      \
+    if (Y_MASK##5 != 0)                                                                     \
         BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##5 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##5 = 0;
@@ -869,7 +885,7 @@
     LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##6;                                                                            \
-    if(Y_MASK##6 != 0)                                                                      \
+    if (Y_MASK##6 != 0)                                                                     \
         BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##6 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##6 = 0;
@@ -878,7 +894,7 @@
     LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##7;                                                                            \
-    if(Y_MASK##7 != 0)                                                                      \
+    if (Y_MASK##7 != 0)                                                                     \
         BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##7 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##7 = 0;
@@ -887,7 +903,7 @@
     LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##8;                                                                            \
-    if(Y_MASK##8 != 0)                                                                      \
+    if (Y_MASK##8 != 0)                                                                     \
         BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##8 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##8 = 0;
@@ -896,7 +912,7 @@
     LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##9;                                                                            \
-    if(Y_MASK##9 != 0)                                                                      \
+    if (Y_MASK##9 != 0)                                                                     \
         BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##9 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##9 = 0;
@@ -905,7 +921,7 @@
     LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##A;                                                                            \
-    if(Y_MASK##A != 0)                                                                      \
+    if (Y_MASK##A != 0)                                                                     \
         BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##A * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##A = 0;
@@ -914,7 +930,7 @@
     LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##B;                                                                            \
-    if(Y_MASK##B != 0)                                                                      \
+    if (Y_MASK##B != 0)                                                                     \
         BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##B * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##B = 0;
@@ -923,7 +939,7 @@
     LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##C;                                                                            \
-    if(Y_MASK##C != 0)                                                                      \
+    if (Y_MASK##C != 0)                                                                     \
         BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##C * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##C = 0;
@@ -932,7 +948,7 @@
     LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##D;                                                                            \
-    if(Y_MASK##D != 0)                                                                      \
+    if (Y_MASK##D != 0)                                                                     \
         BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##D * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##D = 0;
@@ -941,7 +957,7 @@
     LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##E;                                                                            \
-    if(Y_MASK##E != 0)                                                                      \
+    if (Y_MASK##E != 0)                                                                     \
         BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##E * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##E = 0;
@@ -950,7 +966,7 @@
     LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##F;                                                                            \
-    if(Y_MASK##F != 0)                                                                      \
+    if (Y_MASK##F != 0)                                                                     \
         BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##F * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##F = 0;
@@ -976,8 +992,10 @@
  * @param[in] Y_MASK    The y-axis mask vector. If 0, forces BASENAMEn to 0
  * @{
  */
-#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
-#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
+#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
+    LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
+#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
+    LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
 /** @} */ // end of group LOAD_BLOCK_INDIRECT
 
 /** Loads the elements from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
@@ -1088,8 +1106,10 @@
  * @param[in] STRIDE_Y  The stride in y-axis direction
  * @{
  */
-#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
-#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
+#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+    LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
+#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+    LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
 /** @} */ // end of group LOAD_SCALAR_AS_VECTOR
 
 /** Basic macros to calculate Z offset values from Z0 to Zn-1
@@ -1187,8 +1207,10 @@
  * @param[in] STRIDE_Y        The stride value in y-axis direction
  * @{
  */
-#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
-#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
+#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
+#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
 /** @} */ // end of group CALCULATE_Z_OFFSET
 
 /** Scale the rows in the given variables (BASENAME0 to BASENAMEn-1)
@@ -1199,8 +1221,7 @@
  * @param[in] SCALE     The scale factor
  * @{
  */
-#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \
-    BASENAME##0 *= (DATA_TYPE)SCALE;
+#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) BASENAME##0 *= (DATA_TYPE)SCALE;
 
 #define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
     SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE)     \
@@ -1275,7 +1296,7 @@
  * @{
  */
 #define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE)
-#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
+#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE)     SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
 /** @} */ // end of group SCALE_BLOCK
 
 /** Create a new vector containing the values at the given index for a set of given vectors
@@ -1287,8 +1308,7 @@
  * @param[in] TYPE     The data type of the destination vectors
  * @{
  */
-#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \
-    TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
+#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
 #define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \
     VEC_DATA_TYPE(TYPE, 2)                         \
     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL);
@@ -1297,13 +1317,20 @@
     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL);
 #define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \
     VEC_DATA_TYPE(TYPE, 4)                         \
-    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL);
-#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \
-    VEC_DATA_TYPE(TYPE, 8)                         \
-    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
-#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \
-    VEC_DATA_TYPE(TYPE, 16)                         \
-    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL);
+    BASENAME##IDX_COL =                            \
+        (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL);
+#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE)                                                           \
+    VEC_DATA_TYPE(TYPE, 8)                                                                                   \
+    BASENAME##IDX_COL =                                                                                      \
+        (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, \
+                                 (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
+#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE)                                                           \
+    VEC_DATA_TYPE(TYPE, 16)                                                                                   \
+    BASENAME##IDX_COL =                                                                                       \
+        (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, \
+                                  (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, \
+                                  (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, \
+                                  (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL);
 /** @} */ // end of group COLUMN_VECTORn
 
 /** Create a new vector containing the values at the given index. Utility macros for transposing a colum-vector
@@ -1315,8 +1342,7 @@
  * @param[in] TYPE     The data type of the destination vectors
  * @{
  */
-#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) \
-    TYPE BASENAME##IDX_COL = (TYPE)((X##0));
+#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) TYPE BASENAME##IDX_COL = (TYPE)((X##0));
 #define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \
     VEC_DATA_TYPE(TYPE, 2)                                \
     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1));
@@ -1329,9 +1355,10 @@
 #define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \
     VEC_DATA_TYPE(TYPE, 8)                                \
     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7));
-#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \
-    VEC_DATA_TYPE(TYPE, 16)                                \
-    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F));
+#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE)                                                        \
+    VEC_DATA_TYPE(TYPE, 16)                                                                                       \
+    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), \
+                                                  (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F));
 /** @} */ // end of group COLUMN_VECTOR_SCALARn
 
 /** Create transposed vectors of the given vectors
@@ -1343,8 +1370,7 @@
  * @param[in] TYPE     The data type of the transposed vectors
  * @{
  */
-#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) \
-    COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE);
+#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE);
 #define TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE) \
     COLUMN_VECTOR(K0, 0, BASENAME, BS, TYPE);  \
     COLUMN_VECTOR(K0, 1, BASENAME, BS, TYPE);
@@ -1417,8 +1443,7 @@
  * @param[in] BIAS     The basename of the added variables
  * @{
  */
-#define ADD_ROW_1(BASENAME, BIAS) \
-    BASENAME##0 += BIAS##0;
+#define ADD_ROW_1(BASENAME, BIAS) BASENAME##0 += BIAS##0;
 
 #define ADD_ROW_2(BASENAME, BIAS) \
     ADD_ROW_1(BASENAME, BIAS)     \
@@ -1493,7 +1518,7 @@
  * @{
  */
 #define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS)
-#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS)
+#define ADD_BLOCK(N, BASENAME, BIAS)     ADD_BLOCK_STR(N, BASENAME, BIAS)
 /** @} */ // end of group ADD_BLOCK
 
 /** Broadcast (add single value) to the each element of the destination variables
@@ -1503,8 +1528,7 @@
  * @param[in] BIAS     The variable containing the value to add
  * @{
  */
-#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \
-    BASENAME##0 += BIAS;
+#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) BASENAME##0 += BIAS;
 
 #define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \
     ADD_ROW_BROADCAST_1(BASENAME, BIAS)     \
@@ -1578,7 +1602,7 @@
  * @{
  */
 #define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS)
-#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
+#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS)     ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
 /** @} */ // end of group ADD_BLOCK_BROADCAST
 
 /** Apply activation to the given variables
@@ -1668,8 +1692,10 @@
  * @param[in] B_VAL           Additional value required by the activation
  * @{
  */
-#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
-#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
+#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
+#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
 /** @} */ // end of group ACTIVATION_BLOCK
 
 /** Apply convert_<data_type> to the given variables
@@ -1773,6 +1799,8 @@
  * @param[in] BASENAME_DST The basename of the destination variables
  * @{
  */
-#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
-#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
+#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
+#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
 /** @} */ // end of group CONVERT_BLOCK
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index b2ceaf92f3..87a1875f93 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -81,11 +81,11 @@
  * @return The reversed vector
  * @{
  */
-#define REV1(x) ((x))
-#define REV2(x) ((x).s10)
-#define REV3(x) ((x).s210)
-#define REV4(x) ((x).s3210)
-#define REV8(x) ((x).s76543210)
+#define REV1(x)  ((x))
+#define REV2(x)  ((x).s10)
+#define REV3(x)  ((x).s210)
+#define REV4(x)  ((x).s3210)
+#define REV8(x)  ((x).s76543210)
 #define REV16(x) ((x).sFEDCBA9876543210)
 /** @} */ // end of group REVn
 
@@ -99,7 +99,7 @@
  * @{
  */
 #define REVERSE_STR(x, s) REV##s((x))
-#define REVERSE(x, s) REVERSE_STR(x, s)
+#define REVERSE(x, s)     REVERSE_STR(x, s)
 /** @} */ // end of group REVERSE
 
 /** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
@@ -138,16 +138,16 @@
 #define ROT8_7(x) ((x).s12345670)
 #define ROT8_8(x) ((x))
 
-#define ROT16_0(x) ((x))
-#define ROT16_1(x) ((x).sF0123456789ABCDE)
-#define ROT16_2(x) ((x).sEF0123456789ABCD)
-#define ROT16_3(x) ((x).sDEF0123456789ABC)
-#define ROT16_4(x) ((x).sCDEF0123456789AB)
-#define ROT16_5(x) ((x).sBCDEF0123456789A)
-#define ROT16_6(x) ((x).sABCDEF0123456789)
-#define ROT16_7(x) ((x).s9ABCDEF012345678)
-#define ROT16_8(x) ((x).s89ABCDEF01234567)
-#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_0(x)  ((x))
+#define ROT16_1(x)  ((x).sF0123456789ABCDE)
+#define ROT16_2(x)  ((x).sEF0123456789ABCD)
+#define ROT16_3(x)  ((x).sDEF0123456789ABC)
+#define ROT16_4(x)  ((x).sCDEF0123456789AB)
+#define ROT16_5(x)  ((x).sBCDEF0123456789A)
+#define ROT16_6(x)  ((x).sABCDEF0123456789)
+#define ROT16_7(x)  ((x).s9ABCDEF012345678)
+#define ROT16_8(x)  ((x).s89ABCDEF01234567)
+#define ROT16_9(x)  ((x).s789ABCDEF0123456)
 #define ROT16_10(x) ((x).s6789ABCDEF012345)
 #define ROT16_11(x) ((x).s56789ABCDEF01234)
 #define ROT16_12(x) ((x).s456789ABCDEF0123)
@@ -168,7 +168,7 @@
  * @{
  */
 #define ROTATE_STR(x, s, n) ROT##s##_##n(x)
-#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+#define ROTATE(x, s, n)     ROTATE_STR(x, s, n)
 /** @} */ // end of group ROTATE
 
 /** Creates a vector of size n filled with offset values corresponding to the location of each element.
@@ -179,11 +179,11 @@
  * @return The vector filled with offset values
  * @{
  */
-#define V_OFFS1(dt) (dt##1)(0)
-#define V_OFFS2(dt) (dt##2)(0, 1)
-#define V_OFFS3(dt) (dt##3)(0, 1, 2)
-#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
-#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS1(dt)  (dt##1)(0)
+#define V_OFFS2(dt)  (dt##2)(0, 1)
+#define V_OFFS3(dt)  (dt##3)(0, 1, 2)
+#define V_OFFS4(dt)  (dt##4)(0, 1, 2, 3)
+#define V_OFFS8(dt)  (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
 #define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
 /** @} */ // end of group V_OFFSn
 
@@ -197,11 +197,11 @@
  * @{
  */
 #define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
-#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+#define VEC_OFFS(dt, s)     VEC_OFFS_STR(dt, s)
 /** @} */ // end of group VEC_OFFS
 
 #define VLOAD_STR(size) vload##size
-#define VLOAD(size) VLOAD_STR(size)
+#define VLOAD(size)     VLOAD_STR(size)
 
 /** Extended partial vload that correctly handles scalar values as well.
  * Load the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of load ops
@@ -219,23 +219,23 @@
  * @{
  */
 #define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
-#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
+#define VLOAD_PARTIAL(size, load_size)     VLOAD_PARTIAL_STR(size, load_size)
 
 #define NO_LOAD(data, offs, ptr) \
     {                            \
     }
 
 // Size == 1 (scalar)
-#define vload_partial_1_0 NO_LOAD
-#define vload_partial_1_1 vload1
-#define vload_partial_1_2 NO_LOAD
-#define vload_partial_1_3 NO_LOAD
-#define vload_partial_1_4 NO_LOAD
-#define vload_partial_1_5 NO_LOAD
-#define vload_partial_1_6 NO_LOAD
-#define vload_partial_1_7 NO_LOAD
-#define vload_partial_1_8 NO_LOAD
-#define vload_partial_1_9 NO_LOAD
+#define vload_partial_1_0  NO_LOAD
+#define vload_partial_1_1  vload1
+#define vload_partial_1_2  NO_LOAD
+#define vload_partial_1_3  NO_LOAD
+#define vload_partial_1_4  NO_LOAD
+#define vload_partial_1_5  NO_LOAD
+#define vload_partial_1_6  NO_LOAD
+#define vload_partial_1_7  NO_LOAD
+#define vload_partial_1_8  NO_LOAD
+#define vload_partial_1_9  NO_LOAD
 #define vload_partial_1_10 NO_LOAD
 #define vload_partial_1_11 NO_LOAD
 #define vload_partial_1_12 NO_LOAD
@@ -244,16 +244,16 @@
 #define vload_partial_1_15 NO_LOAD
 #define vload_partial_1_16 NO_LOAD
 // Size == 2
-#define vload_partial_2_0 NO_LOAD
-#define vload_partial_2_1 vload_partial_1
-#define vload_partial_2_2 vload_partial_2
-#define vload_partial_2_3 NO_LOAD
-#define vload_partial_2_4 NO_LOAD
-#define vload_partial_2_5 NO_LOAD
-#define vload_partial_2_6 NO_LOAD
-#define vload_partial_2_7 NO_LOAD
-#define vload_partial_2_8 NO_LOAD
-#define vload_partial_2_9 NO_LOAD
+#define vload_partial_2_0  NO_LOAD
+#define vload_partial_2_1  vload_partial_1
+#define vload_partial_2_2  vload_partial_2
+#define vload_partial_2_3  NO_LOAD
+#define vload_partial_2_4  NO_LOAD
+#define vload_partial_2_5  NO_LOAD
+#define vload_partial_2_6  NO_LOAD
+#define vload_partial_2_7  NO_LOAD
+#define vload_partial_2_8  NO_LOAD
+#define vload_partial_2_9  NO_LOAD
 #define vload_partial_2_10 NO_LOAD
 #define vload_partial_2_11 NO_LOAD
 #define vload_partial_2_12 NO_LOAD
@@ -262,16 +262,16 @@
 #define vload_partial_2_15 NO_LOAD
 #define vload_partial_2_16 NO_LOAD
 // Size == 3
-#define vload_partial_3_0 NO_LOAD
-#define vload_partial_3_1 vload_partial_1
-#define vload_partial_3_2 vload_partial_2
-#define vload_partial_3_3 vload_partial_3
-#define vload_partial_3_4 NO_LOAD
-#define vload_partial_3_5 NO_LOAD
-#define vload_partial_3_6 NO_LOAD
-#define vload_partial_3_7 NO_LOAD
-#define vload_partial_3_8 NO_LOAD
-#define vload_partial_3_9 NO_LOAD
+#define vload_partial_3_0  NO_LOAD
+#define vload_partial_3_1  vload_partial_1
+#define vload_partial_3_2  vload_partial_2
+#define vload_partial_3_3  vload_partial_3
+#define vload_partial_3_4  NO_LOAD
+#define vload_partial_3_5  NO_LOAD
+#define vload_partial_3_6  NO_LOAD
+#define vload_partial_3_7  NO_LOAD
+#define vload_partial_3_8  NO_LOAD
+#define vload_partial_3_9  NO_LOAD
 #define vload_partial_3_10 NO_LOAD
 #define vload_partial_3_11 NO_LOAD
 #define vload_partial_3_12 NO_LOAD
@@ -280,16 +280,16 @@
 #define vload_partial_3_15 NO_LOAD
 #define vload_partial_3_16 NO_LOAD
 // Size == 4
-#define vload_partial_4_0 NO_LOAD
-#define vload_partial_4_1 vload_partial_1
-#define vload_partial_4_2 vload_partial_2
-#define vload_partial_4_3 vload_partial_3
-#define vload_partial_4_4 vload_partial_4
-#define vload_partial_4_5 NO_LOAD
-#define vload_partial_4_6 NO_LOAD
-#define vload_partial_4_7 NO_LOAD
-#define vload_partial_4_8 NO_LOAD
-#define vload_partial_4_9 NO_LOAD
+#define vload_partial_4_0  NO_LOAD
+#define vload_partial_4_1  vload_partial_1
+#define vload_partial_4_2  vload_partial_2
+#define vload_partial_4_3  vload_partial_3
+#define vload_partial_4_4  vload_partial_4
+#define vload_partial_4_5  NO_LOAD
+#define vload_partial_4_6  NO_LOAD
+#define vload_partial_4_7  NO_LOAD
+#define vload_partial_4_8  NO_LOAD
+#define vload_partial_4_9  NO_LOAD
 #define vload_partial_4_10 NO_LOAD
 #define vload_partial_4_11 NO_LOAD
 #define vload_partial_4_12 NO_LOAD
@@ -298,16 +298,16 @@
 #define vload_partial_4_15 NO_LOAD
 #define vload_partial_4_16 NO_LOAD
 // Size == 8
-#define vload_partial_8_0 NO_LOAD
-#define vload_partial_8_1 vload_partial_1
-#define vload_partial_8_2 vload_partial_2
-#define vload_partial_8_3 vload_partial_3
-#define vload_partial_8_4 vload_partial_4
-#define vload_partial_8_5 vload_partial_5
-#define vload_partial_8_6 vload_partial_6
-#define vload_partial_8_7 vload_partial_7
-#define vload_partial_8_8 vload_partial_8
-#define vload_partial_8_9 NO_LOAD
+#define vload_partial_8_0  NO_LOAD
+#define vload_partial_8_1  vload_partial_1
+#define vload_partial_8_2  vload_partial_2
+#define vload_partial_8_3  vload_partial_3
+#define vload_partial_8_4  vload_partial_4
+#define vload_partial_8_5  vload_partial_5
+#define vload_partial_8_6  vload_partial_6
+#define vload_partial_8_7  vload_partial_7
+#define vload_partial_8_8  vload_partial_8
+#define vload_partial_8_9  NO_LOAD
 #define vload_partial_8_10 NO_LOAD
 #define vload_partial_8_11 NO_LOAD
 #define vload_partial_8_12 NO_LOAD
@@ -316,16 +316,16 @@
 #define vload_partial_8_15 NO_LOAD
 #define vload_partial_8_16 NO_LOAD
 // Size == 16
-#define vload_partial_16_0 NO_LOAD
-#define vload_partial_16_1 vload_partial_1
-#define vload_partial_16_2 vload_partial_2
-#define vload_partial_16_3 vload_partial_3
-#define vload_partial_16_4 vload_partial_4
-#define vload_partial_16_5 vload_partial_5
-#define vload_partial_16_6 vload_partial_6
-#define vload_partial_16_7 vload_partial_7
-#define vload_partial_16_8 vload_partial_8
-#define vload_partial_16_9 vload_partial_9
+#define vload_partial_16_0  NO_LOAD
+#define vload_partial_16_1  vload_partial_1
+#define vload_partial_16_2  vload_partial_2
+#define vload_partial_16_3  vload_partial_3
+#define vload_partial_16_4  vload_partial_4
+#define vload_partial_16_5  vload_partial_5
+#define vload_partial_16_6  vload_partial_6
+#define vload_partial_16_7  vload_partial_7
+#define vload_partial_16_8  vload_partial_8
+#define vload_partial_16_9  vload_partial_9
 #define vload_partial_16_10 vload_partial_10
 #define vload_partial_16_11 vload_partial_11
 #define vload_partial_16_12 vload_partial_12
@@ -351,17 +351,13 @@
  * @param[in] PTR    The base pointer
  * @{
  */
-#define vload_partial_1(DATA, OFFSET, PTR) \
-    DATA.s0 = vload1(OFFSET, PTR);
+#define vload_partial_1(DATA, OFFSET, PTR) DATA.s0 = vload1(OFFSET, PTR);
 
-#define vload_partial_2(DATA, OFFSET, PTR) \
-    DATA.s01 = vload2(OFFSET, PTR);
+#define vload_partial_2(DATA, OFFSET, PTR) DATA.s01 = vload2(OFFSET, PTR);
 
-#define vload_partial_3(DATA, OFFSET, PTR) \
-    DATA.s012 = vload3(OFFSET, PTR);
+#define vload_partial_3(DATA, OFFSET, PTR) DATA.s012 = vload3(OFFSET, PTR);
 
-#define vload_partial_4(DATA, OFFSET, PTR) \
-    DATA.s0123 = vload4(OFFSET, PTR);
+#define vload_partial_4(DATA, OFFSET, PTR) DATA.s0123 = vload4(OFFSET, PTR);
 
 #define vload_partial_5(DATA, OFFSET, PTR)    \
     vload_partial_4(DATA.s0123, OFFSET, PTR); \
@@ -375,8 +371,7 @@
     vload_partial_4(DATA.s0123, OFFSET, PTR); \
     vload_partial_3(DATA.s456, OFFSET, PTR + 4);
 
-#define vload_partial_8(DATA, OFFSET, PTR) \
-    DATA.s01234567 = vload8(OFFSET, PTR);
+#define vload_partial_8(DATA, OFFSET, PTR) DATA.s01234567 = vload8(OFFSET, PTR);
 
 #define vload_partial_9(DATA, OFFSET, PTR)        \
     vload_partial_8(DATA.s01234567, OFFSET, PTR); \
@@ -406,13 +401,12 @@
     vload_partial_8(DATA.s01234567, OFFSET, PTR); \
     vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
 
-#define vload_partial_16(DATA, OFFSET, PTR) \
-    DATA = vload16(OFFSET, PTR);
+#define vload_partial_16(DATA, OFFSET, PTR) DATA = vload16(OFFSET, PTR);
 /** @} */ // end of groupd vload_partial_n
 /** @} */ // end of groupd VLOAD_PARTIAL
 
-#define PIXEL_UNIT4 1
-#define PIXEL_UNIT8 2
+#define PIXEL_UNIT4  1
+#define PIXEL_UNIT8  2
 #define PIXEL_UNIT16 4
 
 /** Utility macro to convert a vector size in pixel unit.
@@ -425,27 +419,45 @@
  * @{
  */
 #define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
-#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
+#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size)     CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
 /** @} */ // end of group CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT
 
 #define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
-#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
-#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
+#define read_image2d_floatx2(img, x_coord, y_coord) \
+    (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
+#define read_image2d_floatx4(img, x_coord, y_coord)                                                       \
+    (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), \
+              read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
 
 #if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
 #define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
-#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
-#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
+#define read_image2d_halfx2(img, x_coord, y_coord) \
+    (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
+#define read_image2d_halfx4(img, x_coord, y_coord)                                                       \
+    (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), \
+             read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
 #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
 
 #define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
-#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
-#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
+#define write_image2d_floatx2(img, x_coord, y_coord, values)    \
+    (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), \
+     write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
+#define write_image2d_floatx4(img, x_coord, y_coord, values)        \
+    (write_imagef(img, (int2)(x_coord, y_coord), values.s0123),     \
+     write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), \
+     write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), \
+     write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
 
 #if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
 #define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
-#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
-#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
+#define write_image2d_halfx2(img, x_coord, y_coord, values)     \
+    (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), \
+     write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
+#define write_image2d_halfx4(img, x_coord, y_coord, values)         \
+    (write_imageh(img, (int2)(x_coord, y_coord), values.s0123),     \
+     write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), \
+     write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), \
+     write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
 #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
 
 /** Utility macro to read a 2D OpenCL image object.
@@ -462,7 +474,7 @@
  * @{
  */
 #define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
-#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
+#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord)     READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
 /** @} */
 
 /** Utility macro to write a 2D OpenCL image object.
@@ -478,26 +490,28 @@
  *
  * @{
  */
-#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
-#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
+#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) \
+    write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
+#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) \
+    WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
 /** @} */
 
 #define VSTORE_STR(size) vstore##size
-#define VSTORE(size) VSTORE_STR(size)
+#define VSTORE(size)     VSTORE_STR(size)
 
-#define float1 float
-#define half1 half
-#define char1 char
-#define uchar1 uchar
-#define short1 short
+#define float1  float
+#define half1   half
+#define char1   char
+#define uchar1  uchar
+#define short1  short
 #define ushort1 ushort
-#define int1 int
-#define uint1 uint
-#define long1 long
-#define ulong1 ulong
+#define int1    int
+#define uint1   uint
+#define long1   long
+#define ulong1  ulong
 #define double1 double
 
-#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vload1(OFFSET, PTR)        *(OFFSET + PTR)
 #define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
 
 /** Extended partial vstore that correctly handles scalar values as well.
@@ -516,23 +530,23 @@
  * @{
  */
 #define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
-#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
+#define VSTORE_PARTIAL(size, store_size)     VSTORE_PARTIAL_STR(size, store_size)
 
 #define NO_STORE(data, offs, ptr) \
     {                             \
     }
 
 // Size == 1 (scalar)
-#define vstore_partial_1_0 NO_STORE
-#define vstore_partial_1_1 vstore1
-#define vstore_partial_1_2 NO_STORE
-#define vstore_partial_1_3 NO_STORE
-#define vstore_partial_1_4 NO_STORE
-#define vstore_partial_1_5 NO_STORE
-#define vstore_partial_1_6 NO_STORE
-#define vstore_partial_1_7 NO_STORE
-#define vstore_partial_1_8 NO_STORE
-#define vstore_partial_1_9 NO_STORE
+#define vstore_partial_1_0  NO_STORE
+#define vstore_partial_1_1  vstore1
+#define vstore_partial_1_2  NO_STORE
+#define vstore_partial_1_3  NO_STORE
+#define vstore_partial_1_4  NO_STORE
+#define vstore_partial_1_5  NO_STORE
+#define vstore_partial_1_6  NO_STORE
+#define vstore_partial_1_7  NO_STORE
+#define vstore_partial_1_8  NO_STORE
+#define vstore_partial_1_9  NO_STORE
 #define vstore_partial_1_10 NO_STORE
 #define vstore_partial_1_11 NO_STORE
 #define vstore_partial_1_12 NO_STORE
@@ -541,16 +555,16 @@
 #define vstore_partial_1_15 NO_STORE
 #define vstore_partial_1_16 NO_STORE
 // Size == 2
-#define vstore_partial_2_0 NO_STORE
-#define vstore_partial_2_1 vstore_partial_1
-#define vstore_partial_2_2 vstore_partial_2
-#define vstore_partial_2_3 NO_STORE
-#define vstore_partial_2_4 NO_STORE
-#define vstore_partial_2_5 NO_STORE
-#define vstore_partial_2_6 NO_STORE
-#define vstore_partial_2_7 NO_STORE
-#define vstore_partial_2_8 NO_STORE
-#define vstore_partial_2_9 NO_STORE
+#define vstore_partial_2_0  NO_STORE
+#define vstore_partial_2_1  vstore_partial_1
+#define vstore_partial_2_2  vstore_partial_2
+#define vstore_partial_2_3  NO_STORE
+#define vstore_partial_2_4  NO_STORE
+#define vstore_partial_2_5  NO_STORE
+#define vstore_partial_2_6  NO_STORE
+#define vstore_partial_2_7  NO_STORE
+#define vstore_partial_2_8  NO_STORE
+#define vstore_partial_2_9  NO_STORE
 #define vstore_partial_2_10 NO_STORE
 #define vstore_partial_2_11 NO_STORE
 #define vstore_partial_2_12 NO_STORE
@@ -559,16 +573,16 @@
 #define vstore_partial_2_15 NO_STORE
 #define vstore_partial_2_16 NO_STORE
 // Size == 3
-#define vstore_partial_3_0 NO_STORE
-#define vstore_partial_3_1 vstore_partial_1
-#define vstore_partial_3_2 vstore_partial_2
-#define vstore_partial_3_3 vstore_partial_3
-#define vstore_partial_3_4 NO_STORE
-#define vstore_partial_3_5 NO_STORE
-#define vstore_partial_3_6 NO_STORE
-#define vstore_partial_3_7 NO_STORE
-#define vstore_partial_3_8 NO_STORE
-#define vstore_partial_3_9 NO_STORE
+#define vstore_partial_3_0  NO_STORE
+#define vstore_partial_3_1  vstore_partial_1
+#define vstore_partial_3_2  vstore_partial_2
+#define vstore_partial_3_3  vstore_partial_3
+#define vstore_partial_3_4  NO_STORE
+#define vstore_partial_3_5  NO_STORE
+#define vstore_partial_3_6  NO_STORE
+#define vstore_partial_3_7  NO_STORE
+#define vstore_partial_3_8  NO_STORE
+#define vstore_partial_3_9  NO_STORE
 #define vstore_partial_3_10 NO_STORE
 #define vstore_partial_3_11 NO_STORE
 #define vstore_partial_3_12 NO_STORE
@@ -577,16 +591,16 @@
 #define vstore_partial_3_15 NO_STORE
 #define vstore_partial_3_16 NO_STORE
 // Size == 4
-#define vstore_partial_4_0 NO_STORE
-#define vstore_partial_4_1 vstore_partial_1
-#define vstore_partial_4_2 vstore_partial_2
-#define vstore_partial_4_3 vstore_partial_3
-#define vstore_partial_4_4 vstore_partial_4
-#define vstore_partial_4_5 NO_STORE
-#define vstore_partial_4_6 NO_STORE
-#define vstore_partial_4_7 NO_STORE
-#define vstore_partial_4_8 NO_STORE
-#define vstore_partial_4_9 NO_STORE
+#define vstore_partial_4_0  NO_STORE
+#define vstore_partial_4_1  vstore_partial_1
+#define vstore_partial_4_2  vstore_partial_2
+#define vstore_partial_4_3  vstore_partial_3
+#define vstore_partial_4_4  vstore_partial_4
+#define vstore_partial_4_5  NO_STORE
+#define vstore_partial_4_6  NO_STORE
+#define vstore_partial_4_7  NO_STORE
+#define vstore_partial_4_8  NO_STORE
+#define vstore_partial_4_9  NO_STORE
 #define vstore_partial_4_10 NO_STORE
 #define vstore_partial_4_11 NO_STORE
 #define vstore_partial_4_12 NO_STORE
@@ -595,16 +609,16 @@
 #define vstore_partial_4_15 NO_STORE
 #define vstore_partial_4_16 NO_STORE
 // Size == 8
-#define vstore_partial_8_0 NO_STORE
-#define vstore_partial_8_1 vstore_partial_1
-#define vstore_partial_8_2 vstore_partial_2
-#define vstore_partial_8_3 vstore_partial_3
-#define vstore_partial_8_4 vstore_partial_4
-#define vstore_partial_8_5 vstore_partial_5
-#define vstore_partial_8_6 vstore_partial_6
-#define vstore_partial_8_7 vstore_partial_7
-#define vstore_partial_8_8 vstore_partial_8
-#define vstore_partial_8_9 NO_STORE
+#define vstore_partial_8_0  NO_STORE
+#define vstore_partial_8_1  vstore_partial_1
+#define vstore_partial_8_2  vstore_partial_2
+#define vstore_partial_8_3  vstore_partial_3
+#define vstore_partial_8_4  vstore_partial_4
+#define vstore_partial_8_5  vstore_partial_5
+#define vstore_partial_8_6  vstore_partial_6
+#define vstore_partial_8_7  vstore_partial_7
+#define vstore_partial_8_8  vstore_partial_8
+#define vstore_partial_8_9  NO_STORE
 #define vstore_partial_8_10 NO_STORE
 #define vstore_partial_8_11 NO_STORE
 #define vstore_partial_8_12 NO_STORE
@@ -613,16 +627,16 @@
 #define vstore_partial_8_15 NO_STORE
 #define vstore_partial_8_16 NO_STORE
 // Size == 16
-#define vstore_partial_16_0 NO_STORE
-#define vstore_partial_16_1 vstore_partial_1
-#define vstore_partial_16_2 vstore_partial_2
-#define vstore_partial_16_3 vstore_partial_3
-#define vstore_partial_16_4 vstore_partial_4
-#define vstore_partial_16_5 vstore_partial_5
-#define vstore_partial_16_6 vstore_partial_6
-#define vstore_partial_16_7 vstore_partial_7
-#define vstore_partial_16_8 vstore_partial_8
-#define vstore_partial_16_9 vstore_partial_9
+#define vstore_partial_16_0  NO_STORE
+#define vstore_partial_16_1  vstore_partial_1
+#define vstore_partial_16_2  vstore_partial_2
+#define vstore_partial_16_3  vstore_partial_3
+#define vstore_partial_16_4  vstore_partial_4
+#define vstore_partial_16_5  vstore_partial_5
+#define vstore_partial_16_6  vstore_partial_6
+#define vstore_partial_16_7  vstore_partial_7
+#define vstore_partial_16_8  vstore_partial_8
+#define vstore_partial_16_9  vstore_partial_9
 #define vstore_partial_16_10 vstore_partial_10
 #define vstore_partial_16_11 vstore_partial_11
 #define vstore_partial_16_12 vstore_partial_12
@@ -648,17 +662,13 @@
  * @param[in] PTR    The base pointer
  * @{
  */
-#define vstore_partial_1(DATA, OFFSET, PTR) \
-    vstore1(DATA.s0, OFFSET, PTR);
+#define vstore_partial_1(DATA, OFFSET, PTR) vstore1(DATA.s0, OFFSET, PTR);
 
-#define vstore_partial_2(DATA, OFFSET, PTR) \
-    vstore2(DATA.s01, OFFSET, PTR);
+#define vstore_partial_2(DATA, OFFSET, PTR) vstore2(DATA.s01, OFFSET, PTR);
 
-#define vstore_partial_3(DATA, OFFSET, PTR) \
-    vstore3(DATA.s012, OFFSET, PTR);
+#define vstore_partial_3(DATA, OFFSET, PTR) vstore3(DATA.s012, OFFSET, PTR);
 
-#define vstore_partial_4(DATA, OFFSET, PTR) \
-    vstore4(DATA.s0123, OFFSET, PTR);
+#define vstore_partial_4(DATA, OFFSET, PTR) vstore4(DATA.s0123, OFFSET, PTR);
 
 #define vstore_partial_5(DATA, OFFSET, PTR)    \
     vstore_partial_4(DATA.s0123, OFFSET, PTR); \
@@ -672,8 +682,7 @@
     vstore_partial_4(DATA.s0123, OFFSET, PTR); \
     vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
 
-#define vstore_partial_8(DATA, OFFSET, PTR) \
-    vstore8(DATA.s01234567, OFFSET, PTR);
+#define vstore_partial_8(DATA, OFFSET, PTR) vstore8(DATA.s01234567, OFFSET, PTR);
 
 #define vstore_partial_9(DATA, OFFSET, PTR)        \
     vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
@@ -703,186 +712,156 @@
     vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
     vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
 
-#define vstore_partial_16(DATA, OFFSET, PTR) \
-    vstore16(DATA, OFFSET, PTR);
+#define vstore_partial_16(DATA, OFFSET, PTR) vstore16(DATA, OFFSET, PTR);
 /** @} */ // end of groupd vstore_partial_n
 /** @} */ // end of groupd VSTORE_PARTIAL
 
 // Convert built-in functions with _sat modifier are not supported in floating point so we create defines
 // without _sat to overcome this issue
-#define convert_float_sat convert_float
-#define convert_float1_sat convert_float
-#define convert_float2_sat convert_float2
-#define convert_float3_sat convert_float3
-#define convert_float4_sat convert_float4
-#define convert_float8_sat convert_float8
+#define convert_float_sat   convert_float
+#define convert_float1_sat  convert_float
+#define convert_float2_sat  convert_float2
+#define convert_float3_sat  convert_float3
+#define convert_float4_sat  convert_float4
+#define convert_float8_sat  convert_float8
 #define convert_float16_sat convert_float16
-#define convert_half_sat convert_float
-#define convert_half1_sat convert_half
-#define convert_half2_sat convert_half2
-#define convert_half3_sat convert_half3
-#define convert_half4_sat convert_half4
-#define convert_half8_sat convert_half8
-#define convert_half16_sat convert_half16
-
-#define convert_float1 convert_float
-#define convert_half1 convert_half
-#define convert_char1 convert_char
-#define convert_uchar1 convert_uchar
-#define convert_short1 convert_short
+#define convert_half_sat    convert_float
+#define convert_half1_sat   convert_half
+#define convert_half2_sat   convert_half2
+#define convert_half3_sat   convert_half3
+#define convert_half4_sat   convert_half4
+#define convert_half8_sat   convert_half8
+#define convert_half16_sat  convert_half16
+
+#define convert_float1  convert_float
+#define convert_half1   convert_half
+#define convert_char1   convert_char
+#define convert_uchar1  convert_uchar
+#define convert_short1  convert_short
 #define convert_ushort1 convert_ushort
-#define convert_int1 convert_int
-#define convert_uint1 convert_uint
-#define convert_long1 convert_long
-#define convert_ulong1 convert_ulong
+#define convert_int1    convert_int
+#define convert_uint1   convert_uint
+#define convert_long1   convert_long
+#define convert_ulong1  convert_ulong
 #define convert_double1 convert_double
 
-#define convert_char1_sat convert_char_sat
-#define convert_uchar1_sat convert_uchar_sat
-#define convert_uchar2_sat convert_uchar2_sat
-#define convert_uchar3_sat convert_uchar3_sat
-#define convert_uchar4_sat convert_uchar4_sat
-#define convert_uchar8_sat convert_uchar8_sat
+#define convert_char1_sat   convert_char_sat
+#define convert_uchar1_sat  convert_uchar_sat
+#define convert_uchar2_sat  convert_uchar2_sat
+#define convert_uchar3_sat  convert_uchar3_sat
+#define convert_uchar4_sat  convert_uchar4_sat
+#define convert_uchar8_sat  convert_uchar8_sat
 #define convert_uchar16_sat convert_uchar16_sat
-#define convert_short1_sat convert_short_sat
+#define convert_short1_sat  convert_short_sat
 #define convert_ushort1_sat convert_ushort_sat
-#define convert_int1_sat convert_int_sat
-#define convert_uint1_sat convert_uint_sat
-#define convert_long1_sat convert_long_sat
-#define convert_ulong1_sat convert_ulong_sat
+#define convert_int1_sat    convert_int_sat
+#define convert_uint1_sat   convert_uint_sat
+#define convert_long1_sat   convert_long_sat
+#define convert_ulong1_sat  convert_ulong_sat
 #define convert_double1_sat convert_double_sat
 
 #define VEC_DATA_TYPE_STR(type, size) type##size
-#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+#define VEC_DATA_TYPE(type, size)     VEC_DATA_TYPE_STR(type, size)
 
 #define CONVERT_STR(x, type) (convert_##type((x)))
-#define CONVERT(x, type) CONVERT_STR(x, type)
+#define CONVERT(x, type)     CONVERT_STR(x, type)
 
 #define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
-#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+#define CONVERT_SAT(x, type)     CONVERT_SAT_STR(x, type)
 
 #define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
-#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+#define CONVERT_SAT_ROUND(x, type, round)     CONVERT_SAT_ROUND_STR(x, type, round)
 
-#define select_vec_dt_uchar(size) uchar##size
-#define select_vec_dt_char(size) char##size
+#define select_vec_dt_uchar(size)  uchar##size
+#define select_vec_dt_char(size)   char##size
 #define select_vec_dt_ushort(size) ushort##size
-#define select_vec_dt_short(size) short##size
-#define select_vec_dt_half(size) short##size
-#define select_vec_dt_uint(size) uint##size
-#define select_vec_dt_int(size) int##size
-#define select_vec_dt_float(size) int##size
-#define select_vec_dt_ulong(size) ulong##size
-#define select_vec_dt_long(size) long##size
+#define select_vec_dt_short(size)  short##size
+#define select_vec_dt_half(size)   short##size
+#define select_vec_dt_uint(size)   uint##size
+#define select_vec_dt_int(size)    int##size
+#define select_vec_dt_float(size)  int##size
+#define select_vec_dt_ulong(size)  ulong##size
+#define select_vec_dt_long(size)   long##size
 
 #define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
-#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
-#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
+#define SELECT_VEC_DATA_TYPE(type, size)     SELECT_VEC_DATA_TYPE_STR(type, size)
+#define SELECT_DATA_TYPE(type)               SELECT_VEC_DATA_TYPE_STR(type, 1)
 
-#define signed_int_vec_dt_uchar(size) char##size
-#define signed_int_vec_dt_char(size) char##size
+#define signed_int_vec_dt_uchar(size)  char##size
+#define signed_int_vec_dt_char(size)   char##size
 #define signed_int_vec_dt_ushort(size) short##size
-#define signed_int_vec_dt_short(size) short##size
-#define signed_int_vec_dt_half(size) short##size
-#define signed_int_vec_dt_uint(size) int##size
-#define signed_int_vec_dt_int(size) int##size
-#define signed_int_vec_dt_float(size) int##size
-#define signed_int_vec_dt_ulong(size) long##size
-#define signed_int_vec_dt_long(size) long##size
+#define signed_int_vec_dt_short(size)  short##size
+#define signed_int_vec_dt_half(size)   short##size
+#define signed_int_vec_dt_uint(size)   int##size
+#define signed_int_vec_dt_int(size)    int##size
+#define signed_int_vec_dt_float(size)  int##size
+#define signed_int_vec_dt_ulong(size)  long##size
+#define signed_int_vec_dt_long(size)   long##size
 
 #define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
-#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
-#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
-
-#define sum_reduce_1(x) (x)
-#define sum_reduce_2(x) ((x).s0) + ((x).s1)
-#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
-#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
-#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
+#define SIGNED_INT_VEC_DATA_TYPE(type, size)     SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
+#define SIGNED_INT_DATA_TYPE(type)               SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
+
+#define sum_reduce_1(x)  (x)
+#define sum_reduce_2(x)  ((x).s0) + ((x).s1)
+#define sum_reduce_3(x)  sum_reduce_2((x).s01) + ((x).s2)
+#define sum_reduce_4(x)  sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
+#define sum_reduce_8(x)  sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
 #define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
 
 #define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
-#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
+#define SUM_REDUCE(x, size)     SUM_REDUCE_STR(x, size)
 
-#define prod_reduce_1(x) (x)
-#define prod_reduce_2(x) ((x).s0) * ((x).s1)
-#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
-#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
-#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
+#define prod_reduce_1(x)  (x)
+#define prod_reduce_2(x)  ((x).s0) * ((x).s1)
+#define prod_reduce_3(x)  prod_reduce_2((x).s01) * ((x).s2)
+#define prod_reduce_4(x)  prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
+#define prod_reduce_8(x)  prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
 #define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
 
 #define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
-#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
+#define PROD_REDUCE(x, size)     PROD_REDUCE_STR(x, size)
 
-#define max_reduce_1(x) (x)
-#define max_reduce_2(x) max(((x).s0), ((x).s1))
-#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
-#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
-#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
+#define max_reduce_1(x)  (x)
+#define max_reduce_2(x)  max(((x).s0), ((x).s1))
+#define max_reduce_3(x)  max(max_reduce_2((x).s01), ((x).s2))
+#define max_reduce_4(x)  max(max_reduce_2((x).s01), max_reduce_2((x).s23))
+#define max_reduce_8(x)  max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
 #define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
 
 #define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
-#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
+#define MAX_REDUCE(x, size)     MAX_REDUCE_STR(x, size)
 
-#define min_reduce_1(x) (x)
-#define min_reduce_2(x) min(((x).s0), ((x).s1))
-#define min_reduce_3(x) min(min_reduce_2((x).s01), ((x).s2))
-#define min_reduce_4(x) min(min_reduce_2((x).s01), min_reduce_2((x).s23))
-#define min_reduce_8(x) min(min_reduce_4((x).s0123), min_reduce_4((x).s4567))
+#define min_reduce_1(x)  (x)
+#define min_reduce_2(x)  min(((x).s0), ((x).s1))
+#define min_reduce_3(x)  min(min_reduce_2((x).s01), ((x).s2))
+#define min_reduce_4(x)  min(min_reduce_2((x).s01), min_reduce_2((x).s23))
+#define min_reduce_8(x)  min(min_reduce_4((x).s0123), min_reduce_4((x).s4567))
 #define min_reduce_16(x) min(min_reduce_8((x).s01234567), min_reduce_8((x).s89ABCDEF))
 
 #define MIN_REDUCE_STR(x, size) min_reduce_##size(x)
-#define MIN_REDUCE(x, size) MIN_REDUCE_STR(x, size)
-
-#define VECTOR_DECLARATION(name)     \
-    __global uchar *name##_ptr,      \
-    uint        name##_stride_x, \
-    uint        name##_step_x,   \
-    uint        name##_offset_first_element_in_bytes
-
-#define IMAGE_DECLARATION(name)      \
-    __global uchar *name##_ptr,      \
-    uint        name##_stride_x, \
-    uint        name##_step_x,   \
-    uint        name##_stride_y, \
-    uint        name##_step_y,   \
-    uint        name##_offset_first_element_in_bytes
-
-#define TENSOR3D_DECLARATION(name)   \
-    __global uchar *name##_ptr,      \
-    uint        name##_stride_x, \
-    uint        name##_step_x,   \
-    uint        name##_stride_y, \
-    uint        name##_step_y,   \
-    uint        name##_stride_z, \
-    uint        name##_step_z,   \
-    uint        name##_offset_first_element_in_bytes
-
-#define TENSOR4D_DECLARATION(name)   \
-    __global uchar *name##_ptr,      \
-    uint        name##_stride_x, \
-    uint        name##_step_x,   \
-    uint        name##_stride_y, \
-    uint        name##_step_y,   \
-    uint        name##_stride_z, \
-    uint        name##_step_z,   \
-    uint        name##_stride_w, \
-    uint        name##_step_w,   \
-    uint        name##_offset_first_element_in_bytes
-
-#define TENSOR5D_DECLARATION(name)   \
-    __global uchar *name##_ptr,      \
-    uint        name##_stride_x, \
-    uint        name##_step_x,   \
-    uint        name##_stride_y, \
-    uint        name##_step_y,   \
-    uint        name##_stride_z, \
-    uint        name##_step_z,   \
-    uint        name##_stride_w, \
-    uint        name##_step_w,   \
-    uint        name##_stride_v, \
-    uint        name##_step_v,   \
-    uint        name##_offset_first_element_in_bytes
+#define MIN_REDUCE(x, size)     MIN_REDUCE_STR(x, size)
+
+#define VECTOR_DECLARATION(name) \
+    __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)                                                                                     \
+    __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \
+        uint name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)                                                                                  \
+    __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \
+        uint name##_stride_z, uint name##_step_z, uint name##_offset_first_element_in_bytes
+
+#define TENSOR4D_DECLARATION(name)                                                                                  \
+    __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \
+        uint name##_stride_z, uint name##_step_z, uint name##_stride_w, uint name##_step_w,                         \
+        uint name##_offset_first_element_in_bytes
+
+#define TENSOR5D_DECLARATION(name)                                                                                  \
+    __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \
+        uint name##_stride_z, uint name##_step_z, uint name##_stride_w, uint name##_step_w, uint name##_stride_v,   \
+        uint name##_step_v, uint name##_offset_first_element_in_bytes
 
 #define CONVERT_TO_VECTOR_STRUCT(name) \
     update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
@@ -890,38 +869,47 @@
 #define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
     update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
 
-#define CONVERT_TO_IMAGE_STRUCT(name) \
-    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+#define CONVERT_TO_IMAGE_STRUCT(name)                                                                           \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \
+                              name##_stride_y, name##_step_y)
 
 #define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
     update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
 
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
-    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)                                                                 \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                                            name##_step_x, name##_stride_y, name##_step_y, name##_stride_z,    \
+                                            name##_step_z)
 
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
-    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name)                                                            \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \
+                                            name##_stride_y, 0, name##_stride_z, name##_step_z)
 
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
-    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)                                                                 \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                                            name##_step_x, name##_stride_y, name##_step_y, name##_stride_z,    \
+                                            name##_step_z)
 
-#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
-    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
-                                 name##_stride_z, name##_step_z)
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \
+                                 name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
 
-#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
-    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name)                                                       \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \
+                                 name##_stride_y, 0, name##_stride_z, 0)
 
-#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
-    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
-                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \
+                                 name##_stride_y, name##_step_y, name##_stride_z, name##_step_z, name##_stride_w,  \
+                                 name##_step_w, mod_size)
 
-#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
-    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size)                                             \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \
+                                 name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
 
-#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
-    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
-                           name##_stride_z, name##_step_z)
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                       \
+    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \
+                           name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
 
 /** Structure to hold Vector information */
 typedef struct Vector
@@ -970,10 +958,10 @@ typedef struct Tensor4D
  *
  * @return An image object
  */
-inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+inline Vector
+update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
 {
-    Vector vector =
-    {
+    Vector vector = {
         .ptr                           = ptr,
         .offset_first_element_in_bytes = offset_first_element_in_bytes,
         .stride_x                      = stride_x,
@@ -993,15 +981,13 @@ inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_
  *
  * @return An image object
  */
-inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+inline Image update_image_workitem_ptr(
+    __global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
 {
-    Image img =
-    {
-        .ptr                           = ptr,
-        .offset_first_element_in_bytes = offset_first_element_in_bytes,
-        .stride_x                      = stride_x,
-        .stride_y                      = stride_y
-    };
+    Image img = {.ptr                           = ptr,
+                 .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                 .stride_x                      = stride_x,
+                 .stride_y                      = stride_y};
     img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
     return img;
 }
@@ -1019,16 +1005,21 @@ inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_el
  *
  * @return A 3D tensor object
  */
-inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr,
+                                                     uint            offset_first_element_in_bytes,
+                                                     uint            stride_x,
+                                                     uint            step_x,
+                                                     uint            stride_y,
+                                                     uint            step_y,
+                                                     uint            stride_z,
+                                                     uint            step_z)
 {
-    Image img =
-    {
-        .ptr                           = ptr,
-        .offset_first_element_in_bytes = offset_first_element_in_bytes,
-        .stride_x                      = stride_x,
-        .stride_y                      = stride_y
-    };
-    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    Image img = {.ptr                           = ptr,
+                 .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                 .stride_x                      = stride_x,
+                 .stride_y                      = stride_y};
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y +
+               get_global_id(2) * step_z;
     return img;
 }
 
@@ -1045,17 +1036,22 @@ inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint o
  *
  * @return A 3D tensor object
  */
-inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr,
+                                             uint            offset_first_element_in_bytes,
+                                             uint            stride_x,
+                                             uint            step_x,
+                                             uint            stride_y,
+                                             uint            step_y,
+                                             uint            stride_z,
+                                             uint            step_z)
 {
-    Tensor3D tensor =
-    {
-        .ptr                           = ptr,
-        .offset_first_element_in_bytes = offset_first_element_in_bytes,
-        .stride_x                      = stride_x,
-        .stride_y                      = stride_y,
-        .stride_z                      = stride_z
-    };
-    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    Tensor3D tensor = {.ptr                           = ptr,
+                       .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                       .stride_x                      = stride_x,
+                       .stride_y                      = stride_y,
+                       .stride_z                      = stride_z};
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y +
+                  get_global_id(2) * step_z;
     return tensor;
 }
 
@@ -1072,34 +1068,44 @@ inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_fi
  *
  * @return A 3D tensor object
  */
-inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr,
+                                       uint            offset_first_element_in_bytes,
+                                       uint            stride_x,
+                                       uint            step_x,
+                                       uint            stride_y,
+                                       uint            step_y,
+                                       uint            stride_z,
+                                       uint            step_z)
 {
-    Tensor3D tensor =
-    {
-        .ptr                           = ptr,
-        .offset_first_element_in_bytes = offset_first_element_in_bytes,
-        .stride_x                      = stride_x,
-        .stride_y                      = stride_y,
-        .stride_z                      = stride_z
-    };
+    Tensor3D tensor = {.ptr                           = ptr,
+                       .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                       .stride_x                      = stride_x,
+                       .stride_y                      = stride_y,
+                       .stride_z                      = stride_z};
     return tensor;
 }
 
-inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
-                                             uint step_w,
-                                             uint mod_size)
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr,
+                                             uint            offset_first_element_in_bytes,
+                                             uint            stride_x,
+                                             uint            step_x,
+                                             uint            stride_y,
+                                             uint            step_y,
+                                             uint            stride_z,
+                                             uint            step_z,
+                                             uint            stride_w,
+                                             uint            step_w,
+                                             uint            mod_size)
 {
-    Tensor4D tensor =
-    {
-        .ptr                           = ptr,
-        .offset_first_element_in_bytes = offset_first_element_in_bytes,
-        .stride_x                      = stride_x,
-        .stride_y                      = stride_y,
-        .stride_z                      = stride_z,
-        .stride_w                      = stride_w
-    };
-
-    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    Tensor4D tensor = {.ptr                           = ptr,
+                       .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                       .stride_x                      = stride_x,
+                       .stride_y                      = stride_y,
+                       .stride_z                      = stride_z,
+                       .stride_w                      = stride_w};
+
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y +
+                  (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
     return tensor;
 }
 
@@ -1171,7 +1177,8 @@ inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint wid
 
     const uint x = index;
 
-    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z +
+           tensor->offset_first_element_in_bytes;
 }
 
 #endif // _HELPER_H
diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h
index 562c5d3236..166260a3c0 100644
--- a/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/src/core/CL/cl_kernels/helpers_asymm.h
@@ -34,7 +34,7 @@
  * @return The converted vector
  */
 #define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
-#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
+#define CONVERT_DOWN_RTE(x, type)     CONVERT_DOWN_RTE_STR(x, type)
 
 /** Quantize a floating-point scalar value to 8-bit asymmetric
  *
@@ -84,14 +84,15 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return quantized values
  */
-#define QUANTIZE_IMPL(type, size)                                                                                       \
-    inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
-    {                                                                                                                   \
-        VEC_DATA_TYPE(float, size)                                                                                      \
-        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);                   \
-        VEC_DATA_TYPE(type, size)                                                                                       \
-        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size));              \
-        return res;                                                                                                     \
+#define QUANTIZE_IMPL(type, size)                                                                          \
+    inline VEC_DATA_TYPE(type, size)                                                                       \
+        quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale)                 \
+    {                                                                                                      \
+        VEC_DATA_TYPE(float, size)                                                                         \
+        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);      \
+        VEC_DATA_TYPE(type, size)                                                                          \
+        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size)); \
+        return res;                                                                                        \
     }
 
 /** Dequantize a vector of values to floating-point
@@ -101,10 +102,11 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return dequantized values in floating point
  */
-#define DEQUANTIZE_IMPL(type, size)                                                                                       \
-    inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
-    {                                                                                                                     \
-        return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                                             \
+#define DEQUANTIZE_IMPL(type, size)                                                         \
+    inline VEC_DATA_TYPE(float, size)                                                       \
+        dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
+    {                                                                                       \
+        return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;               \
     }
 
 /** Correctly-rounded-to-nearest division by a power-of-two.
@@ -113,18 +115,17 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return Correctly-rounded-to-nearest division by a power-of-two.
  */
-#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                                                        \
-    inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
-    {                                                                                                                                   \
-        const VEC_DATA_TYPE(int, size)                                                                                                  \
-        zero = (VEC_DATA_TYPE(int, size))0;                                                                                         \
-        const VEC_DATA_TYPE(int, size)                                                                                                  \
-        one = (VEC_DATA_TYPE(int, size))1;                                                                                          \
-        VEC_DATA_TYPE(int, size)                                                                                                        \
-        mask = (one << exponent) - one;                                                                                                 \
-        VEC_DATA_TYPE(int, size)                                                                                                        \
-        threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0));                                          \
-        return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold));                          \
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                               \
+    inline VEC_DATA_TYPE(int, size)                                                                            \
+        asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent)    \
+    {                                                                                                          \
+        const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0;                                     \
+        const VEC_DATA_TYPE(int, size) one  = (VEC_DATA_TYPE(int, size))1;                                     \
+        VEC_DATA_TYPE(int, size)                                                                               \
+        mask = (one << exponent) - one;                                                                        \
+        VEC_DATA_TYPE(int, size)                                                                               \
+        threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0));                 \
+        return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold)); \
     }
 
 /** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
@@ -167,27 +168,29 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return Result in fixed-point format Q0.
  */
-#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                                                    \
-    inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \
-    {                                                                                                                               \
-        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                                              \
-        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                                               \
-        const int k_fractional_bits = 31;                                                                                           \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x = a + (1 << (k_fractional_bits - 3));                                                                                     \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x2 = ASYMM_MULT(x, x, size);                                                                                                \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x3 = ASYMM_MULT(x2, x, size);                                                                                               \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x4 = ASYMM_MULT(x2, x2, size);                                                                                              \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                                                     \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                             \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);       \
-        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);                       \
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                              \
+    inline VEC_DATA_TYPE(int, size)                                                                           \
+        asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a)       \
+    {                                                                                                         \
+        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                        \
+        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                         \
+        const int k_fractional_bits                      = 31;                                                \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x = a + (1 << (k_fractional_bits - 3));                                                               \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x2 = ASYMM_MULT(x, x, size);                                                                          \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x3 = ASYMM_MULT(x2, x, size);                                                                         \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x4 = ASYMM_MULT(x2, x2, size);                                                                        \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                               \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;       \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x4_over_24_plus_x3_over_6_plus_x2_over_2 =                                                            \
+            ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);                        \
+        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \
     }
 
 /** Each bit of the result is set to the corresponding bit of either then_val or
@@ -198,10 +201,11 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding bit in @p if_mask is set or not.
  */
-#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                                                                                \
-    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
-    {                                                                                                                                                                     \
-        return (if_mask & then_val) ^ (~if_mask & else_val);                                                                                                              \
+#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                      \
+    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(                                              \
+        VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
+    {                                                                                                           \
+        return (if_mask & then_val) ^ (~if_mask & else_val);                                                    \
     }
 
 /** For each element of input vector, the corresponding bits of the result item are set
@@ -234,18 +238,19 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
         return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a != 0));       \
     }
 
-#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                                                                                                         \
-    inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
-    {                                                                                                                                                                                                         \
-        if(k_integer_bits > exponent)                                                                                                                                                                         \
-        {                                                                                                                                                                                                     \
-            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                                                                                                          \
-            return ASYMM_SELECT_USING_MASK(                                                                                                                                                                   \
-                    ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                                                                                                                              \
-                    ASYMM_MULT(result, fp_multiplier, size), result, size);                                                                                                                                       \
-        }                                                                                                                                                                                                     \
-        \
-        return result;                                                                                                                                                                                        \
+#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                  \
+    inline VEC_DATA_TYPE(int, size)                                                                                    \
+        exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, \
+                                 int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder)                            \
+    {                                                                                                                  \
+        if (k_integer_bits > exponent)                                                                                 \
+        {                                                                                                              \
+            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                   \
+            return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),            \
+                                           ASYMM_MULT(result, fp_multiplier, size), result, size);                     \
+        }                                                                                                              \
+                                                                                                                       \
+        return result;                                                                                                 \
     }
 
 /** Calculates \f$ exp(x) \f$ for x < 0.
@@ -254,39 +259,40 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return Result in fixed-point format Q0.
  */
-#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                               \
-    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)        \
-    {                                                                                                                         \
-        const int k_fractional_bits = 31 - k_integer_bits;                                                                    \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                         \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        mask = k_one_quarter - 1;                                                                                             \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                         \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                           \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        remainder = a_mod_quarter_minus_one_quarter - a;                                                                      \
-        \
-        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);              \
-        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);              \
-        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);               \
-        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);               \
-        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);                \
-        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);                  \
-        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);                     \
-        \
-        if(k_integer_bits > 5)                                                                                                \
-        {                                                                                                                     \
-            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                           \
-            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                       \
-        }                                                                                                                     \
-        \
-        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                                      \
-        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                                    \
+#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                        \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \
+    {                                                                                                                  \
+        const int k_fractional_bits = 31 - k_integer_bits;                                                             \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                  \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        mask = k_one_quarter - 1;                                                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                  \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                    \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, \
+                                                                               size);                                  \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        remainder = a_mod_quarter_minus_one_quarter - a;                                                               \
+                                                                                                                       \
+        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);       \
+        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);       \
+        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);        \
+        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);        \
+        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);         \
+        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);           \
+        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);              \
+                                                                                                                       \
+        if (k_integer_bits > 5)                                                                                        \
+        {                                                                                                              \
+            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                    \
+            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                \
+        }                                                                                                              \
+                                                                                                                       \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                               \
+        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                             \
     }
 
 /** Calculates the product of a integer value by a power of two, with either a positive exponent
@@ -297,26 +303,27 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return Arithmetic left or right shift.
  */
-#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                                                  \
-    inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
-    {                                                                                                                      \
-        if(exponent < 0)                                                                                                   \
-        {                                                                                                                  \
-            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                                                      \
-        }                                                                                                                  \
-        \
-        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                                                      \
-        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                                                      \
-        int threshold = ((1 << (31 - exponent)) - 1);                                                                      \
-        VEC_DATA_TYPE(int, size)                                                                                           \
-        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                                                       \
-        VEC_DATA_TYPE(int, size)                                                                                           \
-        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                                                      \
-        VEC_DATA_TYPE(int, size)                                                                                           \
-        result = x << exponent;                                                                                            \
-        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                                                \
-        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                                                \
-        return result;                                                                                                     \
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                      \
+    inline VEC_DATA_TYPE(int, size)                                                            \
+        asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+    {                                                                                          \
+        if (exponent < 0)                                                                      \
+        {                                                                                      \
+            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                          \
+        }                                                                                      \
+                                                                                               \
+        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                          \
+        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                          \
+        int threshold                      = ((1 << (31 - exponent)) - 1);                     \
+        VEC_DATA_TYPE(int, size)                                                               \
+        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                           \
+        VEC_DATA_TYPE(int, size)                                                               \
+        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                          \
+        VEC_DATA_TYPE(int, size)                                                               \
+        result = x << exponent;                                                                \
+        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                    \
+        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                    \
+        return result;                                                                         \
     }
 
 /** Calculates (a+b)/2, rounded to the nearest integer.
@@ -326,20 +333,21 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return (a+b)/2, rounded to the nearest integer.
  */
-#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                                                \
-    inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
-    {                                                                                                                     \
-        VEC_DATA_TYPE(long, size)                                                                                         \
-        a64 = convert_long##size(a);                                                                                      \
-        VEC_DATA_TYPE(long, size)                                                                                         \
-        b64 = convert_long##size(b);                                                                                      \
-        VEC_DATA_TYPE(long, size)                                                                                         \
-        sum = a64 + b64;                                                                                                  \
-        const VEC_DATA_TYPE(long, size) one       = 1;                                                                    \
-        const VEC_DATA_TYPE(long, size) minus_one = -1;                                                                   \
-        VEC_DATA_TYPE(long, size)                                                                                         \
-        sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0));                                      \
-        return convert_int##size((sum + sign) / 2);                                                                       \
+#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size)                                                           \
+        asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                         \
+        VEC_DATA_TYPE(long, size)                                                             \
+        a64 = convert_long##size(a);                                                          \
+        VEC_DATA_TYPE(long, size)                                                             \
+        b64 = convert_long##size(b);                                                          \
+        VEC_DATA_TYPE(long, size)                                                             \
+        sum                                       = a64 + b64;                                \
+        const VEC_DATA_TYPE(long, size) one       = 1;                                        \
+        const VEC_DATA_TYPE(long, size) minus_one = -1;                                       \
+        VEC_DATA_TYPE(long, size)                                                             \
+        sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0));          \
+        return convert_int##size((sum + sign) / 2);                                           \
     }
 
 /** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
@@ -354,12 +362,12 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
         const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                     \
         const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                                               \
         VEC_DATA_TYPE(int, size)                                                                             \
-        half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);                                         \
+        half_denominator                                 = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);         \
         const VEC_DATA_TYPE(int, size) Q2_48_over_17     = 1515870810;                                       \
         const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;                                      \
         VEC_DATA_TYPE(int, size)                                                                             \
         x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size);                           \
-        for(int i = 0; i < 3; i++)                                                                           \
+        for (int i = 0; i < 3; i++)                                                                          \
         {                                                                                                    \
             VEC_DATA_TYPE(int, size)                                                                         \
             half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);                                \
@@ -378,48 +386,57 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale)
  *
  * @return Rescaled value.
  */
-#define ASYMM_RESCALE_IMPL(size)                                                                                                    \
-    inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
-    {                                                                                                                               \
-        int exponent = src_integer_bits - dst_integer_bits;                                                                         \
-        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                                                       \
+#define ASYMM_RESCALE_IMPL(size)                                                                        \
+    inline VEC_DATA_TYPE(int, size)                                                                     \
+        asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
+    {                                                                                                   \
+        int exponent = src_integer_bits - dst_integer_bits;                                             \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                           \
     }
 
-#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
-#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
+#define QUANTIZE_STR(input, offset, scale, type, size)   quantize_##type##size(input, offset, scale)
+#define QUANTIZE(input, offset, scale, type, size)       QUANTIZE_STR(input, offset, scale, type, size)
 #define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale)
-#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size)
+#define DEQUANTIZE(input, offset, scale, type, size)     DEQUANTIZE_STR(input, offset, scale, type, size)
 
 #define ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
-#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size)
-#define ASYMM_MULT_STR(a, b, size) asymm_mult##size(a, b)
-#define ASYMM_MULT(a, b, size) ASYMM_MULT_STR(a, b, size)
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size)     ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size)
+#define ASYMM_MULT_STR(a, b, size)                           asymm_mult##size(a, b)
+#define ASYMM_MULT(a, b, size)                               ASYMM_MULT_STR(a, b, size)
 #define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
     ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
 #define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
     ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
-#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
-#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val)
-#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \
+    asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
+#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) \
+    asymm_select_using_mask##size(if_mask, then_val, else_val)
+#define ASYMM_MASK_IF_ZERO(a, size)     asymm_mask_if_zero##size(a)
 #define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
-#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
+#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) \
+    exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
 #define ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
-#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size)
-#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
-#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size)
-#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
+#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size)     ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size)       asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size)           ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size)
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) \
+    asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
 #define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
-#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
-#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size)
-
-#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size)                                                                             \
-    inline VEC_DATA_TYPE(int, size) multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
-    {                                                                                                                           \
-        const int left_shift  = shift > 0 ? shift : 0;                                                                          \
-        const int right_shift = shift > 0 ? 0 : -shift;                                                                         \
-        return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size);             \
+#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) \
+    asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \
+    ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size)
+
+#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size)                                                                 \
+    inline VEC_DATA_TYPE(int, size)                                                                                 \
+        multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift)                 \
+    {                                                                                                               \
+        const int left_shift  = shift > 0 ? shift : 0;                                                              \
+        const int right_shift = shift > 0 ? 0 : -shift;                                                             \
+        return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size); \
     }
-#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) multiply_by_quantized_multiplier##size(input, qmul, shift)
+#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) \
+    multiply_by_quantized_multiplier##size(input, qmul, shift)
 
 QUANTIZE_IMPL(uchar, 1)
 QUANTIZE_IMPL(char, 1)
diff --git a/src/core/CL/cl_kernels/load_store_utility.h b/src/core/CL/cl_kernels/load_store_utility.h
index 4ba2b2ca3a..4daf0adc89 100644
--- a/src/core/CL/cl_kernels/load_store_utility.h
+++ b/src/core/CL/cl_kernels/load_store_utility.h
@@ -223,8 +223,10 @@
  * @param[in] Z         The offset in z-axis direction
  * @{
  */
-#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
 /** @} */ // end of group STORE_BLOCK
 
 /** Convert and store a block of the given size M0xN0
@@ -245,8 +247,10 @@
  * @param[in] Z         The offset in z-axis direction
  * @{
  */
-#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
 /** @} */ // end of group CONVERT_STORE_BLOCK
 
 /** Partially store the 0 to (n-1)th rows of the given variables
@@ -365,8 +369,10 @@
  * @param[in] Z         The offset in z-axis direction
  * @{
  */
-#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
 /** Store a block that can be partial in both x and y dimensions
  *
  * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
@@ -388,22 +394,23 @@
  * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
  * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
  */
-#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
-    {                                                                                                                                                     \
-        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
-    }                                                                                                                                                     \
-    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
-    {                                                                                                                                                     \
-        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
-    }                                                                                                                                                     \
-    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
-    {                                                                                                                                                     \
-        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
-    }                                                                                                                                                     \
-    else                                                                                                                                                  \
-    {                                                                                                                                                     \
-        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
+#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0,     \
+                                       PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                    \
+    if (!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                             \
+    {                                                                                                       \
+        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                             \
+    }                                                                                                       \
+    else if ((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                         \
+    {                                                                                                       \
+        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);               \
+    }                                                                                                       \
+    else if (!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                         \
+    {                                                                                                       \
+        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);               \
+    }                                                                                                       \
+    else                                                                                                    \
+    {                                                                                                       \
+        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \
     }
 /** Store a block that can only be partial in x but not y.
  *
@@ -425,7 +432,7 @@
  * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
  */
 #define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
-    if(!(PARTIAL_COND_X))                                                                                         \
+    if (!(PARTIAL_COND_X))                                                                                        \
     {                                                                                                             \
         STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
     }                                                                                                             \
@@ -453,7 +460,7 @@
  * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
  */
 #define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
-    if(!(PARTIAL_COND_Y))                                                                                         \
+    if (!(PARTIAL_COND_Y))                                                                                        \
     {                                                                                                             \
         STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
     }                                                                                                             \
@@ -517,23 +524,28 @@
 #if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
 #if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
 // Case1: No partial blocks in either x or y
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+                                   PARTIAL_COND_Y, PARTIAL_COND_X)                                                    \
     STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
 
 #elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
 // Case2: Partial blocks in y
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+                                   PARTIAL_COND_Y, PARTIAL_COND_X)                                                    \
     STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
 
 #elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
 // Case3: Partial blocks in x
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+                                   PARTIAL_COND_Y, PARTIAL_COND_X)                                                    \
     STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
 
 #else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
 // Case4: Partial blocks in both x and y
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+                                   PARTIAL_COND_Y, PARTIAL_COND_X)                                                    \
+    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+                                   PARTIAL_COND_Y, PARTIAL_COND_X)
 
 #endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
 
@@ -560,8 +572,7 @@
 #define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
     ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
 #else // defined(PARTIAL_STORE_M0)
-#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
-    ((uint)(y * M0))
+#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) ((uint)(y * M0))
 #endif    // defined(PARTIAL_STORE_M0)
 /** @} */ // end of group COMPUTE_M0_START_ROW
 
diff --git a/src/core/CL/cl_kernels/repeat.h b/src/core/CL/cl_kernels/repeat.h
index bed94a7b3b..cb2f4b0319 100644
--- a/src/core/CL/cl_kernels/repeat.h
+++ b/src/core/CL/cl_kernels/repeat.h
@@ -75,7 +75,9 @@
     P_X##_DEF(F, P_A, P_B, P_C);        \
     REPEAT_3_15(P_X, P_A, P_B, P_C)
 
-#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM
+#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) \
+    REPEAT_3_##P_NUM(P_OP, P_A, P_B,               \
+                     P_C) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM
 #define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C)
 
 // Repeat macros with 4 param, excluding the implicit ID param
@@ -126,52 +128,59 @@
     P_X##_DEF(F, P_A, P_B, P_C, P_D);        \
     REPEAT_4_15(P_X, P_A, P_B, P_C, P_D)
 
-#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, P_D) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM
+#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) \
+    REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C,               \
+                     P_D) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM
 #define REPEAT_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D)
 
 // Macro for initializing N variables. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...)
-#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL
+#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL)   TYPE VAR##ID = VAL
 #define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL)
 
 // Macro for initializing N variables by converting the data type. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...)
-#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT)
+#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT)   TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT)
 #define REPEAT_VAR_INIT_CONVERT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT, TYPE_OUT, VAR_IN, VAR_OUT)
 
 // Macro for initializing N variables by converting the data type with saturation. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...)
 #define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT)
-#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT)
+#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) \
+    REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT)
 
 // Macro for adding a constant to N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL
+#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL)   VAR##ID += (TYPE)VAL
 #define REPEAT_ADD_CONST_TO_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, ADD_CONST_TO_VAR, TYPE, VAR, VAL)
 
 // Macro for multiplying N variables (VAR_B) by a constant (VAL) and adding to other N variables (VAR_A). Generates N statements that defines VAR_A##N =RHS_ACCESSOR_DEF(...)
-#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL
+#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL)   VAR_A##ID += VAR_B##ID * VAL
 #define REPEAT_MLA_VAR_WITH_CONST_VEC(N, VAR_A, VAR_B, VAL) REPEAT_3_N(N, MLA_VAR_WITH_CONST_VEC, VAR_A, VAR_B, VAL)
 
 // Macro for adding a vector to N-variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
 #define ADD_VECTOR_TO_VAR_DEF(ID, TYPE, VAR, VEC) VAR##ID += VEC
-#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC)
+#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC)     REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC)
 
 // Macro for adding a two N-variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
 #define ADD_TWO_VARS_DEF(ID, TYPE, VAR_A, VAR_B) VAR_A##ID += VAR_B##ID
-#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B)
+#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B)     REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B)
 
 // Macro for performing Max between a constant and N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL)
+#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL)   VAR##ID = max(VAR##ID, (TYPE)VAL)
 #define REPEAT_MAX_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MAX_CONST_VAR, TYPE, VAR, VAL)
 
 // Macro for performing Min between a constant and N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL)
+#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL)   VAR##ID = min(VAR##ID, (TYPE)VAL)
 #define REPEAT_MIN_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MIN_CONST_VAR, TYPE, VAR, VAL)
 
 // Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE to N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
-#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \
+    VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+    REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
 
 // Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE to N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
-#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \
+    VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+    REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
 
 // Macro for performing per-channel ASYMM_MULT_BY_QUANT_MULTIPLIER to N variables.
 #define ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT)                     \
@@ -182,6 +191,7 @@
         VAR##ID_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0);    \
         VAR##ID           = select(VAR##ID_shift_lt0, VAR##ID_shift_gt0, RES_SHIFT >= 0);                     \
     })
-#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT)
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+    REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT)
 
 #endif // ARM_COMPUTE_REPEAT_H
diff --git a/src/core/CL/cl_kernels/warp_helpers.h b/src/core/CL/cl_kernels/warp_helpers.h
index 642483ab3c..6595bd1981 100644
--- a/src/core/CL/cl_kernels/warp_helpers.h
+++ b/src/core/CL/cl_kernels/warp_helpers.h
@@ -31,11 +31,13 @@
  * @param[in] border_size Border size of the image
  *
  */
-inline const float8 clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size)
+inline const float8
+clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size)
 {
     const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size);
     const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size);
-    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
+    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3,
+                    clamped_y.s3);
 }
 
 /** Clamps the given coordinates to the borders.
@@ -74,7 +76,8 @@ inline const VEC_DATA_TYPE(DATA_TYPE, 4) read_texels4(const Image *in, const int
  */
 inline const float8 get_neighbour_coords(const float2 coord)
 {
-    return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1);
+    return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1,
+                    /*br*/ coord.s0 + 1, coord.s1 + 1);
 }
 
 /** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
@@ -85,37 +88,38 @@ inline const float8 get_neighbour_coords(const float2 coord)
  * @param[in] height      Height of the image
  * @param[in] border_size Border size
  */
-inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(const Image *in, const float8 coords, const float width, const float height, const float border_size)
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(
+    const Image *in, const float8 coords, const float width, const float height, const float border_size)
 {
     // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image.
 
     // Sets the 4x4 coordinates for each of the four input texels
     const float8  fc = floor(coords);
-    const float16 c1 = (float16)(
-                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size),
-                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size));
-    const float16 c2 = (float16)(
-                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size),
-                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size));
+    const float16 c1 =
+        (float16)(clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size),
+                  clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size));
+    const float16 c2 =
+        (float16)(clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size),
+                  clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size));
 
     // Loads the values from the input image
     const float16 t = (float16)(
-                          /* tl, tr, bl, br */
-                          * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
-                          *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
-                          *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
-                          *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
-                          *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
-                          *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
-                          *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
-                          *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
-    const float8 a  = coords - fc;
-    const float8 b  = ((float8)(1.f)) - a;
-    const float4 fr = (float4)(
-                          ((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)),
-                          ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)),
-                          ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)),
-                          ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7)));
+        /* tl, tr, bl, br */
+        *((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
+        *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
+        *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
+        *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
+        *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
+        *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
+        *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
+        *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
+    const float8 a = coords - fc;
+    const float8 b = ((float8)(1.f)) - a;
+    const float4 fr =
+        (float4)(((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)),
+                 ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)),
+                 ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)),
+                 ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7)));
     return CONVERT(fr, VEC_DATA_TYPE(DATA_TYPE, 4));
 }
 
@@ -126,7 +130,8 @@ inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(const
  * @param[in] width  Width of the image
  * @param[in] height Height of the image
  */
-inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height)
+inline const VEC_DATA_TYPE(DATA_TYPE, 4)
+    bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height)
 {
     return bilinear_interpolate_with_border(in, coords, width, height, 1);
 }
diff --git a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
index 2728958add..5b72354abe 100644
--- a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -44,16 +45,20 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::S32, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::S64);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Only ARG_IDX_MAX and ARG_IDX_MIN are supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN,
+                                    "Only ARG_IDX_MAX and ARG_IDX_MIN are supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+                                    "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32, DataType::S64, DataType::U64);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32, DataType::S64,
+                                                             DataType::U64);
     }
 
     return Status{};
@@ -66,22 +71,34 @@ CLArgMinMaxLayerKernel::CLArgMinMaxLayerKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLArgMinMaxLayerKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
+void CLArgMinMaxLayerKernel::configure(const ICLTensor   *input,
+                                       ICLTensor         *output,
+                                       unsigned int       axis,
+                                       ReductionOperation op)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op);
 }
 
-void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
+void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context,
+                                       const ICLTensor        *input,
+                                       ICLTensor              *output,
+                                       unsigned int            axis,
+                                       ReductionOperation      op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    TensorShape output_shape{ input->info()->tensor_shape() };
+    TensorShape output_shape{input->info()->tensor_shape()};
     output_shape.set(axis, 1);
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(DataType::S32).reset_padding().set_is_resizable(true));
+    auto_init_if_empty(*output->info(), input->info()
+                                            ->clone()
+                                            ->set_tensor_shape(output_shape)
+                                            .set_data_type(DataType::S32)
+                                            .reset_padding()
+                                            .set_is_resizable(true));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
 
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input          = input;
     _output         = output;
@@ -90,11 +107,14 @@ void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context,
 
     // Set build options
     const auto adjusted_vector_size = adjust_vec_size(16U, input->info()->dimension(0));
-    const auto vector_size          = (adjusted_vector_size == 3U && axis == 0U) ? 2U : adjusted_vector_size; // the opencl kernel only supports sizes 2, 4, 8 and 16.
+    const auto vector_size          = (adjusted_vector_size == 3U && axis == 0U)
+                                          ? 2U
+                                          : adjusted_vector_size; // the opencl kernel only supports sizes 2, 4, 8 and 16.
 
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % vector_size));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(input->info()->dimension(0) % vector_size));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vector_size));
     build_opts.add_option_if(is_data_type_float(input->info()->data_type()), "-DFLOAT_DATA_TYPE");
     build_opts.add_option_if_else(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX", "-DARG_MIN");
@@ -104,7 +124,7 @@ void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context,
 
     // Create kernel
     std::string kernel_axis_name;
-    switch(axis)
+    switch (axis)
     {
         case 0:
             build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
@@ -135,7 +155,10 @@ void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context,
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLArgMinMaxLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status CLArgMinMaxLayerKernel::validate(const ITensorInfo *input,
+                                        const ITensorInfo *output,
+                                        unsigned int       axis,
+                                        ReductionOperation op)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
     return Status{};
@@ -146,7 +169,7 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    switch(_reduction_axis)
+    switch (_reduction_axis)
     {
         case 0:
         {
@@ -154,7 +177,8 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
             Window out_window(window);
             Window in_window(window);
             out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
-            in_window.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
+            in_window.set(Window::DimX,
+                          Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
             in_window.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), 1u));
 
             // Get first input and output slices
@@ -166,15 +190,15 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
                 add_2D_tensor_argument(idx, _input, in_slice);
                 add_2D_tensor_argument(idx, _output, out_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(in_window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
+            } while (in_window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
         }
         break;
         case 1:
         {
             // Get first input and output slices
-            Window window_in{ window };
-            window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
+            Window window_in{window};
+            window_in.set(Window::DimY,
+                          Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
             Window in_slice  = window_in.first_slice_window_2D();
             Window out_slice = window.first_slice_window_2D();
 
@@ -184,15 +208,15 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
                 add_2D_tensor_argument(idx, _input, in_slice);
                 add_2D_tensor_argument(idx, _output, out_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+            } while (window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
         }
         break;
         case 2:
         {
             // Get first input and output slices
-            Window window_in{ window };
-            window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
+            Window window_in{window};
+            window_in.set(Window::DimZ,
+                          Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
             Window in_slice  = window_in.first_slice_window_3D();
             Window out_slice = window.first_slice_window_3D();
 
@@ -202,14 +226,13 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
                 add_3D_tensor_argument(idx, _input, in_slice);
                 add_3D_tensor_argument(idx, _output, out_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
+            } while (window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
         }
         break;
         case 3:
         {
             // Get first input and output slices
-            Window window_in{ window };
+            Window window_in{window};
             window_in.set(3, Window::Dimension(0, 1, 1));
             Window in_slice  = window_in.first_slice_window_4D();
             Window out_slice = window.first_slice_window_4D();
@@ -220,8 +243,7 @@ void CLArgMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
                 add_4D_tensor_argument(idx, _input, in_slice);
                 add_4D_tensor_argument(idx, _output, out_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
+            } while (window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
         }
         break;
         default:
diff --git a/src/core/CL/kernels/CLArgMinMaxLayerKernel.h b/src/core/CL/kernels/CLArgMinMaxLayerKernel.h
index 5f36bdf113..fb3b41b0de 100644
--- a/src/core/CL/kernels/CLArgMinMaxLayerKernel.h
+++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLARGMINMAXLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -72,7 +73,11 @@ public:
      * @param[in]  axis            Axis along which to reduce. Supported reduction axis : 0,1,2,3
      * @param[in]  op              Reduction operation to perform. Only ArgMin and ArgMax are supported.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   unsigned int            axis,
+                   ReductionOperation      op);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLArgMinMaxLayerKernel.
      *
@@ -84,7 +89,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
index 3fa8a8edaa..c88a852a44 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
@@ -23,58 +23,64 @@
  */
 #include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const ITensorInfo *mean, const ITensorInfo *var,
-                          const ITensorInfo *beta, const ITensorInfo *gamma,
-                          float epsilon, ActivationLayerInfo act_info)
+Status validate_arguments(const ITensorInfo  *input,
+                          const ITensorInfo  *output,
+                          const ITensorInfo  *mean,
+                          const ITensorInfo  *var,
+                          const ITensorInfo  *beta,
+                          const ITensorInfo  *gamma,
+                          float               epsilon,
+                          ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_UNUSED(epsilon);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
-    if(beta != nullptr)
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(
+                                    input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
+    if (beta != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
     }
-    if(gamma != nullptr)
+    if (gamma != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
     }
 
-    if(act_info.enabled())
+    if (act_info.enabled())
     {
         ActivationLayerInfo::ActivationFunction act = act_info.activation();
         ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32 && input->data_type() != DataType::F16);
-        ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU
-                                    && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
-                                    && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+        ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU &&
+                                    act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+                                    act !=
+                                        ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
         ARM_COMPUTE_RETURN_ERROR_ON(act_info.b() > act_info.a());
     }
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -86,14 +92,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
 
 std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input, ITensorInfo *output)
 {
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->element_size(), input->dimension(0));
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(16 / input->element_size(), input->dimension(0));
 
     // Configure kernel window
     Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
     AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
 
     bool window_changed = false;
-    if(output != nullptr)
+    if (output != nullptr)
     {
         AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
         window_changed = update_window_and_padding(win, input_access, output_access);
@@ -104,30 +111,50 @@ std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input,
         window_changed = update_window_and_padding(win, input_access);
     }
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
 
 CLBatchNormalizationLayerKernel::CLBatchNormalizationLayerKernel()
-    : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0), _run_in_place(false)
+    : _input(nullptr),
+      _output(nullptr),
+      _mean(nullptr),
+      _var(nullptr),
+      _beta(nullptr),
+      _gamma(nullptr),
+      _epsilon(0),
+      _run_in_place(false)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLBatchNormalizationLayerKernel::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma,
-                                                float epsilon, ActivationLayerInfo act_info)
+void CLBatchNormalizationLayerKernel::configure(ICLTensor          *input,
+                                                ICLTensor          *output,
+                                                const ICLTensor    *mean,
+                                                const ICLTensor    *var,
+                                                const ICLTensor    *beta,
+                                                const ICLTensor    *gamma,
+                                                float               epsilon,
+                                                ActivationLayerInfo act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, var, beta, gamma, epsilon, act_info);
 }
 
-void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta,
-                                                const ICLTensor *gamma,
-                                                float epsilon, ActivationLayerInfo act_info)
+void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_context,
+                                                ICLTensor              *input,
+                                                ICLTensor              *output,
+                                                const ICLTensor        *mean,
+                                                const ICLTensor        *var,
+                                                const ICLTensor        *beta,
+                                                const ICLTensor        *gamma,
+                                                float                   epsilon,
+                                                ActivationLayerInfo     act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var);
 
-    auto padding_info = get_padding_info({ input, output, mean, var, beta, gamma });
+    auto padding_info = get_padding_info({input, output, mean, var, beta, gamma});
     _input            = input;
     _output           = output;
     _mean             = mean;
@@ -142,13 +169,15 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_
                                                   mean->info(), var->info(), (beta != nullptr) ? beta->info() : nullptr,
                                                   (gamma != nullptr) ? gamma->info() : nullptr, epsilon, act_info));
 
-    unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
+    unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
 
     // Set build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
     build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
     build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
@@ -157,29 +186,33 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_
     build_opts.add_option_if(gamma == nullptr, "-DUSE_DEFAULT_GAMMA");
 
     // Create kernel
-    _kernel = create_kernel(compile_context, "batchnormalization_layer_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    _kernel =
+        create_kernel(compile_context,
+                      "batchnormalization_layer_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+                      build_opts.options());
 
     // Set kernel static arguments
     unsigned int include_output = (!_run_in_place) ? 1 : 0;
-    unsigned int idx            = (1 + include_output) * num_arguments_per_3D_tensor() + 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
-    if(_beta != nullptr)
+    unsigned int idx            = (1 + include_output) * num_arguments_per_3D_tensor() +
+                       2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
+    if (_beta != nullptr)
     {
         idx += num_arguments_per_1D_tensor(); // Skip beta parameter
     }
-    if(_gamma != nullptr)
+    if (_gamma != nullptr)
     {
         idx += num_arguments_per_1D_tensor(); // Skip gamma parameter
     }
     _kernel.setArg<cl_float>(idx++, _epsilon);
 
-    if(output != nullptr)
+    if (output != nullptr)
     {
         // Output tensor auto initialization if not yet initialized
         auto_init_if_empty(*output->info(), *input->info()->clone());
     }
 
     // Configure kernel window
-    if(input->info()->data_layout() == DataLayout::NHWC)
+    if (input->info()->data_layout() == DataLayout::NHWC)
     {
         Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
         ICLKernel::configure_internal(win);
@@ -205,18 +238,23 @@ void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_
     _config_id += lower_string(string_from_data_layout(input->info()->data_layout()));
 }
 
-Status CLBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                                 const ITensorInfo *mean, const ITensorInfo *var,
-                                                 const ITensorInfo *beta, const ITensorInfo *gamma,
-                                                 float epsilon, ActivationLayerInfo act_info)
+Status CLBatchNormalizationLayerKernel::validate(const ITensorInfo  *input,
+                                                 const ITensorInfo  *output,
+                                                 const ITensorInfo  *mean,
+                                                 const ITensorInfo  *var,
+                                                 const ITensorInfo  *beta,
+                                                 const ITensorInfo  *gamma,
+                                                 float               epsilon,
+                                                 ActivationLayerInfo act_info)
 {
     const bool run_in_place = (output == nullptr) || (output == input);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info));
 
-    if(input->data_layout() != DataLayout::NHWC)
+    if (input->data_layout() != DataLayout::NHWC)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_nchw(input->clone().get(), (run_in_place) ? nullptr : output->clone().get())
-                                    .first);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_and_configure_window_nchw(input->clone().get(), (run_in_place) ? nullptr : output->clone().get())
+                .first);
     }
 
     return Status{};
@@ -236,11 +274,11 @@ void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue
     unsigned int idx            = (1 + include_output) * num_arguments_per_3D_tensor();
     add_1D_tensor_argument(idx, _mean, vector_slice);
     add_1D_tensor_argument(idx, _var, vector_slice);
-    if(_beta != nullptr)
+    if (_beta != nullptr)
     {
         add_1D_tensor_argument(idx, _beta, vector_slice);
     }
-    if(_gamma != nullptr)
+    if (_gamma != nullptr)
     {
         add_1D_tensor_argument(idx, _gamma, vector_slice);
     }
@@ -249,11 +287,10 @@ void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue
     {
         idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
-        if(!_run_in_place)
+        if (!_run_in_place)
         {
             add_3D_tensor_argument(idx, _output, slice);
         }
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h
index acbe0f2a26..1a88d2a8c5 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -64,7 +65,13 @@ public:
      * @param[in]      epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
      */
-    void configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr, const ICLTensor *gamma = nullptr, float epsilon = 0.001f,
+    void configure(ICLTensor          *input,
+                   ICLTensor          *output,
+                   const ICLTensor    *mean,
+                   const ICLTensor    *var,
+                   const ICLTensor    *beta     = nullptr,
+                   const ICLTensor    *gamma    = nullptr,
+                   float               epsilon  = 0.001f,
                    ActivationLayerInfo act_info = ActivationLayerInfo());
     /** Set the input and output tensors.
      *
@@ -82,8 +89,15 @@ public:
      * @param[in]      epsilon         (Optional) Small value to avoid division with zero. Default value is 0.001f.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr,
-                   const ICLTensor *gamma = nullptr, float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output,
+                   const ICLTensor        *mean,
+                   const ICLTensor        *var,
+                   const ICLTensor        *beta     = nullptr,
+                   const ICLTensor        *gamma    = nullptr,
+                   float                   epsilon  = 0.001f,
+                   ActivationLayerInfo     act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLBatchNormalizationLayerKernel
      *
      * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result.
@@ -99,10 +113,14 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const ITensorInfo *mean, const ITensorInfo *var,
-                           const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
-                           float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo  *input,
+                           const ITensorInfo  *output,
+                           const ITensorInfo  *mean,
+                           const ITensorInfo  *var,
+                           const ITensorInfo  *beta     = nullptr,
+                           const ITensorInfo  *gamma    = nullptr,
+                           float               epsilon  = 0.001f,
+                           ActivationLayerInfo act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
index 143a842d02..c640b5a8d6 100644
--- a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
@@ -25,13 +25,14 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
-#include "arm_compute/core/TensorInfo.h"
 
 using namespace arm_compute::misc::shape_calculator;
 namespace arm_compute
@@ -46,7 +47,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -54,7 +55,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
 
     return Status{};
 }
-Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const ITensorInfo *output, const CropInfo &crop_info)
+Status validate_arguments_static(const ITensorInfo *input,
+                                 const int          block_shape_x,
+                                 const int          block_shape_y,
+                                 const ITensorInfo *output,
+                                 const CropInfo    &crop_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
@@ -66,10 +71,11 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorShape expected_output_shape = compute_batch_to_space_shape(input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info);
-        const TensorInfo  expected_output       = output->clone()->set_tensor_shape(expected_output_shape);
+        const TensorShape expected_output_shape = compute_batch_to_space_shape(
+            input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info);
+        const TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &expected_output);
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -79,8 +85,7 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
 }
 } // namespace
 
-CLBatchToSpaceLayerKernel::CLBatchToSpaceLayerKernel()
-    : _input(nullptr), _block_shape(nullptr), _output(nullptr)
+CLBatchToSpaceLayerKernel::CLBatchToSpaceLayerKernel() : _input(nullptr), _block_shape(nullptr), _output(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -90,11 +95,14 @@ void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const ICLTenso
     configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output);
 }
 
-void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
+void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          const ICLTensor        *block_shape,
+                                          ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    auto padding_info = get_padding_info({ input, block_shape, output });
+    auto padding_info = get_padding_info({input, block_shape, output});
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), output->info()));
 
@@ -106,8 +114,9 @@ void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_contex
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
     build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(3)));
-    _kernel = create_kernel(compile_context, "batch_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
-
+    _kernel = create_kernel(compile_context,
+                            "batch_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+                            build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps());
@@ -116,47 +125,65 @@ void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_contex
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info)
+void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input,
+                                          const int32_t    block_shape_x,
+                                          const int32_t    block_shape_y,
+                                          ICLTensor       *output,
+                                          const CropInfo  &crop_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output, crop_info);
 }
 
-void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output,
-                                          const CropInfo &crop_info)
+void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          const int32_t           block_shape_x,
+                                          const int32_t           block_shape_y,
+                                          ICLTensor              *output,
+                                          const CropInfo         &crop_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    const TensorShape output_shape = compute_batch_to_space_shape(input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y);
+    const TensorShape output_shape = compute_batch_to_space_shape(
+        input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y);
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info));
 
     _input  = input;
     _output = output;
 
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
     build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(3)));
     build_opts.add_option("-DBLOCK_SHAPE_X=" + support::cpp11::to_string(block_shape_x));
     build_opts.add_option("-DBLOCK_SHAPE_Y=" + support::cpp11::to_string(block_shape_y));
     build_opts.add_option("-DCROP_LEFT=" + support::cpp11::to_string(crop_info.left));
     build_opts.add_option("-DCROP_TOP=" + support::cpp11::to_string(crop_info.top));
-    _kernel = create_kernel(compile_context, "batch_to_space_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    _kernel = create_kernel(
+        compile_context, "batch_to_space_static_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+        build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps());
     ICLKernel::configure_internal(win);
 }
 
-Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+Status
+CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_shape, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, output));
     return Status{};
 }
 
-Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info)
+Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input,
+                                           const int32_t      block_shape_x,
+                                           const int32_t      block_shape_y,
+                                           const ITensorInfo *output,
+                                           const CropInfo    &crop_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, output, crop_info));
@@ -185,7 +212,7 @@ void CLBatchToSpaceLayerKernel::run(const Window &window, cl::CommandQueue &queu
         unsigned int idx = 0;
         add_4D_tensor_argument(idx, _input, slice_in);
         add_argument(idx, batch_id);
-        if(_block_shape != nullptr)
+        if (_block_shape != nullptr)
         {
             add_1D_tensor_argument(idx, _block_shape, vector_slice);
         }
@@ -193,7 +220,6 @@ void CLBatchToSpaceLayerKernel::run(const Window &window, cl::CommandQueue &queu
         enqueue(queue, *this, slice_out, lws_hint());
 
         ++batch_id;
-    }
-    while(window.slide_window_slice_3D(slice_out));
+    } while (window.slide_window_slice_3D(slice_out));
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h
index a05184cd5b..b9d3e66fe2 100644
--- a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h
+++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLBATCHTOSPACELAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -65,7 +66,10 @@ public:
      *
      * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *block_shape,
+                   ICLTensor              *output);
     /** Initialise the kernel's inputs and output (Static block shape).
      *
      * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -74,7 +78,11 @@ public:
      * @param[out] output        Tensor output. Data types supported: same as @p input
      * @param[in]  crop_info     Specifies how the output shape is cropped after batch to space is performed
      */
-    void configure(const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info);
+    void configure(const ICLTensor *input,
+                   const int32_t    block_shape_x,
+                   const int32_t    block_shape_y,
+                   ICLTensor       *output,
+                   const CropInfo  &crop_info);
     /** Initialise the kernel's inputs and output (Static block shape).
      *
      * @param[in]  compile_context The compile context to be used.
@@ -84,7 +92,12 @@ public:
      * @param[out] output          Tensor output. Data types supported: same as @p input
      * @param[in]  crop_info       Specifies how the output shape is cropped after batch to space is performed
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const int32_t           block_shape_x,
+                   const int32_t           block_shape_y,
+                   ICLTensor              *output,
+                   const CropInfo         &crop_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayerKernel
      *
      * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -106,7 +119,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info);
+    static Status validate(const ITensorInfo *input,
+                           const int32_t      block_shape_x,
+                           const int32_t      block_shape_y,
+                           const ITensorInfo *output,
+                           const CropInfo    &crop_info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLBitwiseKernel.cpp b/src/core/CL/kernels/CLBitwiseKernel.cpp
index 11e6d021a5..de3fb43de8 100644
--- a/src/core/CL/kernels/CLBitwiseKernel.cpp
+++ b/src/core/CL/kernels/CLBitwiseKernel.cpp
@@ -28,25 +28,29 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
-CLBitwiseKernel::CLBitwiseKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+CLBitwiseKernel::CLBitwiseKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLBitwiseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, BitwiseOperation op)
+void CLBitwiseKernel::configure(const CLCompileContext &compile_context,
+                                const ICLTensor        *input1,
+                                const ICLTensor        *input2,
+                                ICLTensor              *output,
+                                BitwiseOperation        op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
-    if(op != BitwiseOperation::NOT)
+    if (op != BitwiseOperation::NOT)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(input2);
         ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
@@ -56,7 +60,7 @@ void CLBitwiseKernel::configure(const CLCompileContext &compile_context, const I
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*(output->info()), *(input1->info()));
-    auto padding_info = get_padding_info({ input1, input2, output });
+    auto padding_info = get_padding_info({input1, input2, output});
 
     // Configure kernel window
     const unsigned int vec_size_x = adjust_vec_size(16 / output->info()->element_size(), output->info()->dimension(0));
@@ -68,7 +72,7 @@ void CLBitwiseKernel::configure(const CLCompileContext &compile_context, const I
 
     // Create kernel
     std::string kernel_name = "";
-    switch(op)
+    switch (op)
     {
         case BitwiseOperation::AND:
             kernel_name = "bitwise_and";
@@ -107,13 +111,12 @@ void CLBitwiseKernel::run(const Window &window, cl::CommandQueue &queue)
     {
         unsigned int idx = 0;
         add_2D_tensor_argument(idx, _input1, slice);
-        if(_input2 != nullptr)
+        if (_input2 != nullptr)
         {
             add_2D_tensor_argument(idx, _input2, slice);
         }
         add_2D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
+    } while (window.slide_window_slice_2D(slice));
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLBitwiseKernel.h b/src/core/CL/kernels/CLBitwiseKernel.h
index c5a999643d..2c74955ae4 100644
--- a/src/core/CL/kernels/CLBitwiseKernel.h
+++ b/src/core/CL/kernels/CLBitwiseKernel.h
@@ -59,7 +59,11 @@ public:
      * @param[out] output          Destination tensor. Data types supported: U8.
      * @param[in]  op              Bitwise operation to perform. Supported: AND, OR, NOT, XOR.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, BitwiseOperation op);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input1,
+                   const ICLTensor        *input2,
+                   ICLTensor              *output,
+                   BitwiseOperation        op);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
index 72de854afb..f32c518e29 100644
--- a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
+++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -40,7 +41,10 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status validate_arguments(const ITensorInfo              *boxes,
+                          const ITensorInfo              *pred_boxes,
+                          const ITensorInfo              *deltas,
+                          const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(boxes);
@@ -53,7 +57,7 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
     ARM_COMPUTE_RETURN_ERROR_ON(boxes->num_dimensions() > 2);
 
     const bool is_qasymm16 = boxes->data_type() == DataType::QASYMM16;
-    if(is_qasymm16)
+    if (is_qasymm16)
     {
         const UniformQuantizationInfo boxes_qinfo = boxes->quantization_info().uniform();
         ARM_COMPUTE_RETURN_ERROR_ON(boxes_qinfo.scale != 0.125f);
@@ -65,12 +69,12 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes, deltas);
     }
 
-    if(pred_boxes->total_size() > 0)
+    if (pred_boxes->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(pred_boxes->tensor_shape(), deltas->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(pred_boxes, boxes);
         ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes->num_dimensions() > 2);
-        if(is_qasymm16)
+        if (is_qasymm16)
         {
             const UniformQuantizationInfo pred_boxes_qinfo = pred_boxes->quantization_info().uniform();
             ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes_qinfo.scale != 0.125f);
@@ -83,22 +87,31 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
 }
 } // namespace
 
-CLBoundingBoxTransformKernel::CLBoundingBoxTransformKernel()
-    : _boxes(nullptr), _pred_boxes(nullptr), _deltas(nullptr)
+CLBoundingBoxTransformKernel::CLBoundingBoxTransformKernel() : _boxes(nullptr), _pred_boxes(nullptr), _deltas(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLBoundingBoxTransformKernel::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransformKernel::configure(const ICLTensor                *boxes,
+                                             ICLTensor                      *pred_boxes,
+                                             const ICLTensor                *deltas,
+                                             const BoundingBoxTransformInfo &info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), boxes, pred_boxes, deltas, info);
 }
 
-void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransformKernel::configure(const CLCompileContext         &compile_context,
+                                             const ICLTensor                *boxes,
+                                             ICLTensor                      *pred_boxes,
+                                             const ICLTensor                *deltas,
+                                             const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
-    auto padding_info = get_padding_info({ boxes, pred_boxes, deltas });
-    auto_init_if_empty(*pred_boxes->info(), deltas->info()->clone()->set_data_type(boxes->info()->data_type()).set_quantization_info(boxes->info()->quantization_info()));
+    auto padding_info = get_padding_info({boxes, pred_boxes, deltas});
+    auto_init_if_empty(*pred_boxes->info(), deltas->info()
+                                                ->clone()
+                                                ->set_data_type(boxes->info()->data_type())
+                                                .set_quantization_info(boxes->info()->quantization_info()));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(boxes->info(), pred_boxes->info(), deltas->info(), info));
 
@@ -128,7 +141,7 @@ void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_con
     build_opts.add_option_if(info.apply_scale(), "-DSCALE_AFTER=" + float_to_string_with_full_precision(info.scale()));
     build_opts.add_option_if(info.correct_transform_coords(), "-DOFFSET=1");
 
-    if(is_quantized)
+    if (is_quantized)
     {
         build_opts.add_option("-DDATA_TYPE_DELTAS=" + get_cl_type_from_data_type(deltas->info()->data_type()));
         const UniformQuantizationInfo boxes_qinfo      = boxes->info()->quantization_info().uniform();
@@ -148,12 +161,15 @@ void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_con
 
     // Since the number of columns is a multiple of 4 by definition, we don't need to pad the tensor
     const unsigned int num_elems_processed_per_iteration = 4;
-    Window             win                               = calculate_max_window(*deltas->info(), Steps(num_elems_processed_per_iteration));
+    Window             win = calculate_max_window(*deltas->info(), Steps(num_elems_processed_per_iteration));
     ICLKernel::configure_internal(win);
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status CLBoundingBoxTransformKernel::validate(const ITensorInfo              *boxes,
+                                              const ITensorInfo              *pred_boxes,
+                                              const ITensorInfo              *deltas,
+                                              const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(boxes, pred_boxes, deltas, info));
     return Status{};
diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.h b/src/core/CL/kernels/CLBoundingBoxTransformKernel.h
index 08f350e86a..9a1bb49bb9 100644
--- a/src/core/CL/kernels/CLBoundingBoxTransformKernel.h
+++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.h
@@ -58,7 +58,10 @@ public:
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
      *
      */
-    void configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info);
+    void configure(const ICLTensor                *boxes,
+                   ICLTensor                      *pred_boxes,
+                   const ICLTensor                *deltas,
+                   const BoundingBoxTransformInfo &info);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -71,7 +74,11 @@ public:
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
      *
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info);
+    void configure(const CLCompileContext         &compile_context,
+                   const ICLTensor                *boxes,
+                   ICLTensor                      *pred_boxes,
+                   const ICLTensor                *deltas,
+                   const BoundingBoxTransformInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform
      *
@@ -85,7 +92,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info);
+    static Status validate(const ITensorInfo              *boxes,
+                           const ITensorInfo              *pred_boxes,
+                           const ITensorInfo              *deltas,
+                           const BoundingBoxTransformInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
index a2a0bc4fb4..ec58bf9e7a 100644
--- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -46,15 +47,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups < 2, "Channel shuffling with less than 2 groups would be inefficient");
 
-    const unsigned int channels = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
+    const unsigned int channels =
+        input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups == channels, "Channel shuffling with same number of groups as number of channels would be inefficient");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        num_groups == channels,
+        "Channel shuffling with same number of groups as number of channels would be inefficient");
     // There cannot be more groups than channels
     ARM_COMPUTE_RETURN_ERROR_ON(num_groups > channels);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, "The number of channels must be a multiple of the number of groups");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0,
+                                    "The number of channels must be a multiple of the number of groups");
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -70,11 +75,12 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     auto_init_if_empty(*output, *input->clone());
 
     const bool is_nhwc = input->data_layout() == DataLayout::NHWC;
-    if(is_nhwc)
+    if (is_nhwc)
     {
-        unsigned int num_elems_processed_per_iteration_x = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
-        Window       win                                 = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x));
-        Window       win_collapsed                       = win.collapse(win, Window::DimZ);
+        unsigned int num_elems_processed_per_iteration_x =
+            adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
+        Window win           = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x));
+        Window win_collapsed = win.collapse(win, Window::DimZ);
         return std::make_pair(Status{}, win_collapsed);
     }
     else
@@ -83,22 +89,25 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
         constexpr unsigned int num_elems_processed_per_iteration_y = 2;
 
         // Configure kernel window
-        Window                win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-        AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-        AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+        Window win = calculate_max_window(
+            *input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+        AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x,
+                                           num_elems_processed_per_iteration_y);
+        AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x,
+                                            num_elems_processed_per_iteration_y);
 
         const bool window_changed = update_window_and_padding(win, input_access, output_access);
 
         Window win_collapsed = win.collapse(win, Window::DimZ);
 
-        Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+        Status err =
+            (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
         return std::make_pair(err, win_collapsed);
     }
 }
 } // namespace
 
-CLChannelShuffleLayerKernel::CLChannelShuffleLayerKernel()
-    : _input(nullptr), _output(nullptr)
+CLChannelShuffleLayerKernel::CLChannelShuffleLayerKernel() : _input(nullptr), _output(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -108,23 +117,27 @@ void CLChannelShuffleLayerKernel::configure(const ICLTensor *input, ICLTensor *o
     configure(CLKernelLibrary::get().get_compile_context(), input, output, num_groups);
 }
 
-void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
+void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_context,
+                                            const ICLTensor        *input,
+                                            ICLTensor              *output,
+                                            unsigned int            num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), num_groups));
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input  = input;
     _output = output;
 
-    const DataLayout   data_layout          = input->info()->data_layout();
-    const bool         is_nhwc              = data_layout == DataLayout::NHWC;
-    const unsigned int channels             = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
-    unsigned int       vec_size_x           = 0;
-    unsigned int       vec_size_x_leftovers = 0;
-    if(is_nhwc)
+    const DataLayout   data_layout = input->info()->data_layout();
+    const bool         is_nhwc     = data_layout == DataLayout::NHWC;
+    const unsigned int channels =
+        input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
+    unsigned int vec_size_x           = 0;
+    unsigned int vec_size_x_leftovers = 0;
+    if (is_nhwc)
     {
-        vec_size_x           = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
+        vec_size_x = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
         vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
     }
     else
@@ -170,13 +183,14 @@ void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_cont
     _config_id += support::cpp11::to_string(output->info()->dimension(1));
     _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(2));
-    if(data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NHWC)
     {
         ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
     }
 }
 
-Status CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
+Status
+CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, num_groups));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.h b/src/core/CL/kernels/CLChannelShuffleLayerKernel.h
index 31c007f17e..43c939ebd8 100644
--- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.h
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.h
@@ -60,7 +60,10 @@ public:
      * @param[out] output          Output tensor. Data type supported: Same as @p input
      * @param[in]  num_groups      Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   unsigned int            num_groups);
     /** Static function to check if given info will lead to a valid configuration of @ref CLChannelShuffleLayerKernel
      *
      * @param[in] input      Input tensor info. Data types supported: All.
diff --git a/src/core/CL/kernels/CLComparisonKernel.cpp b/src/core/CL/kernels/CLComparisonKernel.cpp
index f4d6316517..f27270733e 100644
--- a/src/core/CL/kernels/CLComparisonKernel.cpp
+++ b/src/core/CL/kernels/CLComparisonKernel.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -38,14 +39,10 @@ namespace arm_compute
 namespace
 {
 // Create supported comparisons map
-const std::map<ComparisonOperation, std::string> supported_comparison_ops =
-{
-    { ComparisonOperation::Equal, "EQUAL" },
-    { ComparisonOperation::NotEqual, "NOTEQUAL" },
-    { ComparisonOperation::Greater, "GREATER" },
-    { ComparisonOperation::GreaterEqual, "GREATEREQUAL" },
-    { ComparisonOperation::Less, "LESS" },
-    { ComparisonOperation::LessEqual, "LESSEQUAL" },
+const std::map<ComparisonOperation, std::string> supported_comparison_ops = {
+    {ComparisonOperation::Equal, "EQUAL"},     {ComparisonOperation::NotEqual, "NOTEQUAL"},
+    {ComparisonOperation::Greater, "GREATER"}, {ComparisonOperation::GreaterEqual, "GREATEREQUAL"},
+    {ComparisonOperation::Less, "LESS"},       {ComparisonOperation::LessEqual, "LESSEQUAL"},
 };
 
 int calculate_num_elems_processed_per_iteration(const ITensorInfo &input)
@@ -53,7 +50,10 @@ int calculate_num_elems_processed_per_iteration(const ITensorInfo &input)
     return 16 / input.element_size();
 }
 
-Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ComparisonOperation operation)
+Status validate_arguments(const ITensorInfo  &input1,
+                          const ITensorInfo  &input2,
+                          const ITensorInfo  &output,
+                          ComparisonOperation operation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1);
     ARM_COMPUTE_RETURN_ERROR_ON(input1.data_type() == DataType::UNKNOWN);
@@ -64,7 +64,7 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
     // Validate in case of configured output
-    if(output.total_size() > 0)
+    if (output.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
@@ -76,7 +76,7 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
 {
-    const TensorShape &out_shape                         = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
+    const TensorShape &out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
     const unsigned int num_elems_processed_per_iteration = calculate_num_elems_processed_per_iteration(input1);
 
     // Auto initialize output if not initialized
@@ -90,27 +90,34 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITe
     AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
     AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
 
-    bool window_changed = update_window_and_padding(win_input1, input1_access)
-                          || update_window_and_padding(win_input2, input2_access)
-                          || update_window_and_padding(win, output_access);
+    bool window_changed = update_window_and_padding(win_input1, input1_access) ||
+                          update_window_and_padding(win_input2, input2_access) ||
+                          update_window_and_padding(win, output_access);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
 
-CLComparisonKernel::CLComparisonKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+CLComparisonKernel::CLComparisonKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLComparisonKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+void CLComparisonKernel::configure(const ICLTensor    *input1,
+                                   const ICLTensor    *input2,
+                                   ICLTensor          *output,
+                                   ComparisonOperation operation)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, operation);
 }
 
-void CLComparisonKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+void CLComparisonKernel::configure(const CLCompileContext &compile_context,
+                                   const ICLTensor        *input1,
+                                   const ICLTensor        *input2,
+                                   ICLTensor              *output,
+                                   ComparisonOperation     operation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), operation));
@@ -129,10 +136,11 @@ void CLComparisonKernel::configure(const CLCompileContext &compile_context, cons
     // Set kernel build options
     std::set<std::string> build_opts;
     build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type()));
-    build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(calculate_num_elems_processed_per_iteration(*input1->info())));
+    build_opts.emplace("-DVEC_SIZE=" +
+                       support::cpp11::to_string(calculate_num_elems_processed_per_iteration(*input1->info())));
     build_opts.emplace("-DOP=" + operation_name);
     build_opts.emplace("-DOP_NAME=" + lower_string(operation_name));
-    if(is_data_type_quantized(input1->info()->data_type()))
+    if (is_data_type_quantized(input1->info()->data_type()))
     {
         const UniformQuantizationInfo iq1_info = input1->info()->quantization_info().uniform();
         const UniformQuantizationInfo iq2_info = input2->info()->quantization_info().uniform();
@@ -160,12 +168,16 @@ void CLComparisonKernel::configure(const CLCompileContext &compile_context, cons
     _config_id += lower_string(string_from_data_layout(input1->info()->data_layout()));
 }
 
-Status CLComparisonKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation)
+Status CLComparisonKernel::validate(const ITensorInfo  *input1,
+                                    const ITensorInfo  *input2,
+                                    const ITensorInfo  *output,
+                                    ComparisonOperation operation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
 
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, operation));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
 
     return Status{};
 }
@@ -181,17 +193,18 @@ void CLComparisonKernel::run(const Window &window, cl::CommandQueue &queue)
 
     bool       can_collapse = true;
     const bool is_vector    = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1;
-    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
+    if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
     {
         can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+        for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
         {
             can_collapse = (in_shape1[d] == in_shape2[d]);
         }
     }
 
     bool   has_collapsed = false;
-    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+    Window collapsed =
+        can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
 
     const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
     const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
@@ -212,16 +225,16 @@ void CLComparisonKernel::run(const Window &window, cl::CommandQueue &queue)
 
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 
 BorderSize CLComparisonKernel::border_size() const
 {
     const int num_elems_processed_per_iteration = calculate_num_elems_processed_per_iteration(*_input1->info());
 
-    const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
-    const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-    return BorderSize{ 0, border, 0, 0 };
+    const unsigned int replicateSize =
+        _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+    const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+    return BorderSize{0, border, 0, 0};
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLComparisonKernel.h b/src/core/CL/kernels/CLComparisonKernel.h
index 0b94190183..174a6c9bf9 100644
--- a/src/core/CL/kernels/CLComparisonKernel.h
+++ b/src/core/CL/kernels/CLComparisonKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLCOMPARISONKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -64,7 +65,11 @@ public:
      * @param[out] output          Destination tensor. Data types supported: U8.
      * @param[in]  operation       Comparison operation to use.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input1,
+                   const ICLTensor        *input2,
+                   ICLTensor              *output,
+                   ComparisonOperation     operation);
     /** Static function to check if given info will lead to a valid configuration of @ref CLComparisonKernel
      *
      * @param[in] input1    Source tensor. Data types supported: All.
@@ -74,10 +79,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation);
+    static Status validate(const ITensorInfo  *input1,
+                           const ITensorInfo  *input2,
+                           const ITensorInfo  *output,
+                           ComparisonOperation operation);
 
     // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
+    void       run(const Window &window, cl::CommandQueue &queue) override;
     BorderSize border_size() const override;
 
 private:
diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
index 76af5d564a..f8ecc4c098 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -40,7 +41,8 @@ CLDeconvolutionLayerUpsampleKernel::CLDeconvolutionLayerUpsampleKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo   *input,
+                                                    const ITensorInfo   *output,
                                                     const PadStrideInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
@@ -60,7 +62,7 @@ Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, co
     ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_h) == 0);
 
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(idx_c));
-    for(size_t i = 3; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 3; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
     }
@@ -68,20 +70,21 @@ Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, co
     return Status{};
 }
 
-void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                                   const PadStrideInfo &info)
+void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output, const PadStrideInfo &info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
 }
 
-void CLDeconvolutionLayerUpsampleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
-                                                   const PadStrideInfo &info)
+void CLDeconvolutionLayerUpsampleKernel::configure(const CLCompileContext &compile_context,
+                                                   const ICLTensor        *input,
+                                                   ICLTensor              *output,
+                                                   const PadStrideInfo    &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayerUpsampleKernel::validate(input->info(), output->info(), info));
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input       = input;
     _output      = output;
@@ -119,7 +122,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu
     const int out_end_y   = _output->info()->dimension(idx_h) - _info.pad_bottom() + _info.stride().second - 1;
     const int out_step_y  = _info.stride().second;
 
-    switch(_data_layout)
+    switch (_data_layout)
     {
         case DataLayout::NCHW:
         {
@@ -137,8 +140,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu
                 add_3D_tensor_argument(idx, _input, slice_in);
                 add_3D_tensor_argument(idx, _output, slice_out);
                 enqueue(queue, *this, slice_out, lws_hint());
-            }
-            while(collapsed.slide_window_slice_3D(slice_in) && collapsed.slide_window_slice_3D(slice_out));
+            } while (collapsed.slide_window_slice_3D(slice_in) && collapsed.slide_window_slice_3D(slice_out));
             break;
         }
         case DataLayout::NHWC:
@@ -156,8 +158,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu
                 add_3D_tensor_argument(idx, _input, slice_in);
                 add_3D_tensor_argument(idx, _output, slice_out);
                 enqueue(queue, *this, slice_out, lws_hint());
-            }
-            while(window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
+            } while (window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
             break;
         }
         default:
diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
index e0d1322341..762989a836 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
@@ -62,7 +62,10 @@ public:
      * @param[out] output          Destination tensor. Data types supported: same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]  info            Contains padding and stride information described in @ref PadStrideInfo.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PadStrideInfo &info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const PadStrideInfo    &info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayerUpsample
      *
      * @param[in] input  Source tensor info. Data types supported: All.
diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
index 0fc0ff8168..b33e0a8b6f 100644
--- a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
@@ -27,9 +27,10 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -38,7 +39,11 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+Status validate_arguments(const ITensorInfo   *input,
+                          const ITensorInfo   *bias,
+                          const ITensorInfo   *output,
+                          const ITensorInfo   *input_info,
+                          const ITensorInfo   *weights_info,
                           const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, input_info, weights_info);
@@ -53,19 +58,21 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
     ARM_COMPUTE_RETURN_ERROR_ON(weights_info->dimension(idx_w) != deconv_info.stride().first);
     ARM_COMPUTE_RETURN_ERROR_ON(weights_info->dimension(idx_h) != deconv_info.stride().second);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32);
-    if(!is_qasymm)
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S32);
+    if (!is_qasymm)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_info, weights_info);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_info->dimension(idx_w) * weights_info->dimension(idx_h) * weights_info->dimension(idx_b));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_info->dimension(idx_w) * weights_info->dimension(idx_h) *
+                                                           weights_info->dimension(idx_b));
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != input_info->dimension(idx_w));
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != input_info->dimension(idx_h));
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(3) != input_info->dimension(idx_b));
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
-        if(is_qasymm)
+        if (is_qasymm)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         }
@@ -76,19 +83,26 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
         ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights_info->dimension(idx_b));
     }
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second);
-        auto                out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info);
+        auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h),
+                                                        weights_info->dimension(idx_w), weights_info->dimension(idx_h),
+                                                        stride_info);
 
-        const TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
+        const TensorShape output_shape =
+            misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
     }
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input, ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo   *input,
+                                                        ITensorInfo         *output,
+                                                        const ITensorInfo   *input_info,
+                                                        const ITensorInfo   *weights_info,
+                                                        const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -97,11 +111,17 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
     const size_t        idx_h       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second);
 
-    auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info);
+    auto out_dims =
+        deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h),
+                                        weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info);
 
-    const TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
+    const TensorShape output_shape =
+        misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
 
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout).set_quantization_info(input->quantization_info()));
+    auto_init_if_empty(*output, input->clone()
+                                    ->set_tensor_shape(output_shape)
+                                    .set_data_layout(data_layout)
+                                    .set_quantization_info(input->quantization_info()));
 
     Window win = calculate_max_window(*input);
 
@@ -109,29 +129,37 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
 }
 } // namespace
 
-CLDeconvolutionReshapeOutputKernel::CLDeconvolutionReshapeOutputKernel()
-    : _add_bias(false),
-      _bias(nullptr)
+CLDeconvolutionReshapeOutputKernel::CLDeconvolutionReshapeOutputKernel() : _add_bias(false), _bias(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLDeconvolutionReshapeOutputKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+void CLDeconvolutionReshapeOutputKernel::configure(const ICLTensor     *input,
+                                                   const ICLTensor     *bias,
+                                                   ICLTensor           *output,
+                                                   const ITensorInfo   *input_info,
+                                                   const ITensorInfo   *weights_info,
                                                    const PadStrideInfo &deconv_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, input_info, weights_info, deconv_info);
 }
 
-void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info,
-                                                   const ITensorInfo   *weights_info,
-                                                   const PadStrideInfo &deconv_info)
+void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compile_context,
+                                                   const ICLTensor        *input,
+                                                   const ICLTensor        *bias,
+                                                   ICLTensor              *output,
+                                                   const ITensorInfo      *input_info,
+                                                   const ITensorInfo      *weights_info,
+                                                   const PadStrideInfo    &deconv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, input_info, weights_info);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), input_info, weights_info, deconv_info));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr),
+                                                  output->info(), input_info, weights_info, deconv_info));
 
-    auto padding_info = get_padding_info({ input, bias, output });
+    auto padding_info = get_padding_info({input, bias, output});
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), input_info, weights_info, deconv_info);
+    auto win_config =
+        validate_and_configure_window(input->info(), output->info(), input_info, weights_info, deconv_info);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
     const DataLayout data_layout = input_info->data_layout();
@@ -178,7 +206,11 @@ void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compi
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLDeconvolutionReshapeOutputKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+Status CLDeconvolutionReshapeOutputKernel::validate(const ITensorInfo   *input,
+                                                    const ITensorInfo   *bias,
+                                                    const ITensorInfo   *output,
+                                                    const ITensorInfo   *input_info,
+                                                    const ITensorInfo   *weights_info,
                                                     const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, input_info, weights_info, deconv_info));
@@ -194,7 +226,7 @@ void CLDeconvolutionReshapeOutputKernel::run(const Window &window, cl::CommandQu
     unsigned int idx = 0;
     add_3D_tensor_argument(idx, _input, collapsed);
     add_3D_tensor_argument(idx, _output, collapsed);
-    if(_add_bias)
+    if (_add_bias)
     {
         add_1D_tensor_argument(idx, _bias, collapsed);
     }
diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
index ce354fa86f..8f436b07e3 100644
--- a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
+++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
@@ -67,7 +67,12 @@ public:
      * @param[in]  weights_info Deconvolution weights tensor info. Supported data types: same as @p input.  Supported data layouts: same as @p input.
      * @param[in]  deconv_info  Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
      */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info);
+    void configure(const ICLTensor     *input,
+                   const ICLTensor     *bias,
+                   ICLTensor           *output,
+                   const ITensorInfo   *input_info,
+                   const ITensorInfo   *weights_info,
+                   const PadStrideInfo &deconv_info);
     /** Initialise the kernel's source and destination.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -79,8 +84,13 @@ public:
      * @param[in]  weights_info    Deconvolution weights tensor info. Supported data types: same as @p input.  Supported data layouts: same as @p input.
      * @param[in]  deconv_info     Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
-                   const PadStrideInfo &deconv_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *bias,
+                   ICLTensor              *output,
+                   const ITensorInfo      *input_info,
+                   const ITensorInfo      *weights_info,
+                   const PadStrideInfo    &deconv_info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref  CLDeconvolutionReshapeOutputKernel.
      *
@@ -93,7 +103,12 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info);
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *bias,
+                           const ITensorInfo   *output,
+                           const ITensorInfo   *input_info,
+                           const ITensorInfo   *weights_info,
+                           const PadStrideInfo &deconv_info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
index 5c1dc4fbf6..cdf19ab2e1 100644
--- a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -49,12 +50,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) != 0);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
         const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape * input->tensor_shape()[idx_width]));
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape * input->tensor_shape()[idx_height]));
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] !=
+                                    (block_shape * input->tensor_shape()[idx_width]));
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] !=
+                                    (block_shape * input->tensor_shape()[idx_height]));
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -63,8 +66,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
 }
 } // namespace
 
-CLDepthToSpaceLayerKernel::CLDepthToSpaceLayerKernel()
-    : _input(nullptr), _output(nullptr), _block_shape()
+CLDepthToSpaceLayerKernel::CLDepthToSpaceLayerKernel() : _input(nullptr), _output(nullptr), _block_shape()
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -74,14 +76,18 @@ void CLDepthToSpaceLayerKernel::configure(const ICLTensor *input, ICLTensor *out
     configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
 }
 
-void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          ICLTensor              *output,
+                                          int32_t                 block_shape)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    TensorShape output_shape = compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
+    TensorShape output_shape =
+        compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
     auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
 
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape));
 
@@ -98,7 +104,9 @@ void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_contex
     build_opts.add_option("-DCHANNEL_SIZE=" + support::cpp11::to_string(input->info()->dimension(idx_channel)));
     build_opts.add_option("-DBLOCK_SHAPE=" + support::cpp11::to_string(block_shape));
     build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
-    _kernel = create_kernel(compile_context, "depth_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    _kernel = create_kernel(compile_context,
+                            "depth_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+                            build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*input->info(), Steps());
@@ -137,7 +145,6 @@ void CLDepthToSpaceLayerKernel::run(const Window &window, cl::CommandQueue &queu
         enqueue(queue, *this, slice_in, lws_hint());
 
         ++batch_id;
-    }
-    while(window.slide_window_slice_3D(slice_in));
+    } while (window.slide_window_slice_3D(slice_in));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h
index 1f7f77b569..cef70c4dda 100644
--- a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h
+++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLDEPTHTOSPACELAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -61,7 +62,8 @@ public:
      * @param[out] output          Tensor output. Data types supported: same as @p input
      * @param[in]  block_shape     Block shape value.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+    void
+    configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthToSpaceLayerKernel.
      *
      * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
index e34b6929e7..b95abe795f 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
@@ -23,16 +23,17 @@
  */
 #include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLUtils.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/CL/ICLKernel.h"
@@ -45,12 +46,18 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCComputeKernelInfo &dwc_info,
-                          const ConvolutionInfo &conv_info, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status validate_arguments(const ITensorInfo          *input,
+                          const ITensorInfo          *weights,
+                          const ITensorInfo          *biases,
+                          const ITensorInfo          *output,
+                          const DWCComputeKernelInfo &dwc_info,
+                          const ConvolutionInfo      &conv_info,
+                          const ITensorInfo          *output_multipliers,
+                          const ITensorInfo          *output_shifts)
 {
     ARM_COMPUTE_UNUSED(dwc_info);
     bool in_place = false;
-    if(output == nullptr || output == input)
+    if (output == nullptr || output == input)
     {
         in_place = true;
         output   = input;
@@ -58,11 +65,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first > 1 && dwc_info.m0 != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation.x() > 1 && dwc_info.m0 != 1);
     ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_input_to_cl_image == true));
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((dwc_info.export_weights_to_cl_image == true) && (export_to_cl_image(weights) == false), "Weights cannot be exported to cl_image!");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((dwc_info.export_weights_to_cl_image == true) &&
+                                        (export_to_cl_image(weights) == false),
+                                    "Weights cannot be exported to cl_image!");
     ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_weights_to_cl_image == true) && ((dwc_info.n0 % 4) != 0));
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().second < 1);
@@ -72,33 +82,40 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_c) != (input->dimension(idx_c) * conv_info.depth_multiplier));
 
     // In place restrictions
-    if(in_place)
+    if (in_place)
     {
-        const int weights_width_idx  = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
-        const int weights_height_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
-        ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[weights_width_idx] != 1U || weights->tensor_shape()[weights_height_idx] != 1U);
+        const int weights_width_idx =
+            get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
+        const int weights_height_idx =
+            get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
+        ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[weights_width_idx] != 1U ||
+                                    weights->tensor_shape()[weights_height_idx] != 1U);
         ARM_COMPUTE_RETURN_ERROR_ON(conv_info.depth_multiplier != 1U);
         ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride() != std::make_pair(1U, 1U));
         ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation != Size2D(1U, 1U));
-        ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.has_padding()); // Note that in princple padding can be supported with in_place but we choose not to support it
+        ARM_COMPUTE_RETURN_ERROR_ON(
+            conv_info.pad_stride_info
+                .has_padding()); // Note that in princple padding can be supported with in_place but we choose not to support it
     }
 
-    const ConvolutionInfo info{ conv_info.pad_stride_info, conv_info.depth_multiplier, ActivationLayerInfo(), conv_info.dilation };
-    const TensorShape     output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info);
+    const ConvolutionInfo info{conv_info.pad_stride_info, conv_info.depth_multiplier, ActivationLayerInfo(),
+                               conv_info.dilation};
+    const TensorShape     output_shape =
+        arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info);
 
-    if(conv_info.depth_multiplier > 1 && dwc_info.n0 > 1)
+    if (conv_info.depth_multiplier > 1 && dwc_info.n0 > 1)
     {
         ARM_COMPUTE_RETURN_ERROR_ON((conv_info.depth_multiplier % dwc_info.n0) != 0);
     }
 
     const bool is_quantized = is_data_type_quantized(input->data_type());
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != output_shape[idx_c]);
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
 
-        if(is_quantized)
+        if (is_quantized)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -108,7 +125,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
         }
     }
 
-    if(is_quantized)
+    if (is_quantized)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output_multipliers, output_shifts);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
@@ -116,7 +133,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
         ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
 
-        if(is_data_type_quantized_per_channel(weights->data_type()))
+        if (is_data_type_quantized_per_channel(weights->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
             ARM_COMPUTE_RETURN_ERROR_ON(output_shape[idx_c] != output_multipliers->dimension(0));
@@ -134,22 +151,24 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     }
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
-    if(is_data_type_quantized(input->data_type()))
+    if (is_data_type_quantized(input->data_type()))
     {
         const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
         const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info;
+        const UniformQuantizationInfo oq_info =
+            (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info;
 
         float multiplier        = iq_info.scale * wq_info.scale / oq_info.scale;
         int   output_multiplier = 0;
         int   output_shift      = 0;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
     }
 
     return Status{};
@@ -171,30 +190,48 @@ CLDepthwiseConvolutionLayerNativeKernel::CLDepthwiseConvolutionLayerNativeKernel
     _type = CLKernelType::DEPTHWISE;
 }
 
-void CLDepthwiseConvolutionLayerNativeKernel::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                                        const DWCComputeKernelInfo &dwc_info, const ConvolutionInfo &conv_info,
-                                                        const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
+void CLDepthwiseConvolutionLayerNativeKernel::configure(ICLTensor                  *input,
+                                                        const ICLTensor            *weights,
+                                                        const ICLTensor            *biases,
+                                                        ICLTensor                  *output,
+                                                        const DWCComputeKernelInfo &dwc_info,
+                                                        const ConvolutionInfo      &conv_info,
+                                                        const ICLTensor            *output_multipliers,
+                                                        const ICLTensor            *output_shifts)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, dwc_info, conv_info, output_multipliers, output_shifts);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, dwc_info, conv_info,
+              output_multipliers, output_shifts);
 }
 
-void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                                        const DWCComputeKernelInfo &dwc_info, const ConvolutionInfo &conv_info,
-                                                        const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
+void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext     &compile_context,
+                                                        ICLTensor                  *input,
+                                                        const ICLTensor            *weights,
+                                                        const ICLTensor            *biases,
+                                                        ICLTensor                  *output,
+                                                        const DWCComputeKernelInfo &dwc_info,
+                                                        const ConvolutionInfo      &conv_info,
+                                                        const ICLTensor            *output_multipliers,
+                                                        const ICLTensor            *output_shifts)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
-    if(output == nullptr)
+    if (output == nullptr)
     {
         // In-place
         output = input;
     }
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
-                                                  dwc_info, conv_info, (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, (output_shifts != nullptr) ? output_shifts->info() : nullptr));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
+        input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), dwc_info,
+        conv_info, (output_multipliers != nullptr) ? output_multipliers->info() : nullptr,
+        (output_shifts != nullptr) ? output_shifts->info() : nullptr));
 
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*(input->info()), *(weights->info()), conv_info);
-    auto_init_if_empty(*(output->info()), input->info()->clone()->set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info()));
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(
+        *(input->info()), *(weights->info()), conv_info);
+    auto_init_if_empty(*(output->info()), input->info()
+                                              ->clone()
+                                              ->set_tensor_shape(output_shape)
+                                              .set_quantization_info(output->info()->quantization_info()));
 
     _input                      = input;
     _output                     = output;
@@ -214,12 +251,12 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
     CLBuildOptions build_opts;
 
     // Update the padding for the input/weights tensor if we can export to cl_image
-    if(_export_input_to_cl_image)
+    if (_export_input_to_cl_image)
     {
         arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(input->info());
     }
 
-    if(_export_weights_to_cl_image)
+    if (_export_weights_to_cl_image)
     {
         arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(weights->info());
     }
@@ -229,9 +266,10 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
     const auto      act_function  = conv_info.act_info.activation();
     const auto      dst_data_type = _output->info()->data_type();
 
-    if((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
-       && (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-       && (dst_data_type == DataType::F32 || dst_data_type == DataType::F16))
+    if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+        (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU ||
+         act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) &&
+        (dst_data_type == DataType::F32 || dst_data_type == DataType::F16))
     {
         // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
         // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
@@ -268,23 +306,24 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
     build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
     build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
     build_opts.add_option("-DM0_A=" + support::cpp11::to_string(_weights->info()->dimension(1) + m0 - 1));
-    build_opts.add_option_if_else(conv_info.depth_multiplier > 1, "-DN0_A=1", "-DN0_A=" + support::cpp11::to_string(n0));
+    build_opts.add_option_if_else(conv_info.depth_multiplier > 1, "-DN0_A=1",
+                                  "-DN0_A=" + support::cpp11::to_string(n0));
     build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(_output->info()->dimension(0) % n0));
     build_opts.add_option_if(_input->info()->num_dimensions() > 3, "-DBATCHED_EXECUTION");
 
     // Force unroll with pragma when any of the following values exceed the maximum number of manual unroll
-    set_unroll_with_pragma(build_opts, { static_cast<int>(_weights->info()->dimension(1) + m0 - 1),
-                                         static_cast<int>(_weights->info()->dimension(1)),
-                                         static_cast<int>(_weights->info()->dimension(2))
-                                       });
+    set_unroll_with_pragma(build_opts, {static_cast<int>(_weights->info()->dimension(1) + m0 - 1),
+                                        static_cast<int>(_weights->info()->dimension(1)),
+                                        static_cast<int>(_weights->info()->dimension(2))});
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         build_opts.add_option(std::string("-DHAS_BIAS"));
-        build_opts.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->info()->data_type())));
+        build_opts.add_option(
+            std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->info()->data_type())));
     }
 
-    if(_is_quantized)
+    if (_is_quantized)
     {
         kernel_name                          = "dwc_native_quantized_nhwc";
         const UniformQuantizationInfo iqinfo = input->info()->quantization_info().uniform();
@@ -306,13 +345,17 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
         build_opts.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
         build_opts.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32));
         build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32));
-        build_opts.add_option("-DDST_MULTIPLIERS_DATA_TYPE=" + get_cl_type_from_data_type(_output_multipliers->info()->data_type()));
-        build_opts.add_option("-DDST_SHIFTS_DATA_TYPE=" + get_cl_type_from_data_type(_output_shifts->info()->data_type()));
-        build_opts.add_option_if_else(weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL, "-DQUANTIZATION_TYPE=PER_CHANNEL", "-DQUANTIZATION_TYPE=PER_TENSOR");
+        build_opts.add_option("-DDST_MULTIPLIERS_DATA_TYPE=" +
+                              get_cl_type_from_data_type(_output_multipliers->info()->data_type()));
+        build_opts.add_option("-DDST_SHIFTS_DATA_TYPE=" +
+                              get_cl_type_from_data_type(_output_shifts->info()->data_type()));
+        build_opts.add_option_if_else(weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL,
+                                      "-DQUANTIZATION_TYPE=PER_CHANNEL", "-DQUANTIZATION_TYPE=PER_TENSOR");
         // Note: We expect the input and output tensors to always adopt a per-tensor quantization approach
         int a_val{};
         int b_val{};
-        std::tie(b_val, a_val) = get_quantized_activation_min_max(conv_info.act_info, input->info()->data_type(), oqinfo);
+        std::tie(b_val, a_val) =
+            get_quantized_activation_min_max(conv_info.act_info, input->info()->data_type(), oqinfo);
 
         build_opts.add_option_if(conv_info.act_info.enabled(), "-DA_VAL=" + support::cpp11::to_string(a_val));
         build_opts.add_option_if(conv_info.act_info.enabled(), "-DB_VAL=" + support::cpp11::to_string(b_val));
@@ -321,8 +364,10 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
     {
         kernel_name = "dwc_native_fp_nhwc";
         build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-        build_opts.add_option_if(conv_info.act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(conv_info.act_info.a()));
-        build_opts.add_option_if(conv_info.act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(conv_info.act_info.b()));
+        build_opts.add_option_if(conv_info.act_info.enabled(),
+                                 "-DA_VAL=" + float_to_string_with_full_precision(conv_info.act_info.a()));
+        build_opts.add_option_if(conv_info.act_info.enabled(),
+                                 "-DB_VAL=" + float_to_string_with_full_precision(conv_info.act_info.b()));
     }
 
     Window win = calculate_max_window(*(output->info()), Steps(n0, m0));
@@ -350,10 +395,17 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
     _config_id += string_from_data_type(input->info()->data_type());
 }
 
-Status CLDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                                         const DWCComputeKernelInfo &dwc_info, const ConvolutionInfo &conv_info, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status CLDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo          *input,
+                                                         const ITensorInfo          *weights,
+                                                         const ITensorInfo          *biases,
+                                                         const ITensorInfo          *output,
+                                                         const DWCComputeKernelInfo &dwc_info,
+                                                         const ConvolutionInfo      &conv_info,
+                                                         const ITensorInfo          *output_multipliers,
+                                                         const ITensorInfo          *output_shifts)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, dwc_info, conv_info, output_multipliers, output_shifts));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(input, weights, biases, output, dwc_info, conv_info, output_multipliers, output_shifts));
     return Status{};
 }
 
@@ -370,47 +422,52 @@ void CLDepthwiseConvolutionLayerNativeKernel::run(const Window &window, cl::Comm
     cl::Image2D input_cl_image;
     cl::Image2D weights_cl_image;
 
-    if(_export_input_to_cl_image || _export_weights_to_cl_image)
+    if (_export_input_to_cl_image || _export_weights_to_cl_image)
     {
         // Export cl_buffer to cl_image
-        if(_export_input_to_cl_image)
+        if (_export_input_to_cl_image)
         {
-            const size_t      image_w = _input->info()->dimension(0) / 4;
-            const size_t      image_h = _input->info()->dimension(1) * _input->info()->dimension(2) * _input->info()->dimension(3);
+            const size_t image_w = _input->info()->dimension(0) / 4;
+            const size_t image_h =
+                _input->info()->dimension(1) * _input->info()->dimension(2) * _input->info()->dimension(3);
             const TensorShape shape2d(image_w, image_h);
             const size_t      image_row_pitch = _input->info()->strides_in_bytes()[1];
-            input_cl_image                    = create_image2d_from_buffer(CLKernelLibrary::get().context(), _input->cl_buffer(), shape2d, _input->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+            input_cl_image =
+                create_image2d_from_buffer(CLKernelLibrary::get().context(), _input->cl_buffer(), shape2d,
+                                           _input->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
         }
 
-        if(_export_weights_to_cl_image)
+        if (_export_weights_to_cl_image)
         {
-            const size_t      image_w = _weights->info()->dimension(0) / 4;
-            const size_t      image_h = _weights->info()->dimension(1) * _weights->info()->dimension(2) * _weights->info()->dimension(3);
+            const size_t image_w = _weights->info()->dimension(0) / 4;
+            const size_t image_h =
+                _weights->info()->dimension(1) * _weights->info()->dimension(2) * _weights->info()->dimension(3);
             const TensorShape shape2d(image_w, image_h);
             const size_t      image_row_pitch = _weights->info()->strides_in_bytes()[1];
-            weights_cl_image                  = create_image2d_from_buffer(CLKernelLibrary::get().context(), _weights->cl_buffer(), shape2d, _weights->info()->data_type(), image_row_pitch,
-                                                                           CLImage2DType::ReadOnly);
+            weights_cl_image =
+                create_image2d_from_buffer(CLKernelLibrary::get().context(), _weights->cl_buffer(), shape2d,
+                                           _weights->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
         }
     }
 
     unsigned int idx = 0;
-    if(_export_input_to_cl_image)
+    if (_export_input_to_cl_image)
     {
         _kernel.setArg(idx++, input_cl_image);
     }
     add_4d_tensor_nhwc_argument(idx, _input);
     add_4d_tensor_nhwc_argument(idx, _output);
-    if(_export_weights_to_cl_image)
+    if (_export_weights_to_cl_image)
     {
         _kernel.setArg(idx++, weights_cl_image);
     }
     add_4d_tensor_nhwc_argument(idx, _weights);
-    if(_is_quantized)
+    if (_is_quantized)
     {
         add_1D_tensor_argument(idx, _output_multipliers, slice);
         add_1D_tensor_argument(idx, _output_shifts, slice);
     }
-    if(_biases != nullptr)
+    if (_biases != nullptr)
     {
         add_1D_tensor_argument(idx, _biases, slice);
     }
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
index 8eee7b2500..d34a662966 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
@@ -24,11 +24,11 @@
 #ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
 #define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
 
-#include "src/core/CL/ICLKernel.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/function_info/ConvolutionInfo.h"
 
+#include "src/core/CL/ICLKernel.h"
+
 namespace arm_compute
 {
 class ICLTensor;
@@ -74,15 +74,28 @@ public:
      *          * no padding
      *          * no change of data layout after configure
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCComputeKernelInfo &dwc_info,
-                   const ConvolutionInfo &conv_info, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
+    void configure(const CLCompileContext     &compile_context,
+                   ICLTensor                  *input,
+                   const ICLTensor            *weights,
+                   const ICLTensor            *biases,
+                   ICLTensor                  *output,
+                   const DWCComputeKernelInfo &dwc_info,
+                   const ConvolutionInfo      &conv_info,
+                   const ICLTensor            *output_multipliers = nullptr,
+                   const ICLTensor            *output_shifts      = nullptr);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel
      *
      * Similar to @ref CLDepthwiseConvolutionLayerNativeKernel::configure()
      */
-    void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCComputeKernelInfo &dwc_info,
-                   const ConvolutionInfo &conv_info, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
+    void configure(ICLTensor                  *input,
+                   const ICLTensor            *weights,
+                   const ICLTensor            *biases,
+                   ICLTensor                  *output,
+                   const DWCComputeKernelInfo &dwc_info,
+                   const ConvolutionInfo      &conv_info,
+                   const ICLTensor            *output_multipliers = nullptr,
+                   const ICLTensor            *output_shifts      = nullptr);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel
      *
@@ -90,23 +103,29 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCComputeKernelInfo &dwc_info,
-                           const ConvolutionInfo &conv_info, const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
+    static Status validate(const ITensorInfo          *input,
+                           const ITensorInfo          *weights,
+                           const ITensorInfo          *biases,
+                           const ITensorInfo          *output,
+                           const DWCComputeKernelInfo &dwc_info,
+                           const ConvolutionInfo      &conv_info,
+                           const ITensorInfo          *output_multipliers = nullptr,
+                           const ITensorInfo          *output_shifts      = nullptr);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    const ICLTensor *_input {};
+    const ICLTensor *_input{};
     const ICLTensor *_weights{};
     const ICLTensor *_biases{};
     ICLTensor       *_output{};
-    unsigned int     _depth_multiplier{ 0 };
+    unsigned int     _depth_multiplier{0};
     const ICLTensor *_output_multipliers{};
     const ICLTensor *_output_shifts{};
-    bool             _export_input_to_cl_image{ false };
-    bool             _export_weights_to_cl_image{ true };
-    bool             _is_quantized{ false };
+    bool             _export_input_to_cl_image{false};
+    bool             _export_weights_to_cl_image{true};
+    bool             _is_quantized{false};
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H */
diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
index 9b514ed705..3d8f875ef7 100644
--- a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
+++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -37,17 +38,20 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+Status validate_arguments(const ITensorInfo               *input,
+                          const ITensorInfo               *output,
+                          const ITensorInfo               *idx,
+                          const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != 1 && input->num_channels() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(idx, 1, DataType::U32);
-    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] != idx->tensor_shape().x());
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 2);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -57,7 +61,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo                     *input,
+                                                        ITensorInfo                     *output,
+                                                        ITensorInfo                     *idx,
+                                                        const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_UNUSED(idx, config);
 
@@ -69,21 +76,27 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 }
 } // namespace
 
-CLFFTDigitReverseKernel::CLFFTDigitReverseKernel()
-    : _input(nullptr), _output(nullptr), _idx(nullptr)
+CLFFTDigitReverseKernel::CLFFTDigitReverseKernel() : _input(nullptr), _output(nullptr), _idx(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLFFTDigitReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config)
+void CLFFTDigitReverseKernel::configure(const ICLTensor                 *input,
+                                        ICLTensor                       *output,
+                                        const ICLTensor                 *idx,
+                                        const FFTDigitReverseKernelInfo &config)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, idx, config);
 }
 
-void CLFFTDigitReverseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config)
+void CLFFTDigitReverseKernel::configure(const CLCompileContext          &compile_context,
+                                        const ICLTensor                 *input,
+                                        ICLTensor                       *output,
+                                        const ICLTensor                 *idx,
+                                        const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, idx);
-    auto padding_info = get_padding_info({ input, output, idx });
+    auto padding_info = get_padding_info({input, output, idx});
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), idx->info(), config));
 
     _input  = input;
@@ -114,10 +127,14 @@ void CLFFTDigitReverseKernel::configure(const CLCompileContext &compile_context,
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLFFTDigitReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+Status CLFFTDigitReverseKernel::validate(const ITensorInfo               *input,
+                                         const ITensorInfo               *output,
+                                         const ITensorInfo               *idx,
+                                         const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, idx, config));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
 
     return Status{};
 }
@@ -137,7 +154,6 @@ void CLFFTDigitReverseKernel::run(const Window &window, cl::CommandQueue &queue)
         add_3D_tensor_argument(idx, _output, slice);
         add_1D_tensor_argument(idx, _idx, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.h b/src/core/CL/kernels/CLFFTDigitReverseKernel.h
index e5583a4c22..fdd1bcc3d3 100644
--- a/src/core/CL/kernels/CLFFTDigitReverseKernel.h
+++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.h
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H
 #define ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H
 
-#include "src/core/CL/ICLKernel.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 
+#include "src/core/CL/ICLKernel.h"
+
 namespace arm_compute
 {
 // Forward declarations
@@ -56,7 +56,8 @@ public:
      * @param[in]  idx    Digit reverse index tensor. Data type supported: U32
      * @param[in]  config Kernel configuration.
      */
-    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config);
+    void
+    configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -65,7 +66,11 @@ public:
      * @param[in]  idx             Digit reverse index tensor. Data type supported: U32
      * @param[in]  config          Kernel configuration.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config);
+    void configure(const CLCompileContext          &compile_context,
+                   const ICLTensor                 *input,
+                   ICLTensor                       *output,
+                   const ICLTensor                 *idx,
+                   const FFTDigitReverseKernelInfo &config);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFFTDigitReverseKernel
      *
      * @param[in] input  Source tensor info. Data types supported: F16/F32.
@@ -75,7 +80,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config);
+    static Status validate(const ITensorInfo               *input,
+                           const ITensorInfo               *output,
+                           const ITensorInfo               *idx,
+                           const FFTDigitReverseKernelInfo &config);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
index 95f4b640bd..3729e6b77d 100644
--- a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
+++ b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -46,11 +47,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(CLFFTRadixStageKernel::supported_radix().count(config.radix) == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] % config.radix);
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -59,9 +60,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
 {
-    if(output != nullptr)
+    if (output != nullptr)
     {
         auto_init_if_empty(*output, *input);
     }
@@ -76,8 +78,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 }
 } // namespace
 
-CLFFTRadixStageKernel::CLFFTRadixStageKernel()
-    : _input(nullptr), _output(nullptr), _run_in_place(false)
+CLFFTRadixStageKernel::CLFFTRadixStageKernel() : _input(nullptr), _output(nullptr), _run_in_place(false)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -87,11 +88,15 @@ void CLFFTRadixStageKernel::configure(ICLTensor *input, ICLTensor *output, const
     configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
 }
 
-void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config)
+void CLFFTRadixStageKernel::configure(const CLCompileContext        &compile_context,
+                                      ICLTensor                     *input,
+                                      ICLTensor                     *output,
+                                      const FFTRadixStageKernelInfo &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
-    auto padding_info = get_padding_info({ input, output });
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
+    auto padding_info = get_padding_info({input, output});
 
     _input        = input;
     _output       = output;
@@ -110,11 +115,12 @@ void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, I
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Set static arguments if not the first stage
-    if(!config.is_first_stage)
+    if (!config.is_first_stage)
     {
         const unsigned int Ni        = config.Nx * config.radix;
         const float        exp_const = (-2.0 * M_PI) / static_cast<float>(Ni);
-        unsigned int       idx       = (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+        unsigned int       idx =
+            (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
         _kernel.setArg<cl_uint>(idx++, config.Nx);
         _kernel.setArg<cl_uint>(idx++, Ni);
         _kernel.setArg<cl_float>(idx, exp_const);
@@ -136,21 +142,22 @@ void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, I
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLFFTRadixStageKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+Status CLFFTRadixStageKernel::validate(const ITensorInfo             *input,
+                                       const ITensorInfo             *output,
+                                       const FFTRadixStageKernelInfo &config)
 {
     const bool run_in_place = (output == nullptr) || (output == input);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
-                                                              (run_in_place) ? nullptr : output->clone().get(),
-                                                              config)
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(), config)
+            .first);
 
     return Status{};
 }
 
 std::set<unsigned int> CLFFTRadixStageKernel::supported_radix()
 {
-    return std::set<unsigned int> { 2, 3, 4, 5, 7, 8 };
+    return std::set<unsigned int>{2, 3, 4, 5, 7, 8};
 }
 
 void CLFFTRadixStageKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -165,12 +172,11 @@ void CLFFTRadixStageKernel::run(const Window &window, cl::CommandQueue &queue)
     {
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
-        if(!_run_in_place)
+        if (!_run_in_place)
         {
             add_3D_tensor_argument(idx, _output, slice);
         }
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.h b/src/core/CL/kernels/CLFFTRadixStageKernel.h
index 9bb310db83..de80bfced3 100644
--- a/src/core/CL/kernels/CLFFTRadixStageKernel.h
+++ b/src/core/CL/kernels/CLFFTRadixStageKernel.h
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H
 #define ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H
 
-#include "src/core/CL/ICLKernel.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 
+#include "src/core/CL/ICLKernel.h"
+
 #include <set>
 
 namespace arm_compute
@@ -69,7 +69,10 @@ public:
      * @param[out]    output          Destination tensor. Can be nullptr. Data type supported: same as @p input
      * @param[in]     config          FFT descriptor metadata.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config);
+    void configure(const CLCompileContext        &compile_context,
+                   ICLTensor                     *input,
+                   ICLTensor                     *output,
+                   const FFTRadixStageKernelInfo &config);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFFTRadixStageKernel
      *
      * @param[in] input  Source tensor info. Data types supported: F16/F32.
diff --git a/src/core/CL/kernels/CLFFTScaleKernel.cpp b/src/core/CL/kernels/CLFFTScaleKernel.cpp
index 8a714d71bf..be6e16b074 100644
--- a/src/core/CL/kernels/CLFFTScaleKernel.cpp
+++ b/src/core/CL/kernels/CLFFTScaleKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -43,7 +44,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32);
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -54,8 +55,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 }
 } // namespace
 
-CLFFTScaleKernel::CLFFTScaleKernel()
-    : _input(nullptr), _output(nullptr), _run_in_place(false)
+CLFFTScaleKernel::CLFFTScaleKernel() : _input(nullptr), _output(nullptr), _run_in_place(false)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -65,11 +65,14 @@ void CLFFTScaleKernel::configure(ICLTensor *input, ICLTensor *output, const FFTS
     configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
 }
 
-void CLFFTScaleKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config)
+void CLFFTScaleKernel::configure(const CLCompileContext   &compile_context,
+                                 ICLTensor                *input,
+                                 ICLTensor                *output,
+                                 const FFTScaleKernelInfo &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr));
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input        = input;
     _output       = output;
@@ -78,20 +81,22 @@ void CLFFTScaleKernel::configure(const CLCompileContext &compile_context, ICLTen
     // Create kernel
     CLBuildOptions build_opts;
     build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(output != nullptr ? output->info()->num_channels() : input->info()->num_channels()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(output != nullptr ? output->info()->num_channels()
+                                                                                      : input->info()->num_channels()));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option_if(config.conjugate, "-DCONJ");
     std::string kernel_name = "fft_scale_conj";
     _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Set static arguments
-    unsigned int idx = (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+    unsigned int idx =
+        (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
     _kernel.setArg<cl_float>(idx, config.scale);
 
     // Configure kernel window
     Window win = calculate_max_window(*input->info(), Steps());
 
-    if(output != nullptr)
+    if (output != nullptr)
     {
         // Output auto inizialitation if not yet initialized
         auto_init_if_empty(*output->info(), *input->info()->clone());
@@ -130,12 +135,11 @@ void CLFFTScaleKernel::run(const Window &window, cl::CommandQueue &queue)
     {
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
-        if(!_run_in_place)
+        if (!_run_in_place)
         {
             add_3D_tensor_argument(idx, _output, slice);
         }
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFFTScaleKernel.h b/src/core/CL/kernels/CLFFTScaleKernel.h
index cc518be193..b995282e02 100644
--- a/src/core/CL/kernels/CLFFTScaleKernel.h
+++ b/src/core/CL/kernels/CLFFTScaleKernel.h
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_CLFFTSCALEKERNEL_H
 #define ARM_COMPUTE_CLFFTSCALEKERNEL_H
 
-#include "src/core/CL/ICLKernel.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 
+#include "src/core/CL/ICLKernel.h"
+
 namespace arm_compute
 {
 // Forward declarations
@@ -63,7 +63,10 @@ public:
      * @param[out]    output          Destination tensor. Data type supported: same as @p input
      * @param[in]     config          Kernel configuration
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config);
+    void configure(const CLCompileContext   &compile_context,
+                   ICLTensor                *input,
+                   ICLTensor                *output,
+                   const FFTScaleKernelInfo &config);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFFTScaleKernel
      *
      * @param[in] input  Source tensor info. Data types supported: F16/F32.
diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index fcd99a4ed9..86bb502da3 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp
@@ -31,14 +31,14 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
-CLFillBorderKernel::CLFillBorderKernel()
-    : ICLKernel(), _tensor(nullptr)
+CLFillBorderKernel::CLFillBorderKernel() : ICLKernel(), _tensor(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -56,27 +56,38 @@ void CLFillBorderKernel::set_constant_border(unsigned int idx, const PixelValue
     ICLKernel::add_argument<T>(idx, static_cast<T>(value));
 }
 
-void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void CLFillBorderKernel::configure(ICLTensor        *tensor,
+                                   BorderSize        border_size,
+                                   BorderMode        border_mode,
+                                   const PixelValue &constant_border_value)
 {
     configure(CLKernelLibrary::get().get_compile_context(), tensor, border_size, border_mode, constant_border_value);
 }
 
-void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void CLFillBorderKernel::configure(const CLCompileContext &compile_context,
+                                   ICLTensor              *tensor,
+                                   BorderSize              border_size,
+                                   BorderMode              border_mode,
+                                   const PixelValue       &constant_border_value)
 {
     _tensor = tensor;
     configure(compile_context, tensor->info(), border_size, border_mode, constant_border_value);
 }
 
-void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void CLFillBorderKernel::configure(const CLCompileContext &compile_context,
+                                   ITensorInfo            *tensor,
+                                   BorderSize              border_size,
+                                   BorderMode              border_mode,
+                                   const PixelValue       &constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON(tensor == nullptr);
     ARM_COMPUTE_ERROR_ON(tensor->num_channels() != 1);
-    auto padding_info = get_padding_info({ tensor });
+    auto padding_info = get_padding_info({tensor});
 
     border_size.limit(tensor->padding());
 
     // If there is no border: early exit
-    if(border_size.empty() || border_mode == BorderMode::UNDEFINED)
+    if (border_size.empty() || border_mode == BorderMode::UNDEFINED)
     {
         return;
     }
@@ -98,25 +109,22 @@ void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITen
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Create static kernel arguments
-    const unsigned int valid_width  = tensor->valid_region().shape[0];
-    const unsigned int valid_height = tensor->valid_region().shape[1];
-    const cl_int2      valid_region_coords =
-    {
-        {
-            static_cast<cl_int>(tensor->valid_region().anchor[0]),
-            static_cast<cl_int>(tensor->valid_region().anchor[1]),
-        }
-    };
-    const unsigned int total_valid_width = border_size.left + valid_width + border_size.right;
+    const unsigned int valid_width         = tensor->valid_region().shape[0];
+    const unsigned int valid_height        = tensor->valid_region().shape[1];
+    const cl_int2      valid_region_coords = {{
+             static_cast<cl_int>(tensor->valid_region().anchor[0]),
+             static_cast<cl_int>(tensor->valid_region().anchor[1]),
+    }};
+    const unsigned int total_valid_width   = border_size.left + valid_width + border_size.right;
 
     // Set static kernel arguments
     unsigned int idx = num_arguments_per_3D_tensor(); //Skip the tensor parameters
     ICLKernel::add_argument<cl_uint>(idx, valid_width);
     ICLKernel::add_argument<cl_uint>(idx, valid_height);
     ICLKernel::add_argument<cl_int2>(idx, valid_region_coords);
-    if(BorderMode::CONSTANT == border_mode)
+    if (BorderMode::CONSTANT == border_mode)
     {
-        switch(dt)
+        switch (dt)
         {
             case DataType::U8:
             case DataType::QASYMM8:
@@ -175,12 +183,13 @@ void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITen
 void CLFillBorderKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
 {
     // Border mode undefined or border width == 0
-    if(_kernel() == nullptr)
+    if (_kernel() == nullptr)
     {
         return;
     }
 
-    const auto tensor = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto tensor =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
 
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
@@ -193,14 +202,13 @@ void CLFillBorderKernel::run_op(ITensorPack &tensors, const Window &window, cl::
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, tensor, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 
 void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue)
 {
     // Border mode undefined or border width == 0
-    if(_kernel() == nullptr)
+    if (_kernel() == nullptr)
     {
         return;
     }
@@ -216,7 +224,6 @@ void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue)
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _tensor, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLFillBorderKernel.h b/src/core/CL/kernels/CLFillBorderKernel.h
index 7951f48171..5782143cf9 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.h
+++ b/src/core/CL/kernels/CLFillBorderKernel.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -57,7 +58,11 @@ public:
      * @param[in]     border_mode           Border mode to use for the convolution.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *tensor,
+                   BorderSize              border_size,
+                   BorderMode              border_mode,
+                   const PixelValue       &constant_border_value = PixelValue());
     /** Initialise the kernel's input, output and border mode.
      *
      * @param[in,out] tensor                Tensor to process Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32.
@@ -65,7 +70,10 @@ public:
      * @param[in]     border_mode           Border mode to use for the convolution.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    void configure(ICLTensor        *tensor,
+                   BorderSize        border_size,
+                   BorderMode        border_mode,
+                   const PixelValue &constant_border_value = PixelValue());
     /** Initialise the kernel's input, output and border mode.
      *
      * @param[in]     compile_context       The compile context to be used.
@@ -74,7 +82,11 @@ public:
      * @param[in]     border_mode           Border mode to use for the convolution.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *tensor,
+                   BorderSize              border_size,
+                   BorderMode              border_mode,
+                   const PixelValue       &constant_border_value = PixelValue());
 
     /** Function to set the constant value on fill border kernel depending on type.
      *
diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
index 68fe324df6..7da0679ae4 100644
--- a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
@@ -30,20 +30,26 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                          const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                          const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
-                          float epsilon, FuseBatchNormalizationType fbn_type)
+Status validate_arguments(const ITensorInfo         *input_weights,
+                          const ITensorInfo         *bn_mean,
+                          const ITensorInfo         *bn_var,
+                          const ITensorInfo         *fused_weights,
+                          const ITensorInfo         *fused_bias,
+                          const ITensorInfo         *input_bias,
+                          const ITensorInfo         *bn_beta,
+                          const ITensorInfo         *bn_gamma,
+                          float                      epsilon,
+                          FuseBatchNormalizationType fbn_type)
 {
     ARM_COMPUTE_UNUSED(epsilon);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var);
@@ -54,43 +60,44 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b
     ARM_COMPUTE_RETURN_ERROR_ON(input_bias == nullptr && fused_bias == nullptr);
     ARM_COMPUTE_RETURN_ERROR_ON(bn_mean->num_dimensions() > 1);
 
-    if(fbn_type == FuseBatchNormalizationType::CONVOLUTION)
+    if (fbn_type == FuseBatchNormalizationType::CONVOLUTION)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(3) != bn_mean->dimension(0));
     }
     else
     {
-        const size_t channel_idx = get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t channel_idx =
+            get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(channel_idx) != bn_mean->dimension(0));
     }
 
     // Validate bias
-    if(input_bias != nullptr)
+    if (input_bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, input_bias);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, input_bias);
     }
     // Validate beta
-    if(bn_beta != nullptr)
+    if (bn_beta != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_beta);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_beta);
     }
     // Validate gamma
-    if(bn_gamma != nullptr)
+    if (bn_gamma != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_gamma);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_gamma);
     }
     // Validate output weights
-    if(fused_weights != nullptr && fused_weights->total_size() != 0)
+    if (fused_weights != nullptr && fused_weights->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_weights, fused_weights);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input_weights, fused_weights);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_weights);
     }
     // Validate output bias
-    if(fused_bias != nullptr && fused_bias->total_size() != 0)
+    if (fused_bias != nullptr && fused_bias->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, fused_bias);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_bias);
@@ -101,28 +108,52 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b
 } // namespace
 
 CLFuseBatchNormalizationKernel::CLFuseBatchNormalizationKernel()
-    : _input_weights(nullptr), _input_bias(nullptr), _bn_mean(nullptr), _bn_var(nullptr), _bn_gamma(nullptr), _bn_beta(nullptr), _fused_weights(nullptr), _fused_bias(nullptr), _epsilon(),
-      _run_in_place_weights(false), _run_in_place_bias(false)
+    : _input_weights(nullptr),
+      _input_bias(nullptr),
+      _bn_mean(nullptr),
+      _bn_var(nullptr),
+      _bn_gamma(nullptr),
+      _bn_beta(nullptr),
+      _fused_weights(nullptr),
+      _fused_bias(nullptr),
+      _epsilon(),
+      _run_in_place_weights(false),
+      _run_in_place_bias(false)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLFuseBatchNormalizationKernel::configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
-                                               ICLTensor *fused_weights, ICLTensor *fused_bias,
-                                               const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
-                                               float epsilon, FuseBatchNormalizationType fbn_type)
+void CLFuseBatchNormalizationKernel::configure(const ICLTensor           *input_weights,
+                                               const ICLTensor           *bn_mean,
+                                               const ICLTensor           *bn_var,
+                                               ICLTensor                 *fused_weights,
+                                               ICLTensor                 *fused_bias,
+                                               const ICLTensor           *input_bias,
+                                               const ICLTensor           *bn_beta,
+                                               const ICLTensor           *bn_gamma,
+                                               float                      epsilon,
+                                               FuseBatchNormalizationType fbn_type)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+              input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
-void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
-                                               ICLTensor *fused_weights, ICLTensor *fused_bias,
-                                               const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
-                                               float epsilon, FuseBatchNormalizationType fbn_type)
+void CLFuseBatchNormalizationKernel::configure(const CLCompileContext    &compile_context,
+                                               const ICLTensor           *input_weights,
+                                               const ICLTensor           *bn_mean,
+                                               const ICLTensor           *bn_var,
+                                               ICLTensor                 *fused_weights,
+                                               ICLTensor                 *fused_bias,
+                                               const ICLTensor           *input_bias,
+                                               const ICLTensor           *bn_beta,
+                                               const ICLTensor           *bn_gamma,
+                                               float                      epsilon,
+                                               FuseBatchNormalizationType fbn_type)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var);
 
-    auto padding_info = get_padding_info({ input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma });
+    auto padding_info =
+        get_padding_info({input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma});
 
     _input_weights = input_weights;
     _input_bias    = input_bias;
@@ -135,28 +166,28 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c
     _epsilon       = epsilon;
 
     _run_in_place_weights = (fused_weights == nullptr) || (fused_weights == input_weights);
-    _run_in_place_bias    = (input_bias != nullptr && fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias);
+    _run_in_place_bias =
+        (input_bias != nullptr && fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias);
 
     // Auto initialize outputs
-    if(_fused_weights != nullptr)
+    if (_fused_weights != nullptr)
     {
         // Output tensor auto initialization if not yet initialized
         auto_init_if_empty(*_fused_weights->info(), *_input_weights->info()->clone());
     }
-    if(_fused_bias != nullptr)
+    if (_fused_bias != nullptr)
     {
         // Output tensor auto initialization if not yet initialized
         auto_init_if_empty(*_fused_bias->info(), *_bn_mean->info()->clone());
     }
 
     // Validate arguments
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_weights->info(), bn_mean->info(), bn_var->info(),
-                                                  (fused_weights != nullptr) ? fused_weights->info() : nullptr,
-                                                  (fused_bias != nullptr) ? fused_bias->info() : nullptr,
-                                                  (input_bias != nullptr) ? input_bias->info() : nullptr,
-                                                  (bn_beta != nullptr) ? bn_beta->info() : nullptr,
-                                                  (bn_gamma != nullptr) ? bn_gamma->info() : nullptr,
-                                                  epsilon, fbn_type));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
+        input_weights->info(), bn_mean->info(), bn_var->info(),
+        (fused_weights != nullptr) ? fused_weights->info() : nullptr,
+        (fused_bias != nullptr) ? fused_bias->info() : nullptr, (input_bias != nullptr) ? input_bias->info() : nullptr,
+        (bn_beta != nullptr) ? bn_beta->info() : nullptr, (bn_gamma != nullptr) ? bn_gamma->info() : nullptr, epsilon,
+        fbn_type));
 
     // Configure kernel window
     Window win = calculate_max_window(*input_weights->info());
@@ -165,7 +196,8 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c
     // Set build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input_weights->info()->data_type()));
-    build_opts.add_option_if(fbn_type == FuseBatchNormalizationType::CONVOLUTION, "-DDIM2=" + support::cpp11::to_string(input_weights->info()->dimension(2)));
+    build_opts.add_option_if(fbn_type == FuseBatchNormalizationType::CONVOLUTION,
+                             "-DDIM2=" + support::cpp11::to_string(input_weights->info()->dimension(2)));
     build_opts.add_option("-DEPSILON=" + float_to_string_with_full_precision(epsilon));
     build_opts.add_option_if(_input_weights->info()->data_layout() == DataLayout::NHWC, "-DNHWC");
     build_opts.add_option_if(_run_in_place_weights, "-DIN_PLACE_W");
@@ -180,12 +212,19 @@ void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_c
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                                                const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                                                const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
-                                                float epsilon, FuseBatchNormalizationType fbn_type)
+Status CLFuseBatchNormalizationKernel::validate(const ITensorInfo         *input_weights,
+                                                const ITensorInfo         *bn_mean,
+                                                const ITensorInfo         *bn_var,
+                                                const ITensorInfo         *fused_weights,
+                                                const ITensorInfo         *fused_bias,
+                                                const ITensorInfo         *input_bias,
+                                                const ITensorInfo         *bn_beta,
+                                                const ITensorInfo         *bn_gamma,
+                                                float                      epsilon,
+                                                FuseBatchNormalizationType fbn_type)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+                                                   input_bias, bn_beta, bn_gamma, epsilon, fbn_type));
     return Status{};
 }
 
@@ -202,25 +241,25 @@ void CLFuseBatchNormalizationKernel::run(const arm_compute::Window &window, cl::
     // Add kernel arguments
     unsigned int idx = 0;
     add_3D_tensor_argument(idx, _input_weights, slice_3d);
-    if(_input_bias != nullptr)
+    if (_input_bias != nullptr)
     {
         add_1D_tensor_argument(idx, _input_bias, slice_1d);
     }
     add_1D_tensor_argument(idx, _bn_mean, slice_1d);
     add_1D_tensor_argument(idx, _bn_var, slice_1d);
-    if(!_run_in_place_weights)
+    if (!_run_in_place_weights)
     {
         add_3D_tensor_argument(idx, _fused_weights, slice_3d);
     }
-    if(!_run_in_place_bias)
+    if (!_run_in_place_bias)
     {
         add_1D_tensor_argument(idx, _fused_bias, slice_1d);
     }
-    if(_bn_beta != nullptr)
+    if (_bn_beta != nullptr)
     {
         add_1D_tensor_argument(idx, _bn_beta, slice_1d);
     }
-    if(_bn_gamma != nullptr)
+    if (_bn_gamma != nullptr)
     {
         add_1D_tensor_argument(idx, _bn_gamma, slice_1d);
     }
diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h
index 78b1e74cab..76ec7a759f 100644
--- a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h
+++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h
@@ -62,9 +62,16 @@ public:
      * @param[in]  epsilon       (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
      * @param[in]  fbn_type      (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
      */
-    void configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias,
-                   const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    void configure(const ICLTensor           *input_weights,
+                   const ICLTensor           *bn_mean,
+                   const ICLTensor           *bn_var,
+                   ICLTensor                 *fused_weights,
+                   ICLTensor                 *fused_bias,
+                   const ICLTensor           *input_bias = nullptr,
+                   const ICLTensor           *bn_beta    = nullptr,
+                   const ICLTensor           *bn_gamma   = nullptr,
+                   float                      epsilon    = 0.001f,
+                   FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
     /** Set the source, destination of the kernel
      *
      * @param[in]  compile_context The compile context to be used.
@@ -81,9 +88,17 @@ public:
      * @param[in]  epsilon         (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
      * @param[in]  fbn_type        (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias,
-                   const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input_weights,
+                   const ICLTensor           *bn_mean,
+                   const ICLTensor           *bn_var,
+                   ICLTensor                 *fused_weights,
+                   ICLTensor                 *fused_bias,
+                   const ICLTensor           *input_bias = nullptr,
+                   const ICLTensor           *bn_beta    = nullptr,
+                   const ICLTensor           *bn_gamma   = nullptr,
+                   float                      epsilon    = 0.001f,
+                   FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFuseBatchNormalizationKernel
      *
      * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
@@ -101,10 +116,16 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                           const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                           const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr,
-                           float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    static Status validate(const ITensorInfo         *input_weights,
+                           const ITensorInfo         *bn_mean,
+                           const ITensorInfo         *bn_var,
+                           const ITensorInfo         *fused_weights,
+                           const ITensorInfo         *fused_bias,
+                           const ITensorInfo         *input_bias = nullptr,
+                           const ITensorInfo         *bn_beta    = nullptr,
+                           const ITensorInfo         *bn_gamma   = nullptr,
+                           float                      epsilon    = 0.001f,
+                           FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLGatherKernel.cpp b/src/core/CL/kernels/CLGatherKernel.cpp
index 5495023b80..c11a18940a 100644
--- a/src/core/CL/kernels/CLGatherKernel.cpp
+++ b/src/core/CL/kernels/CLGatherKernel.cpp
@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "src/core/CL/kernels/CLGatherKernel.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -34,7 +36,8 @@ namespace arm_compute
 {
 namespace
 {
-inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+inline Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
     const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
@@ -43,11 +46,12 @@ inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *in
     ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-        TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis);
+        TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(
+            input->tensor_shape(), indices->tensor_shape(), actual_axis);
         ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
     }
 
@@ -56,12 +60,14 @@ inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *in
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices, ITensorInfo *output, int axis)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices, ITensorInfo *output, int axis)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
     const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
     // Output auto initialization if not yet initialized
-    TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis);
+    TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(
+        input->tensor_shape(), indices->tensor_shape(), actual_axis);
     auto_init_if_empty((*output), output_shape, 1, input->data_type());
 
     // Create window
@@ -72,8 +78,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 
 } // namespace
 
-CLGatherKernel::CLGatherKernel()
-    : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
+CLGatherKernel::CLGatherKernel() : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -83,10 +88,14 @@ void CLGatherKernel::configure(const ICLTensor *input, const ICLTensor *indices,
     configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, axis);
 }
 
-void CLGatherKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
+void CLGatherKernel::configure(const CLCompileContext &compile_context,
+                               const ICLTensor        *input,
+                               const ICLTensor        *indices,
+                               ICLTensor              *output,
+                               int                     axis)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
-    auto padding_info = get_padding_info({ input, output, indices });
+    auto padding_info = get_padding_info({input, output, indices});
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), indices->info(), output->info(), axis));
 
     // Configure kernel window
@@ -100,7 +109,8 @@ void CLGatherKernel::configure(const CLCompileContext &compile_context, const IC
 
     // Set build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
     build_opts.add_option("-DOUTPUT_DIM_Z=" + support::cpp11::to_string(output->info()->dimension(2)));
     build_opts.add_option("-DINDICES_DIM_Z=" + support::cpp11::to_string(indices->info()->dimension(2)));
     build_opts.add_option("-DINPUT_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
@@ -114,10 +124,12 @@ void CLGatherKernel::configure(const CLCompileContext &compile_context, const IC
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+Status
+CLGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), indices->clone().get(), output->clone().get(), axis).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), indices->clone().get(), output->clone().get(), axis).first);
     return Status{};
 }
 
diff --git a/src/core/CL/kernels/CLGatherKernel.h b/src/core/CL/kernels/CLGatherKernel.h
index 8f472a4696..db4b49d2f5 100644
--- a/src/core/CL/kernels/CLGatherKernel.h
+++ b/src/core/CL/kernels/CLGatherKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLGATHERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -63,7 +64,11 @@ public:
      * @param[out] output          Destination tensor. Data type supported: Same as @p input
      * @param[in]  axis            (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *indices,
+                   ICLTensor              *output,
+                   int                     axis = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLGatherKernel
      *
@@ -74,7 +79,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
index 088c454f3c..b9ff72b928 100644
--- a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
+++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -47,7 +48,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
     ARM_COMPUTE_RETURN_ERROR_ON(anchors->dimension(0) != info.values_per_roi());
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(anchors, DataType::QSYMM16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(anchors->num_dimensions() > 2);
-    if(all_anchors->total_size() > 0)
+    if (all_anchors->total_size() > 0)
     {
         size_t feature_height = info.feat_height();
         size_t feature_width  = info.feat_width();
@@ -57,7 +58,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
         ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(0) != info.values_per_roi());
         ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(1) != feature_height * feature_width * num_anchors);
 
-        if(is_data_type_quantized(anchors->data_type()))
+        if (is_data_type_quantized(anchors->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(anchors, all_anchors);
         }
@@ -66,21 +67,25 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
 }
 } // namespace
 
-CLComputeAllAnchorsKernel::CLComputeAllAnchorsKernel()
-    : _anchors(nullptr), _all_anchors(nullptr)
+CLComputeAllAnchorsKernel::CLComputeAllAnchorsKernel() : _anchors(nullptr), _all_anchors(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLComputeAllAnchorsKernel::configure(const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info)
+void CLComputeAllAnchorsKernel::configure(const ICLTensor          *anchors,
+                                          ICLTensor                *all_anchors,
+                                          const ComputeAnchorsInfo &info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), anchors, all_anchors, info);
 }
 
-void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info)
+void CLComputeAllAnchorsKernel::configure(const CLCompileContext   &compile_context,
+                                          const ICLTensor          *anchors,
+                                          ICLTensor                *all_anchors,
+                                          const ComputeAnchorsInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(anchors, all_anchors);
-    auto padding_info = get_padding_info({ anchors, all_anchors });
+    auto padding_info = get_padding_info({anchors, all_anchors});
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(anchors->info(), all_anchors->info(), info));
 
     // Metadata
@@ -91,7 +96,8 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex
 
     // Initialize the output if empty
     const TensorShape output_shape(info.values_per_roi(), width * height * num_anchors);
-    auto_init_if_empty(*all_anchors->info(), TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info()));
+    auto_init_if_empty(*all_anchors->info(),
+                       TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info()));
 
     // Set instance variables
     _anchors     = anchors;
@@ -108,7 +114,7 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex
     build_opts.add_option("-DNUM_ANCHORS=" + support::cpp11::to_string(num_anchors));
     build_opts.add_option("-DNUM_ROI_FIELDS=" + support::cpp11::to_string(info.values_per_roi()));
 
-    if(is_quantized)
+    if (is_quantized)
     {
         const UniformQuantizationInfo qinfo = anchors->info()->quantization_info().uniform();
         build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(qinfo.scale));
@@ -116,8 +122,9 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex
     }
 
     // Create kernel
-    const std::string kernel_name = (is_quantized) ? "generate_proposals_compute_all_anchors_quantized" : "generate_proposals_compute_all_anchors";
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts.options());
+    const std::string kernel_name =
+        (is_quantized) ? "generate_proposals_compute_all_anchors_quantized" : "generate_proposals_compute_all_anchors";
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // The tensor all_anchors can be interpreted as an array of structs (each structs has values_per_roi fields).
     // This means we don't need to pad on the X dimension, as we know in advance how many fields
@@ -127,7 +134,9 @@ void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_contex
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
+Status CLComputeAllAnchorsKernel::validate(const ITensorInfo        *anchors,
+                                           const ITensorInfo        *all_anchors,
+                                           const ComputeAnchorsInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(anchors, all_anchors, info));
     return Status{};
diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h
index d26795ac7d..e08f281d6c 100644
--- a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h
+++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h
@@ -62,7 +62,10 @@ public:
      * @param[in]  info            Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
      *
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info);
+    void configure(const CLCompileContext   &compile_context,
+                   const ICLTensor          *anchors,
+                   ICLTensor                *all_anchors,
+                   const ComputeAnchorsInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLComputeAllAnchorsKernel
      *
@@ -81,5 +84,5 @@ private:
     const ICLTensor *_anchors;
     ICLTensor       *_all_anchors;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif // ARM_COMPUTE_CLGENERATEPROSPOSALSLAYERKERNEL_H
diff --git a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
index 7ed323c950..b13eb16556 100644
--- a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -39,17 +40,20 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info)
+Status validate_arguments(const ITensorInfo                          *input,
+                          const ITensorInfo                          *output,
+                          const InstanceNormalizationLayerKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.epsilon == 0.f, "Epsilon must be different than 0");
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), "Input and output have different number of channels");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
+                                        "Input and output have different number of channels");
     }
 
     return Status{};
@@ -59,27 +63,30 @@ Status validate_arguments_meanvar(const ITensorInfo *input, const ITensorInfo *o
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), "Input and output have different number of channels");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
+                                        "Input and output have different number of channels");
     }
 
     return Status{};
 }
 } // namespace
 
-CLComputeMeanVariance::CLComputeMeanVariance()
-    : _input(nullptr), _output(nullptr)
+CLComputeMeanVariance::CLComputeMeanVariance() : _input(nullptr), _output(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLComputeMeanVariance::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, bool use_mixed_precision)
+void CLComputeMeanVariance::configure(const CLCompileContext &compile_context,
+                                      ICLTensor              *input,
+                                      ICLTensor              *output,
+                                      bool                    use_mixed_precision)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input  = input;
     _output = output == nullptr ? input : output;
@@ -88,7 +95,8 @@ void CLComputeMeanVariance::configure(const CLCompileContext &compile_context, I
     const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
 
     CLBuildOptions build_opts;
-    build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.add_option("-DINTERNAL_DATA_TYPE=" +
+                          (use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type())));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0)));
@@ -108,7 +116,7 @@ void CLComputeMeanVariance::configure(const CLCompileContext &compile_context, I
     const TensorShape  out_shape(input_channel, 2u, input_batches);
 
     // Output auto initialization if not yet initialized
-    if(use_mixed_precision)
+    if (use_mixed_precision)
     {
         auto_init_if_empty(*_output->info(), out_shape, 1, DataType::F32);
     }
@@ -134,7 +142,7 @@ void CLComputeMeanVariance::run(const Window &window, cl::CommandQueue &queue)
     Window collapsed_window = window.collapse(window, Window::DimZ);
 
     // We will process the planes together
-    if(_input->info()->data_layout() == DataLayout::NCHW)
+    if (_input->info()->data_layout() == DataLayout::NCHW)
     {
         collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
         collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1));
@@ -157,10 +165,14 @@ CLInstanceNormalizationLayerKernel::CLInstanceNormalizationLayerKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *mean_var, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info)
+void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext                     &compile_context,
+                                                   ICLTensor                                  *input,
+                                                   ICLTensor                                  *mean_var,
+                                                   ICLTensor                                  *output,
+                                                   const InstanceNormalizationLayerKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input  = input;
     _output = output == nullptr ? input : output;
@@ -172,7 +184,9 @@ void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compi
 
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (info.use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (info.use_mixed_precision
+                                                         ? "float"
+                                                         : get_cl_type_from_data_type(input->info()->data_type())));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0)));
     build_opts.add_option("-DDIM_Y=" + support::cpp11::to_string(input->info()->dimension(1)));
@@ -188,7 +202,7 @@ void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compi
 
     // Configure kernel window
     Window win = calculate_max_window(*input->info(), Steps(1));
-    if(output != nullptr)
+    if (output != nullptr)
     {
         auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type());
     }
@@ -197,7 +211,9 @@ void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compi
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLInstanceNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info)
+Status CLInstanceNormalizationLayerKernel::validate(const ITensorInfo                          *input,
+                                                    const ITensorInfo                          *output,
+                                                    const InstanceNormalizationLayerKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info));
     return Status{};
@@ -211,7 +227,7 @@ void CLInstanceNormalizationLayerKernel::run(const Window &window, cl::CommandQu
     Window collapsed_window = window.collapse(window, Window::DimZ);
 
     // We will process the planes together
-    if(_input->info()->data_layout() == DataLayout::NCHW)
+    if (_input->info()->data_layout() == DataLayout::NCHW)
     {
         collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
         collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1));
@@ -226,7 +242,7 @@ void CLInstanceNormalizationLayerKernel::run(const Window &window, cl::CommandQu
     add_4D_tensor_argument(idx, _input, collapsed_window);
     add_3D_tensor_argument(idx, _mean, collapsed_window);
 
-    if(!_run_in_place)
+    if (!_run_in_place)
     {
         add_4D_tensor_argument(idx, _output, collapsed_window);
     }
diff --git a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
index 2f9014a651..9f436da7f6 100644
--- a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
+++ b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H
 #define ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H
 
-#include "src/core/CL/ICLKernel.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 
+#include "src/core/CL/ICLKernel.h"
+
 namespace arm_compute
 {
 // Forward declarations
@@ -59,7 +59,11 @@ public:
      * @param[out]     output          Destination tensor. Data types and data layouts supported: same as @p input.
      * @param[in]      info            Kernel meta-data descriptor
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *mean_var, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info);
+    void configure(const CLCompileContext                     &compile_context,
+                   ICLTensor                                  *input,
+                   ICLTensor                                  *mean_var,
+                   ICLTensor                                  *output,
+                   const InstanceNormalizationLayerKernelInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer.
      *
@@ -69,7 +73,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
@@ -106,7 +111,8 @@ public:
      * @param[out]     output              Destination tensor. Data types and data layouts supported: same as @p input.
      * @param[in]      use_mixed_precision Use mixed precision in case of FP16 execution
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, bool use_mixed_precision);
+    void
+    configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, bool use_mixed_precision);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer.
      *
diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
index 542d380e4a..9ed9d7c5b0 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
@@ -31,10 +31,10 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -43,7 +43,8 @@ namespace
 {
 constexpr int max_input_tensor_dim = 3;
 
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
 {
     ARM_COMPUTE_UNUSED(epsilon);
 
@@ -53,14 +54,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis > 2, "Actual axis greater than 2 is not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions, "Actual normalization axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions,
+                                    "Actual normalization axis greater than max number of dimensions");
 
     // Reduce shape on axis
     TensorShape sum_shape = input->tensor_shape();
     sum_shape.set(actual_axis, 1);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(sum->tensor_shape(), sum_shape);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -78,16 +80,22 @@ CLL2NormalizeLayerKernel::CLL2NormalizeLayerKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLL2NormalizeLayerKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon)
+void CLL2NormalizeLayerKernel::configure(
+    const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, sum, output, axis, epsilon);
 }
 
-void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon)
+void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context,
+                                         const ICLTensor        *input,
+                                         const ICLTensor        *sum,
+                                         ICLTensor              *output,
+                                         int                     axis,
+                                         float                   epsilon)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), sum->info(), output->info(), axis, epsilon));
-    auto padding_info = get_padding_info({ input, sum, output });
+    auto padding_info = get_padding_info({input, sum, output});
 
     _input       = input;
     _sum         = sum;
@@ -95,8 +103,9 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context
     _actual_axis = wrap_around(axis, max_input_tensor_dim);
     _epsilon     = epsilon;
 
-    const unsigned int vec_size_x           = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
-    const int          vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
+    const unsigned int vec_size_x =
+        adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
+    const int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
 
     // Set build options
     CLBuildOptions build_opts;
@@ -107,7 +116,7 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context
     // Create kernel
     std::string  kernel_name;
     unsigned int idx = 0;
-    switch(_actual_axis)
+    switch (_actual_axis)
     {
         case 0:
             kernel_name = "l2_normalize_x";
@@ -127,7 +136,7 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Set epsilon argument
-    if(input->info()->data_type() == DataType::F32)
+    if (input->info()->data_type() == DataType::F32)
     {
         _kernel.setArg<cl_float>(idx, _epsilon);
     }
@@ -146,7 +155,8 @@ void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLL2NormalizeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
+Status CLL2NormalizeLayerKernel::validate(
+    const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, sum, output, axis, epsilon));
     return Status{};
@@ -159,7 +169,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
 
     Window window_sum(window);
 
-    switch(_actual_axis)
+    switch (_actual_axis)
     {
         case 0:
         {
@@ -173,8 +183,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
                 add_2D_tensor_argument(idx, _sum, sum_slice);
                 add_2D_tensor_argument(idx, _output, in_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
+            } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
         }
         break;
         case 1:
@@ -189,8 +198,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
                 add_2D_tensor_argument(idx, _sum, sum_slice);
                 add_2D_tensor_argument(idx, _output, in_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
+            } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
         }
         break;
         case 2:
@@ -205,8 +213,7 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
                 add_3D_tensor_argument(idx, _sum, sum_slice);
                 add_3D_tensor_argument(idx, _output, in_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice));
+            } while (window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice));
         }
         break;
         default:
diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.h b/src/core/CL/kernels/CLL2NormalizeLayerKernel.h
index edc0585217..5c9ab94ce5 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.h
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLL2NORMALIZELAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -70,7 +71,12 @@ public:
      * @param[in]  axis            Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
      * @param[in]  epsilon         Lower bound value for the normalization.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *sum,
+                   ICLTensor              *output,
+                   int                     axis,
+                   float                   epsilon);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLL2NormalizeLayerKernel.
      *
@@ -84,7 +90,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
index dc9d68626d..e560f1de4a 100644
--- a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -42,26 +43,31 @@ using namespace misc::shape_calculator;
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status validate_arguments(const ITensorInfo      *input,
+                          const ITensorInfo      *output,
+                          const PoolingLayerInfo &pool_info,
+                          const ITensorInfo      *indices)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, indices);
 
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    PoolingType         pool_type       = pool_info.pool_type;
-    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
+    int                 pool_stride_x      = 0;
+    int                 pool_stride_y      = 0;
+    PoolingType         pool_type          = pool_info.pool_type;
+    const PadStrideInfo pad_stride_info    = pool_info.pad_stride_info;
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-    const int    pool_size_x = pool_info.pool_size.width;
-    const int    pool_size_y = pool_info.pool_size.height;
+    const int    pool_size_x               = pool_info.pool_size.width;
+    const int    pool_size_y               = pool_info.pool_size.height;
     const Size2D pool_size(pool_size_x, pool_size_y);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX,
+                                    "Pooling indices only supported for MAX pooling method");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -71,17 +77,20 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 }
 } // namespace
 
-CLMaxUnpoolingLayerKernel::CLMaxUnpoolingLayerKernel()
-    : _input(nullptr), _output(nullptr), _indices(nullptr)
+CLMaxUnpoolingLayerKernel::CLMaxUnpoolingLayerKernel() : _input(nullptr), _output(nullptr), _indices(nullptr)
 {
     _type = CLKernelType::POOL;
 }
 
-void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info)
+void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          const ICLTensor        *indices,
+                                          ICLTensor              *output,
+                                          const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, indices->info()));
-    auto padding_info = get_padding_info({ input, indices, output });
+    auto padding_info = get_padding_info({input, indices, output});
 
     _input   = input;
     _output  = output;
@@ -119,7 +128,10 @@ void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_contex
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLMaxUnpoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+Status CLMaxUnpoolingLayerKernel::validate(const ITensorInfo      *input,
+                                           const ITensorInfo      *indices,
+                                           const ITensorInfo      *output,
+                                           const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, indices));
@@ -140,7 +152,6 @@ void CLMaxUnpoolingLayerKernel::run(const Window &window, cl::CommandQueue &queu
         add_3D_tensor_argument(idx, _output, slice);
         add_3D_tensor_argument(idx, _indices, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
index 45481d0507..eb18a46784 100644
--- a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
+++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
@@ -59,7 +59,11 @@ public:
      * @param[out] output          Destination tensor. Data types supported: Same as @p input.
      * @param[in]  pool_info       Contains pooling operation information described in @ref PoolingLayerInfo.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *indices,
+                   ICLTensor              *output,
+                   const PoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLMaxUnpoolingLayerKernel
      *
      * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -72,7 +76,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo      *input,
+                           const ITensorInfo      *indices,
+                           const ITensorInfo      *output,
+                           const PoolingLayerInfo &pool_info);
 
     // Inherited methods overridden
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
index ac33468ad8..8632bdf623 100644
--- a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -49,7 +50,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -69,15 +70,19 @@ void CLMeanStdDevNormalizationKernel::configure(ICLTensor *input, ICLTensor *out
     configure(CLKernelLibrary::get().get_compile_context(), input, output, epsilon);
 }
 
-void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float epsilon)
+void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_context,
+                                                ICLTensor              *input,
+                                                ICLTensor              *output,
+                                                float                   epsilon)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
     _run_in_place = (output == nullptr) || (output == input);
 
-    ARM_COMPUTE_ERROR_THROW_ON(CLMeanStdDevNormalizationKernel::validate(input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
+    ARM_COMPUTE_ERROR_THROW_ON(CLMeanStdDevNormalizationKernel::validate(
+        input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
 
-    if(output != nullptr)
+    if (output != nullptr)
     {
         auto_init_if_empty(*output->info(), *input->info());
     }
@@ -85,7 +90,8 @@ void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_
     _input  = input;
     _output = output;
 
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
 
     // Set build options
     CLBuildOptions build_opts;
@@ -134,7 +140,6 @@ void CLMeanStdDevNormalizationKernel::run(const Window &window, cl::CommandQueue
         add_2D_tensor_argument_if((!_run_in_place), idx, _output, slice);
 
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
+    } while (window.slide_window_slice_2D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
index a1ba2b905e..e02a3c58a3 100644
--- a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
+++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
@@ -66,7 +66,10 @@ public:
      * @param[out]     output          (Optional) Destination tensor. It can be nullptr in case of in-place computation. Data type supported: same as @p input
      * @param[in]      epsilon         (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output = nullptr, float epsilon = 1e-8f);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output  = nullptr,
+                   float                   epsilon = 1e-8f);
     /** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDevNormalizationKernel
      *
      * @param[in] input   Source tensor info with 2 dimensions. In case of @p output tensor info = nullptr,
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
index c6c4229c00..b636c485e7 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
@@ -32,6 +32,7 @@
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -53,7 +54,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, N
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -63,7 +64,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, N
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, NormalizationLayerInfo norm_info)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, NormalizationLayerInfo norm_info)
 {
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*output, *input->clone());
@@ -71,9 +73,10 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     bool             window_changed = false;
     Window           win;
     const DataLayout data_layout = input->data_layout();
-    if(data_layout == DataLayout::NCHW)
+    if (data_layout == DataLayout::NCHW)
     {
-        const unsigned int vec_size_x           = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
+        const unsigned int vec_size_x =
+            adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
         const unsigned int norm_idx             = get_normalization_dimension_index(input->data_layout(), norm_info);
         const bool         is_norm_across_width = norm_idx == 0;
 
@@ -87,15 +90,16 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
         // The output has 1 right padding because of the vec_size_x.
         // The input has 1 left padding because radius = 1.
         // The input has 2 right padding because of radius = 1 AND because of the extra output padding
-        const unsigned int border_width_left  = is_norm_across_width ? norm_radius : 0;
-        const unsigned int border_width_right = is_norm_across_width ? norm_radius + (vec_size_x - input->dimension(0) % vec_size_x) : 0;
-        const BorderSize   border_size        = BorderSize(0, border_width_right, 0, border_width_left);
+        const unsigned int border_width_left = is_norm_across_width ? norm_radius : 0;
+        const unsigned int border_width_right =
+            is_norm_across_width ? norm_radius + (vec_size_x - input->dimension(0) % vec_size_x) : 0;
+        const BorderSize border_size = BorderSize(0, border_width_right, 0, border_width_left);
 
         win = calculate_max_window(*input, Steps(vec_size_x));
 
         // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside the kernel, avoiding padding
         // Reads can occur within the valid region of the input
-        if(is_norm_across_width)
+        if (is_norm_across_width)
         {
             AccessWindowStatic input_access(input, -border_size.left, 0, input->dimension(0) + border_size.right, 0);
             window_changed = window_changed || update_window_and_padding(win, input_access);
@@ -112,13 +116,14 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     else
     {
         unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
-        if(norm_info.is_cross_map())
+        if (norm_info.is_cross_map())
         {
             vec_size_x = 1;
         }
         win = calculate_max_window(*input, Steps(vec_size_x));
     }
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
@@ -139,10 +144,13 @@ void CLNormalizationLayerKernel::configure(const ICLTensor *input, ICLTensor *ou
     configure(CLKernelLibrary::get().get_compile_context(), input, output, norm_info);
 }
 
-void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info)
+void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_context,
+                                           const ICLTensor        *input,
+                                           ICLTensor              *output,
+                                           NormalizationLayerInfo  norm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), norm_info));
@@ -152,16 +160,17 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte
     _input  = input;
     _output = output;
 
-    const DataLayout data_layout          = input->info()->data_layout();
-    unsigned int     vec_size_x           = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
-    int              vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
-    if(norm_info.is_cross_map() && data_layout == DataLayout::NHWC)
+    const DataLayout data_layout = input->info()->data_layout();
+    unsigned int     vec_size_x =
+        adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
+    int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
+    if (norm_info.is_cross_map() && data_layout == DataLayout::NHWC)
     {
         vec_size_x           = 1;
         vec_size_x_leftovers = 0;
     }
 
-    if(data_layout == DataLayout::NCHW)
+    if (data_layout == DataLayout::NCHW)
     {
         const unsigned int norm_idx    = get_normalization_dimension_index(data_layout, norm_info);
         _is_norm_across_width          = norm_idx == 0;
@@ -175,9 +184,10 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte
         // The output has 1 right padding because of the vec_size_x.
         // The input has 1 left padding because radius = 1.
         // The input has 2 right padding because of radius = 1 AND the extra output padding
-        const unsigned int border_width_left  = _is_norm_across_width ? norm_radius : 0;
-        const unsigned int border_width_right = _is_norm_across_width ? norm_radius + (vec_size_x - input->info()->dimension(0) % vec_size_x) : 0;
-        _border_size                          = BorderSize(0, border_width_right, 0, border_width_left);
+        const unsigned int border_width_left = _is_norm_across_width ? norm_radius : 0;
+        const unsigned int border_width_right =
+            _is_norm_across_width ? norm_radius + (vec_size_x - input->info()->dimension(0) % vec_size_x) : 0;
+        _border_size = BorderSize(0, border_width_right, 0, border_width_left);
     }
 
     const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D);
@@ -193,12 +203,14 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte
     build_opts.add_option(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size() / 2)));
     build_opts.add_option(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2))));
     build_opts.add_option_if(is_in_map_2D, "-DIN_MAP_2D");
-    build_opts.add_option_if(norm_info.is_in_map() || (data_layout == DataLayout::NHWC && norm_info.is_cross_map()), "-DWIDTH_SIZE=" + support::cpp11::to_string(input->info()->dimension(0)));
-    build_opts.add_option_if(norm_info.is_in_map() && data_layout == DataLayout::NHWC, "-DDIM1_SIZE=" + support::cpp11::to_string(input->info()->dimension(1)));
+    build_opts.add_option_if(norm_info.is_in_map() || (data_layout == DataLayout::NHWC && norm_info.is_cross_map()),
+                             "-DWIDTH_SIZE=" + support::cpp11::to_string(input->info()->dimension(0)));
+    build_opts.add_option_if(norm_info.is_in_map() && data_layout == DataLayout::NHWC,
+                             "-DDIM1_SIZE=" + support::cpp11::to_string(input->info()->dimension(1)));
 
     // Create kernel
     std::string kernel_name;
-    if(norm_info.is_in_map())
+    if (norm_info.is_in_map())
     {
         kernel_name = "normalization_layer_in_map_" + lower_string(string_from_data_layout(data_layout));
     }
@@ -222,16 +234,19 @@ void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_conte
     _config_id += support::cpp11::to_string(input->info()->dimension(0));
     _config_id += "_";
     _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    if(data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NHWC)
     {
         ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
     }
 }
 
-Status CLNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info)
+Status CLNormalizationLayerKernel::validate(const ITensorInfo     *input,
+                                            const ITensorInfo     *output,
+                                            NormalizationLayerInfo norm_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, norm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first);
 
     return Status{};
 }
@@ -251,7 +266,6 @@ void CLNormalizationLayerKernel::run(const Window &window, cl::CommandQueue &que
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
+    } while (window_collapsed.slide_window_slice_3D(slice));
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.h b/src/core/CL/kernels/CLNormalizationLayerKernel.h
index 739a2ae9f1..5517ba6904 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.h
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.h
@@ -63,7 +63,10 @@ public:
      *                             Data layouts supported: same as @p input.
      * @param[in]  norm_info       Normalization layer information like the normalization type, normalization size and other parameters.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   NormalizationLayerInfo  norm_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizationLayerKernel
      *
      * @param[in] input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
@@ -77,7 +80,7 @@ public:
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info);
 
     // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
+    void       run(const Window &window, cl::CommandQueue &queue) override;
     BorderSize border_size() const override;
 
 private:
diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
index 6b0400d50e..59352a8fb7 100644
--- a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
@@ -31,32 +31,35 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, std);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, std);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(mean->num_dimensions() > 1, "mean and std must be vectors");
 
-    const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+    const unsigned int channel_idx =
+        get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != mean->dimension(0));
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -77,7 +80,8 @@ std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input,
 
     bool window_changed = update_window_and_padding(win, input_access, output_access);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
@@ -88,12 +92,19 @@ CLNormalizePlanarYUVLayerKernel::CLNormalizePlanarYUVLayerKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLNormalizePlanarYUVLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayerKernel::configure(const ICLTensor *input,
+                                                ICLTensor       *output,
+                                                const ICLTensor *mean,
+                                                const ICLTensor *std)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, std);
 }
 
-void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_context,
+                                                const ICLTensor        *input,
+                                                ICLTensor              *output,
+                                                const ICLTensor        *mean,
+                                                const ICLTensor        *std)
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, mean, std);
@@ -102,7 +113,7 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*output->info(), *input->info()->clone());
 
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input  = input;
     _output = output;
@@ -112,9 +123,10 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_
     const DataLayout data_layout = input->info()->data_layout();
 
     // Get number of elements to process per iterations
-    const unsigned int num_elems_processed_per_iteration = (data_layout == DataLayout::NHWC) ? adjust_vec_size(16 / input->info()->element_size(),
-                                                                                                               input->info()->dimension(0)) :
-                                                           (16 / input->info()->element_size());
+    const unsigned int num_elems_processed_per_iteration =
+        (data_layout == DataLayout::NHWC)
+            ? adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0))
+            : (16 / input->info()->element_size());
     const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
     const DataType     dt          = input->info()->data_type();
 
@@ -122,11 +134,12 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_
     CLBuildOptions build_opts;
     build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
     build_opts.add_option(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-    build_opts.add_option(("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)));
+    build_opts.add_option(("-DVEC_SIZE_LEFTOVER=" +
+                           support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)));
     build_opts.add_option(("-DNUM_CHANNELS=" + support::cpp11::to_string(input->info()->dimension(channel_idx))));
 
     std::string kernel_name = "normalize_planar_yuv_layer_";
-    if(is_data_type_quantized(dt))
+    if (is_data_type_quantized(dt))
     {
         const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
         build_opts.add_option(("-DOFFSET=" + support::cpp11::to_string(qinfo.offset)));
@@ -139,7 +152,7 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Configure kernel window
-    if(data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NHWC)
     {
         Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
         ICLKernel::configure_internal(win);
@@ -165,12 +178,16 @@ void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_
     _config_id += support::cpp11::to_string(input->info()->dimension(2));
 }
 
-Status CLNormalizePlanarYUVLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
+Status CLNormalizePlanarYUVLayerKernel::validate(const ITensorInfo *input,
+                                                 const ITensorInfo *output,
+                                                 const ITensorInfo *mean,
+                                                 const ITensorInfo *std)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, std));
-    if(input->data_layout() == DataLayout::NCHW)
+    if (input->data_layout() == DataLayout::NCHW)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_nchw(input->clone().get(), output->clone().get()).first);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_and_configure_window_nchw(input->clone().get(), output->clone().get()).first);
     }
     return Status{};
 }
@@ -196,7 +213,6 @@ void CLNormalizePlanarYUVLayerKernel::run(const Window &window, cl::CommandQueue
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
index 6db4433e78..341b404e3d 100644
--- a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
+++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
@@ -67,7 +67,11 @@ public:
      * @param[in]  std             Standard deviation values tensor. 1 dimension with size equal to the number of input channels.
      *                             Data types supported: same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const ICLTensor        *mean,
+                   const ICLTensor        *std);
     /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizePlanarYUVLayerKernel
      *
      * @param[in]  input  Source tensor info. 3 lower dimensions represent a single input with dimensions [width, height, channels].
@@ -79,7 +83,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLPadLayerKernel.cpp b/src/core/CL/kernels/CLPadLayerKernel.cpp
index 53f313c0d3..0ac285038e 100644
--- a/src/core/CL/kernels/CLPadLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPadLayerKernel.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -35,25 +36,29 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *output,
+                          const PaddingList &padding,
+                          PixelValue         constant_value,
+                          PaddingMode        mode)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_UNUSED(constant_value);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON((padding.size() < 1) || (padding.size() > input->num_dimensions()));
-    if(mode == PaddingMode::REFLECT || mode == PaddingMode::SYMMETRIC)
+    if (mode == PaddingMode::REFLECT || mode == PaddingMode::SYMMETRIC)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 3);
 
         const auto is_reflect = static_cast<unsigned int>(mode == PaddingMode::REFLECT);
-        for(size_t i = 0; i < padding.size(); ++i)
+        for (size_t i = 0; i < padding.size(); ++i)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).first > (input->dimension(i) - is_reflect));
             ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).second > (input->dimension(i) - is_reflect));
         }
     }
 
-    if(output->total_size() > 0)
+    if (output->total_size() > 0)
     {
         TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
 
@@ -65,41 +70,51 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 }
 } // namespace
 
-CLPadLayerKernel::CLPadLayerKernel()
-    : _input(nullptr), _output(nullptr), _4d_enabled(false)
+CLPadLayerKernel::CLPadLayerKernel() : _input(nullptr), _output(nullptr), _4d_enabled(false)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLPadLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+void CLPadLayerKernel::configure(
+    const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, mode);
 }
 
-void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+void CLPadLayerKernel::configure(const CLCompileContext &compile_context,
+                                 const ICLTensor        *input,
+                                 ICLTensor              *output,
+                                 const PaddingList      &padding,
+                                 PixelValue              constant_value,
+                                 PaddingMode             mode)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding)));
+    auto_init_if_empty(*output->info(),
+                       input->info()->clone()->set_tensor_shape(
+                           misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding)));
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), padding, constant_value, mode));
 
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input      = input;
     _output     = output;
     _4d_enabled = (mode == PaddingMode::CONSTANT) && (padding.size() > 3);
 
     // Set build options
-    const DataType    &data_type               = input->info()->data_type();
-    const unsigned int input_width             = input->info()->dimension(0);
-    const unsigned int input_height            = input->info()->dimension(1);
-    const unsigned int input_depth             = input->info()->dimension(2);
-    const unsigned int pad_x_before            = padding.at(0).first;
-    const unsigned int pad_y_before            = padding.size() > 1 ? padding.at(1).first : 0;
-    const unsigned int pad_z_before            = padding.size() > 2 ? padding.at(2).first : 0;
-    const unsigned int vec_size                = adjust_vec_size(std::min(16U, 32U / static_cast<unsigned int>(element_size_from_data_type(input->info()->data_type()))), input_width);
-    const unsigned int pad_right_start         = input_width + pad_x_before;
-    const unsigned int pad_x_before_remainder  = pad_x_before % vec_size;
-    const unsigned int vec_size_leftover_write = vec_size - (ceil_to_multiple(output->info()->dimension(0), vec_size) - output->info()->dimension(0));
+    const DataType    &data_type    = input->info()->data_type();
+    const unsigned int input_width  = input->info()->dimension(0);
+    const unsigned int input_height = input->info()->dimension(1);
+    const unsigned int input_depth  = input->info()->dimension(2);
+    const unsigned int pad_x_before = padding.at(0).first;
+    const unsigned int pad_y_before = padding.size() > 1 ? padding.at(1).first : 0;
+    const unsigned int pad_z_before = padding.size() > 2 ? padding.at(2).first : 0;
+    const unsigned int vec_size     = adjust_vec_size(
+            std::min(16U, 32U / static_cast<unsigned int>(element_size_from_data_type(input->info()->data_type()))),
+            input_width);
+    const unsigned int pad_right_start        = input_width + pad_x_before;
+    const unsigned int pad_x_before_remainder = pad_x_before % vec_size;
+    const unsigned int vec_size_leftover_write =
+        vec_size - (ceil_to_multiple(output->info()->dimension(0), vec_size) - output->info()->dimension(0));
 
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
@@ -108,12 +123,12 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const
     build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width));
     build_opts.add_option("-DPAD_X_BEFORE_REMAINDER=" + support::cpp11::to_string(pad_x_before_remainder));
     build_opts.add_option("-DVEC_SIZE_LEFTOVER_WRITE=" + support::cpp11::to_string(vec_size_leftover_write));
-    if(padding.size() > 1)
+    if (padding.size() > 1)
     {
         build_opts.add_option("-DPAD_Y_BEFORE=" + support::cpp11::to_string(pad_y_before));
         build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height));
 
-        if(padding.size() > 2)
+        if (padding.size() > 2)
         {
             build_opts.add_option("-DPAD_Z_BEFORE=" + support::cpp11::to_string(pad_z_before));
             build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_depth));
@@ -121,23 +136,25 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const
     }
 
     std::string kernel_name = "pad_layer_";
-    switch(mode)
+    switch (mode)
     {
         case PaddingMode::CONSTANT:
         {
             kernel_name += "constant";
 
-            const unsigned int vec_size_leftover_read = vec_size - (ceil_to_multiple(pad_right_start, vec_size) - pad_right_start);
+            const unsigned int vec_size_leftover_read =
+                vec_size - (ceil_to_multiple(pad_right_start, vec_size) - pad_right_start);
 
             build_opts.add_option("-DCONST_VAL=" + string_from_pixel_value(constant_value, data_type));
             build_opts.add_option("-DVEC_SIZE_LEFTOVER_READ=" + support::cpp11::to_string(vec_size_leftover_read));
 
-            if(pad_x_before >= vec_size)
+            if (pad_x_before >= vec_size)
             {
                 build_opts.add_option("-DTHREADS_TO_SKIP_BEFORE=" + support::cpp11::to_string(pad_x_before / vec_size));
-                build_opts.add_option("-DTHREADS_TO_SKIP_AFTER=" + support::cpp11::to_string(pad_right_start / vec_size));
+                build_opts.add_option("-DTHREADS_TO_SKIP_AFTER=" +
+                                      support::cpp11::to_string(pad_right_start / vec_size));
             }
-            if(_4d_enabled)
+            if (_4d_enabled)
             {
                 build_opts.add_option("-DPAD_W_BEFORE=" + support::cpp11::to_string(padding.at(3).first));
                 build_opts.add_option("-DSRC_BATCH=" + support::cpp11::to_string(input->info()->dimension(3)));
@@ -154,14 +171,17 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const
 
             const unsigned int pad_x_after_remainder = pad_right_start % vec_size;
             const unsigned int after_pad_fact_x      = (2 * input_width + pad_x_before) - is_reflect;
-            const unsigned int output_last_x         = ceil_to_multiple(pad_right_start + padding.at(0).second, vec_size);
+            const unsigned int output_last_x = ceil_to_multiple(pad_right_start + padding.at(0).second, vec_size);
 
             build_opts.add_option("-DIS_REFLECT=" + support::cpp11::to_string(is_reflect));
             build_opts.add_option("-DPAD_X_AFTER_REMAINDER=" + support::cpp11::to_string(pad_x_after_remainder));
-            build_opts.add_option("-DPAD_X_BEFORE_REMAINDER_REFL=" + support::cpp11::to_string((pad_x_before_remainder + is_reflect) % vec_size));
-            build_opts.add_option("-DPAD_X_AFTER_REMAINDER_REFL=" + support::cpp11::to_string((pad_x_after_remainder - is_reflect) % vec_size));
+            build_opts.add_option("-DPAD_X_BEFORE_REMAINDER_REFL=" +
+                                  support::cpp11::to_string((pad_x_before_remainder + is_reflect) % vec_size));
+            build_opts.add_option("-DPAD_X_AFTER_REMAINDER_REFL=" +
+                                  support::cpp11::to_string((pad_x_after_remainder - is_reflect) % vec_size));
             build_opts.add_option("-DAFTER_PAD_FACT_X=" + support::cpp11::to_string(after_pad_fact_x));
-            build_opts.add_option_if(after_pad_fact_x < output_last_x, "-DAFTER_PAD_REM=" + support::cpp11::to_string(after_pad_fact_x % vec_size));
+            build_opts.add_option_if(after_pad_fact_x < output_last_x,
+                                     "-DAFTER_PAD_REM=" + support::cpp11::to_string(after_pad_fact_x % vec_size));
 
             break;
         }
@@ -179,7 +199,11 @@ void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLPadLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+Status CLPadLayerKernel::validate(const ITensorInfo *input,
+                                  const ITensorInfo *output,
+                                  const PaddingList &padding,
+                                  PixelValue         constant_value,
+                                  PaddingMode        mode)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, constant_value, mode));
     return Status{};
@@ -197,13 +221,12 @@ void CLPadLayerKernel::run(const Window &window, cl::CommandQueue &queue)
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
-        if(_4d_enabled)
+        if (_4d_enabled)
         {
             add_argument<unsigned int>(idx, batch++);
         }
 
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLPadLayerKernel.h b/src/core/CL/kernels/CLPadLayerKernel.h
index 90af337f94..dca121b6a1 100644
--- a/src/core/CL/kernels/CLPadLayerKernel.h
+++ b/src/core/CL/kernels/CLPadLayerKernel.h
@@ -56,7 +56,11 @@ public:
      * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
-    void configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
+    void configure(const ICLTensor   *input,
+                   ICLTensor         *output,
+                   const PaddingList &padding,
+                   PixelValue         constant_value = PixelValue(),
+                   PaddingMode        mode           = PaddingMode::CONSTANT);
     /** Set the input and output tensor.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -68,8 +72,12 @@ public:
      * @param[in]  mode            (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                             or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(),
-                   PaddingMode mode = PaddingMode::CONSTANT);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const PaddingList      &padding,
+                   PixelValue              constant_value = PixelValue(),
+                   PaddingMode             mode           = PaddingMode::CONSTANT);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPadLayerKernel
      *
      * @param[in] input          Source tensor info. Data types supported: All.
@@ -80,7 +88,11 @@ public:
      * @param[in] mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const PaddingList &padding,
+                           PixelValue         constant_value = PixelValue(),
+                           PaddingMode        mode           = PaddingMode::CONSTANT);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
index bf1b874dd0..7dcdf1de6f 100644
--- a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
@@ -30,10 +30,10 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 using namespace arm_compute::misc::shape_calculator;
@@ -42,7 +42,10 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status validate_arguments(const ITensorInfo       *input1,
+                          const ITensorInfo       *input2,
+                          const ITensorInfo       *output,
+                          const PriorBoxLayerInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
@@ -51,10 +54,10 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
 
     // Check variances
     const int var_size = info.variances().size();
-    if(var_size > 1)
+    if (var_size > 1)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size != 4, "Must provide 4 variance values");
-        for(int i = 0; i < var_size; ++i)
+        for (int i = 0; i < var_size; ++i)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size <= 0, "Must be greater than 0");
         }
@@ -62,17 +65,19 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[0] < 0.f, "Step x should be greater or equal to 0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[1] < 0.f, "Step y should be greater or equal to 0");
 
-    if(!info.max_sizes().empty())
+    if (!info.max_sizes().empty())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), "Max and min sizes dimensions should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(),
+                                        "Max and min sizes dimensions should match");
     }
 
-    for(unsigned int i = 0; i < info.max_sizes().size(); ++i)
+    for (unsigned int i = 0; i < info.max_sizes().size(); ++i)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], "Max size should be greater than min size");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i],
+                                        "Max size should be greater than min size");
     }
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != 2);
     }
@@ -80,7 +85,11 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, const PriorBoxLayerInfo &info, int num_priors)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo       *input1,
+                                                        const ITensorInfo       *input2,
+                                                        ITensorInfo             *output,
+                                                        const PriorBoxLayerInfo &info,
+                                                        int                      num_priors)
 {
     ARM_COMPUTE_UNUSED(input2);
     // Output tensor auto initialization if not yet initialized
@@ -88,10 +97,11 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
     auto_init_if_empty(*output, output_shape, 1, input1->data_type());
 
     const unsigned int     num_elems_processed_per_iteration = 4 * num_priors;
-    Window                 win                               = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+    Window                 win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
     AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
     bool                   window_changed = update_window_and_padding(win, output_access);
-    Status                 err            = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status                 err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
@@ -102,13 +112,25 @@ CLPriorBoxLayerKernel::CLPriorBoxLayerKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLPriorBoxLayerKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, cl::Buffer *aspect_ratios)
+void CLPriorBoxLayerKernel::configure(const ICLTensor         *input1,
+                                      const ICLTensor         *input2,
+                                      ICLTensor               *output,
+                                      const PriorBoxLayerInfo &info,
+                                      cl::Buffer              *min,
+                                      cl::Buffer              *max,
+                                      cl::Buffer              *aspect_ratios)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, info, min, max, aspect_ratios);
 }
 
-void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min,
-                                      cl::Buffer *max, cl::Buffer *aspect_ratios)
+void CLPriorBoxLayerKernel::configure(const CLCompileContext  &compile_context,
+                                      const ICLTensor         *input1,
+                                      const ICLTensor         *input2,
+                                      ICLTensor               *output,
+                                      const PriorBoxLayerInfo &info,
+                                      cl::Buffer              *min,
+                                      cl::Buffer              *max,
+                                      cl::Buffer              *aspect_ratios)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
 
@@ -135,7 +157,7 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c
 
     int img_width  = info.img_size().x;
     int img_height = info.img_size().y;
-    if(img_width == 0 || img_height == 0)
+    if (img_width == 0 || img_height == 0)
     {
         img_width  = input2->info()->dimension(width_idx);
         img_height = input2->info()->dimension(height_idx);
@@ -143,7 +165,7 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c
 
     float step_x = info.steps()[0];
     float step_y = info.steps()[0];
-    if(step_x == 0.f || step_y == 0.f)
+    if (step_x == 0.f || step_y == 0.f)
     {
         step_x = static_cast<float>(img_width) / layer_width;
         step_y = static_cast<float>(img_height) / layer_height;
@@ -162,18 +184,20 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c
     build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(info.offset()));
     build_opts.add_option_if(info.clip(), "-DIN_PLACE");
 
-    if(info.variances().size() > 1)
+    if (info.variances().size() > 1)
     {
-        for(unsigned int i = 0; i < info.variances().size(); ++i)
+        for (unsigned int i = 0; i < info.variances().size(); ++i)
         {
-            build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(info.variances().at(i)));
+            build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" +
+                                  support::cpp11::to_string(info.variances().at(i)));
         }
     }
     else
     {
-        for(unsigned int i = 0; i < 4; ++i)
+        for (unsigned int i = 0; i < 4; ++i)
         {
-            build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(info.variances().at(0)));
+            build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" +
+                                  support::cpp11::to_string(info.variances().at(0)));
         }
     }
 
@@ -194,13 +218,17 @@ void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, c
     ICLKernel::configure_internal(win_config.second);
 }
 
-Status CLPriorBoxLayerKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status CLPriorBoxLayerKernel::validate(const ITensorInfo       *input1,
+                                       const ITensorInfo       *input2,
+                                       const ITensorInfo       *output,
+                                       const PriorBoxLayerInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, info));
     const int num_priors = info.aspect_ratios().size() * info.min_sizes().size() + info.max_sizes().size();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get(), info, num_priors)
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(),
+                                                              output->clone().get(), info, num_priors)
+                                    .first);
 
     return Status{};
 }
@@ -211,8 +239,9 @@ void CLPriorBoxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
     queue.enqueueWriteBuffer(*_min, CL_TRUE, 0, _info.min_sizes().size() * sizeof(float), _info.min_sizes().data());
-    queue.enqueueWriteBuffer(*_aspect_ratios, CL_TRUE, 0, _info.aspect_ratios().size() * sizeof(float), _info.aspect_ratios().data());
-    if(!_info.max_sizes().empty())
+    queue.enqueueWriteBuffer(*_aspect_ratios, CL_TRUE, 0, _info.aspect_ratios().size() * sizeof(float),
+                             _info.aspect_ratios().data());
+    if (!_info.max_sizes().empty())
     {
         queue.enqueueWriteBuffer(*_max, CL_TRUE, 0, _info.max_sizes().size() * sizeof(float), _info.max_sizes().data());
     }
diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.h b/src/core/CL/kernels/CLPriorBoxLayerKernel.h
index 6c369a7a4e..a50e0c5ff5 100644
--- a/src/core/CL/kernels/CLPriorBoxLayerKernel.h
+++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.h
@@ -57,7 +57,13 @@ public:
      * @param[in]  max           Maximum prior box values
      * @param[in]  aspect_ratios Aspect ratio values
      */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, cl::Buffer *aspect_ratios);
+    void configure(const ICLTensor         *input1,
+                   const ICLTensor         *input2,
+                   ICLTensor               *output,
+                   const PriorBoxLayerInfo &info,
+                   cl::Buffer              *min,
+                   cl::Buffer              *max,
+                   cl::Buffer              *aspect_ratios);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -69,8 +75,14 @@ public:
      * @param[in]  max             Maximum prior box values
      * @param[in]  aspect_ratios   Aspect ratio values
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max,
-                   cl::Buffer *aspect_ratios);
+    void configure(const CLCompileContext  &compile_context,
+                   const ICLTensor         *input1,
+                   const ICLTensor         *input2,
+                   ICLTensor               *output,
+                   const PriorBoxLayerInfo &info,
+                   cl::Buffer              *min,
+                   cl::Buffer              *max,
+                   cl::Buffer              *aspect_ratios);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPriorBoxLayerKernel
      *
      * @param[in] input1 First source tensor info. Data types supported: F32. Data layouts supported: NCHW/NHWC.
@@ -80,14 +92,17 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info);
+    static Status validate(const ITensorInfo       *input1,
+                           const ITensorInfo       *input2,
+                           const ITensorInfo       *output,
+                           const PriorBoxLayerInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
+    const ICLTensor  *_input1;
+    const ICLTensor  *_input2;
     ICLTensor        *_output;
     PriorBoxLayerInfo _info;
     int               _num_priors;
diff --git a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
index bd573e54c8..731fcb8e04 100644
--- a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
@@ -22,10 +22,12 @@
  * SOFTWARE.
  */
 #include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -49,14 +51,19 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     const uint32_t temp_num_elems_processed_per_iteration = max_cl_vector_width / input->element_size();
     /* If width is less then step, then make step same as width to avoid global size being step instead of actual width. */
     /* Or we should fix in arm_compute::enqueue() or arm_compute::calculate_max_window(). */
-    const uint32_t num_elems_processed_per_iteration = (input->dimension(0) < temp_num_elems_processed_per_iteration) ? input->dimension(0) : temp_num_elems_processed_per_iteration;
+    const uint32_t num_elems_processed_per_iteration = (input->dimension(0) < temp_num_elems_processed_per_iteration)
+                                                           ? input->dimension(0)
+                                                           : temp_num_elems_processed_per_iteration;
 
     // This kernel doesn't need padding
     Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
 
     return std::make_pair(Status{}, win);
 }
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *output,
+                          const ITensorInfo *weight,
+                          const ITensorInfo *bias)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weight, bias, output);
 
@@ -72,7 +79,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(weight, bias);
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -87,10 +94,14 @@ CLQLSTMLayerNormalizationKernel::CLQLSTMLayerNormalizationKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias)
+void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_context,
+                                                const ICLTensor        *input,
+                                                ICLTensor              *output,
+                                                const ICLTensor        *weight,
+                                                const ICLTensor        *bias)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, bias, output);
-    auto padding_info = get_padding_info({ input, weight, bias, output });
+    auto padding_info = get_padding_info({input, weight, bias, output});
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), weight->info(), bias->info()));
 
@@ -104,7 +115,8 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_
     int32_t                       output_multiplier{};
     int32_t                       output_shift{};
     const UniformQuantizationInfo quan_info = _weight->info()->quantization_info().uniform();
-    const Status                  status    = quantization::calculate_quantized_multiplier(quan_info.scale, &output_multiplier, &output_shift);
+    const Status                  status =
+        quantization::calculate_quantized_multiplier(quan_info.scale, &output_multiplier, &output_shift);
     output_shift *= -1;
 
     // Set build options
@@ -114,8 +126,12 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_
     build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
     build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
     build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
-    build_opts.add_option("-DMIN_BOUND=" + support::cpp11::to_string(std::get<0>(quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
-    build_opts.add_option("-DMAX_BOUND=" + support::cpp11::to_string(std::get<1>(quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
+    build_opts.add_option("-DMIN_BOUND=" +
+                          support::cpp11::to_string(std::get<0>(
+                              quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
+    build_opts.add_option("-DMAX_BOUND=" +
+                          support::cpp11::to_string(std::get<1>(
+                              quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
 
     // Create kernel
     _kernel = create_kernel(compile_context, "qlstm_layer_normalization", build_opts.options());
@@ -135,12 +151,18 @@ void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-void CLQLSTMLayerNormalizationKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias)
+void CLQLSTMLayerNormalizationKernel::configure(const ICLTensor *input,
+                                                ICLTensor       *output,
+                                                const ICLTensor *weight,
+                                                const ICLTensor *bias)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, weight, bias);
 }
 
-Status CLQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias)
+Status CLQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input,
+                                                 const ITensorInfo *output,
+                                                 const ITensorInfo *weight,
+                                                 const ITensorInfo *bias)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, weight, bias));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
@@ -171,7 +193,6 @@ void CLQLSTMLayerNormalizationKernel::run(const Window &window, cl::CommandQueue
         add_2D_tensor_argument(idx, _output, slice);
 
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
+    } while (window.slide_window_slice_2D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
index 31085c37ba..ba912e1d2d 100644
--- a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
+++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
@@ -63,7 +63,11 @@ public:
      * @param[in]  weight          Weight tensor. Data types supported: Same as @p input.
      * @param[in]  bias            Bias tensor. Data types supported: S32.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const ICLTensor        *weight,
+                   const ICLTensor        *bias);
     /** Static function to check if given info will lead to a valid configuration of @ref CLQLSTMLayerNormalizationKernel
      *
      * @param[in] input  Source tensor info with 2 dimensions. Data types supported: QSYMM16.
@@ -73,7 +77,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
index 69a6fa5fa0..c97910ef79 100644
--- a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -42,24 +43,29 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status validate_arguments(const ITensorInfo         *input,
+                          const ITensorInfo         *rois,
+                          ITensorInfo               *output,
+                          const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
     ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(0) != 5);
     ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC, DataLayout::NCHW);
     ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info), output->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info),
+                                                           output->tensor_shape());
     }
 
-    if(is_data_type_quantized_asymmetric(input->data_type()))
+    if (is_data_type_quantized_asymmetric(input->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::QASYMM16);
 
@@ -82,12 +88,19 @@ CLROIAlignLayerKernel::CLROIAlignLayerKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLROIAlignLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayerKernel::configure(const ICLTensor           *input,
+                                      const ICLTensor           *rois,
+                                      ICLTensor                 *output,
+                                      const ROIPoolingLayerInfo &pool_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
 }
 
-void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayerKernel::configure(const CLCompileContext    &compile_context,
+                                      const ICLTensor           *input,
+                                      const ICLTensor           *rois,
+                                      ICLTensor                 *output,
+                                      const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info));
@@ -97,7 +110,7 @@ void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, c
     auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
     output->info()->set_data_layout(input->info()->data_layout());
 
-    auto padding_info = get_padding_info({ input, rois, output });
+    auto padding_info = get_padding_info({input, rois, output});
 
     _input     = input;
     _output    = output;
@@ -111,16 +124,23 @@ void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, c
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
     build_opts.add_option("-DDATA_SIZE=" + get_data_size_from_data_type(input->info()->data_type()));
-    build_opts.add_option("-DMAX_DIM_X=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH))));
-    build_opts.add_option("-DMAX_DIM_Y=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT))));
-    build_opts.add_option("-DMAX_DIM_Z=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL))));
+    build_opts.add_option("-DMAX_DIM_X=" +
+                          support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(
+                              input->info()->data_layout(), DataLayoutDimension::WIDTH))));
+    build_opts.add_option("-DMAX_DIM_Y=" +
+                          support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(
+                              input->info()->data_layout(), DataLayoutDimension::HEIGHT))));
+    build_opts.add_option("-DMAX_DIM_Z=" +
+                          support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(
+                              input->info()->data_layout(), DataLayoutDimension::CHANNEL))));
     build_opts.add_option("-DPOOLED_DIM_X=" + support::cpp11::to_string(pool_info.pooled_width()));
     build_opts.add_option("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height()));
     build_opts.add_option("-DSPATIAL_SCALE=" + float_to_string_with_full_precision(pool_info.spatial_scale()));
     build_opts.add_option_if(input->info()->data_layout() == DataLayout::NHWC, "-DNHWC");
-    build_opts.add_option_if(pool_info.sampling_ratio() > 0, "-DSAMPLING_RATIO=" + support::cpp11::to_string(pool_info.sampling_ratio()));
+    build_opts.add_option_if(pool_info.sampling_ratio() > 0,
+                             "-DSAMPLING_RATIO=" + support::cpp11::to_string(pool_info.sampling_ratio()));
 
-    if(is_qasymm)
+    if (is_qasymm)
     {
         const UniformQuantizationInfo iq_info    = input->info()->quantization_info().uniform();
         const UniformQuantizationInfo roisq_info = rois->info()->quantization_info().uniform();
@@ -144,7 +164,10 @@ void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, c
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status CLROIAlignLayerKernel::validate(const ITensorInfo         *input,
+                                       const ITensorInfo         *rois,
+                                       ITensorInfo               *output,
+                                       const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info));
     return Status{};
diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.h b/src/core/CL/kernels/CLROIAlignLayerKernel.h
index 5284a5913f..2e84e5d303 100644
--- a/src/core/CL/kernels/CLROIAlignLayerKernel.h
+++ b/src/core/CL/kernels/CLROIAlignLayerKernel.h
@@ -61,7 +61,8 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void
+    configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -77,7 +78,11 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input,
+                   const ICLTensor           *rois,
+                   ICLTensor                 *output,
+                   const ROIPoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLROIAlignLayerKernel
      *
      * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -93,7 +98,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           ITensorInfo               *output,
+                           const ROIPoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue);
diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
index f6933c6cfd..1b2c414a49 100644
--- a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -48,7 +49,10 @@ CLROIPoolingLayerKernel::CLROIPoolingLayerKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status CLROIPoolingLayerKernel::validate(const ITensorInfo         *input,
+                                         const ITensorInfo         *rois,
+                                         const ITensorInfo         *output,
+                                         const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
 
@@ -61,10 +65,11 @@ Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensor
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8);
     ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) || (output->dimension(1) != pool_info.pooled_height()));
+        ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) ||
+                                    (output->dimension(1) != pool_info.pooled_height()));
         ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(2));
         ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(1) != output->dimension(3));
     }
@@ -72,20 +77,30 @@ Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensor
     return Status{};
 }
 
-void CLROIPoolingLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayerKernel::configure(const ICLTensor           *input,
+                                        const ICLTensor           *rois,
+                                        ICLTensor                 *output,
+                                        const ROIPoolingLayerInfo &pool_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
 }
 
-void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayerKernel::configure(const CLCompileContext    &compile_context,
+                                        const ICLTensor           *input,
+                                        const ICLTensor           *rois,
+                                        const ICLTensor           *output,
+                                        const ROIPoolingLayerInfo &pool_info)
 {
-    ARM_COMPUTE_ERROR_THROW_ON(CLROIPoolingLayerKernel::validate(input->info(), rois->info(), output->info(), pool_info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        CLROIPoolingLayerKernel::validate(input->info(), rois->info(), output->info(), pool_info));
 
-    auto padding_info = get_padding_info({ input, rois, output });
+    auto padding_info = get_padding_info({input, rois, output});
 
     // Output auto initialization if not yet initialized
-    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->info()->dimension(1));
-    auto_init_if_empty(*(output->info()), output_shape, 1, input->info()->data_type(), output->info()->quantization_info());
+    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2),
+                             rois->info()->dimension(1));
+    auto_init_if_empty(*(output->info()), output_shape, 1, input->info()->data_type(),
+                       output->info()->quantization_info());
 
     // Set instance variables
     _input     = input;
@@ -107,11 +122,12 @@ void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context,
     build_opts.add_option("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height()));
     build_opts.add_option("-DSPATIAL_SCALE=" + support::cpp11::to_string(pool_info.spatial_scale()));
 
-    if(is_qasymm)
+    if (is_qasymm)
     {
         // Determine quantization info scale, offset
         UniformQuantizationInfo uqinfo = UniformQuantizationInfo();
-        uqinfo                         = compute_requantization_scale_offset(_input->info()->quantization_info().uniform(), _output->info()->quantization_info().uniform());
+        uqinfo = compute_requantization_scale_offset(_input->info()->quantization_info().uniform(),
+                                                     _output->info()->quantization_info().uniform());
         build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(uqinfo.offset));
         build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(uqinfo.scale));
 
diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.h b/src/core/CL/kernels/CLROIPoolingLayerKernel.h
index 7b7b457632..80bfb63092 100644
--- a/src/core/CL/kernels/CLROIPoolingLayerKernel.h
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.h
@@ -59,7 +59,8 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void
+    configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -74,7 +75,11 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input,
+                   const ICLTensor           *rois,
+                   const ICLTensor           *output,
+                   const ROIPoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
@@ -92,7 +97,10 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           const ITensorInfo         *output,
+                           const ROIPoolingLayerInfo &pool_info);
 
 private:
     const ICLTensor    *_input;
diff --git a/src/core/CL/kernels/CLRangeKernel.cpp b/src/core/CL/kernels/CLRangeKernel.cpp
index a06c2eed75..622f6210b9 100644
--- a/src/core/CL/kernels/CLRangeKernel.cpp
+++ b/src/core/CL/kernels/CLRangeKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -42,11 +43,8 @@ constexpr unsigned int vector_size_byte_opencl = 16;
 Status validate_arguments(const ITensorInfo *output, const float start, const float end, const float step)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output,
-                                                         1,
-                                                         DataType::U8, DataType::S8, DataType::QASYMM8,
-                                                         DataType::U16, DataType::S16,
-                                                         DataType::U32, DataType::S32,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16, DataType::U32, DataType::S32,
                                                          DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output);
 
@@ -56,19 +54,22 @@ Status validate_arguments(const ITensorInfo *output, const float start, const fl
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output->data_type(), output->quantization_info()), "start value is outside the range of the data type");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output->data_type(), output->quantization_info()), "end value is outside the range of the data type");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output->data_type(), output->quantization_info()), "step value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output->data_type(), output->quantization_info()),
+                                    "start value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output->data_type(), output->quantization_info()),
+                                    "end value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output->data_type(), output->quantization_info()),
+                                    "step value is outside the range of the data type");
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->num_dimensions() != 1, "Output has to be a 1-D tensor");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() < num_of_elements_in_range(start, end, step), "Output tensor size is incorrect");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() < num_of_elements_in_range(start, end, step),
+                                    "Output tensor size is incorrect");
 
     return Status{};
 }
 } // namespace
 
-CLRangeKernel::CLRangeKernel()
-    : _start(0), _end(1), _step(1), _output(nullptr)
+CLRangeKernel::CLRangeKernel() : _start(0), _end(1), _step(1), _output(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -78,16 +79,18 @@ void CLRangeKernel::configure(ICLTensor *output, const float start, const float
     configure(CLKernelLibrary::get().get_compile_context(), output, start, end, step);
 }
 
-void CLRangeKernel::configure(const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
+void CLRangeKernel::configure(
+    const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(output->info(), start, end, step));
 
     // Configure kernel window
-    unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / output->info()->element_size(), output->info()->dimension(0));
-    Window       win                               = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / output->info()->element_size(), output->info()->dimension(0));
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
 
-    auto padding_info = get_padding_info({ output });
+    auto padding_info = get_padding_info({output});
 
     _start  = start;
     _end    = end;
@@ -100,10 +103,11 @@ void CLRangeKernel::configure(const CLCompileContext &compile_context, ICLTensor
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
     build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(output->info()->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(output->info()->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DSTART=" + support::cpp11::to_string(start));
     build_opts.add_option("-DSTEP=" + support::cpp11::to_string(step));
-    if(is_data_type_quantized_asymmetric(output->info()->data_type()))
+    if (is_data_type_quantized_asymmetric(output->info()->data_type()))
     {
         const UniformQuantizationInfo qinfo = output->info()->quantization_info().uniform();
         build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(qinfo.offset));
diff --git a/src/core/CL/kernels/CLRangeKernel.h b/src/core/CL/kernels/CLRangeKernel.h
index 1b94a099ed..65251a11e5 100644
--- a/src/core/CL/kernels/CLRangeKernel.h
+++ b/src/core/CL/kernels/CLRangeKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLRANGEKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index e5cfb997ca..70875a2d40 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -28,15 +28,15 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -47,23 +47,28 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    if(input->num_channels() == 1)
+    if (input->num_channels() == 1)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                             DataType::S32, DataType::F16, DataType::F32);
     }
     else
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON(axis == 0);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8, "Not supported reduction operation for QASYMM8");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8,
+                                    "Not supported reduction operation for QASYMM8");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+                                    "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
-    ARM_COMPUTE_RETURN_ERROR_ON((op == ReductionOperation::MEAN_SUM) && (axis == 0) && (input->dimension(0) == 0) && (input->data_type() != DataType::QASYMM8)
-                                && (input->data_type() != DataType::QASYMM8_SIGNED));
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN), "Not supported reduction operation, use CLArgMinMaxLayer");
+    ARM_COMPUTE_RETURN_ERROR_ON((op == ReductionOperation::MEAN_SUM) && (axis == 0) && (input->dimension(0) == 0) &&
+                                (input->data_type() != DataType::QASYMM8) &&
+                                (input->data_type() != DataType::QASYMM8_SIGNED));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN),
+                                    "Not supported reduction operation, use CLArgMinMaxLayer");
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -79,33 +84,42 @@ CLReductionOperationKernel::CLReductionOperationKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
+void CLReductionOperationKernel::configure(const ICLTensor   *input,
+                                           ICLTensor         *output,
+                                           unsigned int       axis,
+                                           ReductionOperation op)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op);
 }
 
-void CLReductionOperationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
+void CLReductionOperationKernel::configure(const CLCompileContext &compile_context,
+                                           const ICLTensor        *input,
+                                           ICLTensor              *output,
+                                           unsigned int            axis,
+                                           ReductionOperation      op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
 
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input          = input;
     _output         = output;
     _reduction_axis = axis;
     _op             = op;
 
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, true);
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).reset_padding().set_is_resizable(true));
+    const TensorShape output_shape =
+        arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, true);
+    auto_init_if_empty(*output->info(),
+                       input->info()->clone()->set_tensor_shape(output_shape).reset_padding().set_is_resizable(true));
 
     // Set build options
     CLBuildOptions build_opts;
     DataType       data_type = input->info()->data_type();
     std::string    data_type_promoted{};
 
-    if(is_data_type_quantized(data_type))
+    if (is_data_type_quantized(data_type))
     {
         data_type_promoted = "int";
     }
@@ -130,10 +144,14 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
     build_opts.add_option_if(op == ReductionOperation::PROD, "-DPROD");
     build_opts.add_option_if(op == ReductionOperation::MIN, "-DMIN");
     build_opts.add_option_if(op == ReductionOperation::MAX, "-DMAX");
-    build_opts.add_option_if(is_data_type_quantized(data_type), "-DOFFSET=" + support::cpp11::to_string(input->info()->quantization_info().uniform().offset));
-    build_opts.add_option_if(is_data_type_quantized(data_type), "-DSCALE=" + float_to_string_with_full_precision(input->info()->quantization_info().uniform().scale));
-
-    switch(op)
+    build_opts.add_option_if(is_data_type_quantized(data_type),
+                             "-DOFFSET=" +
+                                 support::cpp11::to_string(input->info()->quantization_info().uniform().offset));
+    build_opts.add_option_if(
+        is_data_type_quantized(data_type),
+        "-DSCALE=" + float_to_string_with_full_precision(input->info()->quantization_info().uniform().scale));
+
+    switch (op)
     {
         case ReductionOperation::SUM_SQUARE:
             build_opts.add_option(("-DOPERATION=square_sum"));
@@ -159,7 +177,7 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
     std::string kernel_axis_name;
     const bool  is_serial_op = needs_serialized_reduction(_op, _input->info()->data_type(), _reduction_axis);
 
-    switch(axis)
+    switch (axis)
     {
         case 0:
         {
@@ -187,13 +205,17 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
 
     // Configure kernel window
     Window win = calculate_max_window(*input->info(), Steps(vec_size));
-    win.set(Window::DimX, Window::Dimension(win.x().start(), win.x().end() * _input->info()->num_channels(), win.x().step()));
+    win.set(Window::DimX,
+            Window::Dimension(win.x().start(), win.x().end() * _input->info()->num_channels(), win.x().step()));
     ICLKernel::configure_internal(win);
 
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status CLReductionOperationKernel::validate(const ITensorInfo *input,
+                                            const ITensorInfo *output,
+                                            unsigned int       axis,
+                                            ReductionOperation op)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
     return Status{};
@@ -205,18 +227,19 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
     const bool is_serial_op = needs_serialized_reduction(_op, _input->info()->data_type(), _reduction_axis);
-    switch(_reduction_axis)
+    switch (_reduction_axis)
     {
         case 0:
         {
             // We use parallel reduction only in non quantized types
-            if(is_serial_op)
+            if (is_serial_op)
             {
                 // Get first input and output slices
-                Window window_in{ window };
-                window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
+                Window window_in{window};
+                window_in.set(Window::DimX,
+                              Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
 
-                Window out_window{ window };
+                Window out_window{window};
                 out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
 
                 Window in_slice  = window_in.first_slice_window_1D();
@@ -228,8 +251,7 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
                     add_1D_tensor_argument(idx, _input, in_slice);
                     add_1D_tensor_argument(idx, _output, out_slice);
                     enqueue(queue, *this, in_slice);
-                }
-                while(window_in.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
+                } while (window_in.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
             }
             else
             {
@@ -251,8 +273,9 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
         case 1:
         {
             // Get first input and output slices
-            Window window_in{ window };
-            window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
+            Window window_in{window};
+            window_in.set(Window::DimY,
+                          Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
             Window in_slice  = window_in.first_slice_window_2D();
             Window out_slice = window.first_slice_window_2D();
 
@@ -262,15 +285,15 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
                 add_2D_tensor_argument(idx, _input, in_slice);
                 add_2D_tensor_argument(idx, _output, out_slice);
                 enqueue(queue, *this, in_slice);
-            }
-            while(window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+            } while (window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
         }
         break;
         case 2:
         {
             // Get first input and output slices
-            Window window_in{ window };
-            window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
+            Window window_in{window};
+            window_in.set(Window::DimZ,
+                          Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
             Window in_slice  = window_in.first_slice_window_3D();
             Window out_slice = window.first_slice_window_3D();
 
@@ -280,14 +303,13 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
                 add_3D_tensor_argument(idx, _input, in_slice);
                 add_3D_tensor_argument(idx, _output, out_slice);
                 enqueue(queue, *this, in_slice);
-            }
-            while(window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
+            } while (window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
         }
         break;
         case 3:
         {
             // Get first input and output slices
-            Window window_in{ window };
+            Window window_in{window};
             window_in.set(3, Window::Dimension(0, 1, 1));
             Window in_slice  = window_in.first_slice_window_4D();
             Window out_slice = window.first_slice_window_4D();
@@ -298,8 +320,7 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
                 add_4D_tensor_argument(idx, _input, in_slice);
                 add_4D_tensor_argument(idx, _output, out_slice);
                 enqueue(queue, *this, in_slice);
-            }
-            while(window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
+            } while (window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
         }
         break;
         default:
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.h b/src/core/CL/kernels/CLReductionOperationKernel.h
index b456378746..2f94b2add3 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.h
+++ b/src/core/CL/kernels/CLReductionOperationKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -67,7 +68,11 @@ public:
      * @param[in]  axis            Axis along which to reduce. Supported reduction axis : 0,1,2,3
      * @param[in]  op              Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   unsigned int            axis,
+                   ReductionOperation      op);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperationKernel.
      *
@@ -79,7 +84,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLReorgLayerKernel.cpp b/src/core/CL/kernels/CLReorgLayerKernel.cpp
index 3c74e80d33..9fd21943e8 100644
--- a/src/core/CL/kernels/CLReorgLayerKernel.cpp
+++ b/src/core/CL/kernels/CLReorgLayerKernel.cpp
@@ -28,9 +28,10 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -51,13 +52,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
     const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
 
     ARM_COMPUTE_RETURN_ERROR_ON(stride <= 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, "The width of the input tensor must be a multiple of stride");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, "The height of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0,
+                                    "The width of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0,
+                                    "The height of the input tensor must be a multiple of stride");
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
+        const TensorInfo tensor_info_output =
+            output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -66,8 +70,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
 }
 } // namespace
 
-CLReorgLayerKernel::CLReorgLayerKernel()
-    : _input(nullptr), _output(nullptr)
+CLReorgLayerKernel::CLReorgLayerKernel() : _input(nullptr), _output(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -77,17 +80,22 @@ void CLReorgLayerKernel::configure(const ICLTensor *input, ICLTensor *output, in
     configure(CLKernelLibrary::get().get_compile_context(), input, output, stride);
 }
 
-void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t stride)
+void CLReorgLayerKernel::configure(const CLCompileContext &compile_context,
+                                   const ICLTensor        *input,
+                                   ICLTensor              *output,
+                                   int32_t                 stride)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), stride));
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input  = input;
     _output = output;
 
-    std::string  kernel_name = std::string("reorg_layer_") + lower_string(string_from_data_layout(input->info()->data_layout()));
-    const size_t idx_channel = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+    std::string kernel_name =
+        std::string("reorg_layer_") + lower_string(string_from_data_layout(input->info()->data_layout()));
+    const size_t idx_channel =
+        get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
 
     // Create kernel
     CLBuildOptions build_opts;
@@ -98,7 +106,9 @@ void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, cons
 
     // Configure window
     // auto inizialize the output tensor if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride)));
+    auto_init_if_empty(*output->info(),
+                       input->info()->clone()->set_tensor_shape(
+                           misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride)));
 
     Window win = calculate_max_window(*output->info(), Steps());
 
@@ -119,7 +129,9 @@ void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, cons
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLReorgLayerKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output, int32_t stride)
+Status CLReorgLayerKernel::validate(const arm_compute::ITensorInfo *input,
+                                    const arm_compute::ITensorInfo *output,
+                                    int32_t                         stride)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, stride));
 
@@ -139,7 +151,6 @@ void CLReorgLayerKernel::run(const Window &window, cl::CommandQueue &queue)
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLReorgLayerKernel.h b/src/core/CL/kernels/CLReorgLayerKernel.h
index 455a6170c6..f335071e9f 100644
--- a/src/core/CL/kernels/CLReorgLayerKernel.h
+++ b/src/core/CL/kernels/CLReorgLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLREORGLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
diff --git a/src/core/CL/kernels/CLReverseKernel.cpp b/src/core/CL/kernels/CLReverseKernel.cpp
index 0d70ff4f3c..79a0f03b1e 100644
--- a/src/core/CL/kernels/CLReverseKernel.cpp
+++ b/src/core/CL/kernels/CLReverseKernel.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -49,7 +50,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->dimension(0) > 4, "Only up to 4 dimensions can be reversed");
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -60,8 +61,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 }
 } // namespace
 
-CLReverseKernel::CLReverseKernel()
-    : _input(nullptr), _output(nullptr), _axis(nullptr)
+CLReverseKernel::CLReverseKernel() : _input(nullptr), _output(nullptr), _axis(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -71,10 +71,13 @@ void CLReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const
     configure(CLKernelLibrary::get().get_compile_context(), input, output, axis);
 }
 
-void CLReverseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+void CLReverseKernel::configure(const CLCompileContext &compile_context,
+                                const ICLTensor        *input,
+                                ICLTensor              *output,
+                                const ICLTensor        *axis)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, axis);
-    auto padding_info = get_padding_info({ input, output, axis });
+    auto padding_info = get_padding_info({input, output, axis});
 
     _input  = input;
     _output = output;
@@ -138,7 +141,6 @@ void CLReverseKernel::run(const Window &window, cl::CommandQueue &queue)
         add_1D_tensor_argument(idx, _axis, axis_slice);
         add_4D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_4D(slice));
+    } while (collapsed.slide_window_slice_4D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLReverseKernel.h b/src/core/CL/kernels/CLReverseKernel.h
index 4a21e4f802..fbd99dc883 100644
--- a/src/core/CL/kernels/CLReverseKernel.h
+++ b/src/core/CL/kernels/CLReverseKernel.h
@@ -60,7 +60,10 @@ public:
      * @param[out] output          Output tensor. Data type supported: Same as @p input
      * @param[in]  axis            Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const ICLTensor        *axis);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLReverseKernel
      *
diff --git a/src/core/CL/kernels/CLSelectKernel.cpp b/src/core/CL/kernels/CLSelectKernel.cpp
index c0e014e8b8..703c64d8d3 100644
--- a/src/core/CL/kernels/CLSelectKernel.cpp
+++ b/src/core/CL/kernels/CLSelectKernel.cpp
@@ -30,10 +30,10 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -51,9 +51,11 @@ Status validate_arguments(const ITensorInfo *c, const ITensorInfo *x, const ITen
 
     const bool is_same_rank = (c->tensor_shape().num_dimensions() == x->tensor_shape().num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(is_same_rank && (x->tensor_shape() != c->tensor_shape()));
-    ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && ((c->tensor_shape().num_dimensions() > 1) || (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
+    ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank &&
+                                ((c->tensor_shape().num_dimensions() > 1) ||
+                                 (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(x, output);
@@ -63,13 +65,16 @@ Status validate_arguments(const ITensorInfo *c, const ITensorInfo *x, const ITen
 }
 } // namespace
 
-CLSelectKernel::CLSelectKernel()
-    : _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false)
+CLSelectKernel::CLSelectKernel() : _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLSelectKernel::configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output)
+void CLSelectKernel::configure(const CLCompileContext &compile_context,
+                               const ICLTensor        *c,
+                               const ICLTensor        *x,
+                               const ICLTensor        *y,
+                               ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(c, x, y, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(c->info(), x->info(), y->info(), output->info()));
@@ -80,7 +85,7 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC
     _output        = output;
     _has_same_rank = (c->info()->tensor_shape().num_dimensions() == x->info()->tensor_shape().num_dimensions());
 
-    auto               padding_info         = get_padding_info({ c, x, y, output });
+    auto               padding_info         = get_padding_info({c, x, y, output});
     const unsigned int vec_size_x           = adjust_vec_size(16 / x->info()->element_size(), x->info()->dimension(0));
     const int          vec_size_x_leftovers = output->info()->dimension(0) % vec_size_x;
 
@@ -92,14 +97,14 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC
 
     // Create kernel
     std::string kernel_name = "select";
-    if(_has_same_rank)
+    if (_has_same_rank)
     {
         kernel_name += "_same_rank";
     }
     else
     {
         const bool is_input_rank_greater_than_two = x->info()->tensor_shape().num_dimensions() > 2;
-        if(is_input_rank_greater_than_two)
+        if (is_input_rank_greater_than_two)
         {
             const size_t width      = x->info()->tensor_shape().x();
             const size_t height     = x->info()->tensor_shape().y();
@@ -128,7 +133,8 @@ void CLSelectKernel::configure(const CLCompileContext &compile_context, const IC
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLSelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+Status
+CLSelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(c, x, y, output));
     return Status{};
@@ -142,7 +148,7 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice     = collapsed.first_slice_window_3D();
 
-    if(!_has_same_rank)
+    if (!_has_same_rank)
     {
         Window vector_slice = window.first_slice_window_1D();
         vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
@@ -153,7 +159,7 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu
     do
     {
         unsigned int idx = _has_same_rank ? 0 : num_arguments_per_1D_tensor();
-        if(_has_same_rank)
+        if (_has_same_rank)
         {
             add_3D_tensor_argument(idx, _c, slice);
         }
@@ -162,7 +168,6 @@ void CLSelectKernel::run(const arm_compute::Window &window, cl::CommandQueue &qu
         add_3D_tensor_argument(idx, _output, slice);
 
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLSelectKernel.h b/src/core/CL/kernels/CLSelectKernel.h
index b8c10cd7cf..c4256fd743 100644
--- a/src/core/CL/kernels/CLSelectKernel.h
+++ b/src/core/CL/kernels/CLSelectKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLSELECTKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -60,7 +61,11 @@ public:
      * @param[out] y               Second input tensor. Data types supported: Same as @p x
      * @param[in]  output          Output tensor. Data types supported: Same as @p x.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *c,
+                   const ICLTensor        *x,
+                   const ICLTensor        *y,
+                   ICLTensor              *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSelectKernel
      *
      * @param[in] c      Condition input tensor. Data types supported: U8.
diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
index 3632ae2b03..f4c0839ad2 100644
--- a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -38,19 +39,22 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *paddings, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *block_info,
+                          const ITensorInfo *paddings,
+                          const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, paddings, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(block_info->num_dimensions() > 1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{ 2 });
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{2});
     ARM_COMPUTE_RETURN_ERROR_ON(paddings->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{ 2, 2 });
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{2, 2});
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const DataLayout data_layout = input->data_layout();
         const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
@@ -61,7 +65,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
 
     return Status{};
 }
-Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status validate_arguments_static(const ITensorInfo *input,
+                                 const int          block_shape_x,
+                                 const int          block_shape_y,
+                                 const Size2D      &padding_left,
+                                 const Size2D      &padding_right,
                                  const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -70,9 +78,10 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
     ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x < 1 || block_shape_y < 1);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(input, block_shape_x, block_shape_y, padding_left, padding_right);
+        TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(
+            input, block_shape_x, block_shape_y, padding_left, padding_right);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -88,16 +97,24 @@ CLSpaceToBatchLayerKernel::CLSpaceToBatchLayerKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input,
+                                          const ICLTensor *block_shape,
+                                          const ICLTensor *paddings,
+                                          ICLTensor       *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, paddings, output);
 }
 
-void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          const ICLTensor        *block_shape,
+                                          const ICLTensor        *paddings,
+                                          ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
-    auto padding_info = get_padding_info({ input, block_shape, paddings, output });
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
+    auto padding_info = get_padding_info({input, block_shape, paddings, output});
 
     _input       = input;
     _block_shape = block_shape;
@@ -111,14 +128,17 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex
 
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
     build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
     build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
     build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch)));
     build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
     build_opts.add_option("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
     build_opts.add_option("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_batch)));
-    _kernel = create_kernel(compile_context, "space_to_batch_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    _kernel = create_kernel(compile_context,
+                            "space_to_batch_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+                            build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps());
@@ -126,22 +146,34 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
-                                          ICLTensor *output)
+void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input,
+                                          const int        block_shape_x,
+                                          const int        block_shape_y,
+                                          const Size2D    &padding_left,
+                                          const Size2D    &padding_right,
+                                          ICLTensor       *output)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left,
+              padding_right, output);
 }
 
-void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left,
-                                          const Size2D &padding_right,
-                                          ICLTensor    *output)
+void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          const int               block_shape_x,
+                                          const int               block_shape_y,
+                                          const Size2D           &padding_left,
+                                          const Size2D           &padding_right,
+                                          ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+    TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(
+        input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+                       input->info()->quantization_info());
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, padding_right, output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left,
+                                                         padding_right, output->info()));
 
     _input  = input;
     _output = output;
@@ -153,7 +185,8 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex
 
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
     build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
     build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
     build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch)));
@@ -166,22 +199,32 @@ void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_contex
     build_opts.add_option("-DPAD_RIGHT_X=" + support::cpp11::to_string(padding_right.x()));
     build_opts.add_option("-DPAD_LEFT_Y=" + support::cpp11::to_string(padding_left.y()));
     build_opts.add_option("-DPAD_RIGHT_Y=" + support::cpp11::to_string(padding_right.y()));
-    _kernel = create_kernel(compile_context, "space_to_batch_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    _kernel = create_kernel(
+        compile_context, "space_to_batch_static_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+        build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps());
     ICLKernel::configure_internal(win);
 }
 
-Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input,
+                                           const ITensorInfo *block_shape,
+                                           const ITensorInfo *paddings,
+                                           const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, paddings, output));
     return Status{};
 }
-Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input,
+                                           const int          block_shape_x,
+                                           const int          block_shape_y,
+                                           const Size2D      &padding_left,
+                                           const Size2D      &padding_right,
                                            const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
     return Status{};
 }
 
@@ -218,7 +261,6 @@ void CLSpaceToBatchLayerKernel::run(const Window &window, cl::CommandQueue &queu
         add_3D_tensor_argument(idx, _output, slice_out);
         enqueue(queue, *this, slice_out, lws_hint());
         ++batch_id;
-    }
-    while(window.slide_window_slice_3D(slice_out));
+    } while (window.slide_window_slice_3D(slice_out));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h
index 4817cfeef2..f9dce9db47 100644
--- a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h
+++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLSPACETOBATCHLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -63,7 +64,11 @@ public:
      * @param[in]  paddings        2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32
      * @param[out] output          Tensor output. Data types supported: same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *block_shape,
+                   const ICLTensor        *paddings,
+                   ICLTensor              *output);
     /** Initialise the kernel's input and output. (Static block shape and paddings)
      *
      * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -73,7 +78,12 @@ public:
      * @param[in]  padding_right The padding at the end of every dimension of the output tensor.
      * @param[out] output        Tensor output. Data types supported: same as @p input
      */
-    void configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output);
+    void configure(const ICLTensor *input,
+                   const int        block_shape_x,
+                   const int        block_shape_y,
+                   const Size2D    &padding_left,
+                   const Size2D    &padding_right,
+                   ICLTensor       *output);
     /** Initialise the kernel's input and output. (Static block shape and paddings)
      *
      * @param[in]  compile_context The compile context to be used.
@@ -84,8 +94,13 @@ public:
      * @param[in]  padding_right   The padding at the end of every dimension of the output tensor.
      * @param[out] output          Tensor output. Data types supported: same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
-                   ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const int               block_shape_x,
+                   const int               block_shape_y,
+                   const Size2D           &padding_left,
+                   const Size2D           &padding_right,
+                   ICLTensor              *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayerKernel
      *
      * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -95,7 +110,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *block_shape,
+                           const ITensorInfo *paddings,
+                           const ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayerKernel (Static block shape and paddings)
      *
      * @param[in] input         Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -107,7 +125,12 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const int          block_shape_x,
+                           const int          block_shape_y,
+                           const Size2D      &padding_left,
+                           const Size2D      &padding_right,
+                           const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
index c5ffdb588b..25662b5c62 100644
--- a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -45,7 +46,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
     ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const DataLayout data_layout = input->data_layout();
         const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -64,8 +65,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
 }
 } // namespace
 
-CLSpaceToDepthLayerKernel::CLSpaceToDepthLayerKernel()
-    : _input(nullptr), _output(nullptr), _block_shape()
+CLSpaceToDepthLayerKernel::CLSpaceToDepthLayerKernel() : _input(nullptr), _output(nullptr), _block_shape()
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -75,10 +75,13 @@ void CLSpaceToDepthLayerKernel::configure(const ICLTensor *input, ICLTensor *out
     configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
 }
 
-void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          ICLTensor              *output,
+                                          int32_t                 block_shape)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     TensorShape output_shape = compute_space_to_depth_shape(input->info(), block_shape);
     auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
@@ -94,11 +97,14 @@ void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_contex
 
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(output->info()->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(output->info()->data_type())));
     build_opts.add_option("-DCHANNEL_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_channel)));
     build_opts.add_option("-DBLOCK_SHAPE=" + support::cpp11::to_string(block_shape));
     build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
-    _kernel = create_kernel(compile_context, "space_to_depth_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    _kernel = create_kernel(compile_context,
+                            "space_to_depth_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+                            build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps());
@@ -136,7 +142,6 @@ void CLSpaceToDepthLayerKernel::run(const Window &window, cl::CommandQueue &queu
         enqueue(queue, *this, slice_out, lws_hint());
 
         ++batch_id;
-    }
-    while(window.slide_window_slice_3D(slice_out));
+    } while (window.slide_window_slice_3D(slice_out));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h
index bb1ac5f9a6..d0932919e0 100644
--- a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h
+++ b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLSPACETODEPTHLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -61,7 +62,8 @@ public:
      * @param[out] output          Tensor output. Data types supported: same as @p input
      * @param[in]  block_shape     Block shape value.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+    void
+    configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToDepthLayerKernel.
      *
      * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
diff --git a/src/core/CL/kernels/CLStackLayerKernel.cpp b/src/core/CL/kernels/CLStackLayerKernel.cpp
index 075c93ab60..23e26716e7 100644
--- a/src/core/CL/kernels/CLStackLayerKernel.cpp
+++ b/src/core/CL/kernels/CLStackLayerKernel.cpp
@@ -30,10 +30,10 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 using namespace arm_compute::misc::shape_calculator;
@@ -42,7 +42,11 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input,
+                          unsigned int       axis,
+                          unsigned int       idx_input,
+                          unsigned int       num_tensors,
+                          const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
@@ -51,9 +55,10 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned
     ARM_COMPUTE_RETURN_ERROR_ON(axis > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_stack_shape(*input, axis, num_tensors));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
+                                                           compute_stack_shape(*input, axis, num_tensors));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
@@ -61,7 +66,8 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
 {
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_stack_shape(*input, axis, num_tensors)));
@@ -73,18 +79,23 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsi
 }
 } // namespace
 
-CLStackLayerKernel::CLStackLayerKernel()
-    : _input(nullptr), _output(nullptr)
+CLStackLayerKernel::CLStackLayerKernel() : _input(nullptr), _output(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLStackLayerKernel::configure(const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output)
+void CLStackLayerKernel::configure(
+    const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, axis, idx_input, num_tensors, output);
 }
 
-void CLStackLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output)
+void CLStackLayerKernel::configure(const CLCompileContext &compile_context,
+                                   const ICLTensor        *input,
+                                   unsigned int            axis,
+                                   unsigned int            idx_input,
+                                   unsigned int            num_tensors,
+                                   ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, idx_input, num_tensors, output->info()));
@@ -112,10 +123,15 @@ void CLStackLayerKernel::configure(const CLCompileContext &compile_context, cons
     _kernel.setArg<cl_uint>(idx, idx_input);
 }
 
-Status CLStackLayerKernel::validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+Status CLStackLayerKernel::validate(const ITensorInfo *input,
+                                    unsigned int       axis,
+                                    unsigned int       idx_input,
+                                    unsigned int       num_tensors,
+                                    const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, idx_input, num_tensors, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
     return Status{};
 }
 
diff --git a/src/core/CL/kernels/CLStackLayerKernel.h b/src/core/CL/kernels/CLStackLayerKernel.h
index 2865127a90..d3c17f529c 100644
--- a/src/core/CL/kernels/CLStackLayerKernel.h
+++ b/src/core/CL/kernels/CLStackLayerKernel.h
@@ -26,6 +26,7 @@
 #define ARM_COMPUTE_CLSTACKLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -60,7 +61,8 @@ public:
      * @param[out] output      Output tensor. Data types supported: Same as @p input.
      *
      */
-    void configure(const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output);
+    void configure(
+        const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output);
     /** Initialise the kernel's inputs and output
      *
      * @note Supported input tensor rank: up to 4
@@ -74,7 +76,12 @@ public:
      * @param[out] output          Output tensor. Data types supported: Same as @p input.
      *
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   unsigned int            axis,
+                   unsigned int            idx_input,
+                   unsigned int            num_tensors,
+                   ICLTensor              *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLStackLayerKernel
      *
      * @note Supported input tensor rank: up to 4
@@ -88,7 +95,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           unsigned int       axis,
+                           unsigned int       idx_input,
+                           unsigned int       num_tensors,
+                           const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLStridedSliceKernel.cpp b/src/core/CL/kernels/CLStridedSliceKernel.cpp
index 9acbafdb19..a8f6112820 100644
--- a/src/core/CL/kernels/CLStridedSliceKernel.cpp
+++ b/src/core/CL/kernels/CLStridedSliceKernel.cpp
@@ -22,11 +22,13 @@
  * SOFTWARE.
  */
 #include "src/core/CL/kernels/CLStridedSliceKernel.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/helpers/bit_ops.h"
@@ -37,9 +39,14 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                          int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *output,
+                          const Coordinates &starts,
+                          const Coordinates &ends,
+                          const BiStrides   &strides,
+                          int32_t            begin_mask,
+                          int32_t            end_mask,
+                          int32_t            shrink_axis_mask)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
@@ -48,19 +55,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
     ARM_COMPUTE_RETURN_ERROR_ON(starts.num_dimensions() > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(ends.num_dimensions() > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(strides.num_dimensions() > input->num_dimensions());
-    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i)
-    {
-        return i == 0;
-    }));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i) { return i == 0; }));
 
     // Get expected output shape
-    const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
-                                                                                                          starts, ends, strides,
-                                                                                                          begin_mask, end_mask, shrink_axis_mask);
+    const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(
+        *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     ARM_COMPUTE_RETURN_ERROR_ON(exp_output_shape.total_size() == 0);
 
     // Checks output if configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const TensorInfo exp_output_info = output->clone()->set_tensor_shape(exp_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &exp_output_info);
@@ -76,28 +80,33 @@ CLStridedSliceKernel::CLStridedSliceKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
-                                     const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                     int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void CLStridedSliceKernel::configure(const CLCompileContext &compile_context,
+                                     const ITensorInfo      *input,
+                                     ITensorInfo            *output,
+                                     const Coordinates      &starts,
+                                     const Coordinates      &ends,
+                                     const BiStrides        &strides,
+                                     int32_t                 begin_mask,
+                                     int32_t                 end_mask,
+                                     int32_t                 shrink_axis_mask)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    auto padding_info = get_padding_info({ input, output });
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+    auto padding_info = get_padding_info({input, output});
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
 
     const TensorShape &input_shape = input->tensor_shape();
 
     Coordinates starts_abs;
     Coordinates ends_abs;
     Coordinates final_strides;
-    std::tie(starts_abs, ends_abs, final_strides) = arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(
-                                                        input_shape,
-                                                        starts, ends, strides,
-                                                        begin_mask, end_mask, shrink_axis_mask);
+    std::tie(starts_abs, ends_abs, final_strides) =
+        arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(input_shape, starts, ends, strides,
+                                                                               begin_mask, end_mask, shrink_axis_mask);
 
     // Configure kernel window
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
-                                                                                                      starts, ends, strides,
-                                                                                                      begin_mask, end_mask, shrink_axis_mask);
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(
+        *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
     Window win = calculate_max_window(*output, Steps());
 
@@ -108,29 +117,33 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co
     const bool multi_access_x = !is_shrink_on_x && (final_strides.x() == 1) && (output_width_x / vec_size_x > 0);
 
     // Update window if needed
-    if(multi_access_x)
+    if (multi_access_x)
     {
         Window &updated_window = win;
         updated_window.set(Window::DimX,
-                           Window::Dimension(updated_window.x().start(), ceil_to_multiple(updated_window.x().end(), vec_size_x), vec_size_x));
+                           Window::Dimension(updated_window.x().start(),
+                                             ceil_to_multiple(updated_window.x().end(), vec_size_x), vec_size_x));
     }
     ICLKernel::configure_internal(win);
 
     // Create build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->data_type())));
-    for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(input->data_type())));
+    for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
     {
         const bool is_shrink = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, i);
-        build_opts.add_option("-DSTART_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(starts_abs[i]));
-        build_opts.add_option("-DSTRIDE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(final_strides[i]));
+        build_opts.add_option("-DSTART_" + support::cpp11::to_string(i) + "=" +
+                              support::cpp11::to_string(starts_abs[i]));
+        build_opts.add_option("-DSTRIDE_" + support::cpp11::to_string(i) + "=" +
+                              support::cpp11::to_string(final_strides[i]));
         build_opts.add_option_if(is_shrink, "-DSHRINK_" + support::cpp11::to_string(i));
     }
-    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(
+                                                                        std::max<int>(output_width_x - vec_size_x, 0)));
     build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
     build_opts.add_option_if_else(input_shape.num_dimensions() > 2,
-                                  "-DSRC_DEPTH=" + support::cpp11::to_string(input_shape.z()),
-                                  "-DSRC_DEPTH=1");
+                                  "-DSRC_DEPTH=" + support::cpp11::to_string(input_shape.z()), "-DSRC_DEPTH=1");
     build_opts.add_option_if_else(output->num_dimensions() > 2,
                                   "-DDST_DEPTH=" + support::cpp11::to_string(output->tensor_shape().z()),
                                   "-DDST_DEPTH=1");
@@ -142,7 +155,7 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co
     _config_id = "strided_slice";
     _config_id += "_";
     _config_id += lower_string(string_from_data_type(input->data_type()));
-    for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
     {
         _config_id += "_";
         _config_id += support::cpp11::to_string(input->dimension(i));
@@ -156,11 +169,17 @@ void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, co
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                      const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                      int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status CLStridedSliceKernel::validate(const ITensorInfo *input,
+                                      const ITensorInfo *output,
+                                      const Coordinates &starts,
+                                      const Coordinates &ends,
+                                      const BiStrides   &strides,
+                                      int32_t            begin_mask,
+                                      int32_t            end_mask,
+                                      int32_t            shrink_axis_mask)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
 
     return Status{};
 }
@@ -170,8 +189,9 @@ void CLStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, cl
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice            = window_collapsed.first_slice_window_4D();
@@ -182,7 +202,6 @@ void CLStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, cl
         add_4D_tensor_argument(idx, src, slice);
         add_4D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_4D(slice));
+    } while (window_collapsed.slide_window_slice_4D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLStridedSliceKernel.h b/src/core/CL/kernels/CLStridedSliceKernel.h
index 4c201504f5..1cf5bcacec 100644
--- a/src/core/CL/kernels/CLStridedSliceKernel.h
+++ b/src/core/CL/kernels/CLStridedSliceKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_STRIDED_SLICE_KERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 #include <cstdint>
@@ -53,9 +54,15 @@ public:
      * @param[in]  shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *input,
+                   ITensorInfo            *output,
+                   const Coordinates      &starts,
+                   const Coordinates      &ends,
+                   const BiStrides        &strides,
+                   int32_t                 begin_mask,
+                   int32_t                 end_mask,
+                   int32_t                 shrink_axis_mask);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel
      *
@@ -71,9 +78,14 @@ public:
      * @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                           int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const Coordinates &starts,
+                           const Coordinates &ends,
+                           const BiStrides   &strides,
+                           int32_t            begin_mask,
+                           int32_t            end_mask,
+                           int32_t            shrink_axis_mask);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/kernels/CLTileKernel.cpp b/src/core/CL/kernels/CLTileKernel.cpp
index 3e7015cfd2..fa996c4008 100644
--- a/src/core/CL/kernels/CLTileKernel.cpp
+++ b/src/core/CL/kernels/CLTileKernel.cpp
@@ -22,9 +22,11 @@
  * SOFTWARE.
  */
 #include "src/core/CL/kernels/CLTileKernel.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -39,15 +41,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON(multiples.size() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(multiples.empty());
-    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e)
-    {
-        return e == 0;
-    }));
+    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e) { return e == 0; }));
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
@@ -55,8 +55,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 }
 } // namespace
 
-CLTileKernel::CLTileKernel()
-    : _input(nullptr), _output(nullptr)
+CLTileKernel::CLTileKernel() : _input(nullptr), _output(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -66,7 +65,10 @@ void CLTileKernel::configure(const ICLTensor *input, ICLTensor *output, const Mu
     configure(CLKernelLibrary::get().get_compile_context(), input, output, multiples);
 }
 
-void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
+void CLTileKernel::configure(const CLCompileContext &compile_context,
+                             const ICLTensor        *input,
+                             ICLTensor              *output,
+                             const Multiples        &multiples)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -104,15 +106,14 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT
     // Configure window without padding
     Window win = calculate_max_window(*output->info());
 
-    if(multi_access_x)
+    if (multi_access_x)
     {
         // If multi-access is enabled, no thread should cross the tile boundaries. This means we need
         // as many threads as those to cover a single tile times multiples[0]. Note that if threads
         // do not cross the boundaries of the tiles, they won't cross the boundaries of the last tile, and
         // we don't need to pad the output
         const unsigned int size_win_x = ceil_to_multiple(input->info()->dimension(0), vec_size_x) * multiples[0];
-        win.set(Window::DimX,
-                Window::Dimension(win.x().start(), size_win_x, vec_size_x));
+        win.set(Window::DimX, Window::Dimension(win.x().start(), size_win_x, vec_size_x));
     }
 
     ICLKernel::configure_internal(win);
@@ -121,7 +122,7 @@ void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLT
     _config_id = "tile";
     _config_id += "_";
     _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    for(unsigned int i = 0; i < multiples.size(); ++i)
+    for (unsigned int i = 0; i < multiples.size(); ++i)
     {
         _config_id += "_";
         _config_id += support::cpp11::to_string(input->info()->dimension(i));
@@ -150,7 +151,6 @@ void CLTileKernel::run(const Window &window, cl::CommandQueue &queue)
         add_4D_tensor_argument(idx, _input, slice);
         add_4D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_4D(slice));
+    } while (collapsed.slide_window_slice_4D(slice));
 }
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLTileKernel.h b/src/core/CL/kernels/CLTileKernel.h
index 41752ca90b..c3486aecef 100644
--- a/src/core/CL/kernels/CLTileKernel.h
+++ b/src/core/CL/kernels/CLTileKernel.h
@@ -64,7 +64,10 @@ public:
      * @param[out] output          Destination tensor. Same as @p input
      *
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const Multiples        &multiples);
     /** Static function to check if given info will lead to a valid configuration of @ref CLTileKernel
      *
      * @param[in] input     Source tensor info. Data type supported: All.
diff --git a/src/core/CPP/CPPTypes.cpp b/src/core/CPP/CPPTypes.cpp
index 6a3f66fd5a..9980db42f3 100644
--- a/src/core/CPP/CPPTypes.cpp
+++ b/src/core/CPP/CPPTypes.cpp
@@ -25,6 +25,7 @@
 #include "arm_compute/core/CPP/CPPTypes.h"
 
 #include "arm_compute/core/Error.h"
+
 #include "src/common/cpuinfo/CpuInfo.h"
 #include "src/common/cpuinfo/CpuIsaInfo.h"
 
@@ -43,8 +44,7 @@ CPUInfo &CPUInfo::get()
     return _cpuinfo;
 }
 
-CPUInfo::CPUInfo()
-    : _impl(std::make_unique<Impl>())
+CPUInfo::CPUInfo() : _impl(std::make_unique<Impl>())
 {
     _impl->info = cpuinfo::CpuInfo::build();
 }
diff --git a/src/core/CPP/Validate.h b/src/core/CPP/Validate.h
index df192b5131..fe253508cf 100644
--- a/src/core/CPP/Validate.h
+++ b/src/core/CPP/Validate.h
@@ -38,8 +38,8 @@ namespace arm_compute
  *
  * @return Status
  */
-inline Status error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line,
-                                            const ITensorInfo *tensor_info)
+inline Status
+error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line, const ITensorInfo *tensor_info)
 {
     bool fp16_kernels_enabled = false;
 #if defined(ARM_COMPUTE_ENABLE_FP16) && defined(ENABLE_FP16_KERNELS)
@@ -47,8 +47,9 @@ inline Status error_on_unsupported_cpu_fp16(const char *function, const char *fi
 #endif /* defined(ARM_COMPUTE_ENABLE_FP16) && defined(ENABLE_FP16_KERNELS) */
 
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG((tensor_info->data_type() == DataType::F16) && (!CPUInfo::get().has_fp16() || !fp16_kernels_enabled),
-                                        function, file, line, "This CPU architecture does not support F16 data type, you need v8.2 or above");
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(
+        (tensor_info->data_type() == DataType::F16) && (!CPUInfo::get().has_fp16() || !fp16_kernels_enabled), function,
+        file, line, "This CPU architecture does not support F16 data type, you need v8.2 or above");
     return Status{};
 }
 
@@ -61,8 +62,8 @@ inline Status error_on_unsupported_cpu_fp16(const char *function, const char *fi
  *
  * @return Status
  */
-inline Status error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line,
-                                            const ITensorInfo *tensor_info)
+inline Status
+error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line, const ITensorInfo *tensor_info)
 {
     bool bf16_kernels_enabled = false;
 #if defined(ARM_COMPUTE_ENABLE_BF16)
@@ -70,8 +71,9 @@ inline Status error_on_unsupported_cpu_bf16(const char *function, const char *fi
 #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
 
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG((tensor_info->data_type() == DataType::BFLOAT16) && (!CPUInfo::get().has_bf16() || !bf16_kernels_enabled),
-                                        function, file, line, "This CPU architecture does not support BFloat16 data type, you need v8.6 or above");
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(
+        (tensor_info->data_type() == DataType::BFLOAT16) && (!CPUInfo::get().has_bf16() || !bf16_kernels_enabled),
+        function, file, line, "This CPU architecture does not support BFloat16 data type, you need v8.6 or above");
     return Status{};
 }
 
@@ -84,8 +86,8 @@ inline Status error_on_unsupported_cpu_bf16(const char *function, const char *fi
  *
  * @return Status
  */
-inline Status error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line,
-                                            const ITensor *tensor)
+inline Status
+error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line, const ITensor *tensor)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_fp16(function, file, line, tensor->info()));
@@ -101,8 +103,8 @@ inline Status error_on_unsupported_cpu_fp16(const char *function, const char *fi
  *
  * @return Status
  */
-inline Status error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line,
-                                            const ITensor *tensor)
+inline Status
+error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line, const ITensor *tensor)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_bf16(function, file, line, tensor->info()));
diff --git a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
index 0f405d8e83..02686eb4f6 100644
--- a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
+++ b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h"
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
@@ -34,7 +35,11 @@ namespace arm_compute
 namespace
 {
 template <typename T>
-std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &scores_in, std::vector<int> inds, const BoxNMSLimitInfo &info, int class_id)
+std::vector<int> SoftNMS(const ITensor               *proposals,
+                         std::vector<std::vector<T>> &scores_in,
+                         std::vector<int>             inds,
+                         const BoxNMSLimitInfo       &info,
+                         int                          class_id)
 {
     std::vector<int> keep;
     const int        proposals_width = proposals->info()->dimension(1);
@@ -45,7 +50,7 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
     std::vector<T> y2(proposals_width);
     std::vector<T> areas(proposals_width);
 
-    for(int i = 0; i < proposals_width; ++i)
+    for (int i = 0; i < proposals_width; ++i)
     {
         x1[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4, i)));
         y1[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 1, i)));
@@ -56,13 +61,13 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
 
     // Note: Soft NMS scores have already been initialized with input scores
 
-    while(!inds.empty())
+    while (!inds.empty())
     {
         // Find proposal with max score among remaining proposals
         int max_pos = 0;
-        for(unsigned int i = 1; i < inds.size(); ++i)
+        for (unsigned int i = 1; i < inds.size(); ++i)
         {
-            if(scores_in[class_id][inds.at(i)] > scores_in[class_id][inds.at(max_pos)])
+            if (scores_in[class_id][inds.at(i)] > scores_in[class_id][inds.at(max_pos)])
             {
                 max_pos = i;
             }
@@ -75,7 +80,7 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
         inds.erase(inds.begin());
 
         std::vector<int> sorted_indices_temp;
-        for(auto idx : inds)
+        for (auto idx : inds)
         {
             const auto xx1 = std::max(x1[idx], x1[element]);
             const auto yy1 = std::max(y1[idx], y1[element]);
@@ -89,7 +94,7 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
 
             // Update scores based on computed IoU, overlap threshold and NMS method
             T weight;
-            switch(info.soft_nms_method())
+            switch (info.soft_nms_method())
             {
                 case NMSType::LINEAR:
                     weight = (ovr > info.nms()) ? (1.f - ovr) : 1.f;
@@ -106,7 +111,7 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
 
             // Discard boxes with new scores below min threshold and update pending indices
             scores_in[class_id][idx] *= weight;
-            if(scores_in[class_id][idx] >= info.soft_nms_min_score_thres())
+            if (scores_in[class_id][idx] >= info.soft_nms_min_score_thres())
             {
                 sorted_indices_temp.push_back(idx);
             }
@@ -118,7 +123,10 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
 }
 
 template <typename T>
-std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int> sorted_indices, const BoxNMSLimitInfo &info, int class_id)
+std::vector<int> NonMaximaSuppression(const ITensor         *proposals,
+                                      std::vector<int>       sorted_indices,
+                                      const BoxNMSLimitInfo &info,
+                                      int                    class_id)
 {
     std::vector<int> keep;
 
@@ -130,7 +138,7 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
     std::vector<T> y2(proposals_width);
     std::vector<T> areas(proposals_width);
 
-    for(int i = 0; i < proposals_width; ++i)
+    for (int i = 0; i < proposals_width; ++i)
     {
         x1[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4, i)));
         y1[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 1, i)));
@@ -139,7 +147,7 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
         areas[i] = (x2[i] - x1[i] + 1.0) * (y2[i] - y1[i] + 1.0);
     }
 
-    while(!sorted_indices.empty())
+    while (!sorted_indices.empty())
     {
         int i = sorted_indices.at(0);
         keep.push_back(i);
@@ -148,7 +156,7 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
         std::vector<int> new_indices;
         sorted_indices_temp.erase(sorted_indices_temp.begin());
 
-        for(unsigned int j = 0; j < sorted_indices_temp.size(); ++j)
+        for (unsigned int j = 0; j < sorted_indices_temp.size(); ++j)
         {
             const float xx1 = std::max(x1[sorted_indices_temp.at(j)], x1[i]);
             const float yy1 = std::max(y1[sorted_indices_temp.at(j)], y1[i]);
@@ -163,8 +171,9 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
             const float ctr_y = yy1 + (h / 2);
 
             // If suppress_size is specified, filter the boxes based on their size and position
-            const bool keep_size = !info.suppress_size() || (w >= info.min_size() && h >= info.min_size() && ctr_x < info.im_width() && ctr_y < info.im_height());
-            if(ovr <= info.nms() && keep_size)
+            const bool keep_size = !info.suppress_size() || (w >= info.min_size() && h >= info.min_size() &&
+                                                             ctr_x < info.im_width() && ctr_y < info.im_height());
+            if (ovr <= info.nms() && keep_size)
             {
                 new_indices.push_back(j);
             }
@@ -172,7 +181,7 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
 
         const unsigned int new_indices_size = new_indices.size();
         std::vector<int>   new_sorted_indices(new_indices_size);
-        for(unsigned int i = 0; i < new_indices_size; ++i)
+        for (unsigned int i = 0; i < new_indices_size; ++i)
         {
             new_sorted_indices[i] = sorted_indices[new_indices[i] + 1];
         }
@@ -184,7 +193,15 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
 } // namespace
 
 CPPBoxWithNonMaximaSuppressionLimitKernel::CPPBoxWithNonMaximaSuppressionLimitKernel()
-    : _scores_in(nullptr), _boxes_in(nullptr), _batch_splits_in(nullptr), _scores_out(nullptr), _boxes_out(nullptr), _classes(nullptr), _batch_splits_out(nullptr), _keeps(nullptr), _keeps_size(nullptr),
+    : _scores_in(nullptr),
+      _boxes_in(nullptr),
+      _batch_splits_in(nullptr),
+      _scores_out(nullptr),
+      _boxes_out(nullptr),
+      _classes(nullptr),
+      _batch_splits_out(nullptr),
+      _keeps(nullptr),
+      _keeps_size(nullptr),
       _info()
 {
 }
@@ -197,7 +214,7 @@ bool CPPBoxWithNonMaximaSuppressionLimitKernel::is_parallelisable() const
 template <typename T>
 void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
 {
-    const int                     batch_size   = _batch_splits_in == nullptr ? 1 : _batch_splits_in->info()->dimension(0);
+    const int                     batch_size = _batch_splits_in == nullptr ? 1 : _batch_splits_in->info()->dimension(0);
     const int                     num_classes  = _scores_in->info()->dimension(0);
     const int                     scores_count = _scores_in->info()->dimension(1);
     std::vector<int>              total_keep_per_batch(batch_size);
@@ -205,51 +222,48 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
     int                           total_keep_count = 0;
 
     std::vector<std::vector<T>> in_scores(num_classes, std::vector<T>(scores_count));
-    for(int i = 0; i < scores_count; ++i)
+    for (int i = 0; i < scores_count; ++i)
     {
-        for(int j = 0; j < num_classes; ++j)
+        for (int j = 0; j < num_classes; ++j)
         {
             in_scores[j][i] = *reinterpret_cast<const T *>(_scores_in->ptr_to_element(Coordinates(j, i)));
         }
     }
 
     int cur_start_idx = 0;
-    for(int b = 0; b < batch_size; ++b)
+    for (int b = 0; b < batch_size; ++b)
     {
         // Skip first class if there is more than 1 except if the number of classes is 1.
         const int j_start = (num_classes == 1 ? 0 : 1);
-        for(int j = j_start; j < num_classes; ++j)
+        for (int j = j_start; j < num_classes; ++j)
         {
             std::vector<T>   cur_scores(scores_count);
             std::vector<int> inds;
-            for(int i = 0; i < scores_count; ++i)
+            for (int i = 0; i < scores_count; ++i)
             {
                 const T score = in_scores[j][i];
                 cur_scores[i] = score;
 
-                if(score > _info.score_thresh())
+                if (score > _info.score_thresh())
                 {
                     inds.push_back(i);
                 }
             }
-            if(_info.soft_nms_enabled())
+            if (_info.soft_nms_enabled())
             {
                 keeps[j] = SoftNMS(_boxes_in, in_scores, inds, _info, j);
             }
             else
             {
                 std::sort(inds.data(), inds.data() + inds.size(),
-                          [&cur_scores](int lhs, int rhs)
-                {
-                    return cur_scores[lhs] > cur_scores[rhs];
-                });
+                          [&cur_scores](int lhs, int rhs) { return cur_scores[lhs] > cur_scores[rhs]; });
 
                 keeps[j] = NonMaximaSuppression<T>(_boxes_in, inds, _info, j);
             }
             total_keep_count += keeps[j].size();
         }
 
-        if(_info.detections_per_im() > 0 && total_keep_count > _info.detections_per_im())
+        if (_info.detections_per_im() > 0 && total_keep_count > _info.detections_per_im())
         {
             // merge all scores (represented by indices) together and sort
             auto get_all_scores_sorted = [&in_scores, &keeps, total_keep_count]()
@@ -257,10 +271,10 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
                 std::vector<T> ret(total_keep_count);
 
                 int ret_idx = 0;
-                for(unsigned int i = 1; i < keeps.size(); ++i)
+                for (unsigned int i = 1; i < keeps.size(); ++i)
                 {
                     auto &cur_keep = keeps[i];
-                    for(auto &ckv : cur_keep)
+                    for (auto &ckv : cur_keep)
                     {
                         ret[ret_idx++] = in_scores[i][ckv];
                     }
@@ -273,13 +287,13 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
 
             auto    all_scores_sorted = get_all_scores_sorted();
             const T image_thresh      = all_scores_sorted[all_scores_sorted.size() - _info.detections_per_im()];
-            for(int j = 1; j < num_classes; ++j)
+            for (int j = 1; j < num_classes; ++j)
             {
                 auto            &cur_keep = keeps[j];
                 std::vector<int> new_keeps_j;
-                for(auto &k : cur_keep)
+                for (auto &k : cur_keep)
                 {
-                    if(in_scores[j][k] >= image_thresh)
+                    if (in_scores[j][k] >= image_thresh)
                     {
                         new_keeps_j.push_back(k);
                     }
@@ -293,40 +307,52 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
 
         // Write results
         int cur_out_idx = 0;
-        for(int j = j_start; j < num_classes; ++j)
+        for (int j = j_start; j < num_classes; ++j)
         {
-            auto     &cur_keep        = keeps[j];
-            auto      cur_out_scores  = reinterpret_cast<T *>(_scores_out->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
-            auto      cur_out_classes = reinterpret_cast<T *>(_classes->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
-            const int box_column      = (cur_start_idx + cur_out_idx) * 4;
-
-            for(unsigned int k = 0; k < cur_keep.size(); ++k)
+            auto &cur_keep = keeps[j];
+            auto  cur_out_scores =
+                reinterpret_cast<T *>(_scores_out->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
+            auto cur_out_classes =
+                reinterpret_cast<T *>(_classes->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
+            const int box_column = (cur_start_idx + cur_out_idx) * 4;
+
+            for (unsigned int k = 0; k < cur_keep.size(); ++k)
             {
-                cur_out_scores[k]     = in_scores[j][cur_keep[k]];
-                cur_out_classes[k]    = static_cast<T>(j);
-                auto cur_out_box_row0 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 0, k)));
-                auto cur_out_box_row1 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 1, k)));
-                auto cur_out_box_row2 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 2, k)));
-                auto cur_out_box_row3 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 3, k)));
-                *cur_out_box_row0     = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 0, cur_keep[k])));
-                *cur_out_box_row1     = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 1, cur_keep[k])));
-                *cur_out_box_row2     = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 2, cur_keep[k])));
-                *cur_out_box_row3     = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 3, cur_keep[k])));
+                cur_out_scores[k]  = in_scores[j][cur_keep[k]];
+                cur_out_classes[k] = static_cast<T>(j);
+                auto cur_out_box_row0 =
+                    reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 0, k)));
+                auto cur_out_box_row1 =
+                    reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 1, k)));
+                auto cur_out_box_row2 =
+                    reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 2, k)));
+                auto cur_out_box_row3 =
+                    reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 3, k)));
+                *cur_out_box_row0 =
+                    *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 0, cur_keep[k])));
+                *cur_out_box_row1 =
+                    *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 1, cur_keep[k])));
+                *cur_out_box_row2 =
+                    *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 2, cur_keep[k])));
+                *cur_out_box_row3 =
+                    *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 3, cur_keep[k])));
             }
 
             cur_out_idx += cur_keep.size();
         }
 
-        if(_keeps != nullptr)
+        if (_keeps != nullptr)
         {
             cur_out_idx = 0;
-            for(int j = 0; j < num_classes; ++j)
+            for (int j = 0; j < num_classes; ++j)
             {
-                for(unsigned int i = 0; i < keeps[j].size(); ++i)
+                for (unsigned int i = 0; i < keeps[j].size(); ++i)
                 {
-                    *reinterpret_cast<T *>(_keeps->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx + i))) = static_cast<T>(keeps[j].at(i));
+                    *reinterpret_cast<T *>(_keeps->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx + i))) =
+                        static_cast<T>(keeps[j].at(i));
                 }
-                *reinterpret_cast<uint32_t *>(_keeps_size->ptr_to_element(Coordinates(j + b * num_classes))) = keeps[j].size();
+                *reinterpret_cast<uint32_t *>(_keeps_size->ptr_to_element(Coordinates(j + b * num_classes))) =
+                    keeps[j].size();
                 cur_out_idx += keeps[j].size();
             }
         }
@@ -334,17 +360,25 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
         cur_start_idx += total_keep_count;
     }
 
-    if(_batch_splits_out != nullptr)
+    if (_batch_splits_out != nullptr)
     {
-        for(int b = 0; b < batch_size; ++b)
+        for (int b = 0; b < batch_size; ++b)
         {
             *reinterpret_cast<float *>(_batch_splits_out->ptr_to_element(Coordinates(b))) = total_keep_per_batch[b];
         }
     }
 }
 
-void CPPBoxWithNonMaximaSuppressionLimitKernel::configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,
-                                                          ITensor *batch_splits_out, ITensor *keeps, ITensor *keeps_size, const BoxNMSLimitInfo info)
+void CPPBoxWithNonMaximaSuppressionLimitKernel::configure(const ITensor        *scores_in,
+                                                          const ITensor        *boxes_in,
+                                                          const ITensor        *batch_splits_in,
+                                                          ITensor              *scores_out,
+                                                          ITensor              *boxes_out,
+                                                          ITensor              *classes,
+                                                          ITensor              *batch_splits_out,
+                                                          ITensor              *keeps,
+                                                          ITensor              *keeps_size,
+                                                          const BoxNMSLimitInfo info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::F16, DataType::F32);
@@ -352,25 +386,28 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::configure(const ITensor *scores_
     const unsigned int num_classes = scores_in->info()->dimension(0);
 
     ARM_COMPUTE_UNUSED(num_classes);
-    ARM_COMPUTE_ERROR_ON_MSG((4 * num_classes) != boxes_in->info()->dimension(0), "First dimension of input boxes must be of size 4*num_classes");
-    ARM_COMPUTE_ERROR_ON_MSG(scores_in->info()->dimension(1) != boxes_in->info()->dimension(1), "Input scores and input boxes must have the same number of rows");
+    ARM_COMPUTE_ERROR_ON_MSG((4 * num_classes) != boxes_in->info()->dimension(0),
+                             "First dimension of input boxes must be of size 4*num_classes");
+    ARM_COMPUTE_ERROR_ON_MSG(scores_in->info()->dimension(1) != boxes_in->info()->dimension(1),
+                             "Input scores and input boxes must have the same number of rows");
 
     ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != boxes_out->info()->dimension(1));
     ARM_COMPUTE_ERROR_ON(boxes_out->info()->dimension(0) != 4);
     ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != classes->info()->dimension(0));
-    if(keeps != nullptr)
+    if (keeps != nullptr)
     {
-        ARM_COMPUTE_ERROR_ON_MSG(keeps_size == nullptr, "keeps_size cannot be nullptr if keeps has to be provided as output");
+        ARM_COMPUTE_ERROR_ON_MSG(keeps_size == nullptr,
+                                 "keeps_size cannot be nullptr if keeps has to be provided as output");
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, keeps);
         ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keeps_size, 1, DataType::U32);
         ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != keeps->info()->dimension(0));
         ARM_COMPUTE_ERROR_ON(num_classes != keeps_size->info()->dimension(0));
     }
-    if(batch_splits_in != nullptr)
+    if (batch_splits_in != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, batch_splits_in);
     }
-    if(batch_splits_out != nullptr)
+    if (batch_splits_out != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, batch_splits_out);
     }
@@ -399,7 +436,7 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run(const Window &window, const
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IKernel::window(), window);
 
-    switch(_scores_in->info()->data_type())
+    switch (_scores_in->info()->data_type())
     {
         case DataType::F32:
             run_nmslimit<float>();
diff --git a/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
index c1187ff2b3..1224ec14a7 100644
--- a/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
+++ b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
@@ -35,15 +35,22 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *output_indices, unsigned int max_output_size,
-                          const float score_threshold, const float iou_threshold)
+Status validate_arguments(const ITensorInfo *bboxes,
+                          const ITensorInfo *scores,
+                          const ITensorInfo *output_indices,
+                          unsigned int       max_output_size,
+                          const float        score_threshold,
+                          const float        iou_threshold)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(bboxes, scores, output_indices);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bboxes, 1, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_indices, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(bboxes->num_dimensions() > 2, "The bboxes tensor must be a 2-D float tensor of shape [4, num_boxes].");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(scores->num_dimensions() > 1, "The scores tensor must be a 1-D float tensor of shape [num_boxes].");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_indices->num_dimensions() > 1, "The indices must be 1-D integer tensor of shape [M], where max_output_size <= M");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(bboxes->num_dimensions() > 2,
+                                    "The bboxes tensor must be a 2-D float tensor of shape [4, num_boxes].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(scores->num_dimensions() > 1,
+                                    "The scores tensor must be a 1-D float tensor of shape [num_boxes].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_indices->num_dimensions() > 1,
+                                    "The indices must be 1-D integer tensor of shape [M], where max_output_size <= M");
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(bboxes, scores);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_indices->dimension(0) == 0, "Indices tensor must be bigger than 0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(max_output_size == 0, "Max size cannot be 0");
@@ -55,15 +62,26 @@ Status validate_arguments(const ITensorInfo *bboxes, const ITensorInfo *scores,
 } // namespace
 
 CPPNonMaximumSuppressionKernel::CPPNonMaximumSuppressionKernel()
-    : _input_bboxes(nullptr), _input_scores(nullptr), _output_indices(nullptr), _max_output_size(0), _score_threshold(0.f), _iou_threshold(0.f), _num_boxes(0)
+    : _input_bboxes(nullptr),
+      _input_scores(nullptr),
+      _output_indices(nullptr),
+      _max_output_size(0),
+      _score_threshold(0.f),
+      _iou_threshold(0.f),
+      _num_boxes(0)
 {
 }
 
-void CPPNonMaximumSuppressionKernel::configure(const ITensor *input_bboxes, const ITensor *input_scores, ITensor *output_indices,
-                                               unsigned int max_output_size, const float score_threshold, const float iou_threshold)
+void CPPNonMaximumSuppressionKernel::configure(const ITensor *input_bboxes,
+                                               const ITensor *input_scores,
+                                               ITensor       *output_indices,
+                                               unsigned int   max_output_size,
+                                               const float    score_threshold,
+                                               const float    iou_threshold)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input_bboxes, input_scores, output_indices);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_bboxes->info(), input_scores->info(), output_indices->info(), max_output_size, score_threshold, iou_threshold));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_bboxes->info(), input_scores->info(), output_indices->info(),
+                                                  max_output_size, score_threshold, iou_threshold));
 
     auto_init_if_empty(*output_indices->info(), TensorShape(max_output_size), 1, DataType::U8, QuantizationInfo());
 
@@ -82,10 +100,15 @@ void CPPNonMaximumSuppressionKernel::configure(const ITensor *input_bboxes, cons
     ICPPKernel::configure(win);
 }
 
-Status CPPNonMaximumSuppressionKernel::validate(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *output_indices,
-                                                unsigned int max_output_size, const float score_threshold, const float iou_threshold)
+Status CPPNonMaximumSuppressionKernel::validate(const ITensorInfo *bboxes,
+                                                const ITensorInfo *scores,
+                                                const ITensorInfo *output_indices,
+                                                unsigned int       max_output_size,
+                                                const float        score_threshold,
+                                                const float        iou_threshold)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(bboxes, scores, output_indices, max_output_size, score_threshold, iou_threshold));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(bboxes, scores, output_indices, max_output_size, score_threshold, iou_threshold));
     return Status{};
 }
 
@@ -99,10 +122,10 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
     // Auxiliary tensors
     std::vector<int>   indices_above_thd;
     std::vector<float> scores_above_thd;
-    for(unsigned int i = 0; i < _num_boxes; ++i)
+    for (unsigned int i = 0; i < _num_boxes; ++i)
     {
         const float score_i = *(reinterpret_cast<float *>(_input_scores->ptr_to_element(Coordinates(i))));
-        if(score_i >= _score_threshold)
+        if (score_i >= _score_threshold)
         {
             scores_above_thd.emplace_back(score_i);
             indices_above_thd.emplace_back(i);
@@ -114,12 +137,9 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
     std::vector<unsigned int> sorted_indices;
     sorted_indices.resize(num_above_thd);
     std::iota(sorted_indices.data(), sorted_indices.data() + num_above_thd, 0);
-    std::sort(std::begin(sorted_indices),
-              std::end(sorted_indices),
+    std::sort(std::begin(sorted_indices), std::end(sorted_indices),
               [&](unsigned int first, unsigned int second)
-    {
-        return scores_above_thd[first] > scores_above_thd[second];
-    });
+              { return scores_above_thd[first] > scores_above_thd[second]; });
 
     // Number of output is the minimum between max_detection and the scores above the threshold
     const unsigned int num_output = std::min(_max_output_size, num_above_thd);
@@ -127,19 +147,20 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
     std::vector<bool>  visited(num_above_thd, false);
 
     // Keep only boxes with small IoU
-    for(unsigned int i = 0; i < num_above_thd; ++i)
+    for (unsigned int i = 0; i < num_above_thd; ++i)
     {
         // Check if the output is full
-        if(output_idx >= num_output)
+        if (output_idx >= num_output)
         {
             break;
         }
 
         // Check if it was already visited, if not add it to the output and update the indices counter
-        if(!visited[sorted_indices[i]])
+        if (!visited[sorted_indices[i]])
         {
-            *(reinterpret_cast<int *>(_output_indices->ptr_to_element(Coordinates(output_idx)))) = indices_above_thd[sorted_indices[i]];
-            visited[sorted_indices[i]]                                                           = true;
+            *(reinterpret_cast<int *>(_output_indices->ptr_to_element(Coordinates(output_idx)))) =
+                indices_above_thd[sorted_indices[i]];
+            visited[sorted_indices[i]] = true;
             ++output_idx;
         }
         else
@@ -148,28 +169,36 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
         }
 
         // Once added one element at the output check if the next ones overlap and can be skipped
-        for(unsigned int j = i + 1; j < num_above_thd; ++j)
+        for (unsigned int j = i + 1; j < num_above_thd; ++j)
         {
-            if(!visited[sorted_indices[j]])
+            if (!visited[sorted_indices[j]])
             {
                 // Calculate IoU
                 const unsigned int i_index = indices_above_thd[sorted_indices[i]];
                 const unsigned int j_index = indices_above_thd[sorted_indices[j]];
                 // Box-corner format: xmin, ymin, xmax, ymax
-                const auto box_i_xmin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, i_index))));
-                const auto box_i_ymin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, i_index))));
-                const auto box_i_xmax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, i_index))));
-                const auto box_i_ymax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, i_index))));
-
-                const auto box_j_xmin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, j_index))));
-                const auto box_j_ymin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, j_index))));
-                const auto box_j_xmax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, j_index))));
-                const auto box_j_ymax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, j_index))));
+                const auto box_i_xmin =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, i_index))));
+                const auto box_i_ymin =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, i_index))));
+                const auto box_i_xmax =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, i_index))));
+                const auto box_i_ymax =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, i_index))));
+
+                const auto box_j_xmin =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, j_index))));
+                const auto box_j_ymin =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, j_index))));
+                const auto box_j_xmax =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, j_index))));
+                const auto box_j_ymax =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, j_index))));
 
                 const float area_i = (box_i_xmax - box_i_xmin) * (box_i_ymax - box_i_ymin);
                 const float area_j = (box_j_xmax - box_j_xmin) * (box_j_ymax - box_j_ymin);
                 float       overlap;
-                if(area_i <= 0 || area_j <= 0)
+                if (area_i <= 0 || area_j <= 0)
                 {
                     overlap = 0.0f;
                 }
@@ -179,11 +208,12 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
                     const auto x_min_intersection = std::max<float>(box_i_xmin, box_j_xmin);
                     const auto y_max_intersection = std::min<float>(box_i_ymax, box_j_ymax);
                     const auto x_max_intersection = std::min<float>(box_i_xmax, box_j_xmax);
-                    const auto area_intersection  = std::max<float>(y_max_intersection - y_min_intersection, 0.0f) * std::max<float>(x_max_intersection - x_min_intersection, 0.0f);
-                    overlap                       = area_intersection / (area_i + area_j - area_intersection);
+                    const auto area_intersection  = std::max<float>(y_max_intersection - y_min_intersection, 0.0f) *
+                                                   std::max<float>(x_max_intersection - x_min_intersection, 0.0f);
+                    overlap = area_intersection / (area_i + area_j - area_intersection);
                 }
 
-                if(overlap > _iou_threshold)
+                if (overlap > _iou_threshold)
                 {
                     visited[sorted_indices[j]] = true;
                 }
@@ -192,7 +222,7 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
     }
     // The output could be full but not the output indices tensor
     // Instead return values not valid we put -1
-    for(; output_idx < _max_output_size; ++output_idx)
+    for (; output_idx < _max_output_size; ++output_idx)
     {
         *(reinterpret_cast<int *>(_output_indices->ptr_to_element(Coordinates(output_idx)))) = -1;
     }
diff --git a/src/core/CPP/kernels/CPPPermuteKernel.cpp b/src/core/CPP/kernels/CPPPermuteKernel.cpp
index 054c7bf05a..e68090d82b 100644
--- a/src/core/CPP/kernels/CPPPermuteKernel.cpp
+++ b/src/core/CPP/kernels/CPPPermuteKernel.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -43,7 +44,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     const TensorShape output_shape = misc::shape_calculator::compute_permutation_output_shape(*input, perm);
 
     // Validate configured output
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -65,7 +66,7 @@ void CPPPermuteKernel::run_permute(const Window &window)
     // Create output window
     Window                  window_out(window);
     const Window::Dimension zero_window = Window::Dimension(0, 0, 0);
-    for(size_t d = 0; d <= _perm.num_dimensions(); ++d)
+    for (size_t d = 0; d <= _perm.num_dimensions(); ++d)
     {
         window_out.set(d, zero_window);
     }
@@ -74,28 +75,32 @@ void CPPPermuteKernel::run_permute(const Window &window)
     Iterator in(_input, window);
     Iterator out(_output, window_out);
 
-    if(_input->info()->num_dimensions() <= 3)
+    if (_input->info()->num_dimensions() <= 3)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int idx                             = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2];
-            *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
-        },
-        in, out);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int idx = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2];
+                *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
+            },
+            in, out);
     }
-    else if(_input->info()->num_dimensions() >= 4)
+    else if (_input->info()->num_dimensions() >= 4)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int idx                             = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_strides[3];
-            *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
-        },
-        in, out);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int idx = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] +
+                                id[3] * perm_strides[3];
+                *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
+            },
+            in, out);
     }
 }
 
-CPPPermuteKernel::CPPPermuteKernel()
-    : _func(), _input(nullptr), _output(nullptr), _perm()
+CPPPermuteKernel::CPPPermuteKernel() : _func(), _input(nullptr), _output(nullptr), _perm()
 {
 }
 
@@ -113,7 +118,7 @@ void CPPPermuteKernel::configure(const ITensor *input, ITensor *output, const Pe
     _output = output;
     _perm   = perm;
 
-    switch(input->info()->element_size())
+    switch (input->info()->element_size())
     {
         case 1:
             _func = &CPPPermuteKernel::run_permute<uint8_t>;
@@ -152,7 +157,7 @@ void CPPPermuteKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
 
-    if(_func != nullptr)
+    if (_func != nullptr)
     {
         (this->*_func)(window);
     }
diff --git a/src/core/CPP/kernels/CPPTopKVKernel.cpp b/src/core/CPP/kernels/CPPTopKVKernel.cpp
index d2b54e412e..6ffb68e770 100644
--- a/src/core/CPP/kernels/CPPTopKVKernel.cpp
+++ b/src/core/CPP/kernels/CPPTopKVKernel.cpp
@@ -34,32 +34,34 @@ namespace arm_compute
 {
 namespace
 {
-template <typename T,
-          typename std::enable_if<utils::traits::is_floating_point<T>::value, int>::type = 0>
+template <typename T, typename std::enable_if<utils::traits::is_floating_point<T>::value, int>::type = 0>
 inline bool greater_than(T a, T b)
 {
     const T epsilon = std::numeric_limits<T>::epsilon();
     return (a - b > epsilon);
 }
 
-template < typename T,
-           typename std::enable_if < !utils::traits::is_floating_point<T>::value, int >::type = 0 >
+template <typename T, typename std::enable_if<!utils::traits::is_floating_point<T>::value, int>::type = 0>
 inline bool greater_than(T a, T b)
 {
     return (a > b);
 }
 
-Status validate_arguments(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k)
+Status validate_arguments(const ITensorInfo *predictions,
+                          const ITensorInfo *targets,
+                          ITensorInfo       *output,
+                          const unsigned int k)
 {
     ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(predictions, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(predictions, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::S32, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(targets, 1, DataType::U32);
 
     ARM_COMPUTE_RETURN_ERROR_ON(predictions->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(targets->num_dimensions() > 1);
     ARM_COMPUTE_RETURN_ERROR_ON(targets->dimension(0) != predictions->dimension(1));
     // Validate configured output
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), targets->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
@@ -72,22 +74,23 @@ Status validate_arguments(const ITensorInfo *predictions, const ITensorInfo *tar
 template <typename T>
 void CPPTopKVKernel::run_topkv()
 {
-    for(unsigned int i = 0; i < _batch_size; ++i)
+    for (unsigned int i = 0; i < _batch_size; ++i)
     {
-        const auto target_class_id = *reinterpret_cast<uint32_t *>(_targets->ptr_to_element(Coordinates{ i }));
-        const auto predicted_value = *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{ target_class_id, i }));
+        const auto target_class_id = *reinterpret_cast<uint32_t *>(_targets->ptr_to_element(Coordinates{i}));
+        const auto predicted_value =
+            *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{target_class_id, i}));
 
         // The variable rank indicates how many values there are before the target_class_id
         unsigned int rank = 0;
-        for(unsigned int j = 0; (j < _num_classes) && (rank < _k); ++j)
+        for (unsigned int j = 0; (j < _num_classes) && (rank < _k); ++j)
         {
-            const auto current_prediction = *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{ j, i }));
-            if(greater_than(current_prediction, predicted_value))
+            const auto current_prediction = *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{j, i}));
+            if (greater_than(current_prediction, predicted_value))
             {
                 rank++;
             }
         }
-        *(_output->ptr_to_element(Coordinates{ i })) = static_cast<uint8_t>(rank < _k);
+        *(_output->ptr_to_element(Coordinates{i})) = static_cast<uint8_t>(rank < _k);
     }
 }
 
@@ -96,7 +99,10 @@ CPPTopKVKernel::CPPTopKVKernel()
 {
 }
 
-void CPPTopKVKernel::configure(const ITensor *predictions, const ITensor *targets, ITensor *output, const unsigned int k)
+void CPPTopKVKernel::configure(const ITensor     *predictions,
+                               const ITensor     *targets,
+                               ITensor           *output,
+                               const unsigned int k)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(predictions, targets, output);
 
@@ -115,7 +121,10 @@ void CPPTopKVKernel::configure(const ITensor *predictions, const ITensor *target
     ICPPKernel::configure(Window()); // Default 1 iteration window
 }
 
-Status CPPTopKVKernel::validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k)
+Status CPPTopKVKernel::validate(const ITensorInfo *predictions,
+                                const ITensorInfo *targets,
+                                ITensorInfo       *output,
+                                const unsigned int k)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(predictions, targets, output, k));
     return Status{};
@@ -129,7 +138,7 @@ bool CPPTopKVKernel::is_parallelisable() const
 void CPPTopKVKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(window, info);
-    switch(_predictions->info()->data_type())
+    switch (_predictions->info()->data_type())
     {
         case DataType::F32:
             run_topkv<float>();
diff --git a/src/core/CPP/kernels/CPPUpsampleKernel.cpp b/src/core/CPP/kernels/CPPUpsampleKernel.cpp
index 7ef83fb2c4..b1efe32446 100644
--- a/src/core/CPP/kernels/CPPUpsampleKernel.cpp
+++ b/src/core/CPP/kernels/CPPUpsampleKernel.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/core/CPP/kernels/CPPUpsampleKernel.h"
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 
 #include <cstddef>
@@ -31,8 +32,7 @@
 
 namespace arm_compute
 {
-CPPUpsampleKernel::CPPUpsampleKernel()
-    : _input(nullptr), _output(nullptr), _info()
+CPPUpsampleKernel::CPPUpsampleKernel() : _input(nullptr), _output(nullptr), _info()
 {
 }
 
@@ -82,7 +82,7 @@ void CPPUpsampleKernel::run(const Window &window, const ThreadInfo &info)
     const size_t element_size  = _input->info()->element_size();
 
     // The fill value is normally 0, but for quantized types '0' corresponds to the offset
-    switch(_output->info()->data_type())
+    switch (_output->info()->data_type())
     {
         case DataType::QASYMM8:
         {
@@ -102,7 +102,7 @@ void CPPUpsampleKernel::run(const Window &window, const ThreadInfo &info)
 
     // Create window
     Window window_out(window);
-    if(data_layout == DataLayout::NCHW)
+    if (data_layout == DataLayout::NCHW)
     {
         window_out.set(Window::DimX, Window::Dimension(start_width, end_width, stride_width));
         window_out.set(Window::DimY, Window::Dimension(start_height, end_height, stride_height));
@@ -117,10 +117,7 @@ void CPPUpsampleKernel::run(const Window &window, const ThreadInfo &info)
     Iterator in(_input, window);
     Iterator out(_output, window_out);
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        memcpy(out.ptr(), in.ptr(), element_size);
-    },
-    in, out);
+    execute_window_loop(
+        window, [&](const Coordinates &) { memcpy(out.ptr(), in.ptr(), element_size); }, in, out);
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/Error.cpp b/src/core/Error.cpp
index 5c8d45c987..679a93f9af 100644
--- a/src/core/Error.cpp
+++ b/src/core/Error.cpp
@@ -36,9 +36,10 @@ Status arm_compute::create_error(ErrorCode error_code, std::string msg)
     return Status(error_code, msg);
 }
 
-Status arm_compute::create_error_msg(ErrorCode error_code, const char *func, const char *file, int line, const char *msg)
+Status
+arm_compute::create_error_msg(ErrorCode error_code, const char *func, const char *file, int line, const char *msg)
 {
-    std::array<char, 512> out{ 0 };
+    std::array<char, 512> out{0};
     snprintf(out.data(), out.size(), "in %s %s:%d: %s", func, file, line, msg);
     return Status(error_code, std::string(out.data()));
 }
diff --git a/src/core/GPUTarget.cpp b/src/core/GPUTarget.cpp
index 292acf8633..2d1a13cb33 100644
--- a/src/core/GPUTarget.cpp
+++ b/src/core/GPUTarget.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/GPUTarget.h"
+
 #include "arm_compute/core/Log.h"
 
 #include <map>
@@ -31,47 +32,47 @@ namespace
 {
 arm_compute::GPUTarget get_valhall_target(const std::string &version)
 {
-    if(version.find("G77") != std::string::npos)
+    if (version.find("G77") != std::string::npos)
     {
         return arm_compute::GPUTarget::G77;
     }
-    else if(version.find("G57") != std::string::npos)
+    else if (version.find("G57") != std::string::npos)
     {
         return arm_compute::GPUTarget::G57;
     }
-    if(version.find("G68") != std::string::npos)
+    if (version.find("G68") != std::string::npos)
     {
         return arm_compute::GPUTarget::G68;
     }
-    if(version.find("G78AE") != std::string::npos)
+    if (version.find("G78AE") != std::string::npos)
     {
         return arm_compute::GPUTarget::G78AE;
     }
-    if(version.find("G78") != std::string::npos)
+    if (version.find("G78") != std::string::npos)
     {
         return arm_compute::GPUTarget::G78;
     }
-    else if(version.find("G710") != std::string::npos)
+    else if (version.find("G710") != std::string::npos)
     {
         return arm_compute::GPUTarget::G710;
     }
-    else if(version.find("G610") != std::string::npos)
+    else if (version.find("G610") != std::string::npos)
     {
         return arm_compute::GPUTarget::G610;
     }
-    else if(version.find("G510") != std::string::npos)
+    else if (version.find("G510") != std::string::npos)
     {
         return arm_compute::GPUTarget::G510;
     }
-    else if(version.find("G310") != std::string::npos)
+    else if (version.find("G310") != std::string::npos)
     {
         return arm_compute::GPUTarget::G310;
     }
-    else if(version.find("G715") != std::string::npos)
+    else if (version.find("G715") != std::string::npos)
     {
         return arm_compute::GPUTarget::G715;
     }
-    else if(version.find("G615") != std::string::npos)
+    else if (version.find("G615") != std::string::npos)
     {
         return arm_compute::GPUTarget::G615;
     }
@@ -83,39 +84,39 @@ arm_compute::GPUTarget get_valhall_target(const std::string &version)
 
 arm_compute::GPUTarget get_bifrost_target(const std::string &version)
 {
-    if(version.find("G71") != std::string::npos)
+    if (version.find("G71") != std::string::npos)
     {
         return arm_compute::GPUTarget::G71;
     }
-    else if(version.find("G72") != std::string::npos)
+    else if (version.find("G72") != std::string::npos)
     {
         return arm_compute::GPUTarget::G72;
     }
-    else if(version.find("G51BIG") != std::string::npos)
+    else if (version.find("G51BIG") != std::string::npos)
     {
         return arm_compute::GPUTarget::G51BIG;
     }
-    else if(version.find("G51LIT") != std::string::npos)
+    else if (version.find("G51LIT") != std::string::npos)
     {
         return arm_compute::GPUTarget::G51LIT;
     }
-    else if(version.find("G51") != std::string::npos)
+    else if (version.find("G51") != std::string::npos)
     {
         return arm_compute::GPUTarget::G51;
     }
-    else if(version.find("G52LIT") != std::string::npos)
+    else if (version.find("G52LIT") != std::string::npos)
     {
         return arm_compute::GPUTarget::G52LIT;
     }
-    else if(version.find("G52") != std::string::npos)
+    else if (version.find("G52") != std::string::npos)
     {
         return arm_compute::GPUTarget::G52;
     }
-    else if(version.find("G76") != std::string::npos)
+    else if (version.find("G76") != std::string::npos)
     {
         return arm_compute::GPUTarget::G76;
     }
-    else if(version.find("G31") != std::string::npos)
+    else if (version.find("G31") != std::string::npos)
     {
         return arm_compute::GPUTarget::G31;
     }
@@ -127,15 +128,15 @@ arm_compute::GPUTarget get_bifrost_target(const std::string &version)
 
 arm_compute::GPUTarget get_midgard_target(const std::string &version)
 {
-    if(version.find("T600") != std::string::npos)
+    if (version.find("T600") != std::string::npos)
     {
         return arm_compute::GPUTarget::T600;
     }
-    else if(version.find("T700") != std::string::npos)
+    else if (version.find("T700") != std::string::npos)
     {
         return arm_compute::GPUTarget::T700;
     }
-    else if(version.find("T800") != std::string::npos)
+    else if (version.find("T800") != std::string::npos)
     {
         return arm_compute::GPUTarget::T800;
     }
@@ -150,34 +151,16 @@ namespace arm_compute
 {
 const std::string &string_from_target(GPUTarget target)
 {
-    static std::map<GPUTarget, const std::string> gpu_target_map =
-    {
-        { GPUTarget::MIDGARD, "midgard" },
-        { GPUTarget::BIFROST, "bifrost" },
-        { GPUTarget::VALHALL, "valhall" },
-        { GPUTarget::T600, "t600" },
-        { GPUTarget::T700, "t700" },
-        { GPUTarget::T800, "t800" },
-        { GPUTarget::G71, "g71" },
-        { GPUTarget::G72, "g72" },
-        { GPUTarget::G51, "g51" },
-        { GPUTarget::G51BIG, "g51big" },
-        { GPUTarget::G51LIT, "g51lit" },
-        { GPUTarget::G31, "g31" },
-        { GPUTarget::G76, "g76" },
-        { GPUTarget::G52, "g52" },
-        { GPUTarget::G52LIT, "g52lit" },
-        { GPUTarget::G77, "g77" },
-        { GPUTarget::G57, "g57" },
-        { GPUTarget::G78, "g78" },
-        { GPUTarget::G68, "g68" },
-        { GPUTarget::G78AE, "g78ae" },
-        { GPUTarget::G710, "g710" },
-        { GPUTarget::G610, "g610" },
-        { GPUTarget::G510, "g510" },
-        { GPUTarget::G310, "g310" },
-        { GPUTarget::G715, "g715" },
-        { GPUTarget::G615, "g615" },
+    static std::map<GPUTarget, const std::string> gpu_target_map = {
+        {GPUTarget::MIDGARD, "midgard"}, {GPUTarget::BIFROST, "bifrost"}, {GPUTarget::VALHALL, "valhall"},
+        {GPUTarget::T600, "t600"},       {GPUTarget::T700, "t700"},       {GPUTarget::T800, "t800"},
+        {GPUTarget::G71, "g71"},         {GPUTarget::G72, "g72"},         {GPUTarget::G51, "g51"},
+        {GPUTarget::G51BIG, "g51big"},   {GPUTarget::G51LIT, "g51lit"},   {GPUTarget::G31, "g31"},
+        {GPUTarget::G76, "g76"},         {GPUTarget::G52, "g52"},         {GPUTarget::G52LIT, "g52lit"},
+        {GPUTarget::G77, "g77"},         {GPUTarget::G57, "g57"},         {GPUTarget::G78, "g78"},
+        {GPUTarget::G68, "g68"},         {GPUTarget::G78AE, "g78ae"},     {GPUTarget::G710, "g710"},
+        {GPUTarget::G610, "g610"},       {GPUTarget::G510, "g510"},       {GPUTarget::G310, "g310"},
+        {GPUTarget::G715, "g715"},       {GPUTarget::G615, "g615"},
     };
 
     return gpu_target_map[target];
@@ -189,7 +172,7 @@ GPUTarget get_target_from_name(const std::string &device_name)
     std::smatch name_parts;
     const bool  found_mali = std::regex_search(device_name, name_parts, mali_regex);
 
-    if(!found_mali)
+    if (!found_mali)
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Can't find valid Arm® Mali™ GPU. Target is set to default.");
         return GPUTarget::MIDGARD;
@@ -203,22 +186,22 @@ GPUTarget get_target_from_name(const std::string &device_name)
 
     // Work-out gpu target
     GPUTarget gpu_target;
-    if(target == 'G' || is_future_gpu)
+    if (target == 'G' || is_future_gpu)
     {
         // Check for Valhall or Bifrost
         gpu_target = get_valhall_target(version);
-        if(gpu_target == GPUTarget::UNKNOWN)
+        if (gpu_target == GPUTarget::UNKNOWN)
         {
             gpu_target = get_bifrost_target(version);
         }
 
         // Default GPUTarget
-        if(gpu_target == GPUTarget::UNKNOWN)
+        if (gpu_target == GPUTarget::UNKNOWN)
         {
             gpu_target = GPUTarget::VALHALL;
         }
     }
-    else if(target == 'T')
+    else if (target == 'T')
     {
         gpu_target = get_midgard_target(version);
     }
@@ -228,7 +211,7 @@ GPUTarget get_target_from_name(const std::string &device_name)
     }
 
     // Report in case of unknown target
-    if(gpu_target == GPUTarget::UNKNOWN)
+    if (gpu_target == GPUTarget::UNKNOWN)
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Arm® Mali™ Mali GPU unknown. Target is set to the default one. (BIFROST)");
         return GPUTarget::BIFROST;
diff --git a/src/core/Helpers.cpp b/src/core/Helpers.cpp
index 28e7f4c1e5..c801b097b5 100644
--- a/src/core/Helpers.cpp
+++ b/src/core/Helpers.cpp
@@ -25,8 +25,11 @@
 
 namespace arm_compute
 {
-ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const TensorShape &dst_shape,
-                                         InterpolationPolicy interpolate_policy, SamplingPolicy sampling_policy, bool border_undefined)
+ValidRegion calculate_valid_region_scale(const ITensorInfo  &src_info,
+                                         const TensorShape  &dst_shape,
+                                         InterpolationPolicy interpolate_policy,
+                                         SamplingPolicy      sampling_policy,
+                                         bool                border_undefined)
 {
     const DataLayout data_layout = src_info.data_layout();
     const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -49,9 +52,9 @@ ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const Tens
     auto valid_end_out_y   = std::min<int>(std::ceil(valid_end_in_y * scale_y), dst_shape[idx_height]);
 
     // Handle valid points in case of the bi-linear interpolation
-    if(border_undefined)
+    if (border_undefined)
     {
-        switch(interpolate_policy)
+        switch (interpolate_policy)
         {
             case InterpolationPolicy::NEAREST_NEIGHBOR:
             {
@@ -90,7 +93,7 @@ ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const Tens
     }
 
     // Setup output valid region
-    ValidRegion valid_region{ Coordinates(), dst_shape, dst_shape.num_dimensions() };
+    ValidRegion valid_region{Coordinates(), dst_shape, dst_shape.num_dimensions()};
 
     valid_region.anchor.set(idx_width, std::max(0, valid_start_out_x));
     valid_region.anchor.set(idx_height, std::max(0, valid_start_out_y));
@@ -109,14 +112,12 @@ const std::map<DataLayout, std::vector<DataLayoutDimension>> &get_layout_map()
     constexpr DataLayoutDimension D = DataLayoutDimension::DEPTH;
     constexpr DataLayoutDimension N = DataLayoutDimension::BATCHES;
 
-    static const std::map<DataLayout, std::vector<DataLayoutDimension>> layout_map =
-    {
-        { DataLayout::NDHWC, { C, W, H, D, N } },
-        { DataLayout::NCDHW, { W, H, D, C, N } },
-        { DataLayout::NHWC, { C, W, H, N } },
-        { DataLayout::NCHW, { W, H, C, N } }
-    };
+    static const std::map<DataLayout, std::vector<DataLayoutDimension>> layout_map = {
+        {DataLayout::NDHWC, {C, W, H, D, N}},
+        {DataLayout::NCDHW, {W, H, D, C, N}},
+        {DataLayout::NHWC, {C, W, H, N}},
+        {DataLayout::NCHW, {W, H, C, N}}};
 
     return layout_map;
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/IAccessWindow.cpp b/src/core/IAccessWindow.cpp
index 832801255f..923c5f8a85 100644
--- a/src/core/IAccessWindow.cpp
+++ b/src/core/IAccessWindow.cpp
@@ -29,14 +29,18 @@
 
 using namespace arm_compute;
 
-ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, const ValidRegion &input_valid_region) const
+ValidRegion AccessWindowRectangle::compute_valid_region(const Window      &window,
+                                                        const ValidRegion &input_valid_region) const
 {
     return compute_valid_region(window, input_valid_region, false, BorderSize(0));
 }
 
-ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const
+ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window,
+                                                        ValidRegion   input_valid_region,
+                                                        bool          border_undefined,
+                                                        BorderSize    border_size) const
 {
-    if(_info == nullptr)
+    if (_info == nullptr)
     {
         return input_valid_region;
     }
@@ -45,7 +49,7 @@ ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, Va
     Coordinates  old_anchor(anchor);
     TensorShape &shape = input_valid_region.shape;
 
-    if(!border_undefined)
+    if (!border_undefined)
     {
         border_size = BorderSize(0);
     }
@@ -56,7 +60,7 @@ ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, Va
     // Additionally the valid region is shifted by the offset that is used by
     // the kernel to write back output values.
     anchor.set(0, std::max<int>(window.x().start() * _scale_x, anchor[0] + border_size.left) + _x);
-    if(_info->num_dimensions() > 1)
+    if (_info->num_dimensions() > 1)
     {
         anchor.set(1, std::max<int>(window.y().start() * _scale_y, anchor[1] + border_size.top) + _y);
     }
@@ -69,15 +73,19 @@ ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, Va
     // old size is first converted into end points to compared against the
     // execution window. Afterwards the new end points are converted back into
     // a size of the region.
-    shape.set(0, std::min<int>(old_anchor[0] + shape[0] - border_size.right, (window.x().end() - window.x().step()) * _scale_x + _width) - anchor[0]);
-    if(_info->num_dimensions() > 1)
+    shape.set(0, std::min<int>(old_anchor[0] + shape[0] - border_size.right,
+                               (window.x().end() - window.x().step()) * _scale_x + _width) -
+                     anchor[0]);
+    if (_info->num_dimensions() > 1)
     {
-        shape.set(1, std::min<int>(old_anchor[1] + shape[1] - border_size.bottom, (window.y().end() - window.y().step()) * _scale_y + _height) - anchor[1]);
+        shape.set(1, std::min<int>(old_anchor[1] + shape[1] - border_size.bottom,
+                                   (window.y().end() - window.y().step()) * _scale_y + _height) -
+                         anchor[1]);
     }
 
     // For higher dimensions use the intersection of the window size and the
     // valid region of the input
-    for(size_t d = 2; d < _info->num_dimensions(); ++d)
+    for (size_t d = 2; d < _info->num_dimensions(); ++d)
     {
         anchor.set(d, std::max(window[d].start(), input_valid_region.anchor[d]));
         shape.set(d, std::min<int>(window[d].end(), input_valid_region.shape[d]) - anchor[d]);
@@ -86,9 +94,12 @@ ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, Va
     return input_valid_region;
 }
 
-void AccessWindowRectangle::set_valid_region(const Window &window, const ValidRegion &input_valid_region, bool border_undefined, const BorderSize &border_size)
+void AccessWindowRectangle::set_valid_region(const Window      &window,
+                                             const ValidRegion &input_valid_region,
+                                             bool               border_undefined,
+                                             const BorderSize  &border_size)
 {
-    if(_info != nullptr)
+    if (_info != nullptr)
     {
         _info->set_valid_region(compute_valid_region(window, input_valid_region, border_undefined, border_size));
     }
@@ -97,17 +108,16 @@ void AccessWindowRectangle::set_valid_region(const Window &window, const ValidRe
 bool AccessWindowRectangle::update_window_if_needed(Window &window) const
 {
     // Only update the window size if we can't use padding
-    if(_info == nullptr || _info->is_resizable())
+    if (_info == nullptr || _info->is_resizable())
     {
         return false;
     }
 
-    PaddingSize needed = get_needed_padding(window);
+    PaddingSize needed    = get_needed_padding(window);
     PaddingSize available = _info->padding();
 
-    if(needed.top <= available.top && needed.right <= available.right
-    && needed.bottom <= available.bottom
-    && needed.left <= available.left)
+    if (needed.top <= available.top && needed.right <= available.right && needed.bottom <= available.bottom &&
+        needed.left <= available.left)
     {
         return false;
     }
@@ -124,12 +134,12 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const
     const int max_y = (window.y().end() - window.y().step()) * _scale_y + _y + _height;
 
     // Adjust window start for Y dimension
-    if(min_y < 0)
+    if (min_y < 0)
     {
         // Calculate rows available above the tensor
         const int front_pad_y_available = -static_cast<int>(offset_first_element / strides[1]);
 
-        if(min_y < front_pad_y_available)
+        if (min_y < front_pad_y_available)
         {
             // Not enough padding available, need to shrink the window
             int start = adjust_up(min_y, front_pad_y_available, window.y().step() * _scale_y) - _y;
@@ -144,18 +154,19 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const
     }
 
     // Adjust window end for Y dimension
-    if(max_y > static_cast<int>(shape[1]))
+    if (max_y > static_cast<int>(shape[1]))
     {
         const int stride_z = _info->num_dimensions() > 2 ? strides[2] : _info->total_size();
 
         // Calculate rows available below the tensor
         const int tail_pad_y_available = (stride_z / strides[1]) - shape[1] - front_pad_y;
 
-        if(static_cast<int>(shape[1]) + tail_pad_y_available < max_y)
+        if (static_cast<int>(shape[1]) + tail_pad_y_available < max_y)
         {
             // Not enough padding available, need to shrink the window
-            int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.y().step() * _scale_y) + window.y().step() * _scale_y - _y - _height;
-            end     = std::max<int>(window.y().start(), end / _scale_y);
+            int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.y().step() * _scale_y) +
+                      window.y().step() * _scale_y - _y - _height;
+            end = std::max<int>(window.y().start(), end / _scale_y);
 
             window.set(1, Window::Dimension(window.y().start(), end, window.y().step()));
             window_modified = true;
@@ -170,11 +181,14 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const
     const int stride_y = _info->num_dimensions() > 1 ? strides[1] : _info->total_size();
 
     // Adjust window start for X dimension
-    if(min_x < 0)
+    if (min_x < 0)
     {
-        const int front_pad_x_available = -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1], stride_y - shape[0] * strides[0]) / static_cast<int>(strides[0]);
+        const int front_pad_x_available =
+            -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1],
+                           stride_y - shape[0] * strides[0]) /
+            static_cast<int>(strides[0]);
 
-        if(min_x < front_pad_x_available)
+        if (min_x < front_pad_x_available)
         {
             // Not enough padding available, need to shrink the window
             int start = adjust_up(min_x, front_pad_x_available, window.x().step() * _scale_x) - _x;
@@ -189,15 +203,16 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const
     }
 
     // Adjust window end for X dimension
-    if(max_x > static_cast<int>(shape[0]))
+    if (max_x > static_cast<int>(shape[0]))
     {
         const int tail_pad_x_available = (stride_y / strides[0]) - shape[0] - front_pad_x;
 
-        if(static_cast<int>(shape[0]) + tail_pad_x_available < max_x)
+        if (static_cast<int>(shape[0]) + tail_pad_x_available < max_x)
         {
             // Not enough padding available, need to shrink the window
-            int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.x().step() * _scale_x) + window.x().step() * _scale_x - _x - _width;
-            end     = std::max<int>(window.x().start(), end / _scale_x);
+            int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.x().step() * _scale_x) +
+                      window.x().step() * _scale_x - _x - _width;
+            end = std::max<int>(window.x().start(), end / _scale_x);
 
             window.set(0, Window::Dimension(window.x().start(), end, window.x().step()));
             window_modified = true;
@@ -212,15 +227,15 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const
 bool AccessWindowRectangle::update_padding_if_needed(const Window &window)
 {
     // Only update the padding if the tensor allows it
-    if(_info == nullptr || !_info->is_resizable())
+    if (_info == nullptr || !_info->is_resizable())
     {
         return false;
     }
     // Update strides in tensor info
-    return _info->extend_padding( get_needed_padding(window));
+    return _info->extend_padding(get_needed_padding(window));
 }
 
-PaddingSize AccessWindowRectangle::get_needed_padding(const Window &window)const
+PaddingSize AccessWindowRectangle::get_needed_padding(const Window &window) const
 {
     ARM_COMPUTE_ERROR_ON(_scale_x == 0);
     ARM_COMPUTE_ERROR_ON(_scale_y == 0);
diff --git a/src/core/IKernel.cpp b/src/core/IKernel.cpp
index 31f1ec7a3f..fb7e095091 100644
--- a/src/core/IKernel.cpp
+++ b/src/core/IKernel.cpp
@@ -30,8 +30,7 @@ const Window &IKernel::window() const
     return _window;
 }
 
-IKernel::IKernel()
-    : _window()
+IKernel::IKernel() : _window()
 {
     // Create an empty window to make sure the children classes set the window values themselves
     _window.set(Window::DimX, Window::Dimension(0, 0, 1));
diff --git a/src/core/ITensor.cpp b/src/core/ITensor.cpp
index 2f4354cc6f..4dc8ea959b 100644
--- a/src/core/ITensor.cpp
+++ b/src/core/ITensor.cpp
@@ -35,7 +35,7 @@ namespace arm_compute
 {
 void ITensor::copy_from(const ITensor &src)
 {
-    if(&src == this)
+    if (&src == this)
     {
         return;
     }
@@ -47,7 +47,7 @@ void ITensor::copy_from(const ITensor &src)
     ARM_COMPUTE_ERROR_ON(src_info->num_channels() != dst_info->num_channels());
     ARM_COMPUTE_ERROR_ON(src_info->element_size() != dst_info->element_size());
 
-    for(size_t d = 0; d < src_info->num_dimensions(); d++)
+    for (size_t d = 0; d < src_info->num_dimensions(); d++)
     {
         ARM_COMPUTE_ERROR_ON(src_info->dimension(d) > dst_info->dimension(d));
     }
@@ -66,11 +66,7 @@ void ITensor::copy_from(const ITensor &src)
     const size_t line_size = src_info->element_size() * src_info->dimension(0);
 
     execute_window_loop(
-        win_src, [&](const Coordinates &)
-    {
-        memcpy(dst_it.ptr(), src_it.ptr(), line_size);
-    },
-    src_it, dst_it);
+        win_src, [&](const Coordinates &) { memcpy(dst_it.ptr(), src_it.ptr(), line_size); }, src_it, dst_it);
 }
 
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
@@ -87,10 +83,10 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const
     stream_status.copyfmt(s);
 
     // Set precision
-    if(is_data_type_float(dt) && (io_fmt.precision_type != IOFormatInfo::PrecisionType::Default))
+    if (is_data_type_float(dt) && (io_fmt.precision_type != IOFormatInfo::PrecisionType::Default))
     {
         int precision = io_fmt.precision;
-        if(io_fmt.precision_type == IOFormatInfo::PrecisionType::Full)
+        if (io_fmt.precision_type == IOFormatInfo::PrecisionType::Full)
         {
             precision = std::numeric_limits<float>().max_digits10;
         }
@@ -101,7 +97,7 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const
     size_t print_width  = 0;
     size_t print_height = 0;
     int    start_offset = 0;
-    switch(io_fmt.print_region)
+    switch (io_fmt.print_region)
     {
         case IOFormatInfo::PrintRegion::NoPadding:
             print_width  = this->info()->dimension(0);
@@ -111,13 +107,14 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const
         case IOFormatInfo::PrintRegion::ValidRegion:
             print_width  = this->info()->valid_region().shape.x();
             print_height = this->info()->valid_region().shape.y();
-            start_offset = this->info()->offset_element_in_bytes(Coordinates(this->info()->valid_region().anchor.x(),
-                                                                             this->info()->valid_region().anchor.y()));
+            start_offset = this->info()->offset_element_in_bytes(
+                Coordinates(this->info()->valid_region().anchor.x(), this->info()->valid_region().anchor.y()));
             break;
         case IOFormatInfo::PrintRegion::Full:
             print_width  = padding.left + this->info()->dimension(0) + padding.right;
             print_height = padding.top + this->info()->dimension(1) + padding.bottom;
-            start_offset = static_cast<int>(this->info()->offset_first_element_in_bytes()) - padding.top * strides[1] - padding.left * strides[0];
+            start_offset = static_cast<int>(this->info()->offset_first_element_in_bytes()) - padding.top * strides[1] -
+                           padding.left * strides[0];
             break;
         default:
             break;
@@ -129,16 +126,17 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const
     const uint8_t *ptr = this->buffer() + start_offset;
 
     // Start printing
-    for(size_t i = 0; i < slices2D; ++i)
+    for (size_t i = 0; i < slices2D; ++i)
     {
         // Find max_width of elements in slice to align columns
         int max_element_width = 0;
-        if(io_fmt.align_columns)
+        if (io_fmt.align_columns)
         {
             size_t offset = i * strides[2];
-            for(size_t h = 0; h < print_height; ++h)
+            for (size_t h = 0; h < print_height; ++h)
             {
-                max_element_width = std::max<int>(max_element_width, max_consecutive_elements_display_width(s, dt, ptr + offset, print_width));
+                max_element_width = std::max<int>(
+                    max_element_width, max_consecutive_elements_display_width(s, dt, ptr + offset, print_width));
                 offset += strides[1];
             }
         }
@@ -146,7 +144,7 @@ void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const
         // Print slice
         {
             size_t offset = i * strides[2];
-            for(size_t h = 0; h < print_height; ++h)
+            for (size_t h = 0; h < print_height; ++h)
             {
                 print_consecutive_elements(s, dt, ptr + offset, print_width, max_element_width, io_fmt.element_delim);
                 offset += strides[1];
diff --git a/src/core/ITensorPack.cpp b/src/core/ITensorPack.cpp
index 90f9a45039..0f8b0824f8 100644
--- a/src/core/ITensorPack.cpp
+++ b/src/core/ITensorPack.cpp
@@ -27,10 +27,9 @@
 
 namespace arm_compute
 {
-ITensorPack::ITensorPack(std::initializer_list<PackElement> l)
-    : _pack()
+ITensorPack::ITensorPack(std::initializer_list<PackElement> l) : _pack()
 {
-    for(auto &e : l)
+    for (auto &e : l)
     {
         _pack[e.id] = e;
     }
@@ -54,7 +53,7 @@ void ITensorPack::add_const_tensor(int id, const ITensor *tensor)
 const ITensor *ITensorPack::get_const_tensor(int id) const
 {
     auto it = _pack.find(id);
-    if(it != _pack.end())
+    if (it != _pack.end())
     {
         return it->second.ctensor != nullptr ? it->second.ctensor : it->second.tensor;
     }
@@ -81,4 +80,4 @@ bool ITensorPack::empty() const
 {
     return _pack.empty();
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/NEON/NEAsymm.h b/src/core/NEON/NEAsymm.h
index e6d0e532c8..5f4d08d0f6 100644
--- a/src/core/NEON/NEAsymm.h
+++ b/src/core/NEON/NEAsymm.h
@@ -26,6 +26,7 @@
 
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -90,7 +91,7 @@ inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32,
 {
     const static int32x4_t zero_s32 = vdupq_n_s32(0);
 
-    if(result_shift < 0)
+    if (result_shift < 0)
     {
         in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift)));
         in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift)));
@@ -130,18 +131,13 @@ inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32,
     in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
 
     // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
+    const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+                                 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
 
     // Convert S16 to U8
     uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
 
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_u8 = vmaxq_u8(out_u8, min_u8);
         out_u8 = vminq_u8(out_u8, max_u8);
@@ -170,7 +166,7 @@ inline int8x16_t finalize_quantization(int32x4x4_t &in_s32,
                                        int8x16_t    max_s8,
                                        bool         is_bounded_relu)
 {
-    if(result_shift < 0)
+    if (result_shift < 0)
     {
         in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift)));
         in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift)));
@@ -204,18 +200,13 @@ inline int8x16_t finalize_quantization(int32x4x4_t &in_s32,
     in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
 
     // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
+    const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+                                 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
 
     // Convert S16 to S8
     int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
 
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_s8 = vmaxq_s8(out_s8, min_s8);
         out_s8 = vminq_s8(out_s8, max_s8);
@@ -247,8 +238,7 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t       &in_s32,
     const static int32x4_t one_s32 = vdupq_n_s32(1);
 
     // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
-    int32x4x4_t res_shift_gt0 =
-    {
+    int32x4x4_t res_shift_gt0 = {
         vqrdmulhq_s32(in_s32.val[0], result_fixedpoint_multiplier.val[0]),
         vqrdmulhq_s32(in_s32.val[1], result_fixedpoint_multiplier.val[1]),
         vqrdmulhq_s32(in_s32.val[2], result_fixedpoint_multiplier.val[2]),
@@ -260,8 +250,7 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t       &in_s32,
     res_shift_gt0.val[2] = rounding_divide_by_pow2(res_shift_gt0.val[2], result_shift.val[2]);
     res_shift_gt0.val[3] = rounding_divide_by_pow2(res_shift_gt0.val[3], result_shift.val[3]);
 
-    int32x4x4_t res_shift_lt0 =
-    {
+    int32x4x4_t res_shift_lt0 = {
         vmulq_s32(in_s32.val[0], vshlq_s32(one_s32, vnegq_s32(result_shift.val[0]))),
         vmulq_s32(in_s32.val[1], vshlq_s32(one_s32, vnegq_s32(result_shift.val[1]))),
         vmulq_s32(in_s32.val[2], vshlq_s32(one_s32, vnegq_s32(result_shift.val[2]))),
@@ -273,8 +262,7 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t       &in_s32,
     res_shift_lt0.val[3] = vqrdmulhq_s32(res_shift_lt0.val[3], result_fixedpoint_multiplier.val[3]);
 
     // Select result depending on shift value
-    const uint32x4x4_t mask_lt0 =
-    {
+    const uint32x4x4_t mask_lt0 = {
 #ifdef __aarch64__
         vcltzq_s32(result_shift.val[0]),
         vcltzq_s32(result_shift.val[1]),
@@ -300,18 +288,13 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t       &in_s32,
     in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
 
     // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
+    const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+                                 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
 
     // Convert S16 to S8
     int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
 
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_s8 = vmaxq_s8(out_s8, min_s8);
         out_s8 = vminq_s8(out_s8, max_s8);
@@ -332,15 +315,20 @@ inline int8x16_t finalize_quantization_symm(int32x4x4_t       &in_s32,
  *
  * @return Quantized value
  */
-inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
-                                     int32_t result_shift, int32_t result_offset_after_shift_s32,
-                                     uint8_t min_u8, uint8_t max_u8, bool is_bounded_relu)
+inline uint8_t finalize_quantization(int32_t in_value,
+                                     int     result_fixedpoint_multiplier,
+                                     int32_t result_shift,
+                                     int32_t result_offset_after_shift_s32,
+                                     uint8_t min_u8,
+                                     uint8_t max_u8,
+                                     bool    is_bounded_relu)
 {
     int32x4_t in_s32 = vdupq_n_s32(in_value);
 
-    if(result_shift < 0)
+    if (result_shift < 0)
     {
-        in_value = vgetq_lane_s32(vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
+        in_value = vgetq_lane_s32(
+            vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
     }
     else
     {
@@ -355,7 +343,7 @@ inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mul
 
     // Bound the result
     uint8_t out_u8 = static_cast<uint8_t>(std::max<int32_t>(0, std::min<int32_t>(255, in_value)));
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_u8 = static_cast<uint8_t>(std::max(min_u8, std::min(max_u8, out_u8)));
     }
@@ -375,15 +363,20 @@ inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mul
  *
  * @return Quantized value
  */
-inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
-                                    int32_t result_shift, int32_t result_offset_after_shift_s32,
-                                    int8_t min_s8, int8_t max_s8, bool is_bounded_relu)
+inline int8_t finalize_quantization(int32_t in_value,
+                                    int     result_fixedpoint_multiplier,
+                                    int32_t result_shift,
+                                    int32_t result_offset_after_shift_s32,
+                                    int8_t  min_s8,
+                                    int8_t  max_s8,
+                                    bool    is_bounded_relu)
 {
     int32x4_t in_s32 = vdupq_n_s32(in_value);
 
-    if(result_shift < 0)
+    if (result_shift < 0)
     {
-        in_value = vgetq_lane_s32(vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
+        in_value = vgetq_lane_s32(
+            vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
     }
     else
     {
@@ -399,7 +392,7 @@ inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mult
 
     // Bound the result
     int8_t out_s8 = static_cast<int8_t>(std::max<int32_t>(-128, std::min<int32_t>(127, in_value)));
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_s8 = static_cast<int8_t>(std::max(min_s8, std::min(max_s8, out_s8)));
     }
@@ -416,17 +409,16 @@ inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_mult
  */
 inline float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationInfo &qi)
 {
-    const float         scale   = qi.scale;
-    const int           offset  = qi.offset;
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x2_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(qv)))), voffset)), vscale),
-        }
-    };
+    const float         scale              = qi.scale;
+    const int           offset             = qi.offset;
+    const int32x4_t     voffset            = vdupq_n_s32(offset);
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x2_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(qv)))), voffset)),
+                  vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(qv)))), voffset)),
+                  vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -439,17 +431,14 @@ inline float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationI
  */
 inline float32x4x2_t vdequantize(const int8x8_t &qv, const UniformQuantizationInfo &qi)
 {
-    const float         scale   = qi.scale;
-    const int           offset  = qi.offset;
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x2_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(qv))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(qv))), voffset)), vscale),
-        }
-    };
+    const float         scale              = qi.scale;
+    const int           offset             = qi.offset;
+    const int32x4_t     voffset            = vdupq_n_s32(offset);
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x2_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(qv))), voffset)), vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(qv))), voffset)), vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -462,19 +451,24 @@ inline float32x4x2_t vdequantize(const int8x8_t &qv, const UniformQuantizationIn
  */
 inline float32x4x4_t vdequantize(const uint8x16_t &qv, const UniformQuantizationInfo &qi)
 {
-    const float         scale   = qi.scale;
-    const int           offset  = qi.offset;
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
-        }
-    };
+    const float         scale              = qi.scale;
+    const int           offset             = qi.offset;
+    const int32x4_t     voffset            = vdupq_n_s32(offset);
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)),
+                  vscale),
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)),
+                  vscale),
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)),
+                  vscale),
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)),
+                  vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -487,19 +481,16 @@ inline float32x4x4_t vdequantize(const uint8x16_t &qv, const UniformQuantization
  */
 inline float32x4x4_t vdequantize(const int8x16_t &qv, const UniformQuantizationInfo &qi)
 {
-    const float         scale   = qi.scale;
-    const int           offset  = qi.offset;
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
-        }
-    };
+    const float         scale              = qi.scale;
+    const int           offset             = qi.offset;
+    const int32x4_t     voffset            = vdupq_n_s32(offset);
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -513,17 +504,22 @@ inline float32x4x4_t vdequantize(const int8x16_t &qv, const UniformQuantizationI
  */
 inline float32x4x4_t vdequantize(const uint8x16_t &qv, float scale, int32_t offset)
 {
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
-        }
-    };
+    const int32x4_t     voffset            = vdupq_n_s32(offset);
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)),
+                  vscale),
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)),
+                  vscale),
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)),
+                  vscale),
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)),
+                  vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -537,17 +533,14 @@ inline float32x4x4_t vdequantize(const uint8x16_t &qv, float scale, int32_t offs
  */
 inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale, int32_t offset)
 {
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
-        }
-    };
+    const int32x4_t     voffset            = vdupq_n_s32(offset);
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -560,15 +553,12 @@ inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale, int32_t offse
  */
 inline float32x4x4_t vdequantize(const int8x16_t &qv, const float32x4x4_t vscale)
 {
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[0]),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[1]),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[2]),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[3]),
-        }
-    };
+    const float32x4x4_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[0]),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[1]),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[2]),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[3]),
+    }};
     return vdequantized_input;
 }
 
@@ -581,16 +571,13 @@ inline float32x4x4_t vdequantize(const int8x16_t &qv, const float32x4x4_t vscale
  */
 inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale)
 {
-    const float32x4_t   vscale = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
-        }
-    };
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -607,18 +594,15 @@ inline uint8x8_t vquantize(const float32x4x2_t &qv, const UniformQuantizationInf
     const int         offset    = qi.offset;
     const float32x4_t voffset   = vdupq_n_f32(offset);
     const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-    const int32x4x4_t rf =
-    {
-        {
+    const int32x4x4_t rf        = {{
 #ifdef __aarch64__
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+        vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+        vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
 #else  //__aarch64__
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+        vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+        vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
 #endif //__aarch64__
-        }
-    };
+    }};
     return vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
 }
 
@@ -635,18 +619,15 @@ inline int8x8_t vquantize_signed(const float32x4x2_t &qv, const UniformQuantizat
     const int         offset    = qi.offset;
     const float32x4_t voffset   = vdupq_n_f32(offset);
     const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-    const int32x4x4_t rf =
-    {
-        {
+    const int32x4x4_t rf        = {{
 #ifdef __aarch64__
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+        vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+        vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
 #else  //__aarch64__
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+        vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+        vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
 #endif //__aarch64__
-        }
-    };
+    }};
     return vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
 }
 
@@ -654,22 +635,19 @@ inline int32x4x4_t vquantize_internal(const float32x4x4_t &qv, float scale, int3
 {
     const int32x4_t   voffset   = vdupq_n_s32(offset);
     const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-    const int32x4x4_t rf =
-    {
-        {
+    const int32x4x4_t rf        = {{
 #ifdef __aarch64__
-            vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
-            vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
-            vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
-            vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
+        vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
+        vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
+        vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
+        vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
 #else  //__aarch64__
-            vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
-            vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
-            vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
-            vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
+        vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
+        vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
+        vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
+        vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
 #endif //__aarch64__
-        }
-    };
+    }};
     return rf;
 }
 
@@ -715,7 +693,7 @@ inline uint16x8x2_t vquantize_qasymm16(const float32x4x4_t &qv, const UniformQua
     auto             rf = vquantize_internal(qv, qi.scale, qi.offset);
     const uint16x8_t pa = vcombine_u16(vqmovun_s32(rf.val[0]), vqmovun_s32(rf.val[1]));
     const uint16x8_t pb = vcombine_u16(vqmovun_s32(rf.val[2]), vqmovun_s32(rf.val[3]));
-    return { pa, pb };
+    return {pa, pb};
 }
 
 } // namespace arm_compute
diff --git a/src/core/NEON/NEAsymm.inl b/src/core/NEON/NEAsymm.inl
index ca2aea1e18..fd62fd4654 100644
--- a/src/core/NEON/NEAsymm.inl
+++ b/src/core/NEON/NEAsymm.inl
@@ -51,14 +51,14 @@ inline qasymm8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t v
     D_f32x4 = vmlaq_f32(vo, D_f32x4, vs);
     // Convert float32 vectors to uint32 vectors
 #if __aarch64__
-    if(round_policy == RoundingPolicy::TO_NEAREST_EVEN)
+    if (round_policy == RoundingPolicy::TO_NEAREST_EVEN)
     {
         A_u32x4 = vcvtnq_u32_f32(A_f32x4);
         B_u32x4 = vcvtnq_u32_f32(B_f32x4);
         C_u32x4 = vcvtnq_u32_f32(C_f32x4);
         D_u32x4 = vcvtnq_u32_f32(D_f32x4);
     }
-    else if(round_policy == RoundingPolicy::TO_NEAREST_UP)
+    else if (round_policy == RoundingPolicy::TO_NEAREST_UP)
     {
         A_u32x4 = vcvtaq_u32_f32(A_f32x4);
         B_u32x4 = vcvtaq_u32_f32(B_f32x4);
@@ -86,7 +86,7 @@ inline qasymm8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t v
     return vcombine_u8(vqmovn_u16(vd_low_u16x8), vqmovn_u16(vd_high_u16x8));
 }
 
-template <RoundingPolicy   round_policy>
+template <RoundingPolicy round_policy>
 inline qasymm8x16_signed_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo)
 {
     // Convert uint8 vectors to int16 vectors
@@ -110,14 +110,14 @@ inline qasymm8x16_signed_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x
     C_f32x4 = vmlaq_f32(vo, C_f32x4, vs);
     D_f32x4 = vmlaq_f32(vo, D_f32x4, vs);
 #if __aarch64__
-    if(round_policy == RoundingPolicy::TO_NEAREST_EVEN)
+    if (round_policy == RoundingPolicy::TO_NEAREST_EVEN)
     {
         A_s32x4 = vcvtnq_s32_f32(A_f32x4);
         B_s32x4 = vcvtnq_s32_f32(B_f32x4);
         C_s32x4 = vcvtnq_s32_f32(C_f32x4);
         D_s32x4 = vcvtnq_s32_f32(D_f32x4);
     }
-    else if(round_policy == RoundingPolicy::TO_NEAREST_UP)
+    else if (round_policy == RoundingPolicy::TO_NEAREST_UP)
     {
         A_s32x4 = vcvtaq_s32_f32(A_f32x4);
         B_s32x4 = vcvtaq_s32_f32(B_f32x4);
diff --git a/src/core/NEON/NEFixedPoint.inl b/src/core/NEON/NEFixedPoint.inl
index 8bff9c4a8e..fb403b6d26 100644
--- a/src/core/NEON/NEFixedPoint.inl
+++ b/src/core/NEON/NEFixedPoint.inl
@@ -30,13 +30,7 @@ namespace arm_compute
 
 inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
 {
-    float32x4x2_t res =
-    {
-        {
-            vmaxq_f32(a.val[0], b.val[0]),
-            vmaxq_f32(a.val[1], b.val[1])
-        }
-    };
+    float32x4x2_t res = {{vmaxq_f32(a.val[0], b.val[0]), vmaxq_f32(a.val[1], b.val[1])}};
     return res;
 }
 #endif /* DOXYGEN_SKIP_THIS */
diff --git a/src/core/NEON/NEMath.inl b/src/core/NEON/NEMath.inl
index 1cbe669373..f875917988 100644
--- a/src/core/NEON/NEMath.inl
+++ b/src/core/NEON/NEMath.inl
@@ -29,19 +29,16 @@
 namespace arm_compute
 {
 /** Logarithm polynomial coefficients */
-const std::array<float32x4_t, 8> log_tab =
-{
-    {
-        vdupq_n_f32(-2.29561495781f),
-        vdupq_n_f32(-2.47071170807f),
-        vdupq_n_f32(-5.68692588806f),
-        vdupq_n_f32(-0.165253549814f),
-        vdupq_n_f32(5.17591238022f),
-        vdupq_n_f32(0.844007015228f),
-        vdupq_n_f32(4.58445882797f),
-        vdupq_n_f32(0.0141278216615f),
-    }
-};
+const std::array<float32x4_t, 8> log_tab = {{
+    vdupq_n_f32(-2.29561495781f),
+    vdupq_n_f32(-2.47071170807f),
+    vdupq_n_f32(-5.68692588806f),
+    vdupq_n_f32(-0.165253549814f),
+    vdupq_n_f32(5.17591238022f),
+    vdupq_n_f32(0.844007015228f),
+    vdupq_n_f32(4.58445882797f),
+    vdupq_n_f32(0.0141278216615f),
+}};
 
 /** Sin polynomial coefficients */
 constexpr float te_sin_coeff2 = 0.166666666666f; // 1/(2*3)
@@ -54,7 +51,7 @@ inline float32x4_t prefer_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c)
 {
 #if __ARM_FEATURE_FMA
     return vfmaq_f32(a, b, c);
-#else // __ARM_FEATURE_FMA
+#else  // __ARM_FEATURE_FMA
     return vmlaq_f32(a, b, c);
 #endif // __ARM_FEATURE_FMA
 }
@@ -73,13 +70,14 @@ inline float32x4_t vroundq_rte_f32(float32x4_t val)
 {
 #ifdef __aarch64__
     return vrndnq_f32(val);
-#else // __aarch64__
+#else  // __aarch64__
     static const float32x4_t CONST_HALF_FLOAT = vdupq_n_f32(0.5f);
     static const float32x4_t CONST_1_FLOAT    = vdupq_n_f32(1.f);
     static const int32x4_t   CONST_1_INT      = vdupq_n_s32(1);
     const float32x4_t        floor_val        = vfloorq_f32(val);
     const float32x4_t        diff             = vsubq_f32(val, floor_val);
-    const float32x4_t        fp32_upper_limit = vreinterpretq_f32_u32(vdupq_n_u32(0x4B000000)); // 0x4B000000 = (23U + 127U) << 23U
+    const float32x4_t        fp32_upper_limit =
+        vreinterpretq_f32_u32(vdupq_n_u32(0x4B000000)); // 0x4B000000 = (23U + 127U) << 23U
 
     /*
     * 1. Select the floor value when (diff<0.5 || (diff==0.5 && floor_val%2==0).
@@ -95,12 +93,13 @@ inline float32x4_t vroundq_rte_f32(float32x4_t val)
     *    Threshold upper limit with format |S|E(8bits)|   Fraction(23bits)     | = (23 + 127) << 23 (assuming positive sign): Adding 127, because 127 represents the actual zero in this format.
     */
 
-    float32x4_t rounded_val = vbslq_f32(vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT),
-                                                  vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT),
-                                                            vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT),CONST_1_INT)))),
-                                        floor_val, vaddq_f32(floor_val, CONST_1_FLOAT));
+    float32x4_t rounded_val = vbslq_f32(
+        vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT),
+                  vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT),
+                            vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT), CONST_1_INT)))),
+        floor_val, vaddq_f32(floor_val, CONST_1_FLOAT));
 
-    float32x4_t result      = vbslq_f32(vcgeq_f32(vabsq_f32(val), fp32_upper_limit), val, rounded_val);
+    float32x4_t result = vbslq_f32(vcgeq_f32(vabsq_f32(val), fp32_upper_limit), val, rounded_val);
 
     return result;
 #endif // __aarch64__
@@ -118,8 +117,8 @@ inline float32x2_t vinvsqrt_f32(float32x2_t x)
 inline float32x4_t vinvsqrtq_f32(float32x4_t x)
 {
     float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
-    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
 
     return sqrt_reciprocal;
 }
@@ -152,8 +151,7 @@ inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array<float32x4_t
     return res;
 }
 
-static const uint32_t exp_f32_coeff[] =
-{
+static const uint32_t exp_f32_coeff[] = {
     0x3f7ffff6, // x^1: 0x1.ffffecp-1f
     0x3efffedb, // x^2: 0x1.fffdb6p-2f
     0x3e2aaf33, // x^3: 0x1.555e66p-3f
@@ -169,10 +167,12 @@ inline float32x4_t vexpq_f32(float32x4_t x)
     const auto c4 = vreinterpretq_f32_u32(vdupq_n_u32(exp_f32_coeff[3]));
     const auto c5 = vreinterpretq_f32_u32(vdupq_n_u32(exp_f32_coeff[4]));
 
-    const auto shift      = vreinterpretq_f32_u32(vdupq_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f
-    const auto inv_ln2    = vreinterpretq_f32_u32(vdupq_n_u32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f
-    const auto neg_ln2_hi = vreinterpretq_f32_u32(vdupq_n_u32(0xbf317200)); // -ln(2) from bits  -1 to -19: -0x1.62e400p-1f
-    const auto neg_ln2_lo = vreinterpretq_f32_u32(vdupq_n_u32(0xb5bfbe8e)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f
+    const auto shift   = vreinterpretq_f32_u32(vdupq_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f
+    const auto inv_ln2 = vreinterpretq_f32_u32(vdupq_n_u32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f
+    const auto neg_ln2_hi =
+        vreinterpretq_f32_u32(vdupq_n_u32(0xbf317200)); // -ln(2) from bits  -1 to -19: -0x1.62e400p-1f
+    const auto neg_ln2_lo =
+        vreinterpretq_f32_u32(vdupq_n_u32(0xb5bfbe8e)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f
 
     const auto inf       = vdupq_n_f32(std::numeric_limits<float>::infinity());
     const auto max_input = vdupq_n_f32(88.37f); // Approximately ln(2^127.5)
@@ -224,9 +224,9 @@ inline float32x4_t vexpq_f32(float32x4_t x)
 #ifdef __aarch64__
 inline float32x4_t verfq_f32(float32x4_t x)
 {
-    static const float       erffdata[4] = { 0.278393f, 0.230389f, 0.000972f, 0.078108f };
+    static const float       erffdata[4] = {0.278393f, 0.230389f, 0.000972f, 0.078108f};
     static const float32x4_t coeffdata   = vld1q_f32(erffdata);
-    static const float32x4_t onev{ vdupq_n_f32(1.0f) };
+    static const float32x4_t onev{vdupq_n_f32(1.0f)};
 
     uint32x4_t selector = vcltzq_f32(x);
 
@@ -287,10 +287,12 @@ inline float32x4_t vtanhq_f32(float32x4_t val)
 
     float32x4_t x = vminq_f32(vmaxq_f32(val, CONST_MIN_TANH), CONST_MAX_TANH);
     // x * (1 - x^2/3) if |x| < 5.e-3 or (exp2x - 1) / (exp2x + 1) otherwise
-    float32x4_t exp2x = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vexpq_f32(vmulq_f32(CONST_2, x)), vmulq_f32(x, x));
-    float32x4_t num   = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vsubq_f32(exp2x, CONST_1), vmulq_f32(CONST_1_3, exp2x));
-    float32x4_t den   = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vaddq_f32(exp2x, CONST_1), vsubq_f32(CONST_1, num));
-    float32x4_t tanh  = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vmulq_f32(num, vinvq_f32(den)), vmulq_f32(x, den));
+    float32x4_t exp2x =
+        vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vexpq_f32(vmulq_f32(CONST_2, x)), vmulq_f32(x, x));
+    float32x4_t num =
+        vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vsubq_f32(exp2x, CONST_1), vmulq_f32(CONST_1_3, exp2x));
+    float32x4_t den = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vaddq_f32(exp2x, CONST_1), vsubq_f32(CONST_1, num));
+    float32x4_t tanh = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vmulq_f32(num, vinvq_f32(den)), vmulq_f32(x, den));
     return tanh;
 }
 
@@ -456,30 +458,23 @@ inline float32x4x4_t convert_to_float32x4x4(const int8x16_t &in)
 
 inline void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x3_t &in2, uint8x8x3_t &out)
 {
-    out.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[0])),
-                                         vqmovn_u32(vcvtq_u32_f32(in2.val[0]))));
-    out.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[1])),
-                                         vqmovn_u32(vcvtq_u32_f32(in2.val[1]))));
-    out.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[2])),
-                                         vqmovn_u32(vcvtq_u32_f32(in2.val[2]))));
+    out.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[0])), vqmovn_u32(vcvtq_u32_f32(in2.val[0]))));
+    out.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[1])), vqmovn_u32(vcvtq_u32_f32(in2.val[1]))));
+    out.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[2])), vqmovn_u32(vcvtq_u32_f32(in2.val[2]))));
 }
 
 inline void convert_float32x4x4_to_uint8x16(const float32x4x4_t &in, uint8x16_t &out)
 {
-    const auto low = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])),
-                                   vqmovn_u32(vcvtq_u32_f32(in.val[1])));
-    const auto high = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[2])),
-                                   vqmovn_u32(vcvtq_u32_f32(in.val[3])));
-    out = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
+    const auto low  = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])), vqmovn_u32(vcvtq_u32_f32(in.val[1])));
+    const auto high = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[2])), vqmovn_u32(vcvtq_u32_f32(in.val[3])));
+    out             = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
 }
 
 inline void convert_float32x4x4_to_int8x16(const float32x4x4_t &in, int8x16_t &out)
 {
-    const auto low = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[0])),
-                                   vqmovn_s32(vcvtq_s32_f32(in.val[1])));
-    const auto high = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[2])),
-                                   vqmovn_s32(vcvtq_s32_f32(in.val[3])));
-    out = vcombine_s8(vqmovn_s16(low), vqmovn_s16(high));
+    const auto low  = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[0])), vqmovn_s32(vcvtq_s32_f32(in.val[1])));
+    const auto high = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[2])), vqmovn_s32(vcvtq_s32_f32(in.val[3])));
+    out             = vcombine_s8(vqmovn_s16(low), vqmovn_s16(high));
 }
 
 template <>
@@ -552,8 +547,8 @@ inline float16x4_t vinvsqrt_f16(float16x4_t x)
 inline float16x8_t vinvsqrtq_f16(float16x8_t x)
 {
     float16x8_t sqrt_reciprocal = vrsqrteq_f16(x);
-    sqrt_reciprocal             = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-    sqrt_reciprocal             = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
     return sqrt_reciprocal;
 }
 
@@ -602,8 +597,8 @@ inline float16x4_t vtanh_rational_approx_f16(float16x4_t x16)
 inline float16x8_t vtanhq_f16(float16x8_t x)
 {
     // Split into high/low and use rational approximation on both parts exactly
-    const float16x8_t tanh = vcombine_f16(vtanh_rational_approx_f16(vget_low_f16(x)),
-                                          vtanh_rational_approx_f16(vget_high_f16(x)));
+    const float16x8_t tanh =
+        vcombine_f16(vtanh_rational_approx_f16(vget_low_f16(x)), vtanh_rational_approx_f16(vget_high_f16(x)));
 
     // tanh(x) == sign(x) to F16 precision for |x| >= 4.508, use sign after this
     const float16x8_t ONE      = vdupq_n_f16(1.0f);
diff --git a/src/core/NEON/NESymm.h b/src/core/NEON/NESymm.h
index e6644577a1..ec246efc8c 100644
--- a/src/core/NEON/NESymm.h
+++ b/src/core/NEON/NESymm.h
@@ -25,7 +25,9 @@
 #define ARM_COMPUTE_NESYMM_H
 
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -49,13 +51,10 @@ using qsymm16x8x2_t = int16x8x2_t; /**< 16 bit quantized symmetric vector with 1
  * @return Quantized values
  */
 template <bool is_bounded_relu>
-int16x8_t finalize_quantization_int16(int32x4x2_t &in_s32,
-                                      int          result_fixedpoint_multiplier,
-                                      int32_t      result_shift,
-                                      int16x8_t    min_s16,
-                                      int16x8_t    max_s16)
+int16x8_t finalize_quantization_int16(
+    int32x4x2_t &in_s32, int result_fixedpoint_multiplier, int32_t result_shift, int16x8_t min_s16, int16x8_t max_s16)
 {
-    if(result_shift < 0)
+    if (result_shift < 0)
     {
         in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << -result_shift));
         in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << -result_shift));
@@ -76,7 +75,7 @@ int16x8_t finalize_quantization_int16(int32x4x2_t &in_s32,
     // Convert S32 to S16
     int16x8_t out_s16 = vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1]));
 
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_s16 = vmaxq_s16(out_s16, min_s16);
         out_s16 = vminq_s16(out_s16, max_s16);
@@ -98,13 +97,14 @@ int16x8_t finalize_quantization_int16(int32x4x2_t &in_s32,
  * @return Quantized values
  */
 template <bool is_bounded_relu>
-inline int16_t finalize_quantization_int16(int32_t in_value, int result_fixedpoint_multiplier,
-                                           int32_t result_shift, int16_t min_s16, int16_t max_s16)
+inline int16_t finalize_quantization_int16(
+    int32_t in_value, int result_fixedpoint_multiplier, int32_t result_shift, int16_t min_s16, int16_t max_s16)
 {
-    if(result_shift < 0)
+    if (result_shift < 0)
     {
-        const int64_t in_64 = static_cast<int64_t>(in_value) * (1 << (-result_shift)) * static_cast<int64_t>(result_fixedpoint_multiplier);
-        in_value            = static_cast<int32_t>((in_64 + (1 << 30)) >> 31);
+        const int64_t in_64 = static_cast<int64_t>(in_value) * (1 << (-result_shift)) *
+                              static_cast<int64_t>(result_fixedpoint_multiplier);
+        in_value = static_cast<int32_t>((in_64 + (1 << 30)) >> 31);
     }
     else
     {
@@ -117,7 +117,7 @@ inline int16_t finalize_quantization_int16(int32_t in_value, int result_fixedpoi
     // Bound the result
     int16_t out_s16 = static_cast<int16_t>(std::max<int32_t>(-32768, std::min<int32_t>(32767, in_value)));
 
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_s16 = static_cast<int16_t>(std::max(min_s16, std::min(max_s16, out_s16)));
     }
@@ -134,14 +134,9 @@ inline int16_t finalize_quantization_int16(int32_t in_value, int result_fixedpoi
  */
 inline float32x4x2_t vdequantize_int16(const int16x8_t &qv, float scale)
 {
-    const float32x4_t   vscale = vdupq_n_f32(scale);
-    const float32x4x2_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv))), vscale)
-        }
-    };
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x2_t vdequantized_input = {{vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv))), vscale),
+                                               vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv))), vscale)}};
     return vdequantized_input;
 }
 
@@ -156,18 +151,13 @@ inline int16x8_t vquantize_int16(const float32x4x2_t &qv, float scale)
 {
     const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
 
-    const int32x4x2_t rf =
-    {
-        {
+    const int32x4x2_t rf = {{
 #ifdef __aarch64__
-            vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
+        vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
 #else  //__aarch64__
-            vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
+        vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
 #endif //__aarch64__
-        }
-    };
+    }};
     return vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
 }
 
@@ -180,17 +170,14 @@ inline int16x8_t vquantize_int16(const float32x4x2_t &qv, float scale)
  */
 inline float32x4x4_t vdequantize(const int16x8x2_t &qv, const UniformQuantizationInfo &qi)
 {
-    const float         scale  = qi.scale;
-    const float32x4_t   vscale = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[0]))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[0]))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[1]))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[1]))), vscale),
-        }
-    };
+    const float         scale              = qi.scale;
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[0]))), vscale),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[0]))), vscale),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[1]))), vscale),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[1]))), vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -206,24 +193,20 @@ inline qsymm16x8x2_t vquantize_qsymm16(const float32x4x4_t &qv, const UniformQua
     const float scale = qi.scale;
     ARM_COMPUTE_ERROR_ON(scale == 0.f);
     const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-    const int32x4x4_t rf =
-    {
-        {
+    const int32x4x4_t rf        = {{
 #ifdef __aarch64__
-            vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale)),
-            vcvtnq_s32_f32(vmulq_f32(qv.val[2], vinvscale)),
-            vcvtnq_s32_f32(vmulq_f32(qv.val[3], vinvscale)),
+        vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
+        vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale)),
+        vcvtnq_s32_f32(vmulq_f32(qv.val[2], vinvscale)),
+        vcvtnq_s32_f32(vmulq_f32(qv.val[3], vinvscale)),
 #else  //__aarch64__
-            vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)),
-            vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)),
-            vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)),
+        vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
+        vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)),
+        vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)),
+        vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)),
 #endif //__aarch64__
-        }
-    };
-    const qsymm16x8x2_t res =
-    {
+    }};
+    const qsymm16x8x2_t res = {
         vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])),
         vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])),
     };
diff --git a/src/core/NEON/SVEAsymm.h b/src/core/NEON/SVEAsymm.h
index eea2627c62..a448cde475 100644
--- a/src/core/NEON/SVEAsymm.h
+++ b/src/core/NEON/SVEAsymm.h
@@ -26,6 +26,7 @@
 
 #if defined(ARM_COMPUTE_ENABLE_SVE2)
 #include "src/core/NEON/SVEMath.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -70,10 +71,18 @@ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svuint8_t &qv, float scal
     const auto          voffset            = svdup_n_s32(offset);
     const auto          vscale             = svdup_n_f32(scale);
     const svfloat32x4_t vdequantized_input = svcreate4_f32(
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(qv))), voffset)), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(qv))), voffset)), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(qv))), voffset)), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(qv))), voffset)), vscale));
+        svmul_f32_z(pg,
+                    svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(qv))), voffset)),
+                    vscale),
+        svmul_f32_z(pg,
+                    svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(qv))), voffset)),
+                    vscale),
+        svmul_f32_z(pg,
+                    svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(qv))), voffset)),
+                    vscale),
+        svmul_f32_z(pg,
+                    svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(qv))), voffset)),
+                    vscale));
     return vdequantized_input;
 }
 
@@ -104,10 +113,10 @@ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, float scale
     const auto          voffset            = svdup_n_s32(offset);
     const auto          vscale             = svdup_n_f32(scale);
     const svfloat32x4_t vdequantized_input = svcreate4_f32(
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(qv)), voffset)), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(qv)), voffset)), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(qv)), voffset)), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(qv)), voffset)), vscale));
+        svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(qv)), voffset)), vscale),
+        svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(qv)), voffset)), vscale),
+        svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(qv)), voffset)), vscale),
+        svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(qv)), voffset)), vscale));
 
     return vdequantized_input;
 }
@@ -135,11 +144,11 @@ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, const Unifo
  */
 inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, const svfloat32x4_t vscale)
 {
-    const svfloat32x4_t vdequantized_input = svcreate4_f32(
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), svget4_f32(vscale, 0)),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), svget4_f32(vscale, 1)),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), svget4_f32(vscale, 2)),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), svget4_f32(vscale, 3)));
+    const svfloat32x4_t vdequantized_input =
+        svcreate4_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), svget4_f32(vscale, 0)),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), svget4_f32(vscale, 1)),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), svget4_f32(vscale, 2)),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), svget4_f32(vscale, 3)));
 
     return vdequantized_input;
 }
@@ -153,12 +162,12 @@ inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, const svflo
  */
 inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, float scale)
 {
-    const auto          vscale             = svdup_n_f32(scale);
-    const svfloat32x4_t vdequantized_input = svcreate4_f32(
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), vscale));
+    const auto          vscale = svdup_n_f32(scale);
+    const svfloat32x4_t vdequantized_input =
+        svcreate4_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), vscale),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), vscale),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), vscale),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), vscale));
     return vdequantized_input;
 }
 
diff --git a/src/core/NEON/SVEMath.h b/src/core/NEON/SVEMath.h
index 5ada7ae0ff..6d69b330ba 100644
--- a/src/core/NEON/SVEMath.h
+++ b/src/core/NEON/SVEMath.h
@@ -28,6 +28,7 @@
 #include "src/core/NEON/wrapper/intrinsics/svcvt.h"
 #include "src/core/NEON/wrapper/intrinsics/svdup_n.h"
 #include "src/core/NEON/wrapper/intrinsics/svreinterpret.h"
+
 #include <arm_sve.h>
 #include <array>
 
@@ -181,9 +182,12 @@ svfloat16_t svpow_f16_z(svbool_t pg, svfloat16_t a, svfloat16_t b);
  * @return The converted integer vector
  */
 template <typename int_vec_type>
-int_vec_type convert_float_to_int(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3);
+int_vec_type convert_float_to_int(const svfloat32_t &in_0,
+                                  const svfloat32_t &in_1,
+                                  const svfloat32_t &in_2,
+                                  const svfloat32_t &in_3);
 
 } // namespace arm_compute
 #include "src/core/NEON/SVEMath.inl"
 #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
-#endif /* ARM_COMPUTE_SVEMATH_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_SVEMATH_H */
diff --git a/src/core/NEON/SVEMath.inl b/src/core/NEON/SVEMath.inl
index 8973d0b273..b30125dcb7 100644
--- a/src/core/NEON/SVEMath.inl
+++ b/src/core/NEON/SVEMath.inl
@@ -32,8 +32,16 @@
 
 namespace arm_compute
 {
-inline svfloat32_t svtaylor_poly_f32_z(svbool_t pg, svfloat32_t x, svfloat32_t coeff_1, svfloat32_t coeff_2, svfloat32_t coeff_3,
-                                       svfloat32_t coeff_4, svfloat32_t coeff_5, svfloat32_t coeff_6, svfloat32_t coeff_7, svfloat32_t coeff_8)
+inline svfloat32_t svtaylor_poly_f32_z(svbool_t    pg,
+                                       svfloat32_t x,
+                                       svfloat32_t coeff_1,
+                                       svfloat32_t coeff_2,
+                                       svfloat32_t coeff_3,
+                                       svfloat32_t coeff_4,
+                                       svfloat32_t coeff_5,
+                                       svfloat32_t coeff_6,
+                                       svfloat32_t coeff_7,
+                                       svfloat32_t coeff_8)
 {
     const auto A   = svmla_f32_z(pg, coeff_1, coeff_5, x);
     const auto B   = svmla_f32_z(pg, coeff_3, coeff_7, x);
@@ -45,8 +53,16 @@ inline svfloat32_t svtaylor_poly_f32_z(svbool_t pg, svfloat32_t x, svfloat32_t c
     return res;
 }
 
-inline svfloat16_t svtaylor_poly_f16_z(svbool_t pg, svfloat16_t x, svfloat16_t coeff_1, svfloat16_t coeff_2, svfloat16_t coeff_3,
-                                       svfloat16_t coeff_4, svfloat16_t coeff_5, svfloat16_t coeff_6, svfloat16_t coeff_7, svfloat16_t coeff_8)
+inline svfloat16_t svtaylor_poly_f16_z(svbool_t    pg,
+                                       svfloat16_t x,
+                                       svfloat16_t coeff_1,
+                                       svfloat16_t coeff_2,
+                                       svfloat16_t coeff_3,
+                                       svfloat16_t coeff_4,
+                                       svfloat16_t coeff_5,
+                                       svfloat16_t coeff_6,
+                                       svfloat16_t coeff_7,
+                                       svfloat16_t coeff_8)
 {
     const auto A   = svmla_f16_z(pg, coeff_1, coeff_5, x);
     const auto B   = svmla_f16_z(pg, coeff_3, coeff_7, x);
@@ -90,15 +106,17 @@ inline svfloat32_t svexp_f32_z(svbool_t pg, svfloat32_t x)
     const auto c4 = svreinterpret_f32_u32(svdup_n_u32(svexp_f32_coeff[3]));
     const auto c5 = svreinterpret_f32_u32(svdup_n_u32(svexp_f32_coeff[4]));
 
-    const auto shift   = svreinterpret_f32_u32(svdup_n_u32(0x4b00007f));  // 2^23 + 127 = 0x1.0000fep23f
-    const auto inv_ln2 = svreinterpret_f32_u32(svdup_n_u32(0x3fb8aa3b));  // 1 / ln(2) = 0x1.715476p+0f
-    const auto neg_ln2_hi  = svreinterpret_f32_u32(svdup_n_u32(0xbf317200));  // -ln(2) from bits  -1 to -19: -0x1.62e400p-1f
-    const auto neg_ln2_lo  = svreinterpret_f32_u32(svdup_n_u32(0xb5bfbe8e));  // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f
+    const auto shift   = svreinterpret_f32_u32(svdup_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f
+    const auto inv_ln2 = svreinterpret_f32_u32(svdup_n_u32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f
+    const auto neg_ln2_hi =
+        svreinterpret_f32_u32(svdup_n_u32(0xbf317200)); // -ln(2) from bits  -1 to -19: -0x1.62e400p-1f
+    const auto neg_ln2_lo =
+        svreinterpret_f32_u32(svdup_n_u32(0xb5bfbe8e)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f
 
     const auto inf       = svdup_n_f32(std::numeric_limits<float>::infinity());
-    const auto max_input = svdup_n_f32(88.37f);  // Approximately ln(2^127.5)
+    const auto max_input = svdup_n_f32(88.37f); // Approximately ln(2^127.5)
     const auto zero      = svdup_n_f32(0.f);
-    const auto min_input = svdup_n_f32(-86.64f);  // Approximately ln(2^-125)
+    const auto min_input = svdup_n_f32(-86.64f); // Approximately ln(2^-125)
 
     // Range reduction:
     //   e^x = 2^n * e^r
@@ -114,23 +132,23 @@ inline svfloat32_t svexp_f32_z(svbool_t pg, svfloat32_t x)
     //     (i.e. n) because the decimal part has been pushed out and lost.
     //   * The addition of 127 makes the FP32 fraction part of z ready to be used as the exponent
     //     in FP32 format. Left shifting z by 23 bits will result in 2^n.
-    const auto z = svmla_f32_z(pg, shift, x, inv_ln2);
-    const auto n = svsub_f32_z(pg, z, shift);
-    const auto scale = svreinterpret_f32_u32(svlsl_n_u32_z(pg, svreinterpret_u32_f32(z), 23));  // 2^n
+    const auto z     = svmla_f32_z(pg, shift, x, inv_ln2);
+    const auto n     = svsub_f32_z(pg, z, shift);
+    const auto scale = svreinterpret_f32_u32(svlsl_n_u32_z(pg, svreinterpret_u32_f32(z), 23)); // 2^n
 
     // The calculation of n * ln(2) is done using 2 steps to achieve accuracy beyond FP32.
     // This outperforms longer Taylor series (3-4 tabs) both in term of accuracy and performance.
     const auto r_hi = svmla_f32_z(pg, x, n, neg_ln2_hi);
-    const auto r = svmla_f32_z(pg, r_hi, n, neg_ln2_lo);
+    const auto r    = svmla_f32_z(pg, r_hi, n, neg_ln2_lo);
 
     // Compute the truncated Taylor series of e^r.
     //   poly = scale * (1 + c1 * r + c2 * r^2 + c3 * r^3 + c4 * r^4 + c5 * r^5)
     const auto r2 = svmul_f32_z(pg, r, r);
 
-    const auto p1 = svmul_f32_z(pg, c1, r);
-    const auto p23 = svmla_f32_z(pg, c2, c3, r);
-    const auto p45 = svmla_f32_z(pg, c4, c5, r);
-    const auto p2345 = svmla_f32_z(pg, p23, p45, r2);
+    const auto p1     = svmul_f32_z(pg, c1, r);
+    const auto p23    = svmla_f32_z(pg, c2, c3, r);
+    const auto p45    = svmla_f32_z(pg, c4, c5, r);
+    const auto p2345  = svmla_f32_z(pg, p23, p45, r2);
     const auto p12345 = svmla_f32_z(pg, p1, p2345, r2);
 
     auto poly = svmla_f32_z(pg, scale, p12345, scale);
@@ -213,7 +231,8 @@ inline svfloat32_t svlog_f32_z(svbool_t pg, svfloat32_t x)
     auto val = svreinterpret_f32_s32(svsub_s32_z(pg, svreinterpret_s32_f32(x), svlsl_n_s32_z(pg, m, 23)));
 
     // Polynomial Approximation
-    auto poly = svtaylor_poly_f32_z(pg, val, log_tab_1, log_tab_2, log_tab_3, log_tab_4, log_tab_5, log_tab_6, log_tab_7, log_tab_8);
+    auto poly = svtaylor_poly_f32_z(pg, val, log_tab_1, log_tab_2, log_tab_3, log_tab_4, log_tab_5, log_tab_6,
+                                    log_tab_7, log_tab_8);
 
     // Reconstruct
     poly = svmla_f32_z(pg, poly, svcvt_f32_s32_z(pg, m), CONST_LN2);
@@ -259,7 +278,8 @@ inline svfloat32_t svsin_f32_z(svbool_t pg, svfloat32_t val)
     //Find positive or negative
     const auto c_v    = svabs_z(pg, wrapper::svcvt_z<int32_t>(pg, svmul_z(pg, val, ipi_v)));
     const auto sign_v = svcmple(pg, val, wrapper::svdup_n(ScalarType(0)));
-    const auto odd_v  = svcmpne(pg, svand_z(pg, wrapper::svreinterpret<IntType>(c_v), wrapper::svdup_n(IntType(1))), wrapper::svdup_n(IntType(0)));
+    const auto odd_v  = svcmpne(pg, svand_z(pg, wrapper::svreinterpret<IntType>(c_v), wrapper::svdup_n(IntType(1))),
+                                wrapper::svdup_n(IntType(0)));
 
     auto neg_v = sveor_z(pg, odd_v, sign_v);
 
@@ -347,7 +367,10 @@ inline svfloat16_t svpow_f16_z(svbool_t pg, svfloat16_t a, svfloat16_t b)
 
 #if defined(ARM_COMPUTE_ENABLE_SVE2)
 template <>
-inline svuint8_t convert_float_to_int<svuint8_t>(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3)
+inline svuint8_t convert_float_to_int<svuint8_t>(const svfloat32_t &in_0,
+                                                 const svfloat32_t &in_1,
+                                                 const svfloat32_t &in_2,
+                                                 const svfloat32_t &in_3)
 {
     svuint8_t  out;
     const auto all_true_pg = svptrue_b32();
@@ -381,7 +404,10 @@ inline svuint8_t convert_float_to_int<svuint8_t>(const svfloat32_t &in_0, const
 }
 
 template <>
-inline svint8_t convert_float_to_int<svint8_t>(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3)
+inline svint8_t convert_float_to_int<svint8_t>(const svfloat32_t &in_0,
+                                               const svfloat32_t &in_1,
+                                               const svfloat32_t &in_2,
+                                               const svfloat32_t &in_3)
 {
     svint8_t   out;
     const auto all_true_pg = svptrue_b32();
diff --git a/src/core/NEON/SVESymm.h b/src/core/NEON/SVESymm.h
index 6808577681..288d45d979 100644
--- a/src/core/NEON/SVESymm.h
+++ b/src/core/NEON/SVESymm.h
@@ -28,6 +28,7 @@
 
 #if defined(ARM_COMPUTE_ENABLE_SVE2)
 #include "src/core/NEON/SVEMath.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -42,8 +43,10 @@ namespace arm_compute
  */
 inline svfloat32x2_t svdequantize_qsymm16_z(svbool_t pg, const svint16_t &qv, float scale)
 {
-    const auto          vscale             = svdup_n_f32(scale);
-    const svfloat32x2_t vdequantized_input = svcreate2_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(qv)), vscale), svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(qv)), vscale));
+    const auto          vscale = svdup_n_f32(scale);
+    const svfloat32x2_t vdequantized_input =
+        svcreate2_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(qv)), vscale),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(qv)), vscale));
     return vdequantized_input;
 }
 
@@ -76,13 +79,13 @@ inline svint16_t svquantize_qsymm16_z(svbool_t pg, const svfloat32x2_t qv, float
  */
 inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint16x2_t qv, const UniformQuantizationInfo &qi)
 {
-    const float         scale              = qi.scale;
-    const auto          vscale             = svdup_n_f32(scale);
-    const svfloat32x4_t vdequantized_input = svcreate4_f32(
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 0))), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 0))), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 1))), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 1))), vscale));
+    const float         scale  = qi.scale;
+    const auto          vscale = svdup_n_f32(scale);
+    const svfloat32x4_t vdequantized_input =
+        svcreate4_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 0))), vscale),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 0))), vscale),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 1))), vscale),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 1))), vscale));
     return vdequantized_input;
 }
 
@@ -112,4 +115,4 @@ inline svint16x2_t svquantize_qsymm16_z(svbool_t pg, const svfloat32x4_t qv, con
 
 } // namespace arm_compute
 #endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
-#endif // ARM_COMPUTE_NESYMM_H
-\ No newline at end of file
+#endif // ARM_COMPUTE_NESYMM_H
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index 108b199df7..deb89996a9 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -28,18 +28,17 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
+#include "src/core/NEON/kernels/batchnormalization/impl/list.h"
 #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
 
-#include "src/core/NEON/kernels/batchnormalization/impl/list.h"
-#include "src/core/common/Registrars.h"
-
 #include <map>
 
 namespace arm_compute
@@ -52,8 +51,15 @@ struct BatchNormalizationSelectorData
     const CPUInfo &ci;
 };
 using BatchNormalizationSelectorPtr = std::add_pointer<bool(const BatchNormalizationSelectorData &data)>::type;
-using BatchNormalizationKernelPtr   = std::add_pointer<void(ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, const ITensor *,
-                                                            float, ActivationLayerInfo &, const Window &)>::type;
+using BatchNormalizationKernelPtr   = std::add_pointer<void(ITensor *,
+                                                          ITensor *,
+                                                          const ITensor *,
+                                                          const ITensor *,
+                                                          const ITensor *,
+                                                          const ITensor *,
+                                                          float,
+                                                          ActivationLayerInfo &,
+                                                          const Window &)>::type;
 
 struct BatchNormalizationKernel
 {
@@ -62,41 +68,32 @@ struct BatchNormalizationKernel
     BatchNormalizationKernelPtr         ukernel;
 };
 
-static const BatchNormalizationKernel available_kernels[] =
-{
+static const BatchNormalizationKernel available_kernels[] = {
 #if defined(ARM_COMPUTE_ENABLE_SVE)
-    {
-        "sve_fp16_batch_normalization",
-        [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); },
-        REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_batch_normalization)
-    },
-    {
-        "sve_fp32_batch_normalization",
-        [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); },
-        REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_batch_normalization)
-    },
+    {"sve_fp16_batch_normalization",
+     [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F16 && data.ci.has_sve(); },
+     REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_batch_normalization)},
+    {"sve_fp32_batch_normalization",
+     [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F32 && data.ci.has_sve(); },
+     REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_batch_normalization)},
 #endif /* !defined(ARM_COMPUTE_ENABLE_SVE) */
 #if defined(ARM_COMPUTE_ENABLE_NEON)
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    {
-        "neon_fp16_batch_normalization",
-        [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_batch_normalization)
-    },
+    {"neon_fp16_batch_normalization",
+     [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_batch_normalization)},
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    {
-        "neon_fp32_batch_normalization",
-        [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_batch_normalization)
-    },
+    {"neon_fp32_batch_normalization",
+     [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_batch_normalization)},
 #endif /* !defined(ARM_COMPUTE_ENABLE_NEON) */
 };
 
 const BatchNormalizationKernel *get_implementation(const BatchNormalizationSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -104,25 +101,31 @@ const BatchNormalizationKernel *get_implementation(const BatchNormalizationSelec
     return nullptr;
 }
 
-Status
-validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var,
-                   const ITensorInfo *beta, const ITensorInfo *gamma, float epsilon, ActivationLayerInfo act_info)
+Status validate_arguments(const ITensorInfo  *input,
+                          const ITensorInfo  *output,
+                          const ITensorInfo  *mean,
+                          const ITensorInfo  *var,
+                          const ITensorInfo  *beta,
+                          const ITensorInfo  *gamma,
+                          float               epsilon,
+                          ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_UNUSED(epsilon);
 
-    const auto *uk = get_implementation(BatchNormalizationSelectorData{ input->data_type(), CPUInfo::get() });
+    const auto *uk = get_implementation(BatchNormalizationSelectorData{input->data_type(), CPUInfo::get()});
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
-    if(act_info.enabled())
+    if (act_info.enabled())
     {
         ActivationLayerInfo::ActivationFunction act = act_info.activation();
-        ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU
-                                    && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
-                                    && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+        ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU &&
+                                    act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+                                    act !=
+                                        ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
         ARM_COMPUTE_RETURN_ERROR_ON(act_info.b() > act_info.a());
     }
 
-    if(nullptr != output)
+    if (nullptr != output)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -131,17 +134,18 @@ validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const IT
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var);
-    if(beta != nullptr)
+    if (beta != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
     }
-    if(gamma != nullptr)
+    if (gamma != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(
+                                    input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
 
     return Status{};
 }
@@ -169,10 +173,12 @@ void NEBatchNormalizationLayerKernel::batch_normalization_nchw(const Window &win
     // Only compute denominator and constants once per feature map.
     int slice = -1;
 
-    const auto input_mean  = reinterpret_cast<const T *>(_mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const T *>(_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const T *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (_beta != nullptr) ? reinterpret_cast<const T *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_mean = reinterpret_cast<const T *>(_mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var  = reinterpret_cast<const T *>(_var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma =
+        (_gamma != nullptr) ? reinterpret_cast<const T *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta =
+        (_beta != nullptr) ? reinterpret_cast<const T *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
 
     T mean        = static_cast<T>(0);
     T var         = static_cast<T>(0);
@@ -186,80 +192,83 @@ void NEBatchNormalizationLayerKernel::batch_normalization_nchw(const Window &win
     auto       beta_vec        = wrapper::vdup_n(beta, ExactTagType{});
     auto       denominator_vec = wrapper::vdup_n(denominator, ExactTagType{});
     const auto epsilon_vec     = wrapper::vdup_n(static_cast<T>(_epsilon), ExactTagType{});
-    execute_window_loop(win_to_use, [&](const Coordinates & id)
-    {
-        const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<T *>(output.ptr());
-
-        if(slice != id.z())
+    execute_window_loop(
+        win_to_use,
+        [&](const Coordinates &id)
         {
-            mean     = input_mean[id.z()];
-            var      = input_var[id.z()];
-            mean_vec = wrapper::vdup_n(mean, ExactTagType{});
-            var_vec  = wrapper::vdup_n(var, ExactTagType{});
-            if(input_gamma != nullptr)
-            {
-                gamma     = input_gamma[id.z()];
-                gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
-            }
-            if(input_beta != nullptr)
+            const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+
+            if (slice != id.z())
             {
-                beta     = input_beta[id.z()];
-                beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+                mean     = input_mean[id.z()];
+                var      = input_var[id.z()];
+                mean_vec = wrapper::vdup_n(mean, ExactTagType{});
+                var_vec  = wrapper::vdup_n(var, ExactTagType{});
+                if (input_gamma != nullptr)
+                {
+                    gamma     = input_gamma[id.z()];
+                    gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
+                }
+                if (input_beta != nullptr)
+                {
+                    beta     = input_beta[id.z()];
+                    beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+                }
+
+                // Calculate denominator
+                denominator_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+                denominator     = wrapper::vgetlane(denominator_vec, 0);
+                slice           = id.z();
             }
 
-            // Calculate denominator
-            denominator_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
-            denominator     = wrapper::vgetlane(denominator_vec, 0);
-            slice           = id.z();
-        }
-
-        // Perform core calculations using vector operations
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            // Calculate x bar
-            const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
-            const auto x_bar     = wrapper::vmul(numerator, denominator_vec);
-            auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
-
-            // Perform fused activation
-            if(fused_activation)
+            // Perform core calculations using vector operations
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                activation_functor(res);
+                // Calculate x bar
+                const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
+                const auto x_bar     = wrapper::vmul(numerator, denominator_vec);
+                auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
+
+                // Perform fused activation
+                if (fused_activation)
+                {
+                    activation_functor(res);
+                }
+
+                // Store results
+                wrapper::vstore(output_ptr + x, res);
             }
 
-            // Store results
-            wrapper::vstore(output_ptr + x, res);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const T numerator = input_ptr[x] - mean;
-            const T x_bar     = numerator * denominator;
-            T       res       = beta + x_bar * gamma;
-
-            // Perform fused activation
-            if(fused_activation)
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                activation_functor(res);
+                const T numerator = input_ptr[x] - mean;
+                const T x_bar     = numerator * denominator;
+                T       res       = beta + x_bar * gamma;
+
+                // Perform fused activation
+                if (fused_activation)
+                {
+                    activation_functor(res);
+                }
+
+                // Store results
+                *(output_ptr + x) = res;
             }
-
-            // Store results
-            *(output_ptr + x) = res;
-        }
-    },
-    input, output);
+        },
+        input, output);
 }
 
 void NEBatchNormalizationLayerKernel::configure_non_fused()
 {
-    switch(_input->info()->data_type())
+    switch (_input->info()->data_type())
     {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            _func = &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, false, detail::dummy<float16_t, 8>>;
+            _func = &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, false,
+                                                                               detail::dummy<float16_t, 8>>;
             break;
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F32:
@@ -274,23 +283,25 @@ void NEBatchNormalizationLayerKernel::configure_non_fused()
 void NEBatchNormalizationLayerKernel::configure_fused()
 {
     // NCHW Fused Batched Normalization with activation functions : FP32
-    static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nchw =
-    {
-        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::relu<float, 4>> },
-        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::brelu<float, 4>> },
-        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::lubrelu<float, 4>> }
-    };
+    static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nchw = {
+        {ActivationLayerInfo::ActivationFunction::RELU,
+         &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::relu<float, 4>>},
+        {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+         &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::brelu<float, 4>>},
+        {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+         &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::lubrelu<float, 4>>}};
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     // NCHW Fused Batched Normalization with activation functions : FP16
-    static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f16_nchw =
-    {
-        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::relu<float16_t, 8>> },
-        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::brelu<float16_t, 8>> },
-        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::lubrelu<float16_t, 8>> }
-    };
+    static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f16_nchw = {
+        {ActivationLayerInfo::ActivationFunction::RELU,
+         &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::relu<float16_t, 8>>},
+        {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+         &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::brelu<float16_t, 8>>},
+        {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+         &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::lubrelu<float16_t, 8>>}};
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
-    switch(_input->info()->data_type())
+    switch (_input->info()->data_type())
     {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
@@ -307,22 +318,32 @@ void NEBatchNormalizationLayerKernel::configure_fused()
 }
 
 NEBatchNormalizationLayerKernel::NEBatchNormalizationLayerKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(), _act_info()
+    : _func(nullptr),
+      _input(nullptr),
+      _output(nullptr),
+      _mean(nullptr),
+      _var(nullptr),
+      _gamma(nullptr),
+      _beta(nullptr),
+      _epsilon(),
+      _act_info()
 {
 }
 
-void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output,
-                                                const ITensor *mean, const ITensor *var,
-                                                const ITensor *beta, const ITensor *gamma,
-                                                float epsilon, ActivationLayerInfo act_info)
+void NEBatchNormalizationLayerKernel::configure(ITensor            *input,
+                                                ITensor            *output,
+                                                const ITensor      *mean,
+                                                const ITensor      *var,
+                                                const ITensor      *beta,
+                                                const ITensor      *gamma,
+                                                float               epsilon,
+                                                ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var);
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr,
-                                                  mean->info(), var->info(),
-                                                  (beta != nullptr) ? beta->info() : nullptr,
-                                                  (gamma != nullptr) ? gamma->info() : nullptr,
-                                                  epsilon, act_info));
+                                                  mean->info(), var->info(), (beta != nullptr) ? beta->info() : nullptr,
+                                                  (gamma != nullptr) ? gamma->info() : nullptr, epsilon, act_info));
 
     _input    = input;
     _output   = input;
@@ -334,16 +355,16 @@ void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output,
     _act_info = act_info;
 
     const bool run_in_place = (output == nullptr) || (output == input);
-    if(!run_in_place)
+    if (!run_in_place)
     {
         _output = output;
     }
 
     // Configure activation function to run
     const bool is_nchw = _input->info()->data_layout() == DataLayout::NCHW;
-    if(is_nchw)
+    if (is_nchw)
     {
-        if(_act_info.enabled())
+        if (_act_info.enabled())
         {
             configure_fused();
         }
@@ -357,17 +378,21 @@ void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output,
     Window win = calculate_max_window(*input->info(), Steps());
     INEKernel::configure(win);
 
-    if(output != nullptr)
+    if (output != nullptr)
     {
         // Output auto initialization if not yet initialized
         auto_init_if_empty(*output->info(), *input->info()->clone());
     }
 }
 
-Status NEBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                                 const ITensorInfo *mean, const ITensorInfo *var,
-                                                 const ITensorInfo *beta, const ITensorInfo *gamma,
-                                                 float epsilon, ActivationLayerInfo act_info)
+Status NEBatchNormalizationLayerKernel::validate(const ITensorInfo  *input,
+                                                 const ITensorInfo  *output,
+                                                 const ITensorInfo  *mean,
+                                                 const ITensorInfo  *var,
+                                                 const ITensorInfo  *beta,
+                                                 const ITensorInfo  *gamma,
+                                                 float               epsilon,
+                                                 ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info));
 
@@ -382,13 +407,14 @@ void NEBatchNormalizationLayerKernel::run(const Window &window, const ThreadInfo
     ARM_COMPUTE_ERROR_ON(_func == nullptr && _input->info()->data_layout() == DataLayout::NCHW);
 
     const bool is_nchw = _input->info()->data_layout() == DataLayout::NCHW;
-    if(is_nchw)
+    if (is_nchw)
     {
         (this->*_func)(window);
     }
     else
     {
-        const auto *uk = get_implementation(BatchNormalizationSelectorData{ _input->info()->data_type(), CPUInfo::get() });
+        const auto *uk =
+            get_implementation(BatchNormalizationSelectorData{_input->info()->data_type(), CPUInfo::get()});
         uk->ukernel(_input, _output, _mean, _var, _beta, _gamma, _epsilon, _act_info, window);
     }
 }
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
index 0551ace30c..2e8ff0dc9a 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -68,7 +69,13 @@ public:
      * @param[in]      epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
      */
-    void configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta = nullptr, const ITensor *gamma = nullptr, float epsilon = 0.001f,
+    void configure(ITensor            *input,
+                   ITensor            *output,
+                   const ITensor      *mean,
+                   const ITensor      *var,
+                   const ITensor      *beta     = nullptr,
+                   const ITensor      *gamma    = nullptr,
+                   float               epsilon  = 0.001f,
                    ActivationLayerInfo act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEBatchNormalizationLayerKernel
      *
@@ -85,10 +92,14 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const ITensorInfo *mean, const ITensorInfo *var,
-                           const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
-                           float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo  *input,
+                           const ITensorInfo  *output,
+                           const ITensorInfo  *mean,
+                           const ITensorInfo  *var,
+                           const ITensorInfo  *beta     = nullptr,
+                           const ITensorInfo  *gamma    = nullptr,
+                           float               epsilon  = 0.001f,
+                           ActivationLayerInfo act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
index 83fb5f6f51..f299bb94a4 100644
--- a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
@@ -27,8 +27,9 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -46,7 +47,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -54,7 +55,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
 
     return Status{};
 }
-Status validate_arguments_static(const ITensorInfo *input, int block_shape_x, int block_shape_y, const ITensorInfo *output, const CropInfo &crop_info)
+Status validate_arguments_static(const ITensorInfo *input,
+                                 int                block_shape_x,
+                                 int                block_shape_y,
+                                 const ITensorInfo *output,
+                                 const CropInfo    &crop_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
@@ -65,13 +70,14 @@ Status validate_arguments_static(const ITensorInfo *input, int block_shape_x, in
     const int        idx_batch   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0);
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
-        const TensorShape expected_output_shape = compute_batch_to_space_shape(input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info);
-        const TensorInfo  expected_output       = output->clone()->set_tensor_shape(expected_output_shape);
+        const TensorShape expected_output_shape = compute_batch_to_space_shape(
+            input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info);
+        const TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &expected_output);
     }
 
@@ -80,7 +86,13 @@ Status validate_arguments_static(const ITensorInfo *input, int block_shape_x, in
 } // namespace
 
 NEBatchToSpaceLayerKernel::NEBatchToSpaceLayerKernel()
-    : _input(nullptr), _block_shape(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _block_shape_x(), _block_shape_y(), _crop_info()
+    : _input(nullptr),
+      _block_shape(nullptr),
+      _output(nullptr),
+      _data_layout(DataLayout::UNKNOWN),
+      _block_shape_x(),
+      _block_shape_y(),
+      _crop_info()
 {
 }
 
@@ -99,15 +111,18 @@ void NEBatchToSpaceLayerKernel::configure(const ITensor *input, const ITensor *b
     ICPPKernel::configure(win);
 }
 
-void NEBatchToSpaceLayerKernel::configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info)
+void NEBatchToSpaceLayerKernel::configure(
+    const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    const TensorShape output_shape = compute_batch_to_space_shape(input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y);
+    const TensorShape output_shape = compute_batch_to_space_shape(
+        input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y);
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info));
 
     _input         = input;
     _output        = output;
@@ -121,14 +136,19 @@ void NEBatchToSpaceLayerKernel::configure(const ITensor *input, int32_t block_sh
     ICPPKernel::configure(win);
 }
 
-Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+Status
+NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_shape, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, output));
     return Status{};
 }
 
-Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info)
+Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input,
+                                           int32_t            block_shape_x,
+                                           int32_t            block_shape_y,
+                                           const ITensorInfo *output,
+                                           const CropInfo    &crop_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, output, crop_info));
@@ -141,7 +161,7 @@ void NEBatchToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
 
-    if(_block_shape != nullptr)
+    if (_block_shape != nullptr)
     {
         // Retrieve the block shapes dynamically
         _block_shape_x = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(0)));
@@ -155,31 +175,32 @@ void NEBatchToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info
 
     int batch_id = 0;
     // Main loop for NCHW and NHWC
-    if(_data_layout == DataLayout::NCHW)
+    if (_data_layout == DataLayout::NCHW)
     {
         do
         {
             Iterator out(_output, slice_out);
-            execute_window_loop(slice_out, [&](const Coordinates & id)
-            {
-
-                const int x = id.x();
-                const int y = id.y();
-                const int z = id.z();
-                // Translate x, y to uncropped version
-                const int x_c = x + _crop_info.left;
-                const int y_c = y + _crop_info.top;
-
-                const int   in_batch = batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size;
-                const int   in_x     = x_c / _block_shape_x;
-                const int   in_y     = y_c / _block_shape_y;
-                Coordinates input_coords{ in_x, in_y, z, in_batch };
-                memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
-            },
-            out);
+            execute_window_loop(
+                slice_out,
+                [&](const Coordinates &id)
+                {
+                    const int x = id.x();
+                    const int y = id.y();
+                    const int z = id.z();
+                    // Translate x, y to uncropped version
+                    const int x_c = x + _crop_info.left;
+                    const int y_c = y + _crop_info.top;
+
+                    const int in_batch =
+                        batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size;
+                    const int   in_x = x_c / _block_shape_x;
+                    const int   in_y = y_c / _block_shape_y;
+                    Coordinates input_coords{in_x, in_y, z, in_batch};
+                    memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+                },
+                out);
             ++batch_id;
-        }
-        while(window.slide_window_slice_3D(slice_out));
+        } while (window.slide_window_slice_3D(slice_out));
     }
     else
     {
@@ -188,26 +209,28 @@ void NEBatchToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info
         do
         {
             Iterator out(_output, slice_out);
-            execute_window_loop(slice_out, [&](const Coordinates & id)
-            {
-
-                const int x = id.y();
-                const int y = id.z();
-
-                // Translate x, y to uncropped version
-                const int x_c = x + _crop_info.left;
-                const int y_c = y + _crop_info.top;
-
-                const int   in_batch = batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size;
-                const int   in_x     = x_c / _block_shape_x;
-                const int   in_y     = y_c / _block_shape_y;
-                Coordinates input_coords{ 0, in_x, in_y, in_batch };
-                memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size * _input->info()->dimension(0));
-            },
-            out);
+            execute_window_loop(
+                slice_out,
+                [&](const Coordinates &id)
+                {
+                    const int x = id.y();
+                    const int y = id.z();
+
+                    // Translate x, y to uncropped version
+                    const int x_c = x + _crop_info.left;
+                    const int y_c = y + _crop_info.top;
+
+                    const int in_batch =
+                        batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size;
+                    const int   in_x = x_c / _block_shape_x;
+                    const int   in_y = y_c / _block_shape_y;
+                    Coordinates input_coords{0, in_x, in_y, in_batch};
+                    memcpy(out.ptr(), _input->ptr_to_element(input_coords),
+                           element_size * _input->info()->dimension(0));
+                },
+                out);
             ++batch_id;
-        }
-        while(window.slide_window_slice_3D(slice_out));
+        } while (window.slide_window_slice_3D(slice_out));
     }
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
index 5eceee0904..d98ac621b0 100644
--- a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
+++ b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NEBATCHTOSPACELAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -68,7 +69,11 @@ public:
      * @param[out] output        Tensor output. Data types supported: same as @p input
      * @param[in]  crop_info     Specifies how the output shape is cropped after batch to space is performed
      */
-    void configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info = CropInfo{});
+    void configure(const ITensor  *input,
+                   int32_t         block_shape_x,
+                   int32_t         block_shape_y,
+                   ITensor        *output,
+                   const CropInfo &crop_info = CropInfo{});
     /** Static function to check if given info will lead to a valid configuration of @ref NEBatchToSpaceLayerKernel
      *
      * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -90,7 +95,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info = CropInfo{});
+    static Status validate(const ITensorInfo *input,
+                           int32_t            block_shape_x,
+                           int32_t            block_shape_y,
+                           const ITensorInfo *output,
+                           const CropInfo    &crop_info = CropInfo{});
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
index 677c5cddcc..a59bbd233b 100644
--- a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
@@ -27,9 +27,10 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 #include <cstdint>
@@ -55,8 +56,7 @@ inline void bitwise_and(const T *__restrict input1, const T *__restrict input2,
 }
 } // namespace
 
-NEBitwiseAndKernel::NEBitwiseAndKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+NEBitwiseAndKernel::NEBitwiseAndKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
 }
 
@@ -86,8 +86,7 @@ void NEBitwiseAndKernel::configure(const ITensor *input1, const ITensor *input2,
     Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
 
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
+    update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
                               AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
                               output_access);
 
@@ -103,9 +102,7 @@ void NEBitwiseAndKernel::run(const Window &window, const ThreadInfo &info)
     Iterator input2(_input2, window);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        bitwise_and<uint8_t>(input1.ptr(), input2.ptr(), output.ptr());
-    },
-    input1, input2, output);
+    execute_window_loop(
+        window, [&](const Coordinates &) { bitwise_and<uint8_t>(input1.ptr(), input2.ptr(), output.ptr()); }, input1,
+        input2, output);
 }
diff --git a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
index 19b1af690a..ecd181a7af 100644
--- a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -50,8 +51,7 @@ inline void bitwise_not_U8_U8(const uint8_t *__restrict input, uint8_t *__restri
 }
 } // namespace
 
-NEBitwiseNotKernel::NEBitwiseNotKernel()
-    : _input(nullptr), _output(nullptr)
+NEBitwiseNotKernel::NEBitwiseNotKernel() : _input(nullptr), _output(nullptr)
 {
 }
 
@@ -77,7 +77,8 @@ void NEBitwiseNotKernel::configure(const ITensor *input, ITensor *output)
     // Configure kernel window
     Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), output_access);
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
 
     INEKernel::configure(win);
 }
@@ -90,9 +91,6 @@ void NEBitwiseNotKernel::run(const Window &window, const ThreadInfo &info)
     Iterator input(_input, window);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        bitwise_not_U8_U8(input.ptr(), output.ptr());
-    },
-    input, output);
+    execute_window_loop(
+        window, [&](const Coordinates &) { bitwise_not_U8_U8(input.ptr(), output.ptr()); }, input, output);
 }
diff --git a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
index 08094fbfcf..4c906134aa 100644
--- a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -42,7 +43,8 @@ class Coordinates;
 
 namespace
 {
-inline void bitwise_or_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
+inline void
+bitwise_or_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
 {
     const uint8x16_t val1 = vld1q_u8(input1);
     const uint8x16_t val2 = vld1q_u8(input2);
@@ -51,8 +53,7 @@ inline void bitwise_or_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t
 }
 } // namespace
 
-NEBitwiseOrKernel::NEBitwiseOrKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+NEBitwiseOrKernel::NEBitwiseOrKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
 }
 
@@ -82,8 +83,7 @@ void NEBitwiseOrKernel::configure(const ITensor *input1, const ITensor *input2,
     Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
 
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
+    update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
                               AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
                               output_access);
 
@@ -99,9 +99,7 @@ void NEBitwiseOrKernel::run(const Window &window, const ThreadInfo &info)
     Iterator input2(_input2, window);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        bitwise_or_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr());
-    },
-    input1, input2, output);
+    execute_window_loop(
+        window, [&](const Coordinates &) { bitwise_or_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr()); }, input1,
+        input2, output);
 }
diff --git a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
index fc5b38b64f..dbbed2483c 100644
--- a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -42,7 +43,8 @@ class Coordinates;
 
 namespace
 {
-inline void bitwise_xor_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
+inline void
+bitwise_xor_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
 {
     const uint8x16_t val1 = vld1q_u8(input1);
     const uint8x16_t val2 = vld1q_u8(input2);
@@ -51,8 +53,7 @@ inline void bitwise_xor_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t
 }
 } // namespace
 
-NEBitwiseXorKernel::NEBitwiseXorKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+NEBitwiseXorKernel::NEBitwiseXorKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
 }
 
@@ -82,7 +83,8 @@ void NEBitwiseXorKernel::configure(const ITensor *input1, const ITensor *input2,
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
 
     update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
-                              AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration), output_access);
+                              AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
 
     INEKernel::configure(win);
 }
@@ -96,9 +98,7 @@ void NEBitwiseXorKernel::run(const Window &window, const ThreadInfo &info)
     Iterator input2(_input2, window);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        bitwise_xor_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr());
-    },
-    input1, input2, output);
+    execute_window_loop(
+        window, [&](const Coordinates &) { bitwise_xor_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr()); }, input1,
+        input2, output);
 }
diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
index 69bfd56ce0..cb869838e2 100644
--- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
+++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
@@ -27,8 +27,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/CPP/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/boundingboxtransform/list.h"
@@ -45,7 +46,11 @@ struct BoundingBoxTransformSelectorData
 };
 
 using BoundingBoxTransformSelctorPtr = std::add_pointer<bool(const BoundingBoxTransformSelectorData &data)>::type;
-using BoundingBoxTransformUKernelPtr = std::add_pointer<void(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)>::type;
+using BoundingBoxTransformUKernelPtr = std::add_pointer<void(const ITensor           *boxes,
+                                                             ITensor                 *pred_boxes,
+                                                             const ITensor           *deltas,
+                                                             BoundingBoxTransformInfo bbinfo,
+                                                             const Window            &window)>::type;
 
 struct BoundingBoxTransformKernel
 {
@@ -54,26 +59,19 @@ struct BoundingBoxTransformKernel
     BoundingBoxTransformUKernelPtr       ukernel;
 };
 
-static const BoundingBoxTransformKernel available_kernels[] =
-{
-    {
-        "fp32_neon_boundingboxtransform",
-        [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_boundingboxtransform)
-    },
+static const BoundingBoxTransformKernel available_kernels[] = {
+    {"fp32_neon_boundingboxtransform",
+     [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_boundingboxtransform)},
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    {
-        "fp16_neon_boundingboxtransform",
-        [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_boundingboxtransform)
-    },
+    {"fp16_neon_boundingboxtransform",
+     [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_boundingboxtransform)},
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #if defined(ARM_COMPUTE_ENABLE_NEON)
-    {
-        "qu16_neon_boundingboxtransform",
-        [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::QASYMM16; },
-        REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_boundingboxtransform)
-    },
+    {"qu16_neon_boundingboxtransform",
+     [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::QASYMM16; },
+     REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_boundingboxtransform)},
 #endif //defined(ARM_COMPUTE_ENABLE_NEON)
 };
 
@@ -85,9 +83,9 @@ static const BoundingBoxTransformKernel available_kernels[] =
  */
 const BoundingBoxTransformKernel *get_implementation(const BoundingBoxTransformSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -95,7 +93,10 @@ const BoundingBoxTransformKernel *get_implementation(const BoundingBoxTransformS
     return nullptr;
 }
 
-Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status validate_arguments(const ITensorInfo              *boxes,
+                          const ITensorInfo              *pred_boxes,
+                          const ITensorInfo              *deltas,
+                          const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(boxes);
@@ -108,7 +109,7 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
     ARM_COMPUTE_RETURN_ERROR_ON(boxes->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(info.scale() <= 0);
 
-    if(boxes->data_type() == DataType::QASYMM16)
+    if (boxes->data_type() == DataType::QASYMM16)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(deltas, 1, DataType::QASYMM8);
         const UniformQuantizationInfo deltas_qinfo = deltas->quantization_info().uniform();
@@ -120,12 +121,12 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes, deltas);
     }
 
-    if(pred_boxes->total_size() > 0)
+    if (pred_boxes->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(pred_boxes->tensor_shape(), deltas->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(pred_boxes, deltas);
         ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes->num_dimensions() > 2);
-        if(pred_boxes->data_type() == DataType::QASYMM16)
+        if (pred_boxes->data_type() == DataType::QASYMM16)
         {
             const UniformQuantizationInfo pred_qinfo = pred_boxes->quantization_info().uniform();
             ARM_COMPUTE_RETURN_ERROR_ON(pred_qinfo.scale != 0.125f);
@@ -142,13 +143,19 @@ NEBoundingBoxTransformKernel::NEBoundingBoxTransformKernel()
 {
 }
 
-void NEBoundingBoxTransformKernel::configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info)
+void NEBoundingBoxTransformKernel::configure(const ITensor                  *boxes,
+                                             ITensor                        *pred_boxes,
+                                             const ITensor                  *deltas,
+                                             const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(boxes->info(), pred_boxes->info(), deltas->info(), info));
 
     // Configure kernel window
-    auto_init_if_empty(*pred_boxes->info(), deltas->info()->clone()->set_data_type(boxes->info()->data_type()).set_quantization_info(boxes->info()->quantization_info()));
+    auto_init_if_empty(*pred_boxes->info(), deltas->info()
+                                                ->clone()
+                                                ->set_data_type(boxes->info()->data_type())
+                                                .set_quantization_info(boxes->info()->quantization_info()));
 
     // Set instance variables
     _boxes      = boxes;
@@ -164,7 +171,10 @@ void NEBoundingBoxTransformKernel::configure(const ITensor *boxes, ITensor *pred
     INEKernel::configure(win);
 }
 
-Status NEBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status NEBoundingBoxTransformKernel::validate(const ITensorInfo              *boxes,
+                                              const ITensorInfo              *pred_boxes,
+                                              const ITensorInfo              *deltas,
+                                              const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(boxes, pred_boxes, deltas, info));
     return Status{};
@@ -176,7 +186,7 @@ void NEBoundingBoxTransformKernel::run(const Window &window, const ThreadInfo &i
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    const auto *uk = get_implementation(BoundingBoxTransformSelectorData{ _boxes->info()->data_type() });
+    const auto *uk = get_implementation(BoundingBoxTransformSelectorData{_boxes->info()->data_type()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     uk->ukernel(_boxes, _pred_boxes, _deltas, _bbinfo, window);
diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
index def827836c..3915994feb 100644
--- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
+++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
@@ -63,7 +63,8 @@ public:
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
      *
      */
-    void configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info);
+    void
+    configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform
      *
@@ -77,7 +78,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info);
+    static Status validate(const ITensorInfo              *boxes,
+                           const ITensorInfo              *pred_boxes,
+                           const ITensorInfo              *deltas,
+                           const BoundingBoxTransformInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
index 64da1f2262..3b53b7055f 100644
--- a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -44,15 +45,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NCHW, DataLayout::NHWC);
 
-    const unsigned int channels = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
+    const unsigned int channels =
+        input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups < 2, "Channel shuffling with less than 2 groups would be inefficient");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups == channels, "Channel shuffling with same number of groups as number of channels would be inefficient");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        num_groups == channels,
+        "Channel shuffling with same number of groups as number of channels would be inefficient");
     ARM_COMPUTE_RETURN_ERROR_ON(num_groups > channels); // There cannot be more groups than channels
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, "The number of channels must be a multiple of the number of groups");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0,
+                                    "The number of channels must be a multiple of the number of groups");
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -72,20 +77,22 @@ void channel_shuffle_nhwc(const ITensor *input, ITensor *output, unsigned int nu
 
     Iterator in(input, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        // Shuffle channel
-        const unsigned int curr_channel = id.x();
-        const unsigned int group_id     = curr_channel * rK;
-        const unsigned int r            = group_id * K;
-        const unsigned int channel_id   = curr_channel - r;
-
-        // Calculate output coordinates
-        Coordinates out_coords = id;
-        out_coords.set(Window::DimX, channel_id * num_groups + group_id);
-        std::copy_n(in.ptr(), element_size, output->ptr_to_element(out_coords));
-    },
-    in);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            // Shuffle channel
+            const unsigned int curr_channel = id.x();
+            const unsigned int group_id     = curr_channel * rK;
+            const unsigned int r            = group_id * K;
+            const unsigned int channel_id   = curr_channel - r;
+
+            // Calculate output coordinates
+            Coordinates out_coords = id;
+            out_coords.set(Window::DimX, channel_id * num_groups + group_id);
+            std::copy_n(in.ptr(), element_size, output->ptr_to_element(out_coords));
+        },
+        in);
 }
 void channel_shuffle_nchw(const ITensor *input, ITensor *output, unsigned int num_groups, const Window &window)
 {
@@ -107,34 +114,35 @@ void channel_shuffle_nchw(const ITensor *input, ITensor *output, unsigned int nu
 
     Iterator in(input, win);
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        // Shuffle channel
-        const unsigned int curr_channel = id.z();
-        const unsigned int group_id     = curr_channel * rK;
-        const unsigned int r            = group_id * K;
-        const unsigned int channel_id   = curr_channel - r;
-
-        // Calculate output coordinates
-        Coordinates out_coords = id;
-        out_coords.set(Window::DimZ, channel_id * num_groups + group_id);
-        const uint8_t *input_ptr  = in.ptr();
-        uint8_t       *output_ptr = output->ptr_to_element(out_coords);
-
-        // Copy plane
-        for(unsigned int y = 0; y < height; ++y)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            std::copy_n(input_ptr, row_size, output_ptr);
-            input_ptr += input_stride_y;
-            output_ptr += output_stride_y;
-        }
-    },
-    in);
+            // Shuffle channel
+            const unsigned int curr_channel = id.z();
+            const unsigned int group_id     = curr_channel * rK;
+            const unsigned int r            = group_id * K;
+            const unsigned int channel_id   = curr_channel - r;
+
+            // Calculate output coordinates
+            Coordinates out_coords = id;
+            out_coords.set(Window::DimZ, channel_id * num_groups + group_id);
+            const uint8_t *input_ptr  = in.ptr();
+            uint8_t       *output_ptr = output->ptr_to_element(out_coords);
+
+            // Copy plane
+            for (unsigned int y = 0; y < height; ++y)
+            {
+                std::copy_n(input_ptr, row_size, output_ptr);
+                input_ptr += input_stride_y;
+                output_ptr += output_stride_y;
+            }
+        },
+        in);
 }
 } // namespace
 
-NEChannelShuffleLayerKernel::NEChannelShuffleLayerKernel()
-    : _input(nullptr), _output(nullptr), _num_groups()
+NEChannelShuffleLayerKernel::NEChannelShuffleLayerKernel() : _input(nullptr), _output(nullptr), _num_groups()
 {
 }
 
@@ -158,7 +166,8 @@ void NEChannelShuffleLayerKernel::configure(const ITensor *input, ITensor *outpu
     INEKernel::configure(win);
 }
 
-Status NEChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
+Status
+NEChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, num_groups));
     return Status{};
@@ -170,7 +179,7 @@ void NEChannelShuffleLayerKernel::run(const Window &window, const ThreadInfo &in
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    switch(_input->info()->data_layout())
+    switch (_input->info()->data_layout())
     {
         case DataLayout::NHWC:
             channel_shuffle_nhwc(_input, _output, _num_groups, window);
diff --git a/src/core/NEON/kernels/NECol2ImKernel.h b/src/core/NEON/kernels/NECol2ImKernel.h
index 1976302036..bc6652fd30 100644
--- a/src/core/NEON/kernels/NECol2ImKernel.h
+++ b/src/core/NEON/kernels/NECol2ImKernel.h
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_NECOL2IMKERNEL_H
 #define ARM_COMPUTE_NECOL2IMKERNEL_H
 
-#include "src/core/NEON/INEKernel.h"
-
 #include "arm_compute/core/Size2D.h"
 
+#include "src/core/NEON/INEKernel.h"
+
 namespace arm_compute
 {
 class ITensor;
diff --git a/src/core/NEON/kernels/NECropKernel.cpp b/src/core/NEON/kernels/NECropKernel.cpp
index 94c455305c..60271fbc74 100644
--- a/src/core/NEON/kernels/NECropKernel.cpp
+++ b/src/core/NEON/kernels/NECropKernel.cpp
@@ -26,14 +26,15 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Window.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/core/utils/helpers/bit_ops.h"
 #include "src/cpu/kernels/crop/list.h"
 
@@ -47,7 +48,8 @@ struct CropSelectorData
 };
 
 using CropSelectorPtr = std::add_pointer<bool(const CropSelectorData &data)>::type;
-using CropUKernelPtr  = std::add_pointer<void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool)>::type;
+using CropUKernelPtr  = std::add_pointer<void(
+    const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool)>::type;
 
 struct CropUKernel
 {
@@ -56,48 +58,23 @@ struct CropUKernel
     CropUKernelPtr        ukernel;
 };
 
-static const CropUKernel available_kernels[] =
-{
-    {
-        "fp16_neon_crop",
-        [](const CropSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::fp16_in_bounds_crop_window)
-    },
-    {
-        "f32_neon_crop",
-        [](const CropSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::fp32_in_bounds_crop_window)
-    },
-    {
-        "u8_neon_crop",
-        [](const CropSelectorData & data) { return data.dt == DataType::U8; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::u8_in_bounds_crop_window)
-    },
-    {
-        "u16_neon_crop",
-        [](const CropSelectorData & data) { return data.dt == DataType::U16; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::u16_in_bounds_crop_window)
-    },
-    {
-        "u32_neon_crop",
-        [](const CropSelectorData & data) { return data.dt == DataType::U32; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::u32_in_bounds_crop_window)
-    },
-    {
-        "s8_neon_crop",
-        [](const CropSelectorData & data) { return data.dt == DataType::S8; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::s8_in_bounds_crop_window)
-    },
-    {
-        "s16_neon_crop",
-        [](const CropSelectorData & data) { return data.dt == DataType::S16; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::s16_in_bounds_crop_window)
-    },
-    {
-        "s32_neon_crop",
-        [](const CropSelectorData & data) { return data.dt == DataType::S32; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::s32_in_bounds_crop_window)
-    },
+static const CropUKernel available_kernels[] = {
+    {"fp16_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::fp16_in_bounds_crop_window)},
+    {"f32_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::fp32_in_bounds_crop_window)},
+    {"u8_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::U8; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::u8_in_bounds_crop_window)},
+    {"u16_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::U16; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::u16_in_bounds_crop_window)},
+    {"u32_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::U32; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::u32_in_bounds_crop_window)},
+    {"s8_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::S8; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s8_in_bounds_crop_window)},
+    {"s16_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::S16; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s16_in_bounds_crop_window)},
+    {"s32_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::S32; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s32_in_bounds_crop_window)},
 };
 
 /** Micro-kernel selector
@@ -108,9 +85,9 @@ static const CropUKernel available_kernels[] =
  */
 const CropUKernel *get_implementation(const CropSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -119,26 +96,40 @@ const CropUKernel *get_implementation(const CropSelectorData &data)
     return nullptr;
 }
 
-inline void out_of_bounds_crop_window(const ITensor *output, float *output_ptr, float extrapolation_value,
-                                      int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit)
+inline void out_of_bounds_crop_window(const ITensor *output,
+                                      float         *output_ptr,
+                                      float          extrapolation_value,
+                                      int32_t        window_step_x,
+                                      int32_t        output_width_start,
+                                      int32_t        output_width_limit)
 {
-    auto    in               = wrapper::vdup_n(extrapolation_value, wrapper::traits::vector_128_tag());
-    int32_t x                = 0;
-    int32_t limit            = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
-    float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
-    for(; x <= limit - window_step_x; x += window_step_x)
+    auto    in    = wrapper::vdup_n(extrapolation_value, wrapper::traits::vector_128_tag());
+    int32_t x     = 0;
+    int32_t limit = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
+    float  *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
+    for (; x <= limit - window_step_x; x += window_step_x)
     {
         wrapper::vstore(output_start_ptr + x, in);
     }
-    for(; x < limit; ++x)
+    for (; x < limit; ++x)
     {
         *(output_start_ptr + x) = extrapolation_value;
     }
 }
 
-inline void execute_window(const ITensor *input, const ITensor *output, Coordinates input_offset, float extrapolation_value,
-                           const std::array<uint32_t, 2> &rows_out_of_bounds, const std::array<uint32_t, 2> &cols_out_of_bounds, NECropKernel::InBoundsCropFunction *in_bounds_crop_function,
-                           bool is_height_flipped, bool has_cols_in_bounds, bool has_cols_out_of_bounds_before, bool has_cols_out_of_bounds_after, bool input_has_single_channel, bool is_width_flipped)
+inline void execute_window(const ITensor                      *input,
+                           const ITensor                      *output,
+                           Coordinates                         input_offset,
+                           float                               extrapolation_value,
+                           const std::array<uint32_t, 2>      &rows_out_of_bounds,
+                           const std::array<uint32_t, 2>      &cols_out_of_bounds,
+                           NECropKernel::InBoundsCropFunction *in_bounds_crop_function,
+                           bool                                is_height_flipped,
+                           bool                                has_cols_in_bounds,
+                           bool                                has_cols_out_of_bounds_before,
+                           bool                                has_cols_out_of_bounds_after,
+                           bool                                input_has_single_channel,
+                           bool                                is_width_flipped)
 {
     // Output is always float.
     const int window_step_x = 16 / sizeof(float);
@@ -159,45 +150,66 @@ inline void execute_window(const ITensor *input, const ITensor *output, Coordina
     //  |------------------------------|
     // Fill all output rows that have no elements that are within the input bounds with the extrapolation value.
     // First for the rows before the in bounds rows.
-    out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, rows_out_of_bounds[0] * output->info()->dimension(1));
+    out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0,
+                              rows_out_of_bounds[0] * output->info()->dimension(1));
     output_ptr += rows_out_of_bounds[0] * output->info()->dimension(1) * output->info()->dimension(0);
     // Iterate through each row that has any elements within the input bounds.
-    for(uint32_t row = rows_out_of_bounds[0]; static_cast<int32_t>(row) < static_cast<int32_t>(output->info()->dimension(2) - rows_out_of_bounds[1]);
-        ++row, is_height_flipped ? --input_offset[2] : ++input_offset[2])
+    for (uint32_t row = rows_out_of_bounds[0];
+         static_cast<int32_t>(row) < static_cast<int32_t>(output->info()->dimension(2) - rows_out_of_bounds[1]);
+         ++row, is_height_flipped ? --input_offset[2] : ++input_offset[2])
     {
         // Fill all elements in the row that are out of bounds with the extrapolation value.
         // First for the elements before the in bounds elements.
-        if(has_cols_out_of_bounds_before)
+        if (has_cols_out_of_bounds_before)
         {
             out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, cols_out_of_bounds[0]);
         }
         // Copy all elements within the input bounds from the input tensor.
-        if(has_cols_in_bounds)
+        if (has_cols_in_bounds)
         {
             (*in_bounds_crop_function)(input, output, output_ptr, input_offset, window_step_x, cols_out_of_bounds[0],
-                                       output->info()->dimension(1) - cols_out_of_bounds[1], input_has_single_channel, is_width_flipped);
+                                       output->info()->dimension(1) - cols_out_of_bounds[1], input_has_single_channel,
+                                       is_width_flipped);
         }
         // Fill all elements after the in bounds elements with the extrapolation value.
-        if(has_cols_out_of_bounds_after)
+        if (has_cols_out_of_bounds_after)
         {
-            out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, output->info()->dimension(1) - cols_out_of_bounds[1], output->info()->dimension(1));
+            out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x,
+                                      output->info()->dimension(1) - cols_out_of_bounds[1],
+                                      output->info()->dimension(1));
         }
         output_ptr += output->info()->dimension(1) * output->info()->dimension(0);
     }
     // Fill all rows after the in bounds elements with the extrapolation value.
-    out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, rows_out_of_bounds[1] * output->info()->dimension(1));
+    out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0,
+                              rows_out_of_bounds[1] * output->info()->dimension(1));
 }
 } // namespace
 
 NECropKernel::NECropKernel()
-    : _input(nullptr), _crop_boxes(nullptr), _box_ind(nullptr), _output(nullptr), _start(), _end(), _crop_box_ind(0), _extrapolation_value(0), _rows_out_of_bounds(), _cols_out_of_bounds()
+    : _input(nullptr),
+      _crop_boxes(nullptr),
+      _box_ind(nullptr),
+      _output(nullptr),
+      _start(),
+      _end(),
+      _crop_box_ind(0),
+      _extrapolation_value(0),
+      _rows_out_of_bounds(),
+      _cols_out_of_bounds()
 {
 }
 
-void NECropKernel::configure(const ITensor *input, const ITensor *crop_boxes, const ITensor *box_ind, ITensor *output, uint32_t crop_box_ind, float extrapolation_value)
+void NECropKernel::configure(const ITensor *input,
+                             const ITensor *crop_boxes,
+                             const ITensor *box_ind,
+                             ITensor       *output,
+                             uint32_t       crop_box_ind,
+                             float          extrapolation_value)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), crop_boxes->info(), box_ind->info(), output->info(), crop_box_ind, extrapolation_value));
+    ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), crop_boxes->info(), box_ind->info(), output->info(),
+                                        crop_box_ind, extrapolation_value));
 
     _input               = input;
     _crop_boxes          = crop_boxes;
@@ -207,21 +219,27 @@ void NECropKernel::configure(const ITensor *input, const ITensor *crop_boxes, co
     _extrapolation_value = extrapolation_value;
 }
 
-Status NECropKernel::validate(const ITensorInfo *input, const ITensorInfo *crop_boxes, const ITensorInfo *box_ind, const ITensorInfo *output, uint32_t crop_box_ind, float extrapolation_value)
+Status NECropKernel::validate(const ITensorInfo *input,
+                              const ITensorInfo *crop_boxes,
+                              const ITensorInfo *box_ind,
+                              const ITensorInfo *output,
+                              uint32_t           crop_box_ind,
+                              float              extrapolation_value)
 {
     ARM_COMPUTE_UNUSED(extrapolation_value);
-    const auto *uk = get_implementation(CropSelectorData{ input->data_type() });
+    const auto *uk = get_implementation(CropSelectorData{input->data_type()});
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::U16, DataType::S16, DataType::F16, DataType::U32, DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::U16, DataType::S16,
+                                                         DataType::F16, DataType::U32, DataType::S32, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().num_dimensions() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[0] != 4);
     ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[1] != box_ind->tensor_shape()[0]);
     ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[1] <= crop_box_ind);
     ARM_COMPUTE_RETURN_ERROR_ON(box_ind->tensor_shape()[0] <= crop_box_ind);
-    if(output->total_size() > 0)
+    if (output->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -242,48 +260,53 @@ void NECropKernel::configure_output_shape()
     // The normalized coordiantes are scaled to retrieve the floating point image coordinates which are rounded to integers.
     _start = Coordinates(std::floor(x0 * (_input->info()->tensor_shape()[1] - 1) + 0.5f),
                          std::floor(y0 * (_input->info()->tensor_shape()[2] - 1) + 0.5f));
-    _end = Coordinates(std::floor(x1 * (_input->info()->tensor_shape()[1] - 1) + 0.5f),
-                       std::floor(y1 * (_input->info()->tensor_shape()[2] - 1) + 0.5f));
-    const TensorShape out_shape(_input->info()->tensor_shape()[0], abs(_end[0] - _start[0]) + 1, abs(_end[1] - _start[1]) + 1);
+    _end   = Coordinates(std::floor(x1 * (_input->info()->tensor_shape()[1] - 1) + 0.5f),
+                         std::floor(y1 * (_input->info()->tensor_shape()[2] - 1) + 0.5f));
+    const TensorShape out_shape(_input->info()->tensor_shape()[0], abs(_end[0] - _start[0]) + 1,
+                                abs(_end[1] - _start[1]) + 1);
     _output->info()->set_tensor_shape(out_shape);
 
     bool is_width_flipped  = _end[0] < _start[0];
     bool is_height_flipped = _end[1] < _start[1];
-    if(is_height_flipped)
+    if (is_height_flipped)
     {
-        _rows_out_of_bounds[0] = _start[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(static_cast<uint32_t>(_start[1] - _input->info()->dimension(2) + 1),
-                                                                                                            static_cast<uint32_t>(_output->info()->dimension(2))) :
-                                 0;
+        _rows_out_of_bounds[0] = _start[1] >= static_cast<int32_t>(_input->info()->dimension(2))
+                                     ? std::min(static_cast<uint32_t>(_start[1] - _input->info()->dimension(2) + 1),
+                                                static_cast<uint32_t>(_output->info()->dimension(2)))
+                                     : 0;
         _rows_out_of_bounds[1] = _end[1] < 0 ? std::min(static_cast<uint32_t>(-_end[1]),
-                                                        static_cast<uint32_t>(_output->info()->dimension(2))) :
-                                 0;
+                                                        static_cast<uint32_t>(_output->info()->dimension(2)))
+                                             : 0;
     }
     else
     {
         _rows_out_of_bounds[0] = _start[1] < 0 ? std::min(static_cast<uint32_t>(-_start[1]),
-                                                          static_cast<uint32_t>(_output->info()->dimension(2))) :
-                                 0;
-        _rows_out_of_bounds[1] = _end[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(static_cast<uint32_t>(_end[1] - _input->info()->dimension(2) + 1),
-                                                                                                          static_cast<uint32_t>(_output->info()->dimension(2))) :
-                                 0;
+                                                          static_cast<uint32_t>(_output->info()->dimension(2)))
+                                               : 0;
+        _rows_out_of_bounds[1] = _end[1] >= static_cast<int32_t>(_input->info()->dimension(2))
+                                     ? std::min(static_cast<uint32_t>(_end[1] - _input->info()->dimension(2) + 1),
+                                                static_cast<uint32_t>(_output->info()->dimension(2)))
+                                     : 0;
     }
-    if(is_width_flipped)
+    if (is_width_flipped)
     {
-        _cols_out_of_bounds[0] = _start[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(static_cast<uint32_t>(_start[0] - _input->info()->dimension(1) + 1),
-                                                                                                            static_cast<uint32_t>(_output->info()->dimension(1))) :
-                                 0;
+        _cols_out_of_bounds[0] = _start[0] >= static_cast<int32_t>(_input->info()->dimension(1))
+                                     ? std::min(static_cast<uint32_t>(_start[0] - _input->info()->dimension(1) + 1),
+                                                static_cast<uint32_t>(_output->info()->dimension(1)))
+                                     : 0;
         _cols_out_of_bounds[1] = _end[0] < 0 ? std::min(static_cast<uint32_t>(-_end[0]),
-                                                        static_cast<uint32_t>(_output->info()->dimension(1))) :
-                                 0;
+                                                        static_cast<uint32_t>(_output->info()->dimension(1)))
+                                             : 0;
     }
     else
     {
         _cols_out_of_bounds[0] = _start[0] < 0 ? std::min(static_cast<uint32_t>(-_start[0]),
-                                                          static_cast<uint32_t>(_output->info()->dimension(1))) :
-                                 0;
-        _cols_out_of_bounds[1] = _end[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(static_cast<uint32_t>(_end[0] - _input->info()->dimension(1) + 1),
-                                                                                                          static_cast<uint32_t>(_output->info()->dimension(1))) :
-                                 0;
+                                                          static_cast<uint32_t>(_output->info()->dimension(1)))
+                                               : 0;
+        _cols_out_of_bounds[1] = _end[0] >= static_cast<int32_t>(_input->info()->dimension(1))
+                                     ? std::min(static_cast<uint32_t>(_end[0] - _input->info()->dimension(1) + 1),
+                                                static_cast<uint32_t>(_output->info()->dimension(1)))
+                                     : 0;
     }
 
     INEKernel::configure(calculate_max_window(*_output->info()));
@@ -298,13 +321,18 @@ void NECropKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON(_input->info()->has_padding());
     ARM_COMPUTE_ERROR_ON(_output->info()->has_padding());
 
-    const auto *uk = get_implementation(CropSelectorData{ _input->info()->data_type() });
+    const auto *uk = get_implementation(CropSelectorData{_input->info()->data_type()});
 
     uint32_t    batch_index = *(reinterpret_cast<int32_t *>(_box_ind->ptr_to_element(Coordinates(_crop_box_ind))));
-    Coordinates input_offset(0, _end[0] < _start[0] ? _start[0] - _cols_out_of_bounds[0] : _start[0] + _cols_out_of_bounds[0],
-                             _end[1] < _start[1] ? _start[1] - _rows_out_of_bounds[0] : _start[1] + _rows_out_of_bounds[0], batch_index);
-    execute_window(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds, uk->ukernel, _end[1] < _start[1],
-                   _cols_out_of_bounds[0] + _cols_out_of_bounds[1] < _output->info()->dimension(1), _cols_out_of_bounds[0] > 0, _cols_out_of_bounds[1] > 0,
+    Coordinates input_offset(
+        0, _end[0] < _start[0] ? _start[0] - _cols_out_of_bounds[0] : _start[0] + _cols_out_of_bounds[0],
+        _end[1] < _start[1] ? _start[1] - _rows_out_of_bounds[0] : _start[1] + _rows_out_of_bounds[0], batch_index);
+    execute_window(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds,
+                   uk->ukernel,
+                   _end[1]<_start[1],
+                           _cols_out_of_bounds[0] +
+                               _cols_out_of_bounds[1]<_output->info()->dimension(1), _cols_out_of_bounds[0]> 0,
+                           _cols_out_of_bounds[1]> 0,
                    _start[0] <= _end[0], _end[0] < _start[0]);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NECropKernel.h b/src/core/NEON/kernels/NECropKernel.h
index 6c989c1d2c..da4a1b26e5 100644
--- a/src/core/NEON/kernels/NECropKernel.h
+++ b/src/core/NEON/kernels/NECropKernel.h
@@ -25,7 +25,7 @@
 #define ARM_COMPUTE_NEON_CROP_KERNEL_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -67,7 +67,12 @@ public:
      * @param[in]  crop_box_ind        Index of the crop box to be used from @p crop_boxes. Default is 0.
      * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
      */
-    void configure(const ITensor *input, const ITensor *crop_boxes, const ITensor *box_ind, ITensor *output, uint32_t crop_box_ind = 0, float extrapolation_value = 0);
+    void configure(const ITensor *input,
+                   const ITensor *crop_boxes,
+                   const ITensor *box_ind,
+                   ITensor       *output,
+                   uint32_t       crop_box_ind        = 0,
+                   float          extrapolation_value = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel
      *
@@ -82,7 +87,12 @@ public:
      * @param[in] crop_box_ind        Index of the crop box to be used from @p crop_boxes. Default is 0.
      * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *crop_boxes, const ITensorInfo *box_ind, const ITensorInfo *output, uint32_t crop_box_ind = 0, float extrapolation_value = 0);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *crop_boxes,
+                           const ITensorInfo *box_ind,
+                           const ITensorInfo *output,
+                           uint32_t           crop_box_ind        = 0,
+                           float              extrapolation_value = 0);
 
     /** Configure output tensor's shape as this can only be determined at runtime. */
     void configure_output_shape();
@@ -91,7 +101,8 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 
     /** Function to use for in bounds crop for the particular tensor types passed to configure() */
-    using InBoundsCropFunction = void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool);
+    using InBoundsCropFunction =
+        void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool);
 
 private:
     const ITensor *_input;
diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
index 6dcc85ec2e..de0079ee60 100644
--- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
@@ -26,11 +26,12 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 #include <cstdint>
@@ -52,12 +53,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
     const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) != 0);
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
         const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape * input->tensor_shape()[idx_width]));
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape * input->tensor_shape()[idx_height]));
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] !=
+                                    (block_shape * input->tensor_shape()[idx_width]));
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] !=
+                                    (block_shape * input->tensor_shape()[idx_height]));
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -74,7 +77,8 @@ NEDepthToSpaceLayerKernel::NEDepthToSpaceLayerKernel()
 void NEDepthToSpaceLayerKernel::configure(const ITensor *input, ITensor *output, int32_t block_shape)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    TensorShape output_shape = compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
+    TensorShape output_shape =
+        compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
@@ -117,26 +121,27 @@ void NEDepthToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info
     slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
 
     // Main loop for NCHW and NHWC
-    if(_data_layout == DataLayout::NCHW)
+    if (_data_layout == DataLayout::NCHW)
     {
         Window slice_in = window.first_slice_window_2D();
         do
         {
             Iterator in(_input, slice_in);
-            execute_window_loop(slice_in, [&](const Coordinates & id)
-            {
-                const int x = id.x();
-                const int y = id.y();
-
-                const int   z     = id.z() % r;
-                const int   out_x = x * _block_shape + (id.z() / r) % _block_shape;
-                const int   out_y = y * _block_shape + (id.z() / r) / _block_shape;
-                Coordinates output_coords{ out_x, out_y, z, id[3] };
-                memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
-            },
-            in);
-        }
-        while(window.slide_window_slice_2D(slice_in));
+            execute_window_loop(
+                slice_in,
+                [&](const Coordinates &id)
+                {
+                    const int x = id.x();
+                    const int y = id.y();
+
+                    const int   z     = id.z() % r;
+                    const int   out_x = x * _block_shape + (id.z() / r) % _block_shape;
+                    const int   out_y = y * _block_shape + (id.z() / r) / _block_shape;
+                    Coordinates output_coords{out_x, out_y, z, id[3]};
+                    memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
+                },
+                in);
+        } while (window.slide_window_slice_2D(slice_in));
     }
     else
     {
@@ -144,20 +149,21 @@ void NEDepthToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info
         do
         {
             Iterator in(_input, slice_in);
-            execute_window_loop(slice_in, [&](const Coordinates & id)
-            {
-                const int x = id.y();
-                const int y = id.z();
-
-                const int   z     = id.x() % r;
-                const int   out_x = x * _block_shape + (id.x() / r) % _block_shape;
-                const int   out_y = y * _block_shape + (id.x() / r) / _block_shape;
-                Coordinates output_coords{ z, out_x, out_y, id[3] };
-                memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
-            },
-            in);
-        }
-        while(window.slide_window_slice_3D(slice_in));
+            execute_window_loop(
+                slice_in,
+                [&](const Coordinates &id)
+                {
+                    const int x = id.y();
+                    const int y = id.z();
+
+                    const int   z     = id.x() % r;
+                    const int   out_x = x * _block_shape + (id.x() / r) % _block_shape;
+                    const int   out_y = y * _block_shape + (id.x() / r) / _block_shape;
+                    Coordinates output_coords{z, out_x, out_y, id[3]};
+                    memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
+                },
+                in);
+        } while (window.slide_window_slice_3D(slice_in));
     }
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
index 261437f07d..a5969cd497 100644
--- a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -37,16 +38,19 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+Status validate_arguments(const ITensorInfo               *input,
+                          const ITensorInfo               *output,
+                          const ITensorInfo               *idx,
+                          const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(idx, 1, DataType::U32);
-    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] != idx->tensor_shape().x());
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 2);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -56,7 +60,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo                     *input,
+                                                        ITensorInfo                     *output,
+                                                        ITensorInfo                     *idx,
+                                                        const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_UNUSED(idx, config);
 
@@ -68,12 +75,14 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 }
 } // namespace
 
-NEFFTDigitReverseKernel::NEFFTDigitReverseKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _idx(nullptr)
+NEFFTDigitReverseKernel::NEFFTDigitReverseKernel() : _func(nullptr), _input(nullptr), _output(nullptr), _idx(nullptr)
 {
 }
 
-void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, const ITensor *idx, const FFTDigitReverseKernelInfo &config)
+void NEFFTDigitReverseKernel::configure(const ITensor                   *input,
+                                        ITensor                         *output,
+                                        const ITensor                   *idx,
+                                        const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, idx);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), idx->info(), config));
@@ -91,11 +100,11 @@ void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, c
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 
-    if(axis == 0)
+    if (axis == 0)
     {
-        if(is_input_complex)
+        if (is_input_complex)
         {
-            if(is_conj)
+            if (is_conj)
             {
                 _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0<true, true>;
             }
@@ -109,11 +118,11 @@ void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, c
             _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0<false, false>;
         }
     }
-    else if(axis == 1)
+    else if (axis == 1)
     {
-        if(is_input_complex)
+        if (is_input_complex)
         {
-            if(is_conj)
+            if (is_conj)
             {
                 _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_1<true, true>;
             }
@@ -133,10 +142,14 @@ void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, c
     }
 }
 
-Status NEFFTDigitReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+Status NEFFTDigitReverseKernel::validate(const ITensorInfo               *input,
+                                         const ITensorInfo               *output,
+                                         const ITensorInfo               *idx,
+                                         const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, idx, config));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
     return Status{};
 }
 
@@ -159,38 +172,40 @@ void NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0(const Window &window)
     std::vector<float> buffer_row_out(2 * N);
     std::vector<float> buffer_row_in(2 * N);
 
-    execute_window_loop(slice, [&](const Coordinates &)
-    {
-        if(is_input_complex)
+    execute_window_loop(
+        slice,
+        [&](const Coordinates &)
         {
-            // Load
-            memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), 2 * N * sizeof(float));
-
-            // Shuffle
-            for(size_t x = 0; x < 2 * N; x += 2)
+            if (is_input_complex)
             {
-                size_t idx            = buffer_idx[x / 2];
-                buffer_row_out[x]     = buffer_row_in[2 * idx];
-                buffer_row_out[x + 1] = (is_conj ? -buffer_row_in[2 * idx + 1] : buffer_row_in[2 * idx + 1]);
-            }
-        }
-        else
-        {
-            // Load
-            memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), N * sizeof(float));
+                // Load
+                memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), 2 * N * sizeof(float));
 
-            // Shuffle
-            for(size_t x = 0; x < N; ++x)
+                // Shuffle
+                for (size_t x = 0; x < 2 * N; x += 2)
+                {
+                    size_t idx            = buffer_idx[x / 2];
+                    buffer_row_out[x]     = buffer_row_in[2 * idx];
+                    buffer_row_out[x + 1] = (is_conj ? -buffer_row_in[2 * idx + 1] : buffer_row_in[2 * idx + 1]);
+                }
+            }
+            else
             {
-                size_t idx            = buffer_idx[x];
-                buffer_row_out[2 * x] = buffer_row_in[idx];
+                // Load
+                memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), N * sizeof(float));
+
+                // Shuffle
+                for (size_t x = 0; x < N; ++x)
+                {
+                    size_t idx            = buffer_idx[x];
+                    buffer_row_out[2 * x] = buffer_row_in[idx];
+                }
             }
-        }
 
-        // Copy back
-        memcpy(reinterpret_cast<float *>(out.ptr()), buffer_row_out.data(), 2 * N * sizeof(float));
-    },
-    in, out);
+            // Copy back
+            memcpy(reinterpret_cast<float *>(out.ptr()), buffer_row_out.data(), 2 * N * sizeof(float));
+        },
+        in, out);
 }
 
 template <bool is_input_complex, bool is_conj>
@@ -215,39 +230,41 @@ void NEFFTDigitReverseKernel::digit_reverse_kernel_axis_1(const Window &window)
     const size_t stride_z = _input->info()->strides_in_bytes()[2];
     const size_t stride_w = _input->info()->strides_in_bytes()[3];
 
-    execute_window_loop(slice, [&](const Coordinates & id)
-    {
-        auto        *out_ptr    = reinterpret_cast<float *>(out.ptr());
-        auto        *in_ptr     = reinterpret_cast<float *>(_input->buffer() + id.z() * stride_z + id[3] * stride_w);
-        const size_t y_shuffled = buffer_idx[id.y()];
-
-        if(is_input_complex)
+    execute_window_loop(
+        slice,
+        [&](const Coordinates &id)
         {
-            // Shuffle the entire row into the output
-            memcpy(out_ptr, in_ptr + 2 * Nx * y_shuffled, 2 * Nx * sizeof(float));
+            auto        *out_ptr = reinterpret_cast<float *>(out.ptr());
+            auto        *in_ptr  = reinterpret_cast<float *>(_input->buffer() + id.z() * stride_z + id[3] * stride_w);
+            const size_t y_shuffled = buffer_idx[id.y()];
 
-            // Conjugate if necessary
-            if(is_conj)
+            if (is_input_complex)
             {
-                for(size_t x = 0; x < 2 * Nx; x += 2)
+                // Shuffle the entire row into the output
+                memcpy(out_ptr, in_ptr + 2 * Nx * y_shuffled, 2 * Nx * sizeof(float));
+
+                // Conjugate if necessary
+                if (is_conj)
                 {
-                    out_ptr[x + 1] = -out_ptr[x + 1];
+                    for (size_t x = 0; x < 2 * Nx; x += 2)
+                    {
+                        out_ptr[x + 1] = -out_ptr[x + 1];
+                    }
                 }
             }
-        }
-        else
-        {
-            // Shuffle the entire row into the buffer
-            memcpy(buffer_row.data(), in_ptr + Nx * y_shuffled, Nx * sizeof(float));
-
-            // Copy the buffer to the output, with a zero imaginary part
-            for(size_t x = 0; x < 2 * Nx; x += 2)
+            else
             {
-                out_ptr[x] = buffer_row[x / 2];
+                // Shuffle the entire row into the buffer
+                memcpy(buffer_row.data(), in_ptr + Nx * y_shuffled, Nx * sizeof(float));
+
+                // Copy the buffer to the output, with a zero imaginary part
+                for (size_t x = 0; x < 2 * Nx; x += 2)
+                {
+                    out_ptr[x] = buffer_row[x / 2];
+                }
             }
-        }
-    },
-    out);
+        },
+        out);
 }
 
 void NEFFTDigitReverseKernel::run(const Window &window, const ThreadInfo &info)
diff --git a/src/core/NEON/kernels/NEFFTDigitReverseKernel.h b/src/core/NEON/kernels/NEFFTDigitReverseKernel.h
index f436c364b2..ecf85ebc98 100644
--- a/src/core/NEON/kernels/NEFFTDigitReverseKernel.h
+++ b/src/core/NEON/kernels/NEFFTDigitReverseKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NEFFTDIGITREVERSEKERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -70,7 +71,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config);
+    static Status validate(const ITensorInfo               *input,
+                           const ITensorInfo               *output,
+                           const ITensorInfo               *idx,
+                           const FFTDigitReverseKernelInfo &config);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
index 44c841f626..4b58a7b9ac 100644
--- a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
@@ -28,10 +28,11 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/wrapper/traits.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/traits.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "support/ToolchainSupport.h"
 
 #include <arm_neon.h>
@@ -70,7 +71,7 @@ float32x2_t c_mul_neon(float32x2_t a, float32x2_t b)
 {
     using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
 
-    const float32x2_t mask = { -1.0, 1.0 };
+    const float32x2_t mask = {-1.0, 1.0};
     const float32x2_t tmp0 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
     const float32x2_t tmp1 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
 
@@ -88,7 +89,7 @@ float32x2_t c_mul_neon_img(float32x2_t a, float img_constant)
     const float a_r = wrapper::vgetlane(a, 0);
     const float a_i = wrapper::vgetlane(a, 1);
 
-    const auto out = wrapper::vmul(float32x2_t{ -a_i, a_r }, float32x2_t{ img_constant, img_constant });
+    const auto out = wrapper::vmul(float32x2_t{-a_i, a_r}, float32x2_t{img_constant, img_constant});
     return out;
 }
 
@@ -100,7 +101,8 @@ float32x2_t reduce_sum_5(float32x2_t a, float32x2_t b, float32x2_t c, float32x2_
     return wrapper::vadd(t2, e);
 }
 
-float32x2_t reduce_sum_7(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7)
+float32x2_t reduce_sum_7(
+    float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7)
 {
     const auto t0  = wrapper::vadd(x1, x2);
     const auto t1  = wrapper::vadd(x3, x4);
@@ -111,7 +113,14 @@ float32x2_t reduce_sum_7(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32
     return wrapper::vadd(t00, t01);
 }
 
-float32x2_t reduce_sum_8(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7, float32x2_t x8)
+float32x2_t reduce_sum_8(float32x2_t x1,
+                         float32x2_t x2,
+                         float32x2_t x3,
+                         float32x2_t x4,
+                         float32x2_t x5,
+                         float32x2_t x6,
+                         float32x2_t x7,
+                         float32x2_t x8)
 {
     const auto t0  = wrapper::vadd(x1, x2);
     const auto t1  = wrapper::vadd(x3, x4);
@@ -141,15 +150,21 @@ void fft_3(float32x2_t &x, float32x2_t &y, float32x2_t &z, const float32x2_t &w,
     x = wrapper::vadd(a, b);
     x = wrapper::vadd(x, c);
 
-    const auto v1 = wrapper::vmul(float32x2_t{ 0.5f, 0.5 }, wrapper::vadd(b, c));
-    const auto v2 = c_mul_neon(float32x2_t{ 0.f, -kSqrt3Div2 }, wrapper::vsub(b, c));
+    const auto v1 = wrapper::vmul(float32x2_t{0.5f, 0.5}, wrapper::vadd(b, c));
+    const auto v2 = c_mul_neon(float32x2_t{0.f, -kSqrt3Div2}, wrapper::vsub(b, c));
 
     y = z = wrapper::vsub(a, v1);
     y     = wrapper::vadd(y, v2);
     z     = wrapper::vsub(z, v2);
 }
 
-void fft_4(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3)
+void fft_4(float32x2_t       &x1,
+           float32x2_t       &x2,
+           float32x2_t       &x3,
+           float32x2_t       &x4,
+           const float32x2_t &w,
+           const float32x2_t &w2,
+           const float32x2_t &w3)
 {
     float32x2_t a = x1;
     float32x2_t b = c_mul_neon(w, x2);
@@ -173,7 +188,15 @@ void fft_4(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, c
     x4             = wrapper::vadd(x41, x42);
 }
 
-void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3, const float32x2_t &w4)
+void fft_5(float32x2_t       &x1,
+           float32x2_t       &x2,
+           float32x2_t       &x3,
+           float32x2_t       &x4,
+           float32x2_t       &x5,
+           const float32x2_t &w,
+           const float32x2_t &w2,
+           const float32x2_t &w3,
+           const float32x2_t &w4)
 {
     const auto a = x1;
     const auto b = c_mul_neon(w, x2);
@@ -181,25 +204,25 @@ void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
     const auto d = c_mul_neon(w3, x4);
     const auto e = c_mul_neon(w4, x5);
 
-    const auto b0 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, b);
-    const auto b1 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, b);
-    const auto b2 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, b);
-    const auto b3 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, b);
+    const auto b0 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, b);
+    const auto b1 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, b);
+    const auto b2 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, b);
+    const auto b3 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, b);
 
-    const auto c0 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, c);
-    const auto c1 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, c);
-    const auto c2 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, c);
-    const auto c3 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, c);
+    const auto c0 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, c);
+    const auto c1 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, c);
+    const auto c2 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, c);
+    const auto c3 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, c);
 
-    const auto d0 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, d);
-    const auto d1 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, d);
-    const auto d2 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, d);
-    const auto d3 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, d);
+    const auto d0 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, d);
+    const auto d1 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, d);
+    const auto d2 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, d);
+    const auto d3 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, d);
 
-    const auto e0 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, e);
-    const auto e1 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, e);
-    const auto e2 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, e);
-    const auto e3 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, e);
+    const auto e0 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, e);
+    const auto e1 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, e);
+    const auto e2 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, e);
+    const auto e3 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, e);
 
     x1 = reduce_sum_5(a, b, c, d, e);
     x2 = reduce_sum_5(a, b0, c0, d0, e0);
@@ -208,9 +231,19 @@ void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
     x5 = reduce_sum_5(a, b3, c3, d3, e3);
 }
 
-void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3,
+void fft_7(float32x2_t       &x1,
+           float32x2_t       &x2,
+           float32x2_t       &x3,
+           float32x2_t       &x4,
+           float32x2_t       &x5,
+           float32x2_t       &x6,
+           float32x2_t       &x7,
+           const float32x2_t &w,
+           const float32x2_t &w2,
+           const float32x2_t &w3,
            const float32x2_t &w4,
-           const float32x2_t &w5, const float32x2_t &w6)
+           const float32x2_t &w5,
+           const float32x2_t &w6)
 {
     const auto a = x1;
     const auto b = c_mul_neon(w, x2);
@@ -220,47 +253,47 @@ void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
     const auto f = c_mul_neon(w5, x6);
     const auto g = c_mul_neon(w6, x7);
 
-    const auto b0 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, b);
-    const auto b1 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, b);
-    const auto b2 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, b);
-    const auto b3 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, b);
-    const auto b4 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, b);
-    const auto b5 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, b);
-
-    const auto c0 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, c);
-    const auto c1 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, c);
-    const auto c2 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, c);
-    const auto c3 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, c);
-    const auto c4 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, c);
-    const auto c5 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, c);
-
-    const auto d0 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, d);
-    const auto d1 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, d);
-    const auto d2 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, d);
-    const auto d3 = c_mul_neon(float32x2_t{ -kW7_2, +kW7_3 }, d);
-    const auto d4 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, d);
-    const auto d5 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, d);
-
-    const auto e0 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, e);
-    const auto e1 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, e);
-    const auto e2 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, e);
-    const auto e3 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, e);
-    const auto e4 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, e);
-    const auto e5 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, e);
-
-    const auto f0 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, f);
-    const auto f1 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, f);
-    const auto f2 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, f);
-    const auto f3 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, f);
-    const auto f4 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, f);
-    const auto f5 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, f);
-
-    const auto g0 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, g);
-    const auto g1 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, g);
-    const auto g2 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, g);
-    const auto g3 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, g);
-    const auto g4 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, g);
-    const auto g5 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, g);
+    const auto b0 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, b);
+    const auto b1 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, b);
+    const auto b2 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, b);
+    const auto b3 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, b);
+    const auto b4 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, b);
+    const auto b5 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, b);
+
+    const auto c0 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, c);
+    const auto c1 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, c);
+    const auto c2 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, c);
+    const auto c3 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, c);
+    const auto c4 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, c);
+    const auto c5 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, c);
+
+    const auto d0 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, d);
+    const auto d1 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, d);
+    const auto d2 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, d);
+    const auto d3 = c_mul_neon(float32x2_t{-kW7_2, +kW7_3}, d);
+    const auto d4 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, d);
+    const auto d5 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, d);
+
+    const auto e0 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, e);
+    const auto e1 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, e);
+    const auto e2 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, e);
+    const auto e3 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, e);
+    const auto e4 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, e);
+    const auto e5 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, e);
+
+    const auto f0 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, f);
+    const auto f1 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, f);
+    const auto f2 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, f);
+    const auto f3 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, f);
+    const auto f4 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, f);
+    const auto f5 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, f);
+
+    const auto g0 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, g);
+    const auto g1 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, g);
+    const auto g2 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, g);
+    const auto g3 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, g);
+    const auto g4 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, g);
+    const auto g5 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, g);
 
     x1 = reduce_sum_7(a, b, c, d, e, f, g);
     x2 = reduce_sum_7(a, b0, c0, d0, e0, f0, g0);
@@ -271,9 +304,20 @@ void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
     x7 = reduce_sum_7(a, b5, c5, d5, e5, f5, g5);
 }
 
-void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, float32x2_t &x8, const float32x2_t &w, const float32x2_t &w2,
+void fft_8(float32x2_t       &x1,
+           float32x2_t       &x2,
+           float32x2_t       &x3,
+           float32x2_t       &x4,
+           float32x2_t       &x5,
+           float32x2_t       &x6,
+           float32x2_t       &x7,
+           float32x2_t       &x8,
+           const float32x2_t &w,
+           const float32x2_t &w2,
            const float32x2_t &w3,
-           const float32x2_t &w4, const float32x2_t &w5, const float32x2_t &w6,
+           const float32x2_t &w4,
+           const float32x2_t &w5,
+           const float32x2_t &w6,
            const float32x2_t &w7)
 {
     const auto a = x1;
@@ -285,61 +329,61 @@ void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
     const auto g = c_mul_neon(w6, x7);
     const auto h = c_mul_neon(w7, x8);
 
-    const auto b0 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, b);
-    const auto b1 = c_mul_neon(float32x2_t{ 0, -1 }, b);
-    const auto b2 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, b);
-    const auto b3 = c_mul_neon(float32x2_t{ -1, 0 }, b);
-    const auto b4 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, b);
-    const auto b5 = c_mul_neon(float32x2_t{ 0, 1 }, b);
-    const auto b6 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, b);
-
-    const auto c0 = c_mul_neon(float32x2_t{ 0, -1 }, c);
-    const auto c1 = c_mul_neon(float32x2_t{ -1, 0 }, c);
-    const auto c2 = c_mul_neon(float32x2_t{ 0, 1 }, c);
-    const auto c3 = c_mul_neon(float32x2_t{ 1, 0 }, c);
-    const auto c4 = c_mul_neon(float32x2_t{ 0, -1 }, c);
-    const auto c5 = c_mul_neon(float32x2_t{ -1, 0 }, c);
-    const auto c6 = c_mul_neon(float32x2_t{ 0, 1 }, c);
-
-    const auto d0 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, d);
-    const auto d1 = c_mul_neon(float32x2_t{ 0, 1 }, d);
-    const auto d2 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, d);
-    const auto d3 = c_mul_neon(float32x2_t{ -1, 0 }, d);
-    const auto d4 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, d);
-    const auto d5 = c_mul_neon(float32x2_t{ 0, -1 }, d);
-    const auto d6 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, d);
-
-    const auto e0 = c_mul_neon(float32x2_t{ -1, 0 }, e);
-    const auto e1 = c_mul_neon(float32x2_t{ 1, 0 }, e);
-    const auto e2 = c_mul_neon(float32x2_t{ -1, 0 }, e);
-    const auto e3 = c_mul_neon(float32x2_t{ 1, 0 }, e);
-    const auto e4 = c_mul_neon(float32x2_t{ -1, 0 }, e);
-    const auto e5 = c_mul_neon(float32x2_t{ 1, 0 }, e);
-    const auto e6 = c_mul_neon(float32x2_t{ -1, 0 }, e);
-
-    const auto f0 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, f);
-    const auto f1 = c_mul_neon(float32x2_t{ 0, -1 }, f);
-    const auto f2 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, f);
-    const auto f3 = c_mul_neon(float32x2_t{ -1, 0 }, f);
-    const auto f4 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, f);
-    const auto f5 = c_mul_neon(float32x2_t{ 0, 1 }, f);
-    const auto f6 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, f);
-
-    const auto g0 = c_mul_neon(float32x2_t{ 0, 1 }, g);
-    const auto g1 = c_mul_neon(float32x2_t{ -1, 0 }, g);
-    const auto g2 = c_mul_neon(float32x2_t{ 0, -1 }, g);
-    const auto g3 = c_mul_neon(float32x2_t{ 1, 0 }, g);
-    const auto g4 = c_mul_neon(float32x2_t{ 0, 1 }, g);
-    const auto g5 = c_mul_neon(float32x2_t{ -1, 0 }, g);
-    const auto g6 = c_mul_neon(float32x2_t{ 0, -1 }, g);
-
-    const auto h0 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, h);
-    const auto h1 = c_mul_neon(float32x2_t{ 0, 1 }, h);
-    const auto h2 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, h);
-    const auto h3 = c_mul_neon(float32x2_t{ -1, 0 }, h);
-    const auto h4 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, h);
-    const auto h5 = c_mul_neon(float32x2_t{ 0, -1 }, h);
-    const auto h6 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, h);
+    const auto b0 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, b);
+    const auto b1 = c_mul_neon(float32x2_t{0, -1}, b);
+    const auto b2 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, b);
+    const auto b3 = c_mul_neon(float32x2_t{-1, 0}, b);
+    const auto b4 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, b);
+    const auto b5 = c_mul_neon(float32x2_t{0, 1}, b);
+    const auto b6 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, b);
+
+    const auto c0 = c_mul_neon(float32x2_t{0, -1}, c);
+    const auto c1 = c_mul_neon(float32x2_t{-1, 0}, c);
+    const auto c2 = c_mul_neon(float32x2_t{0, 1}, c);
+    const auto c3 = c_mul_neon(float32x2_t{1, 0}, c);
+    const auto c4 = c_mul_neon(float32x2_t{0, -1}, c);
+    const auto c5 = c_mul_neon(float32x2_t{-1, 0}, c);
+    const auto c6 = c_mul_neon(float32x2_t{0, 1}, c);
+
+    const auto d0 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, d);
+    const auto d1 = c_mul_neon(float32x2_t{0, 1}, d);
+    const auto d2 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, d);
+    const auto d3 = c_mul_neon(float32x2_t{-1, 0}, d);
+    const auto d4 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, d);
+    const auto d5 = c_mul_neon(float32x2_t{0, -1}, d);
+    const auto d6 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, d);
+
+    const auto e0 = c_mul_neon(float32x2_t{-1, 0}, e);
+    const auto e1 = c_mul_neon(float32x2_t{1, 0}, e);
+    const auto e2 = c_mul_neon(float32x2_t{-1, 0}, e);
+    const auto e3 = c_mul_neon(float32x2_t{1, 0}, e);
+    const auto e4 = c_mul_neon(float32x2_t{-1, 0}, e);
+    const auto e5 = c_mul_neon(float32x2_t{1, 0}, e);
+    const auto e6 = c_mul_neon(float32x2_t{-1, 0}, e);
+
+    const auto f0 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, f);
+    const auto f1 = c_mul_neon(float32x2_t{0, -1}, f);
+    const auto f2 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, f);
+    const auto f3 = c_mul_neon(float32x2_t{-1, 0}, f);
+    const auto f4 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, f);
+    const auto f5 = c_mul_neon(float32x2_t{0, 1}, f);
+    const auto f6 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, f);
+
+    const auto g0 = c_mul_neon(float32x2_t{0, 1}, g);
+    const auto g1 = c_mul_neon(float32x2_t{-1, 0}, g);
+    const auto g2 = c_mul_neon(float32x2_t{0, -1}, g);
+    const auto g3 = c_mul_neon(float32x2_t{1, 0}, g);
+    const auto g4 = c_mul_neon(float32x2_t{0, 1}, g);
+    const auto g5 = c_mul_neon(float32x2_t{-1, 0}, g);
+    const auto g6 = c_mul_neon(float32x2_t{0, -1}, g);
+
+    const auto h0 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, h);
+    const auto h1 = c_mul_neon(float32x2_t{0, 1}, h);
+    const auto h2 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, h);
+    const auto h3 = c_mul_neon(float32x2_t{-1, 0}, h);
+    const auto h4 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, h);
+    const auto h5 = c_mul_neon(float32x2_t{0, -1}, h);
+    const auto h6 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, h);
 
     x1 = reduce_sum_8(a, b, c, d, e, f, g, h);
     x2 = reduce_sum_8(a, b0, c0, d0, e0, f0, g0, h0);
@@ -352,18 +396,19 @@ void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
 }
 
 template <bool first_stage>
-void fft_radix_2_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_2_axes_0(
+    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
-        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
         {
-            auto a = float32x2_t{ 0, 0 };
-            auto b = float32x2_t{ 0, 0 };
+            auto a = float32x2_t{0, 0};
+            auto b = float32x2_t{0, 0};
 
             // Load inputs
-            if(first_stage)
+            if (first_stage)
             {
                 const auto ab = wrapper::vloadq(in + k);
                 a             = wrapper::vgetlow(ab);
@@ -379,7 +424,7 @@ void fft_radix_2_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
             fft_2(a, b, w);
 
             // Write outputs
-            if(first_stage)
+            if (first_stage)
             {
                 wrapper::vstore(out + k, wrapper::vcombine(a, b));
             }
@@ -394,12 +439,20 @@ void fft_radix_2_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
     }
 }
 
-void fft_radix_2_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_2_axes_1(float             *out,
+                        float             *in,
+                        unsigned int       Nx,
+                        unsigned int       NxRadix,
+                        const float32x2_t &w_m,
+                        unsigned int       N,
+                        unsigned int       M,
+                        unsigned int       in_pad_x,
+                        unsigned int       out_pad_x)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
-        for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
         {
             // Load inputs
             float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -418,20 +471,21 @@ void fft_radix_2_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
 }
 
 template <bool first_stage>
-void fft_radix_3_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_3_axes_0(
+    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const auto w2 = c_mul_neon(w, w);
 
-        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
         {
             // Load inputs
-            float32x2_t a = { 0, 0 };
-            float32x2_t b = { 0, 0 };
-            float32x2_t c = { 0, 0 };
-            if(first_stage)
+            float32x2_t a = {0, 0};
+            float32x2_t b = {0, 0};
+            float32x2_t c = {0, 0};
+            if (first_stage)
             {
                 const auto ab = wrapper::vloadq(in + k);
                 a             = wrapper::vgetlow(ab);
@@ -447,7 +501,7 @@ void fft_radix_3_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
             // Base-case prime transform
             fft_3(a, b, c, w, w2);
 
-            if(first_stage)
+            if (first_stage)
             {
                 wrapper::vstore(out + k, wrapper::vcombine(a, b));
             }
@@ -462,14 +516,22 @@ void fft_radix_3_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
     }
 }
 
-void fft_radix_3_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_3_axes_1(float             *out,
+                        float             *in,
+                        unsigned int       Nx,
+                        unsigned int       NxRadix,
+                        const float32x2_t &w_m,
+                        unsigned int       N,
+                        unsigned int       M,
+                        unsigned int       in_pad_x,
+                        unsigned int       out_pad_x)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const auto w2 = c_mul_neon(w, w);
 
-        for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
         {
             // Load inputs
             float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -489,21 +551,22 @@ void fft_radix_3_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
 }
 
 template <bool first_stage>
-void fft_radix_4_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_4_axes_0(
+    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const auto w2 = c_mul_neon(w, w);
         const auto w3 = c_mul_neon(w2, w);
 
-        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
         {
-            float32x2_t a = { 0, 0 };
-            float32x2_t b = { 0, 0 };
-            float32x2_t c = { 0, 0 };
-            float32x2_t d = { 0, 0 };
-            if(first_stage)
+            float32x2_t a = {0, 0};
+            float32x2_t b = {0, 0};
+            float32x2_t c = {0, 0};
+            float32x2_t d = {0, 0};
+            if (first_stage)
             {
                 const auto ab = wrapper::vloadq(in + k);
                 const auto cd = wrapper::vloadq(in + k + 4 * Nx);
@@ -524,7 +587,7 @@ void fft_radix_4_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
             // Base-case prime transform
             fft_4(a, b, c, d, w, w2, w3);
 
-            if(first_stage)
+            if (first_stage)
             {
                 wrapper::vstore(out + k, wrapper::vcombine(a, b));
                 wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));
@@ -542,15 +605,23 @@ void fft_radix_4_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
     }
 }
 
-void fft_radix_4_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_4_axes_1(float             *out,
+                        float             *in,
+                        unsigned int       Nx,
+                        unsigned int       NxRadix,
+                        const float32x2_t &w_m,
+                        unsigned int       N,
+                        unsigned int       M,
+                        unsigned int       in_pad_x,
+                        unsigned int       out_pad_x)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const auto w2 = c_mul_neon(w, w);
         const auto w3 = c_mul_neon(w2, w);
 
-        for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
         {
             // Load inputs
             float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -572,25 +643,26 @@ void fft_radix_4_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
 }
 
 template <bool first_stage>
-void fft_radix_5_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_5_axes_0(
+    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const float32x2_t w2 = c_mul_neon(w, w);
         const float32x2_t w3 = c_mul_neon(w2, w);
         const float32x2_t w4 = c_mul_neon(w3, w);
 
-        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
         {
-            float32x2_t a = { 0, 0 };
-            float32x2_t b = { 0, 0 };
-            float32x2_t c = { 0, 0 };
-            float32x2_t d = { 0, 0 };
-            float32x2_t e = { 0, 0 };
+            float32x2_t a = {0, 0};
+            float32x2_t b = {0, 0};
+            float32x2_t c = {0, 0};
+            float32x2_t d = {0, 0};
+            float32x2_t e = {0, 0};
 
             // Load inputs
-            if(first_stage)
+            if (first_stage)
             {
                 const auto ab = wrapper::vloadq(in + k);
                 const auto cd = wrapper::vloadq(in + k + 4 * Nx);
@@ -613,7 +685,7 @@ void fft_radix_5_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
             fft_5(a, b, c, d, e, w, w2, w3, w4);
 
             // Store outputs
-            if(first_stage)
+            if (first_stage)
             {
                 wrapper::vstore(out + k, wrapper::vcombine(a, b));
                 wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));
@@ -632,16 +704,24 @@ void fft_radix_5_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
     }
 }
 
-void fft_radix_5_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_5_axes_1(float             *out,
+                        float             *in,
+                        unsigned int       Nx,
+                        unsigned int       NxRadix,
+                        const float32x2_t &w_m,
+                        unsigned int       N,
+                        unsigned int       M,
+                        unsigned int       in_pad_x,
+                        unsigned int       out_pad_x)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const float32x2_t w2 = c_mul_neon(w, w);
         const float32x2_t w3 = c_mul_neon(w2, w);
         const float32x2_t w4 = c_mul_neon(w3, w);
 
-        for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
         {
             // Load inputs
             float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -666,10 +746,11 @@ void fft_radix_5_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
 }
 
 template <bool first_stage>
-void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_7_axes_0(
+    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const float32x2_t w2 = c_mul_neon(w, w);
         const float32x2_t w3 = c_mul_neon(w2, w);
@@ -677,18 +758,18 @@ void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
         const float32x2_t w5 = c_mul_neon(w4, w);
         const float32x2_t w6 = c_mul_neon(w5, w);
 
-        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
         {
-            float32x2_t a = { 0, 0 };
-            float32x2_t b = { 0, 0 };
-            float32x2_t c = { 0, 0 };
-            float32x2_t d = { 0, 0 };
-            float32x2_t e = { 0, 0 };
-            float32x2_t f = { 0, 0 };
-            float32x2_t g = { 0, 0 };
+            float32x2_t a = {0, 0};
+            float32x2_t b = {0, 0};
+            float32x2_t c = {0, 0};
+            float32x2_t d = {0, 0};
+            float32x2_t e = {0, 0};
+            float32x2_t f = {0, 0};
+            float32x2_t g = {0, 0};
 
             // Load inputs
-            if(first_stage)
+            if (first_stage)
             {
                 const auto ab = wrapper::vloadq(in + k);
                 const auto cd = wrapper::vloadq(in + k + 4 * Nx);
@@ -715,7 +796,7 @@ void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
             // Base-case prime transform
             fft_7(a, b, c, d, e, f, g, w, w2, w3, w4, w5, w6);
 
-            if(first_stage)
+            if (first_stage)
             {
                 wrapper::vstore(out + k, wrapper::vcombine(a, b));
                 wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));
@@ -737,10 +818,18 @@ void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
     }
 }
 
-void fft_radix_7_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_7_axes_1(float             *out,
+                        float             *in,
+                        unsigned int       Nx,
+                        unsigned int       NxRadix,
+                        const float32x2_t &w_m,
+                        unsigned int       N,
+                        unsigned int       M,
+                        unsigned int       in_pad_x,
+                        unsigned int       out_pad_x)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const float32x2_t w2 = c_mul_neon(w, w);
         const float32x2_t w3 = c_mul_neon(w2, w);
@@ -748,7 +837,7 @@ void fft_radix_7_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
         const float32x2_t w5 = c_mul_neon(w4, w);
         const float32x2_t w6 = c_mul_neon(w5, w);
 
-        for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
         {
             // Load inputs
             float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -777,10 +866,11 @@ void fft_radix_7_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
 }
 
 template <bool first_stage>
-void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_8_axes_0(
+    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const float32x2_t w2 = c_mul_neon(w, w);
         const float32x2_t w3 = c_mul_neon(w2, w);
@@ -789,20 +879,20 @@ void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
         const float32x2_t w6 = c_mul_neon(w5, w);
         const float32x2_t w7 = c_mul_neon(w6, w);
 
-        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
         {
             // Load inputs
-            float32x2_t a = { 0, 0 };
-            float32x2_t b = { 0, 0 };
-            float32x2_t c = { 0, 0 };
-            float32x2_t d = { 0, 0 };
-            float32x2_t e = { 0, 0 };
-            float32x2_t f = { 0, 0 };
-            float32x2_t g = { 0, 0 };
-            float32x2_t h = { 0, 0 };
+            float32x2_t a = {0, 0};
+            float32x2_t b = {0, 0};
+            float32x2_t c = {0, 0};
+            float32x2_t d = {0, 0};
+            float32x2_t e = {0, 0};
+            float32x2_t f = {0, 0};
+            float32x2_t g = {0, 0};
+            float32x2_t h = {0, 0};
 
             // Base-case prime transform
-            if(first_stage)
+            if (first_stage)
             {
                 const auto ab = wrapper::vloadq(in + k);
                 const auto cd = wrapper::vloadq(in + k + 4 * Nx);
@@ -834,7 +924,7 @@ void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
             fft_8(a, b, c, d, e, f, g, h, w, w2, w3, w4, w5, w6, w7);
 
             // Store outputs
-            if(first_stage)
+            if (first_stage)
             {
                 wrapper::vstore(out + k, wrapper::vcombine(a, b));
                 wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));
@@ -858,10 +948,18 @@ void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
     }
 }
 
-void fft_radix_8_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_8_axes_1(float             *out,
+                        float             *in,
+                        unsigned int       Nx,
+                        unsigned int       NxRadix,
+                        const float32x2_t &w_m,
+                        unsigned int       N,
+                        unsigned int       M,
+                        unsigned int       in_pad_x,
+                        unsigned int       out_pad_x)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const float32x2_t w2 = c_mul_neon(w, w);
         const float32x2_t w3 = c_mul_neon(w2, w);
@@ -870,7 +968,7 @@ void fft_radix_8_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
         const float32x2_t w6 = c_mul_neon(w5, w);
         const float32x2_t w7 = c_mul_neon(w6, w);
 
-        for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
         {
             // Load inputs
             float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -908,7 +1006,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     ARM_COMPUTE_UNUSED(config);
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -917,11 +1015,12 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
 {
     ARM_COMPUTE_UNUSED(config);
 
-    if(output != nullptr)
+    if (output != nullptr)
     {
         auto_init_if_empty(*output, *input);
     }
@@ -942,7 +1041,7 @@ void NEFFTRadixStageKernel::set_radix_stage_axis0(const FFTRadixStageKernelInfo
     // FFT table axis 0: [radix, first_stage]
     static std::map<unsigned int, std::map<bool, FFTFunctionPointerAxis0>> fft_table_axis0;
 
-    if(fft_table_axis0.empty())
+    if (fft_table_axis0.empty())
     {
         fft_table_axis0[2][false] = &fft_radix_2_axes_0<false>;
         fft_table_axis0[3][false] = &fft_radix_3_axes_0<false>;
@@ -967,7 +1066,7 @@ void NEFFTRadixStageKernel::set_radix_stage_axis1(const FFTRadixStageKernelInfo
     // FFT table axis 1: [radix, first_stage]
     static std::map<unsigned int, FFTFunctionPointerAxis1> fft_table_axis1;
 
-    if(fft_table_axis1.empty())
+    if (fft_table_axis1.empty())
     {
         fft_table_axis1[2] = &fft_radix_2_axes_1;
         fft_table_axis1[3] = &fft_radix_3_axes_1;
@@ -985,12 +1084,13 @@ void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFT
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
     // Output auto inizialitation if not yet initialized
-    if(output != nullptr)
+    if (output != nullptr)
     {
         auto_init_if_empty(*output->info(), *input->info()->clone());
     }
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
 
     _input  = input;
     _output = (output == nullptr) ? input : output;
@@ -998,7 +1098,7 @@ void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFT
     _axis   = config.axis;
     _radix  = config.radix;
 
-    switch(config.axis)
+    switch (config.axis)
     {
         case 0:
             set_radix_stage_axis0(config);
@@ -1012,26 +1112,28 @@ void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFT
     }
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), (output != nullptr) ? output->info() : nullptr, config);
+    auto win_config =
+        validate_and_configure_window(input->info(), (output != nullptr) ? output->info() : nullptr, config);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 }
 
-Status NEFFTRadixStageKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+Status NEFFTRadixStageKernel::validate(const ITensorInfo             *input,
+                                       const ITensorInfo             *output,
+                                       const FFTRadixStageKernelInfo &config)
 {
     const bool run_in_place = (output == nullptr) || (output == input);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
-                                                              (run_in_place) ? nullptr : output->clone().get(),
-                                                              config)
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(), config)
+            .first);
 
     return Status{};
 }
 
 std::set<unsigned int> NEFFTRadixStageKernel::supported_radix()
 {
-    return std::set<unsigned int> { 2, 3, 4, 5, 7, 8 };
+    return std::set<unsigned int>{2, 3, 4, 5, 7, 8};
 }
 
 void NEFFTRadixStageKernel::run(const Window &window, const ThreadInfo &info)
@@ -1049,28 +1151,32 @@ void NEFFTRadixStageKernel::run(const Window &window, const ThreadInfo &info)
     // Precompute FFT constants
     const unsigned int NxRadix = _radix * _Nx;
     const float        alpha   = 2.0f * kPi / float(NxRadix);
-    const float32x2_t  w_m{ cosf(alpha), -sinf(alpha) };
+    const float32x2_t  w_m{cosf(alpha), -sinf(alpha)};
 
-    if(_axis == 0)
+    if (_axis == 0)
     {
         const unsigned int N = _input->info()->dimension(0);
-        execute_window_loop(input_window, [&](const Coordinates &)
-        {
-            _func_0(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N);
-        },
-        in, out);
+        execute_window_loop(
+            input_window,
+            [&](const Coordinates &) {
+                _func_0(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m,
+                        N);
+            },
+            in, out);
     }
     else
     {
         const unsigned int N = _input->info()->dimension(0);
         const unsigned int M = _input->info()->dimension(1);
-        execute_window_loop(input_window, [&](const Coordinates &)
-        {
-            _func_1(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N, M,
-                    _input->info()->padding().right + _input->info()->padding().left,
-                    _output->info()->padding().right + _output->info()->padding().left);
-        },
-        in, out);
+        execute_window_loop(
+            input_window,
+            [&](const Coordinates &)
+            {
+                _func_1(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N,
+                        M, _input->info()->padding().right + _input->info()->padding().left,
+                        _output->info()->padding().right + _output->info()->padding().left);
+            },
+            in, out);
     }
 
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.h b/src/core/NEON/kernels/NEFFTRadixStageKernel.h
index 2291a1068c..54f32efa23 100644
--- a/src/core/NEON/kernels/NEFFTRadixStageKernel.h
+++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NEFFTRADIXSTAGEKERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 #include <arm_neon.h>
@@ -92,8 +93,17 @@ private:
     void set_radix_stage_axis0(const FFTRadixStageKernelInfo &config);
     void set_radix_stage_axis1(const FFTRadixStageKernelInfo &config);
 
-    using FFTFunctionPointerAxis0 = std::function<void(float *, float *, unsigned int, unsigned int, const float32x2_t &, unsigned int)>;
-    using FFTFunctionPointerAxis1 = std::function<void(float *, float *, unsigned int, unsigned int, const float32x2_t &, unsigned int, unsigned int, unsigned int, unsigned int)>;
+    using FFTFunctionPointerAxis0 =
+        std::function<void(float *, float *, unsigned int, unsigned int, const float32x2_t &, unsigned int)>;
+    using FFTFunctionPointerAxis1 = std::function<void(float *,
+                                                       float *,
+                                                       unsigned int,
+                                                       unsigned int,
+                                                       const float32x2_t &,
+                                                       unsigned int,
+                                                       unsigned int,
+                                                       unsigned int,
+                                                       unsigned int)>;
 
     FFTFunctionPointerAxis0 _func_0;
     FFTFunctionPointerAxis1 _func_1;
diff --git a/src/core/NEON/kernels/NEFFTScaleKernel.cpp b/src/core/NEON/kernels/NEFFTScaleKernel.cpp
index 5ec330bebc..9fe561fc59 100644
--- a/src/core/NEON/kernels/NEFFTScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTScaleKernel.cpp
@@ -28,9 +28,10 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 
@@ -41,8 +42,8 @@ namespace
 void scale_complex(float *c_in, float *c_out, bool is_conjugate, float scale)
 {
     const auto a = wrapper::vload(c_in);
-    auto       b = wrapper::vdiv(a, float32x2_t{ scale, scale });
-    if(is_conjugate)
+    auto       b = wrapper::vdiv(a, float32x2_t{scale, scale});
+    if (is_conjugate)
     {
         const float img_part = wrapper::vgetlane(b, 1);
         b                    = wrapper::vsetlane(-img_part, b, 1);
@@ -56,7 +57,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32);
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -71,7 +72,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     // Configure kernel window
     Window win = calculate_max_window(*input, Steps());
 
-    if(output != nullptr)
+    if (output != nullptr)
     {
         // Output auto inizialitation if not yet initialized
         auto_init_if_empty(*output, *input->clone());
@@ -126,10 +127,10 @@ void NEFFTScaleKernel::run(const Window &window, const ThreadInfo &info)
     Iterator in(_input, input_window);
     Iterator out(_run_in_place ? _input : _output, input_window);
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        scale_complex(reinterpret_cast<float *>(in.ptr()), reinterpret_cast<float *>(out.ptr()), _is_conj, _scale);
-    },
-    in, out);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
+        { scale_complex(reinterpret_cast<float *>(in.ptr()), reinterpret_cast<float *>(out.ptr()), _is_conj, _scale); },
+        in, out);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFFTScaleKernel.h b/src/core/NEON/kernels/NEFFTScaleKernel.h
index 24a19f98ba..608cf5ea34 100644
--- a/src/core/NEON/kernels/NEFFTScaleKernel.h
+++ b/src/core/NEON/kernels/NEFFTScaleKernel.h
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_NEFFTSCALEKERNEL_H
 #define ARM_COMPUTE_NEFFTSCALEKERNEL_H
 
-#include "src/core/NEON/INEKernel.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 
+#include "src/core/NEON/INEKernel.h"
+
 namespace arm_compute
 {
 // Forward declarations
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index 1c7c1f9763..00b0c0ae8d 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -30,14 +30,19 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
 
 namespace arm_compute
 {
 namespace
 {
-inline void fill_constant_value_single_channel_special(ITensor *tensor, const Window &window, unsigned int right, unsigned int bottom, const PixelValue &constant_border_value)
+inline void fill_constant_value_single_channel_special(ITensor          *tensor,
+                                                       const Window     &window,
+                                                       unsigned int      right,
+                                                       unsigned int      bottom,
+                                                       const PixelValue &constant_border_value)
 {
     float border_value;
     constant_border_value.get(border_value);
@@ -52,39 +57,43 @@ inline void fill_constant_value_single_channel_special(ITensor *tensor, const Wi
 
     Iterator vertical_it(tensor, vertical);
 
-    execute_window_loop(vertical, [&](const Coordinates &)
-    {
-        const auto row_start = reinterpret_cast<float *>(start_valid_region + vertical_it.offset());
+    execute_window_loop(
+        vertical,
+        [&](const Coordinates &)
+        {
+            const auto row_start = reinterpret_cast<float *>(start_valid_region + vertical_it.offset());
 
-        // Fill left and right borders
-        *(row_start - 1) = border_value;
-        std::fill_n(row_start + width, right, border_value);
-    },
-    vertical_it);
+            // Fill left and right borders
+            *(row_start - 1) = border_value;
+            std::fill_n(row_start + width, right, border_value);
+        },
+        vertical_it);
 
     // Top and bottom border
     Iterator plane_it(tensor, window);
 
     // Iterate over all XY planes
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        uint8_t *base_addr = start_valid_region + plane_it.offset();
-        // Top border
-        const auto row_start = reinterpret_cast<float *>(base_addr - stridey);
-        // Fill top rows including left/right borders
-        std::fill_n(row_start - 1, 1 + width + right, border_value);
-
-        // Bottom border
-        const unsigned low_border_size = height + bottom;
-        for(unsigned int i = height; i < low_border_size; ++i)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
         {
-            const auto row_start = reinterpret_cast<float *>(base_addr + i * stridey);
-
-            // Fill bottom rows including left/right borders
+            uint8_t *base_addr = start_valid_region + plane_it.offset();
+            // Top border
+            const auto row_start = reinterpret_cast<float *>(base_addr - stridey);
+            // Fill top rows including left/right borders
             std::fill_n(row_start - 1, 1 + width + right, border_value);
-        }
-    },
-    plane_it);
+
+            // Bottom border
+            const unsigned low_border_size = height + bottom;
+            for (unsigned int i = height; i < low_border_size; ++i)
+            {
+                const auto row_start = reinterpret_cast<float *>(base_addr + i * stridey);
+
+                // Fill bottom rows including left/right borders
+                std::fill_n(row_start - 1, 1 + width + right, border_value);
+            }
+        },
+        plane_it);
 }
 } // namespace
 
@@ -93,14 +102,20 @@ NEFillBorderKernel::NEFillBorderKernel()
 {
 }
 
-void NEFillBorderKernel::configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void NEFillBorderKernel::configure(ITensor          *tensor,
+                                   BorderSize        border_size,
+                                   BorderMode        border_mode,
+                                   const PixelValue &constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
     _tensor = tensor;
     configure(tensor->info(), border_size, border_mode, constant_border_value);
 }
 
-void NEFillBorderKernel::configure(ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void NEFillBorderKernel::configure(ITensorInfo      *tensor,
+                                   BorderSize        border_size,
+                                   BorderMode        border_mode,
+                                   const PixelValue &constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
@@ -124,7 +139,7 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_UNUSED(info);
 
     // If there is no border: early exit
-    if(_border_size.empty())
+    if (_border_size.empty())
     {
         return;
     }
@@ -132,13 +147,14 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    switch(_mode)
+    switch (_mode)
     {
         case BorderMode::CONSTANT:
         {
-            if(_border_size.left == 1 && _border_size.top == 1 && _tensor->info()->data_type() == DataType::F32)
+            if (_border_size.left == 1 && _border_size.top == 1 && _tensor->info()->data_type() == DataType::F32)
             {
-                fill_constant_value_single_channel_special(_tensor, window, _border_size.right, _border_size.bottom, _constant_border_value);
+                fill_constant_value_single_channel_special(_tensor, window, _border_size.right, _border_size.bottom,
+                                                           _constant_border_value);
             }
             else
             {
@@ -176,46 +192,56 @@ void NEFillBorderKernel::fill_replicate_single_channel(const Window &window)
 
     Iterator vertical_it(_tensor, vertical);
 
-    execute_window_loop(vertical, [&](const Coordinates &)
-    {
-        uint8_t *base_addr = start_valid_region + vertical_it.offset();
-        // Fill left and right borders
-        for(unsigned int i = 0; i < _border_size.left; ++i)
+    execute_window_loop(
+        vertical,
+        [&](const Coordinates &)
         {
-            std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, vertical_it.ptr(), element_size);
-        }
+            uint8_t *base_addr = start_valid_region + vertical_it.offset();
+            // Fill left and right borders
+            for (unsigned int i = 0; i < _border_size.left; ++i)
+            {
+                std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, vertical_it.ptr(),
+                            element_size);
+            }
 
-        for(unsigned int i = 0; i < _border_size.right; ++i)
-        {
-            std::memcpy(base_addr + (width + i) * element_size, vertical_it.ptr() + (width - 1) * element_size, element_size);
-        }
-    },
-    vertical_it);
+            for (unsigned int i = 0; i < _border_size.right; ++i)
+            {
+                std::memcpy(base_addr + (width + i) * element_size, vertical_it.ptr() + (width - 1) * element_size,
+                            element_size);
+            }
+        },
+        vertical_it);
 
     // Top and bottom border
     Iterator plane_it(_tensor, window);
 
     // Iterate over all XY planes
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        uint8_t *base_addr = start_valid_region + plane_it.offset();
-        // Top border
-        for(int i = -_border_size.top; i < 0; ++i)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
         {
-            // Copy top rows including left/right borders
-            std::memcpy(base_addr + i * static_cast<int>(_tensor->info()->strides_in_bytes()[1]) - _border_size.left * element_size,
-                        base_addr - _border_size.left * element_size, (_border_size.left + width + _border_size.right) * element_size);
-        }
+            uint8_t *base_addr = start_valid_region + plane_it.offset();
+            // Top border
+            for (int i = -_border_size.top; i < 0; ++i)
+            {
+                // Copy top rows including left/right borders
+                std::memcpy(base_addr + i * static_cast<int>(_tensor->info()->strides_in_bytes()[1]) -
+                                _border_size.left * element_size,
+                            base_addr - _border_size.left * element_size,
+                            (_border_size.left + width + _border_size.right) * element_size);
+            }
 
-        // Bottom border
-        for(unsigned int i = height; i < height + _border_size.bottom; ++i)
-        {
-            // Copy bottom rows including left/right borders
-            std::memcpy(base_addr + i * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size,
-                        base_addr + (height - 1) * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size, (_border_size.left + width + _border_size.right) * element_size);
-        }
-    },
-    plane_it);
+            // Bottom border
+            for (unsigned int i = height; i < height + _border_size.bottom; ++i)
+            {
+                // Copy bottom rows including left/right borders
+                std::memcpy(base_addr + i * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size,
+                            base_addr + (height - 1) * _tensor->info()->strides_in_bytes()[1] -
+                                _border_size.left * element_size,
+                            (_border_size.left + width + _border_size.right) * element_size);
+            }
+        },
+        plane_it);
 }
 
 void NEFillBorderKernel::fill_constant_value_single_channel(const Window &window)
@@ -232,50 +258,57 @@ void NEFillBorderKernel::fill_constant_value_single_channel(const Window &window
 
     Iterator vertical_it(_tensor, vertical);
 
-    execute_window_loop(vertical, [&](const Coordinates &)
-    {
-        uint8_t *base_addr = start_valid_region + vertical_it.offset();
-        // Fill left and right borders
-        for(unsigned int i = 0; i < _border_size.left; ++i)
+    execute_window_loop(
+        vertical,
+        [&](const Coordinates &)
         {
-            std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, &_constant_border_value, element_size);
-        }
+            uint8_t *base_addr = start_valid_region + vertical_it.offset();
+            // Fill left and right borders
+            for (unsigned int i = 0; i < _border_size.left; ++i)
+            {
+                std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, &_constant_border_value,
+                            element_size);
+            }
 
-        for(unsigned int i = 0; i < _border_size.right; ++i)
-        {
-            std::memcpy(base_addr + (width + i) * element_size, &_constant_border_value, element_size);
-        }
-    },
-    vertical_it);
+            for (unsigned int i = 0; i < _border_size.right; ++i)
+            {
+                std::memcpy(base_addr + (width + i) * element_size, &_constant_border_value, element_size);
+            }
+        },
+        vertical_it);
 
     // Top and bottom border
     Iterator plane_it(_tensor, window);
 
     // Iterate over all XY planes
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        uint8_t *base_addr = start_valid_region + plane_it.offset();
-        // Top border
-        for(int i = -_border_size.top; i < 0; ++i)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
         {
-            // Fill top rows including left/right borders
-            for(unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j)
+            uint8_t *base_addr = start_valid_region + plane_it.offset();
+            // Top border
+            for (int i = -_border_size.top; i < 0; ++i)
             {
-                std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size, &_constant_border_value, element_size);
+                // Fill top rows including left/right borders
+                for (unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j)
+                {
+                    std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size,
+                                &_constant_border_value, element_size);
+                }
             }
-        }
 
-        // Bottom border
-        const unsigned low_border_size = height + _border_size.bottom;
-        for(unsigned int i = height; i < low_border_size; ++i)
-        {
-            // Fill bottom rows including left/right borders
-            for(unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j)
+            // Bottom border
+            const unsigned low_border_size = height + _border_size.bottom;
+            for (unsigned int i = height; i < low_border_size; ++i)
             {
-                std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size, &_constant_border_value, element_size);
+                // Fill bottom rows including left/right borders
+                for (unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j)
+                {
+                    std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size,
+                                &_constant_border_value, element_size);
+                }
             }
-        }
-    },
-    plane_it);
+        },
+        plane_it);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.h b/src/core/NEON/kernels/NEFillBorderKernel.h
index 2c851583ed..aaad108bfa 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.h
+++ b/src/core/NEON/kernels/NEFillBorderKernel.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -64,7 +65,10 @@ public:
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      *
      */
-    void configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    void configure(ITensor          *tensor,
+                   BorderSize        border_size,
+                   BorderMode        border_mode,
+                   const PixelValue &constant_border_value = PixelValue());
     /** Initialise the function.
      *
      * @note This kernel fills the borders within the XY-planes.
@@ -75,7 +79,10 @@ public:
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      *
      */
-    void configure(ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    void configure(ITensorInfo      *tensor,
+                   BorderSize        border_size,
+                   BorderMode        border_mode,
+                   const PixelValue &constant_border_value = PixelValue());
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
index 51a69046a9..cbe5136fb1 100644
--- a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
-#include "src/cpu/kernels/fuse_batch_normalization/list.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -30,12 +29,14 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/common/cpuinfo/CpuIsaInfo.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/fuse_batch_normalization/list.h"
 
 #include <map>
 
@@ -52,8 +53,16 @@ struct FuseBatchNormalizeSelectorData
 };
 
 using FBNSelectorPtr = std::add_pointer<bool(const FuseBatchNormalizeSelectorData &data)>::type;
-using FBNUKernelPtr  = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, ITensor *,
-                                             const ITensor *, const ITensor *, const ITensor *, const ITensor *, float, const Window &)>::type;
+using FBNUKernelPtr  = std::add_pointer<void(const ITensor *,
+                                            const ITensor *,
+                                            ITensor *,
+                                            ITensor *,
+                                            const ITensor *,
+                                            const ITensor *,
+                                            const ITensor *,
+                                            const ITensor *,
+                                            float,
+                                            const Window &)>::type;
 
 struct FBNUKernel
 {
@@ -62,73 +71,63 @@ struct FBNUKernel
     FBNUKernelPtr        ukernel;
 };
 
-static const FBNUKernel available_kernels[] =
-{
-    {
-        "fused_batch_normalization_conv_NHWC_F16",
-        [](const FuseBatchNormalizeSelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
-        },
-        REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16)
-    },
-    {
-        "fused_batch_normalization_conv_NCHW_F16",
-        [](const FuseBatchNormalizeSelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
-        },
-        REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16)
-    },
-    {
-        "fused_batch_normalization_dwc_NHWC_F16",
-        [](const FuseBatchNormalizeSelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
-        },
-        REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f16)
-    },
-    {
-        "fused_batch_normalization_dwc_NCHW_F16",
-        [](const FuseBatchNormalizeSelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
-        },
-        REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f16)
-    },
-    {
-        "fused_batch_normalization_conv_NHWC_F32",
-        [](const FuseBatchNormalizeSelectorData & data)
-        {
-            return data.dt == DataType::F32 && data.dl == DataLayout::NHWC && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
-        },
-        REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32)
-    },
-    {
-        "fused_batch_normalization_conv_NCHW_F32",
-        [](const FuseBatchNormalizeSelectorData & data)
-        {
-            return data.dt == DataType::F32 && data.dl == DataLayout::NCHW && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
-        },
-        REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32)
-    },
-    {
-        "fused_batch_normalization_dwc_NHWC_F32",
-        [](const FuseBatchNormalizeSelectorData & data)
-        {
-            return data.dt == DataType::F32 && data.dl == DataLayout::NHWC && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
-        },
-        REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f32)
-    },
-    {
-        "fused_batch_normalization_dwc_NCHW_F32",
-        [](const FuseBatchNormalizeSelectorData & data)
-        {
-            return data.dt == DataType::F32 && data.dl == DataLayout::NCHW && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
-        },
-        REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f32)
-    }
-};
+static const FBNUKernel available_kernels[] = {
+    {"fused_batch_normalization_conv_NHWC_F16",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 &&
+                data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
+     },
+     REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16)},
+    {"fused_batch_normalization_conv_NCHW_F16",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 &&
+                data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
+     },
+     REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16)},
+    {"fused_batch_normalization_dwc_NHWC_F16",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 &&
+                data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
+     },
+     REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f16)},
+    {"fused_batch_normalization_dwc_NCHW_F16",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 &&
+                data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
+     },
+     REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f16)},
+    {"fused_batch_normalization_conv_NHWC_F32",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F32 && data.dl == DataLayout::NHWC &&
+                data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
+     },
+     REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32)},
+    {"fused_batch_normalization_conv_NCHW_F32",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F32 && data.dl == DataLayout::NCHW &&
+                data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
+     },
+     REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32)},
+    {"fused_batch_normalization_dwc_NHWC_F32",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F32 && data.dl == DataLayout::NHWC &&
+                data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
+     },
+     REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f32)},
+    {"fused_batch_normalization_dwc_NCHW_F32",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F32 && data.dl == DataLayout::NCHW &&
+                data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
+     },
+     REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f32)}};
 
 /** Micro-kernel selector
  *
@@ -140,9 +139,9 @@ static const FBNUKernel available_kernels[] =
  */
 const FBNUKernel *get_implementation(const FuseBatchNormalizeSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -150,10 +149,16 @@ const FBNUKernel *get_implementation(const FuseBatchNormalizeSelectorData &data)
     return nullptr;
 }
 
-Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                          const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                          const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
-                          float epsilon, FuseBatchNormalizationType fbn_type)
+Status validate_arguments(const ITensorInfo         *input_weights,
+                          const ITensorInfo         *bn_mean,
+                          const ITensorInfo         *bn_var,
+                          const ITensorInfo         *fused_weights,
+                          const ITensorInfo         *fused_bias,
+                          const ITensorInfo         *input_bias,
+                          const ITensorInfo         *bn_beta,
+                          const ITensorInfo         *bn_gamma,
+                          float                      epsilon,
+                          FuseBatchNormalizationType fbn_type)
 {
     ARM_COMPUTE_UNUSED(epsilon);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var);
@@ -164,43 +169,44 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b
     ARM_COMPUTE_RETURN_ERROR_ON(input_bias == nullptr && fused_bias == nullptr);
     ARM_COMPUTE_RETURN_ERROR_ON(bn_mean->num_dimensions() > 1);
 
-    if(fbn_type == FuseBatchNormalizationType::CONVOLUTION)
+    if (fbn_type == FuseBatchNormalizationType::CONVOLUTION)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(3) != bn_mean->dimension(0));
     }
     else
     {
-        const size_t channel_idx = get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t channel_idx =
+            get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(channel_idx) != bn_mean->dimension(0));
     }
     // Validate bias
-    if(input_bias != nullptr)
+    if (input_bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, input_bias);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, input_bias);
     }
     // Validate beta
-    if(bn_beta != nullptr)
+    if (bn_beta != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_beta);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_beta);
     }
     // Validate gamma
-    if(bn_gamma != nullptr)
+    if (bn_gamma != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_gamma);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_gamma);
     }
 
     // Validate output weights
-    if(fused_weights != nullptr && fused_weights->total_size() != 0)
+    if (fused_weights != nullptr && fused_weights->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_weights, fused_weights);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input_weights, fused_weights);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_weights);
     }
     // Validate output bias
-    if(fused_bias != nullptr && fused_bias->total_size() != 0)
+    if (fused_bias != nullptr && fused_bias->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, fused_bias);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_bias);
@@ -212,15 +218,31 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b
 } // namespace
 
 NEFuseBatchNormalizationKernel::NEFuseBatchNormalizationKernel()
-    : _input_weights(nullptr), _input_bias(nullptr), _bn_mean(nullptr), _bn_var(nullptr), _bn_gamma(nullptr), _bn_beta(nullptr), _fused_weights(nullptr), _fused_bias(nullptr), _epsilon(),
-      _run_in_place_weights(false), _run_in_place_bias(false), _func(nullptr)
+    : _input_weights(nullptr),
+      _input_bias(nullptr),
+      _bn_mean(nullptr),
+      _bn_var(nullptr),
+      _bn_gamma(nullptr),
+      _bn_beta(nullptr),
+      _fused_weights(nullptr),
+      _fused_bias(nullptr),
+      _epsilon(),
+      _run_in_place_weights(false),
+      _run_in_place_bias(false),
+      _func(nullptr)
 {
 }
 
-void NEFuseBatchNormalizationKernel::configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var,
-                                               ITensor *fused_weights, ITensor *fused_bias,
-                                               const ITensor *input_bias, const ITensor *bn_beta, const ITensor *bn_gamma,
-                                               float epsilon, FuseBatchNormalizationType fbn_type)
+void NEFuseBatchNormalizationKernel::configure(const ITensor             *input_weights,
+                                               const ITensor             *bn_mean,
+                                               const ITensor             *bn_var,
+                                               ITensor                   *fused_weights,
+                                               ITensor                   *fused_bias,
+                                               const ITensor             *input_bias,
+                                               const ITensor             *bn_beta,
+                                               const ITensor             *bn_gamma,
+                                               float                      epsilon,
+                                               FuseBatchNormalizationType fbn_type)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var);
 
@@ -238,27 +260,27 @@ void NEFuseBatchNormalizationKernel::configure(const ITensor *input_weights, con
     _run_in_place_bias    = (fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias);
 
     // Auto initialize outputs
-    if(_fused_weights != nullptr)
+    if (_fused_weights != nullptr)
     {
         // Output tensor auto initialization if not yet initialized
         auto_init_if_empty(*_fused_weights->info(), *_input_weights->info()->clone());
     }
-    if(_fused_bias != nullptr)
+    if (_fused_bias != nullptr)
     {
         // Output tensor auto initialization if not yet initialized
         auto_init_if_empty(*_fused_bias->info(), *_bn_mean->info()->clone());
     }
 
     // Validate arguments
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_weights->info(), bn_mean->info(), bn_var->info(),
-                                                  (fused_weights != nullptr) ? fused_weights->info() : nullptr,
-                                                  (fused_bias != nullptr) ? fused_bias->info() : nullptr,
-                                                  (input_bias != nullptr) ? input_bias->info() : nullptr,
-                                                  (bn_beta != nullptr) ? bn_beta->info() : nullptr,
-                                                  (bn_gamma != nullptr) ? bn_gamma->info() : nullptr,
-                                                  epsilon, fbn_type));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
+        input_weights->info(), bn_mean->info(), bn_var->info(),
+        (fused_weights != nullptr) ? fused_weights->info() : nullptr,
+        (fused_bias != nullptr) ? fused_bias->info() : nullptr, (input_bias != nullptr) ? input_bias->info() : nullptr,
+        (bn_beta != nullptr) ? bn_beta->info() : nullptr, (bn_gamma != nullptr) ? bn_gamma->info() : nullptr, epsilon,
+        fbn_type));
 
-    const auto *uk = get_implementation(FuseBatchNormalizeSelectorData{ input_weights->info()->data_type(), input_weights->info()->data_layout(), fbn_type, CPUInfo::get().get_isa() });
+    const auto *uk = get_implementation(FuseBatchNormalizeSelectorData{
+        input_weights->info()->data_type(), input_weights->info()->data_layout(), fbn_type, CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
     ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
     _func = uk->ukernel;
@@ -268,12 +290,19 @@ void NEFuseBatchNormalizationKernel::configure(const ITensor *input_weights, con
     INEKernel::configure(win);
 }
 
-Status NEFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                                                const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                                                const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
-                                                float epsilon, FuseBatchNormalizationType fbn_type)
+Status NEFuseBatchNormalizationKernel::validate(const ITensorInfo         *input_weights,
+                                                const ITensorInfo         *bn_mean,
+                                                const ITensorInfo         *bn_var,
+                                                const ITensorInfo         *fused_weights,
+                                                const ITensorInfo         *fused_bias,
+                                                const ITensorInfo         *input_bias,
+                                                const ITensorInfo         *bn_beta,
+                                                const ITensorInfo         *bn_gamma,
+                                                float                      epsilon,
+                                                FuseBatchNormalizationType fbn_type)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+                                                   input_bias, bn_beta, bn_gamma, epsilon, fbn_type));
     return Status{};
 }
 
@@ -284,6 +313,7 @@ void NEFuseBatchNormalizationKernel::run(const Window &window, const ThreadInfo
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
-    (*_func)(_input_weights, _input_bias, _fused_weights, _fused_bias, _bn_mean, _bn_var, _bn_beta, _bn_gamma, _epsilon, window);
+    (*_func)(_input_weights, _input_bias, _fused_weights, _fused_bias, _bn_mean, _bn_var, _bn_beta, _bn_gamma, _epsilon,
+             window);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
index ee767b01c8..f23280d55a 100644
--- a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
+++ b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
@@ -66,9 +66,16 @@ public:
      * @param[in]  epsilon       (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
      * @param[in]  fbn_type      (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
      */
-    void configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var, ITensor *fused_weights, ITensor *fused_bias,
-                   const ITensor *input_bias = nullptr, const ITensor *bn_beta = nullptr, const ITensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    void configure(const ITensor             *input_weights,
+                   const ITensor             *bn_mean,
+                   const ITensor             *bn_var,
+                   ITensor                   *fused_weights,
+                   ITensor                   *fused_bias,
+                   const ITensor             *input_bias = nullptr,
+                   const ITensor             *bn_beta    = nullptr,
+                   const ITensor             *bn_gamma   = nullptr,
+                   float                      epsilon    = 0.001f,
+                   FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
     /** Static function to check if given info will lead to a valid configuration of @ref NEFuseBatchNormalizationKernel
      *
      * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
@@ -86,10 +93,16 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                           const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                           const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr,
-                           float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    static Status validate(const ITensorInfo         *input_weights,
+                           const ITensorInfo         *bn_mean,
+                           const ITensorInfo         *bn_var,
+                           const ITensorInfo         *fused_weights,
+                           const ITensorInfo         *fused_bias,
+                           const ITensorInfo         *input_bias = nullptr,
+                           const ITensorInfo         *bn_beta    = nullptr,
+                           const ITensorInfo         *bn_gamma   = nullptr,
+                           float                      epsilon    = 0.001f,
+                           FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -107,8 +120,16 @@ private:
     bool           _run_in_place_weights;
     bool           _run_in_place_bias;
 
-    using FuseBatchNormFunction = void(const ITensor *input_weights, const ITensor *input_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                       const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window);
+    using FuseBatchNormFunction = void(const ITensor *input_weights,
+                                       const ITensor *input_bias,
+                                       ITensor       *fused_weights,
+                                       ITensor       *fused_bias,
+                                       const ITensor *bn_mean,
+                                       const ITensor *bn_var,
+                                       const ITensor *bn_beta,
+                                       const ITensor *bn_gamma,
+                                       float          epsilon,
+                                       const Window  &window);
 
     FuseBatchNormFunction *_func;
 };
diff --git a/src/core/NEON/kernels/NEGatherKernel.cpp b/src/core/NEON/kernels/NEGatherKernel.cpp
index 11332ffac8..f1d457d399 100644
--- a/src/core/NEON/kernels/NEGatherKernel.cpp
+++ b/src/core/NEON/kernels/NEGatherKernel.cpp
@@ -27,9 +27,10 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -42,20 +43,22 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices,
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
 
-    if(axis < 0)
+    if (axis < 0)
     {
         axis += input->num_dimensions();
     }
 
     ARM_COMPUTE_RETURN_ERROR_ON(0 > axis || axis >= static_cast<int32_t>(input->num_dimensions()));
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > Coordinates::num_max_dimensions);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 >
+                                Coordinates::num_max_dimensions);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-        TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), axis);
+        TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(
+            input->tensor_shape(), indices->tensor_shape(), axis);
         ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
     }
 
@@ -81,23 +84,23 @@ void NEGatherKernel::gather_common(const Window &window, const ThreadInfo &info)
     const auto idx_info = _indices->info();
     const auto dst_info = _output->info();
 
-    const auto num_dims = dst_info->num_dimensions();
+    const auto num_dims     = dst_info->num_dimensions();
     const auto chunk_stride = src_info->strides_in_bytes()[_axis];
 
     const auto window_start_x = window.x().start();
-    const auto window_end_x = window.x().end();
-    auto window_size_x = src_info->element_size();
+    const auto window_end_x   = window.x().end();
+    auto       window_size_x  = src_info->element_size();
 
     const auto idx_limit = static_cast<TIndex>(src_info->tensor_shape()[_axis]);
 
-    if(_axis != 0)
+    if (_axis != 0)
     {
         dst_win.set(0, Window::Dimension(window_start_x, window_start_x + 1, 1));
         window_size_x *= window_end_x - window_start_x;
     }
 
     // Compute source and index tensors window based on the output window.
-    auto src_win = dst_win;
+    auto   src_win = dst_win;
     Window idx_win;
 
     for (size_t i = 0; i < idx_info->num_dimensions(); ++i)
@@ -109,22 +112,27 @@ void NEGatherKernel::gather_common(const Window &window, const ThreadInfo &info)
     // Use the custom strides to access all three tensors using the same loop.
     Iterator src_it(num_dims, _src_it_strides, _input->buffer(), src_info->offset_first_element_in_bytes(), src_win);
     Iterator idx_it(num_dims, _idx_it_strides, _indices->buffer(), idx_info->offset_first_element_in_bytes(), idx_win);
-    Iterator dst_it(num_dims, dst_info->strides_in_bytes(), _output->buffer(), dst_info->offset_first_element_in_bytes(), dst_win);
-
-    execute_window_loop(dst_win, [&](const Coordinates &) {
-        const auto idx = *reinterpret_cast<const TIndex *>(idx_it.ptr());
-
-        if(idx >= 0 && idx < idx_limit)
-        {
-            const auto src_ptr = src_it.ptr() + idx * chunk_stride;
+    Iterator dst_it(num_dims, dst_info->strides_in_bytes(), _output->buffer(),
+                    dst_info->offset_first_element_in_bytes(), dst_win);
 
-            std::copy_n(src_ptr, window_size_x, dst_it.ptr());
-        }
-        else
+    execute_window_loop(
+        dst_win,
+        [&](const Coordinates &)
         {
-            std::fill_n(dst_it.ptr(), window_size_x, 0);
-        }
-    }, src_it, idx_it, dst_it);
+            const auto idx = *reinterpret_cast<const TIndex *>(idx_it.ptr());
+
+            if (idx >= 0 && idx < idx_limit)
+            {
+                const auto src_ptr = src_it.ptr() + idx * chunk_stride;
+
+                std::copy_n(src_ptr, window_size_x, dst_it.ptr());
+            }
+            else
+            {
+                std::fill_n(dst_it.ptr(), window_size_x, 0);
+            }
+        },
+        src_it, idx_it, dst_it);
 }
 
 void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
@@ -137,13 +145,13 @@ void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITe
     _output  = output;
     _axis    = axis;
 
-    if(_axis < 0)
+    if (_axis < 0)
     {
         _axis += input->info()->num_dimensions();
     }
     ARM_COMPUTE_ERROR_ON(0 > _axis || _axis >= static_cast<int32_t>(input->info()->num_dimensions()));
 
-    switch(_indices->info()->data_type())
+    switch (_indices->info()->data_type())
     {
         case DataType::U32:
             _func = &NEGatherKernel::gather_common<uint32_t>;
@@ -157,7 +165,8 @@ void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITe
     }
 
     // Output auto initialization if not yet initialized
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis);
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(
+        input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis);
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     // Create window
@@ -169,30 +178,31 @@ void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITe
     // These will be used to iterate lock-step through all tensors (input, indices and output).
     size_t dim_no = 0;
 
-    const auto input_info = input->info();
+    const auto  input_info    = input->info();
     const auto &input_strides = input_info->strides_in_bytes();
 
-    const auto indices_info = indices->info();
-    const auto &indices_strides = indices_info->strides_in_bytes();
-    const auto indices_num_dims = indices_info->num_dimensions();
+    const auto  indices_info     = indices->info();
+    const auto &indices_strides  = indices_info->strides_in_bytes();
+    const auto  indices_num_dims = indices_info->num_dimensions();
 
-    for(; dim_no < static_cast<size_t>(_axis); ++dim_no)
+    for (; dim_no < static_cast<size_t>(_axis); ++dim_no)
     {
         _src_it_strides[dim_no] = input_strides[dim_no];
     }
 
-    for(; dim_no < static_cast<size_t>(_axis) + indices_num_dims; ++dim_no)
+    for (; dim_no < static_cast<size_t>(_axis) + indices_num_dims; ++dim_no)
     {
         _idx_it_strides[dim_no] = indices_strides[dim_no - _axis];
     }
 
-    for(; dim_no < Coordinates::num_max_dimensions; ++dim_no)
+    for (; dim_no < Coordinates::num_max_dimensions; ++dim_no)
     {
         _src_it_strides[dim_no] = input_strides[dim_no - indices_num_dims + 1];
     }
 }
 
-Status NEGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+Status
+NEGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis));
     return Status{};
diff --git a/src/core/NEON/kernels/NEGatherKernel.h b/src/core/NEON/kernels/NEGatherKernel.h
index ce69daeda7..b8c069f99e 100644
--- a/src/core/NEON/kernels/NEGatherKernel.h
+++ b/src/core/NEON/kernels/NEGatherKernel.h
@@ -26,6 +26,7 @@
 #define ARM_COMPUTE_NEGATHERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -92,8 +93,8 @@ private:
     ITensor       *_output;
     kernel_ptr     _func;
 
-    Strides        _src_it_strides;
-    Strides        _idx_it_strides;
+    Strides _src_it_strides;
+    Strides _idx_it_strides;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEGATHERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
index 7bba136e84..549319e49f 100644
--- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
@@ -27,11 +27,13 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/CPP/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/genproposals/list.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -44,7 +46,8 @@ struct ComputeAllAnchorsData
 };
 
 using ComputeAllAnchorsSelectorPtr = std::add_pointer<bool(const ComputeAllAnchorsData &data)>::type;
-using ComputeAllAnchorsUKernelPtr  = std::add_pointer<void(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)>::type;
+using ComputeAllAnchorsUKernelPtr  = std::add_pointer<void(
+    const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)>::type;
 
 struct ComputeAllAnchorsKernel
 {
@@ -53,27 +56,17 @@ struct ComputeAllAnchorsKernel
     ComputeAllAnchorsUKernelPtr        ukernel;
 };
 
-static const ComputeAllAnchorsKernel available_kernels[] =
-{
+static const ComputeAllAnchorsKernel available_kernels[] = {
 #if defined(ARM_COMPUTE_ENABLE_NEON)
-    {
-        "neon_qu16_computeallanchors",
-        [](const ComputeAllAnchorsData & data) { return data.dt == DataType::QSYMM16; },
-        REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_computeallanchors)
-    },
+    {"neon_qu16_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::QSYMM16; },
+     REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_computeallanchors)},
 #endif //defined(ARM_COMPUTE_ENABLE_NEON)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    {
-        "neon_fp16_computeallanchors",
-        [](const ComputeAllAnchorsData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_computeallanchors)
-    },
+    {"neon_fp16_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_computeallanchors)},
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    {
-        "neon_fp32_computeallanchors",
-        [](const ComputeAllAnchorsData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_computeallanchors)
-    },
+    {"neon_fp32_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_computeallanchors)},
 };
 
 /** Micro-kernel selector
@@ -84,9 +77,9 @@ static const ComputeAllAnchorsKernel available_kernels[] =
  */
 const ComputeAllAnchorsKernel *get_implementation(const ComputeAllAnchorsData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -101,7 +94,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
     ARM_COMPUTE_RETURN_ERROR_ON(anchors->dimension(0) != info.values_per_roi());
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(anchors, DataType::QSYMM16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(anchors->num_dimensions() > 2);
-    if(all_anchors->total_size() > 0)
+    if (all_anchors->total_size() > 0)
     {
         const size_t feature_height = info.feat_height();
         const size_t feature_width  = info.feat_width();
@@ -111,7 +104,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
         ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(0) != info.values_per_roi());
         ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(1) != feature_height * feature_width * num_anchors);
 
-        if(is_data_type_quantized(anchors->data_type()))
+        if (is_data_type_quantized(anchors->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(anchors, all_anchors);
         }
@@ -139,7 +132,8 @@ void NEComputeAllAnchorsKernel::configure(const ITensor *anchors, ITensor *all_a
 
     // Initialize the output if empty
     const TensorShape output_shape(info.values_per_roi(), width * height * num_anchors);
-    auto_init_if_empty(*all_anchors->info(), TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info()));
+    auto_init_if_empty(*all_anchors->info(),
+                       TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info()));
 
     // Set instance variables
     _anchors      = anchors;
@@ -151,7 +145,9 @@ void NEComputeAllAnchorsKernel::configure(const ITensor *anchors, ITensor *all_a
     INEKernel::configure(win);
 }
 
-Status NEComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
+Status NEComputeAllAnchorsKernel::validate(const ITensorInfo        *anchors,
+                                           const ITensorInfo        *all_anchors,
+                                           const ComputeAnchorsInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(anchors, all_anchors, info));
     return Status{};
@@ -163,7 +159,7 @@ void NEComputeAllAnchorsKernel::run(const Window &window, const ThreadInfo &info
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    const auto *uk = get_implementation(ComputeAllAnchorsData{ _anchors->info()->data_type() });
+    const auto *uk = get_implementation(ComputeAllAnchorsData{_anchors->info()->data_type()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     uk->ukernel(_anchors, _all_anchors, _anchors_info, window);
diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
index 297d6d4abe..30699eee01 100644
--- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
+++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
@@ -78,5 +78,5 @@ private:
     ITensor           *_all_anchors;
     ComputeAnchorsInfo _anchors_info;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif // ARM_COMPUTE_NEGENERATEPROPOSALSLAYERKERNEL_H
diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
index 71641404bf..0a1780f6ee 100644
--- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
@@ -31,12 +31,13 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/instancenorm/list.h"
 
 #include <arm_neon.h>
@@ -51,7 +52,13 @@ struct InstanceNormSelectorData
 };
 
 using InstanceNormSelctorPtr = std::add_pointer<bool(const InstanceNormSelectorData &data)>::type;
-using InstanceNormUKernelPtr = std::add_pointer<void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window)>::type;
+using InstanceNormUKernelPtr = std::add_pointer<void(ITensor      *input,
+                                                     ITensor      *output,
+                                                     float         gamma,
+                                                     float         beta,
+                                                     float         epsilon,
+                                                     bool          use_mixed_precision,
+                                                     const Window &window)>::type;
 
 struct InstanceNormKernel
 {
@@ -60,19 +67,12 @@ struct InstanceNormKernel
     InstanceNormUKernelPtr       ukernel;
 };
 
-static const InstanceNormKernel available_kernels[] =
-{
-    {
-        "fp32_neon_instancenorm",
-        [](const InstanceNormSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_instancenorm)
-    },
+static const InstanceNormKernel available_kernels[] = {
+    {"fp32_neon_instancenorm", [](const InstanceNormSelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_instancenorm)},
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    {
-        "fp16_neon_instancenorm",
-        [](const InstanceNormSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_instancenorm)
-    },
+    {"fp16_neon_instancenorm", [](const InstanceNormSelectorData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_instancenorm)},
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 };
 
@@ -84,9 +84,9 @@ static const InstanceNormKernel available_kernels[] =
  */
 const InstanceNormKernel *get_implementation(const InstanceNormSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -102,14 +102,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0");
 
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC, "NHWC data layout is not supported by the kernel directly");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC,
+                                    "NHWC data layout is not supported by the kernel directly");
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), "Input and output have different number of channels");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
+                                        "Input and output have different number of channels");
     }
     return Status{};
 }
@@ -132,7 +134,9 @@ NEInstanceNormalizationLayerKernel::NEInstanceNormalizationLayerKernel()
 {
 }
 
-void NEInstanceNormalizationLayerKernel::configure(ITensor *input, ITensor *output, const InstanceNormalizationLayerKernelInfo &info)
+void NEInstanceNormalizationLayerKernel::configure(ITensor                                    *input,
+                                                   ITensor                                    *output,
+                                                   const InstanceNormalizationLayerKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
@@ -152,10 +156,13 @@ void NEInstanceNormalizationLayerKernel::configure(ITensor *input, ITensor *outp
     INEKernel::configure(std::get<1>(win_config));
 }
 
-Status NEInstanceNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info)
+Status NEInstanceNormalizationLayerKernel::validate(const ITensorInfo                          *input,
+                                                    const ITensorInfo                          *output,
+                                                    const InstanceNormalizationLayerKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info.gamma, info.beta, info.epsilon));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(
+        input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
     return Status{};
 }
 
@@ -165,7 +172,7 @@ void NEInstanceNormalizationLayerKernel::run(const Window &window, const ThreadI
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    const auto *uk = get_implementation(InstanceNormSelectorData{ _input->info()->data_type() });
+    const auto *uk = get_implementation(InstanceNormSelectorData{_input->info()->data_type()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     uk->ukernel(_input, _output, _gamma, _beta, _epsilon, _use_mixed_precision, window);
diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
index f166ce2058..024ccd9ef2 100644
--- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
+++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
@@ -68,7 +68,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -82,14 +83,15 @@ private:
      * @param[in]      beta    The offset scalar value applied to the normalized tensor. Defaults to 0.0
      * @param[in]      epsilon Lower bound value for the normalization. Defaults to 1e-12
      */
-    using NormalizationFunction = void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
+    using NormalizationFunction =
+        void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
 
     ITensor *_input;
     ITensor *_output;
     float    _gamma;
     float    _beta;
     float    _epsilon;
-    bool     _use_mixed_precision{ true };
+    bool     _use_mixed_precision{true};
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
index 8ab0288ab1..eea57a17d3 100644
--- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
@@ -30,11 +30,12 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/common/cpuinfo/CpuIsaInfo.h"
-#include "src/core/NEON/NEMath.h"
 #include "src/core/common/Registrars.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEMath.h"
 #include "src/cpu/kernels/l2normlayer/list.h"
 
 #include <arm_neon.h>
@@ -55,7 +56,8 @@ struct L2NormalizeLayerSelectorData
 
 using L2NormalizeLayerKernelSelctorPtr = std::add_pointer<bool(const L2NormalizeLayerSelectorData &data)>::type;
 
-using L2NormalizeLayerPtr = std::add_pointer<void(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)>::type;
+using L2NormalizeLayerPtr = std::add_pointer<void(
+    const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)>::type;
 
 struct L2NormalizeLayerKernel
 {
@@ -64,26 +66,25 @@ struct L2NormalizeLayerKernel
     L2NormalizeLayerPtr                    ukernel;
 };
 
-static const L2NormalizeLayerKernel available_kernels[] =
-{
-    {
-        "fp32_neon_l2normalize_x",
-        [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F32 && data.actual_axis == Window::DimX; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_x)
-    },
-    {
-        "fp32_neon_l2normalize_yz",
-        [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F32 && data.actual_axis != Window::DimX; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_yz)
-    },
+static const L2NormalizeLayerKernel available_kernels[] = {
+    {"fp32_neon_l2normalize_x",
+     [](const L2NormalizeLayerSelectorData &data)
+     { return data.dt == DataType::F32 && data.actual_axis == Window::DimX; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_x)},
+    {"fp32_neon_l2normalize_yz",
+     [](const L2NormalizeLayerSelectorData &data)
+     { return data.dt == DataType::F32 && data.actual_axis != Window::DimX; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_yz)},
     {
         "fp16_neon_l2normalize_x",
-        [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis == Window::DimX; },
+        [](const L2NormalizeLayerSelectorData &data)
+        { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis == Window::DimX; },
         REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_l2_normalize_x),
     },
     {
         "fp16_neon_l2normalize_yz",
-        [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis != Window::DimX; },
+        [](const L2NormalizeLayerSelectorData &data)
+        { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis != Window::DimX; },
         REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_l2_normalize_yz),
     },
 };
@@ -96,9 +97,9 @@ static const L2NormalizeLayerKernel available_kernels[] =
  */
 const L2NormalizeLayerKernel *get_implementation(const L2NormalizeLayerSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -106,7 +107,8 @@ const L2NormalizeLayerKernel *get_implementation(const L2NormalizeLayerSelectorD
     return nullptr;
 }
 
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
 {
     ARM_COMPUTE_UNUSED(epsilon);
 
@@ -115,14 +117,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis > 2, "Actual axis greater than 2 is not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions, "Actual normalization axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions,
+                                    "Actual normalization axis greater than max number of dimensions");
 
     // Reduce shape on axis
     TensorShape sum_shape = input->tensor_shape();
     sum_shape.set(actual_axis, 1);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(sum->tensor_shape(), sum_shape);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -151,7 +154,8 @@ NEL2NormalizeLayerKernel::NEL2NormalizeLayerKernel()
 {
 }
 
-void NEL2NormalizeLayerKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output, int axis, float epsilon)
+void NEL2NormalizeLayerKernel::configure(
+    const ITensor *input, const ITensor *sum, ITensor *output, int axis, float epsilon)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), sum->info(), output->info(), axis, epsilon));
@@ -169,10 +173,12 @@ void NEL2NormalizeLayerKernel::configure(const ITensor *input, const ITensor *su
     INEKernel::configure(std::get<1>(win_config));
 }
 
-Status NEL2NormalizeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
+Status NEL2NormalizeLayerKernel::validate(
+    const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, sum, output, axis, epsilon));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
 
     return Status{};
 }
@@ -183,12 +189,13 @@ void NEL2NormalizeLayerKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    if(_actual_axis > 2)
+    if (_actual_axis > 2)
     {
         ARM_COMPUTE_ERROR("Unsupported normalization axis");
     }
 
-    const auto *uk = get_implementation(L2NormalizeLayerSelectorData{ _output->info()->data_type(), _actual_axis, CPUInfo::get().get_isa() });
+    const auto *uk = get_implementation(
+        L2NormalizeLayerSelectorData{_output->info()->data_type(), _actual_axis, CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr);
     ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
 
diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h
index af3ad3403e..3524e66a21 100644
--- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h
@@ -74,7 +74,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NELogicalKernel.cpp b/src/core/NEON/kernels/NELogicalKernel.cpp
index 6939e08ef0..6be6284528 100644
--- a/src/core/NEON/kernels/NELogicalKernel.cpp
+++ b/src/core/NEON/kernels/NELogicalKernel.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -50,7 +51,7 @@ void neon_logical_and(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, ui
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(src1);
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
 
-    for(; len >= step; len -= step)
+    for (; len >= step; len -= step)
     {
         vst1q_u8(dst, vandq_u8(vminq_u8(vld1q_u8(src0), c1_x16), vminq_u8(vld1q_u8(src1), c1_x16)));
         src0 += step;
@@ -58,7 +59,7 @@ void neon_logical_and(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, ui
         dst += step;
     }
 
-    for(; len >= half_step; len -= half_step)
+    for (; len >= half_step; len -= half_step)
     {
         vst1_u8(dst, vand_u8(vmin_u8(vld1_u8(src0), c1_x8), vmin_u8(vld1_u8(src1), c1_x8)));
         src0 += half_step;
@@ -66,7 +67,7 @@ void neon_logical_and(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, ui
         dst += half_step;
     }
 
-    for(; len > 0; --len)
+    for (; len > 0; --len)
     {
         *dst = (*src0) && (*src1);
         ++src0;
@@ -84,21 +85,21 @@ void neon_logical_and_broadcast(const uint8_t *src, uint8_t broadcast_val, uint8
     const auto broadcast_val_clamped_x16 = vdupq_n_u8(broadcast_val_clamped_s);
     const auto broadcast_val_clamped_x8  = vdup_n_u8(broadcast_val_clamped_s);
 
-    for(; len >= step; len -= step)
+    for (; len >= step; len -= step)
     {
         vst1q_u8(dst, vandq_u8(vminq_u8(vld1q_u8(src), c1_x16), broadcast_val_clamped_x16));
         src += step;
         dst += step;
     }
 
-    for(; len >= half_step; len -= half_step)
+    for (; len >= half_step; len -= half_step)
     {
         vst1_u8(dst, vand_u8(vmin_u8(vld1_u8(src), c1_x8), broadcast_val_clamped_x8));
         src += half_step;
         dst += half_step;
     }
 
-    for(; len > 0; --len)
+    for (; len > 0; --len)
     {
         *dst = (*src) && broadcast_val_clamped_s;
         ++src;
@@ -112,7 +113,7 @@ void neon_logical_or(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, uin
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(src1);
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
 
-    for(; len >= step; len -= step)
+    for (; len >= step; len -= step)
     {
         vst1q_u8(dst, vorrq_u8(vminq_u8(vld1q_u8(src0), c1_x16), vminq_u8(vld1q_u8(src1), c1_x16)));
         src0 += step;
@@ -120,7 +121,7 @@ void neon_logical_or(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, uin
         dst += step;
     }
 
-    for(; len >= half_step; len -= half_step)
+    for (; len >= half_step; len -= half_step)
     {
         vst1_u8(dst, vorr_u8(vmin_u8(vld1_u8(src0), c1_x8), vmin_u8(vld1_u8(src1), c1_x8)));
         src0 += half_step;
@@ -128,7 +129,7 @@ void neon_logical_or(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, uin
         dst += half_step;
     }
 
-    for(; len > 0; --len)
+    for (; len > 0; --len)
     {
         *dst = (*src0) || (*src1);
         ++src0;
@@ -146,21 +147,21 @@ void neon_logical_or_broadcast(const uint8_t *src, uint8_t broadcast_val, uint8_
     const auto broadcast_val_clamped_x16 = vdupq_n_u8(broadcast_val_clamped_s);
     const auto broadcast_val_clamped_x8  = vdup_n_u8(broadcast_val_clamped_s);
 
-    for(; len >= step; len -= step)
+    for (; len >= step; len -= step)
     {
         vst1q_u8(dst, vorrq_u8(vminq_u8(vld1q_u8(src), c1_x16), broadcast_val_clamped_x16));
         src += step;
         dst += step;
     }
 
-    for(; len >= half_step; len -= half_step)
+    for (; len >= half_step; len -= half_step)
     {
         vst1_u8(dst, vorr_u8(vmin_u8(vld1_u8(src), c1_x8), broadcast_val_clamped_x8));
         src += half_step;
         dst += half_step;
     }
 
-    for(; len > 0; --len)
+    for (; len > 0; --len)
     {
         *dst = (*src) || broadcast_val_clamped_s;
         ++src;
@@ -173,21 +174,21 @@ void neon_logical_not(const uint8_t *src, uint8_t *dst, uint32_t len)
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
 
-    for(; len >= step; len -= step)
+    for (; len >= step; len -= step)
     {
         vst1q_u8(dst, vbslq_u8(vceqq_u8(vld1q_u8(src), c0_x16), c1_x16, c0_x16));
         src += step;
         dst += step;
     }
 
-    for(; len >= half_step; len -= half_step)
+    for (; len >= half_step; len -= half_step)
     {
         vst1_u8(dst, vbsl_u8(vceq_u8(vld1_u8(src), c0_x8), c1_x8, c0_x8));
         src += half_step;
         dst += half_step;
     }
 
-    for(; len > 0; --len)
+    for (; len > 0; --len)
     {
         *dst = !(*src);
         ++src;
@@ -197,18 +198,15 @@ void neon_logical_not(const uint8_t *src, uint8_t *dst, uint32_t len)
 
 void run_unary(const Window &window, const ITensor *src, ITensor *dst)
 {
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
     const auto len = window.x().end() - window.x().start();
 
     Iterator in(src, win);
     Iterator out(dst, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        neon_logical_not(in.ptr(), out.ptr(), len);
-    },
-    in, out);
+    execute_window_loop(
+        win, [&](const Coordinates &) { neon_logical_not(in.ptr(), out.ptr(), len); }, in, out);
 }
 
 void run_binary(const Window &window, const ITensor *src0, const ITensor *src1, ITensor *dst, LogicalOperation op)
@@ -216,16 +214,17 @@ void run_binary(const Window &window, const ITensor *src0, const ITensor *src1,
     Window src0_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
     Window src1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
     const auto len                   = window.x().end() - window.x().start();
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
-        using LogicalBroadcastUKernelPtr        = std::add_pointer<void(const uint8_t *, uint8_t, uint8_t *, uint32_t)>::type;
-        LogicalBroadcastUKernelPtr logical_func = op == LogicalOperation::Or ? &neon_logical_or_broadcast : &neon_logical_and_broadcast;
+        using LogicalBroadcastUKernelPtr = std::add_pointer<void(const uint8_t *, uint8_t, uint8_t *, uint32_t)>::type;
+        LogicalBroadcastUKernelPtr logical_func =
+            op == LogicalOperation::Or ? &neon_logical_or_broadcast : &neon_logical_and_broadcast;
 
         const bool     is_broadcast_input_1 = src1_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_1 ? src1_win : src0_win;
@@ -238,17 +237,18 @@ void run_binary(const Window &window, const ITensor *src0, const ITensor *src1,
         Iterator non_broadcast_in(non_broadcast_tensor, non_broadcast_win);
         Iterator out(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const uint8_t broadcast_value = *broadcast_in.ptr();
-            logical_func(non_broadcast_in.ptr(), broadcast_value, out.ptr(), len);
-
-        },
-        broadcast_in, non_broadcast_in, out);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const uint8_t broadcast_value = *broadcast_in.ptr();
+                logical_func(non_broadcast_in.ptr(), broadcast_value, out.ptr(), len);
+            },
+            broadcast_in, non_broadcast_in, out);
     }
     else
     {
-        using LogicalUKernelPtr        = std::add_pointer<void(const uint8_t *, const uint8_t *, uint8_t *, uint32_t)>::type;
+        using LogicalUKernelPtr = std::add_pointer<void(const uint8_t *, const uint8_t *, uint8_t *, uint32_t)>::type;
         LogicalUKernelPtr logical_func = op == LogicalOperation::Or ? &neon_logical_or : &neon_logical_and;
 
         src0_win.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -257,11 +257,8 @@ void run_binary(const Window &window, const ITensor *src0, const ITensor *src1,
         Iterator in0(src0, src0_win);
         Iterator in1(src1, src1_win);
         Iterator out(dst, win);
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            logical_func(in0.ptr(), in1.ptr(), out.ptr(), len);
-        },
-        in0, in1, out);
+        execute_window_loop(
+            win, [&](const Coordinates &) { logical_func(in0.ptr(), in1.ptr(), out.ptr(), len); }, in0, in1, out);
     }
 }
 } // namespace
@@ -270,7 +267,10 @@ const char *NELogicalKernel::name() const
     return "NELogicalKernel";
 }
 
-void NELogicalKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, LogicalOperation op)
+void NELogicalKernel::configure(const ITensorInfo *input1,
+                                const ITensorInfo *input2,
+                                ITensorInfo       *output,
+                                LogicalOperation   op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate(input1, input2, output, op));
@@ -279,7 +279,7 @@ void NELogicalKernel::configure(const ITensorInfo *input1, const ITensorInfo *in
 
     Window      win       = calculate_max_window(*input1, Steps());
     TensorShape out_shape = input1->tensor_shape();
-    if(op != LogicalOperation::Not)
+    if (op != LogicalOperation::Not)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(input2);
         out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
@@ -292,13 +292,16 @@ void NELogicalKernel::configure(const ITensorInfo *input1, const ITensorInfo *in
     set_data_type_if_unknown(*output, input1->data_type());
 }
 
-Status NELogicalKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op)
+Status NELogicalKernel::validate(const ITensorInfo *input1,
+                                 const ITensorInfo *input2,
+                                 const ITensorInfo *output,
+                                 LogicalOperation   op)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
     ARM_COMPUTE_RETURN_ERROR_ON(op == LogicalOperation::Unknown);
 
     TensorShape out_shape = input1->tensor_shape();
-    if(op != LogicalOperation::Not)
+    if (op != LogicalOperation::Not)
     {
         out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
@@ -306,7 +309,7 @@ Status NELogicalKernel::validate(const ITensorInfo *input1, const ITensorInfo *i
     }
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
@@ -326,7 +329,7 @@ void NELogicalKernel::run_op(ITensorPack &tensors, const Window &window, const T
     const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
     ITensor       *dst  = tensors.get_tensor(TensorType::ACL_DST);
 
-    if(_op == LogicalOperation::Not)
+    if (_op == LogicalOperation::Not)
     {
         run_unary(window, src0, dst);
     }
diff --git a/src/core/NEON/kernels/NELogicalKernel.h b/src/core/NEON/kernels/NELogicalKernel.h
index caf69cf45d..477a59d826 100644
--- a/src/core/NEON/kernels/NELogicalKernel.h
+++ b/src/core/NEON/kernels/NELogicalKernel.h
@@ -58,10 +58,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op);
+    static Status
+    validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
diff --git a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
index 37e88a8565..451031d696 100644
--- a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
@@ -28,12 +28,13 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/meanstddevnorm/list.h"
 
 namespace arm_compute
@@ -46,7 +47,8 @@ struct MeanStdDevNormSelectorData
 };
 
 using MeanStdDevNormSelctorPtr = std::add_pointer<bool(const MeanStdDevNormSelectorData &data)>::type;
-using MeanStdDevNormUKernelPtr = std::add_pointer<void(ITensor *input, ITensor *output, float epsilon, const Window &window)>::type;
+using MeanStdDevNormUKernelPtr =
+    std::add_pointer<void(ITensor *input, ITensor *output, float epsilon, const Window &window)>::type;
 
 struct MeanStdDevNormKernel
 {
@@ -55,25 +57,15 @@ struct MeanStdDevNormKernel
     MeanStdDevNormUKernelPtr       ukernel;
 };
 
-static const std::vector<MeanStdDevNormKernel> available_kernels =
-{
-    {
-        "fp32_neon_meanstddevnorm",
-        [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_meanstddevnorm)
-    },
+static const std::vector<MeanStdDevNormKernel> available_kernels = {
+    {"fp32_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_meanstddevnorm)},
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    {
-        "fp16_neon_meanstddevnorm",
-        [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_meanstddevnorm)
-    },
+    {"fp16_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_meanstddevnorm)},
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    {
-        "qasymm8_neon_meanstddevnorm",
-        [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::QASYMM8; },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_meanstddevnorm)
-    },
+    {"qasymm8_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::QASYMM8; },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_meanstddevnorm)},
 };
 
 /** Micro-kernel selector
@@ -84,9 +76,9 @@ static const std::vector<MeanStdDevNormKernel> available_kernels =
  */
 const MeanStdDevNormKernel *get_implementation(const MeanStdDevNormSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -103,7 +95,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QASYMM8);
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -113,7 +105,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
 {
-    if(output != nullptr)
+    if (output != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
         // Output auto inizialitation if not yet initialized
@@ -128,8 +120,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 }
 } // namespace
 
-NEMeanStdDevNormalizationKernel::NEMeanStdDevNormalizationKernel()
-    : _input(nullptr), _output(nullptr), _epsilon(1e-8f)
+NEMeanStdDevNormalizationKernel::NEMeanStdDevNormalizationKernel() : _input(nullptr), _output(nullptr), _epsilon(1e-8f)
 {
 }
 
@@ -137,7 +128,8 @@ void NEMeanStdDevNormalizationKernel::configure(ITensor *input, ITensor *output,
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
-    ARM_COMPUTE_ERROR_THROW_ON(NEMeanStdDevNormalizationKernel::validate(input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
+    ARM_COMPUTE_ERROR_THROW_ON(NEMeanStdDevNormalizationKernel::validate(
+        input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
 
     _input   = input;
     _output  = (output == nullptr) ? input : output;
@@ -152,7 +144,9 @@ void NEMeanStdDevNormalizationKernel::configure(ITensor *input, ITensor *output,
 Status NEMeanStdDevNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, float epsilon)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, epsilon));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (output != nullptr) ? output->clone().get() : nullptr).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), (output != nullptr) ? output->clone().get() : nullptr)
+            .first);
     return Status{};
 }
 
@@ -162,7 +156,7 @@ void NEMeanStdDevNormalizationKernel::run(const Window &window, const ThreadInfo
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    const auto *uk = get_implementation(MeanStdDevNormSelectorData{ _output->info()->data_type() });
+    const auto *uk = get_implementation(MeanStdDevNormSelectorData{_output->info()->data_type()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     uk->ukernel(_input, _output, _epsilon, window);
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index 49a045382d..2c61bda147 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
@@ -29,19 +29,23 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/NormalizationHelpers.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
+Status validate_arguments(const ITensorInfo            *input,
+                          const ITensorInfo            *input_squared,
+                          const ITensorInfo            *output,
+                          const NormalizationLayerInfo &norm_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_squared, output);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
@@ -52,7 +56,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squ
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -69,7 +73,10 @@ NENormalizationLayerKernel::NENormalizationLayerKernel()
 {
 }
 
-void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info)
+void NENormalizationLayerKernel::configure(const ITensor         *input,
+                                           const ITensor         *input_squared,
+                                           ITensor               *output,
+                                           NormalizationLayerInfo norm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_squared, output);
     // Output tensor auto initialization if not yet initialized
@@ -85,15 +92,15 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *
     _output        = output;
     _norm_info     = norm_info;
 
-    switch(_input->info()->data_type())
+    switch (_input->info()->data_type())
     {
         case DataType::F32:
         {
-            switch(norm_idx)
+            switch (norm_idx)
             {
                 case 0:
                 {
-                    if(norm_info.type() == NormType::IN_MAP_2D)
+                    if (norm_info.type() == NormType::IN_MAP_2D)
                     {
                         _func = &NENormalizationLayerKernel::normalize_float<float, 4, 0, true>;
                     }
@@ -104,7 +111,7 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *
                     break;
                 }
                 case 1:
-                    if(norm_info.type() == NormType::IN_MAP_2D)
+                    if (norm_info.type() == NormType::IN_MAP_2D)
                     {
                         _func = &NENormalizationLayerKernel::normalize_float<float, 4, 1, true>;
                     }
@@ -124,11 +131,11 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
         {
-            switch(norm_idx)
+            switch (norm_idx)
             {
                 case 0:
                 {
-                    if(norm_info.type() == NormType::IN_MAP_2D)
+                    if (norm_info.type() == NormType::IN_MAP_2D)
                     {
                         _func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 0, true>;
                     }
@@ -139,7 +146,7 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *
                     break;
                 }
                 case 1:
-                    if(norm_info.type() == NormType::IN_MAP_2D)
+                    if (norm_info.type() == NormType::IN_MAP_2D)
                     {
                         _func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 1, true>;
                     }
@@ -196,8 +203,9 @@ void NENormalizationLayerKernel::normalize_float(const Window &window)
     const auto beta_vec  = wrapper::vdup_n(static_cast<T>(_norm_info.beta()), ExactTagType{});
     const auto kappa_vec = wrapper::vdup_n(static_cast<T>(_norm_info.kappa()), ExactTagType{});
 
-    auto sequential_normalization = [&](const int x, const Coordinates & id, const int current_row, const int first_row, const int last_row, const T * input_ptr, const uint8_t *input_squared_start_ptr,
-                                        T * output_ptr)
+    auto sequential_normalization = [&](const int x, const Coordinates &id, const int current_row, const int first_row,
+                                        const int last_row, const T *input_ptr, const uint8_t *input_squared_start_ptr,
+                                        T *output_ptr)
     {
         const int current_slice = dim == 0 ? x : id[dim];
         const int first_slice   = std::max(current_slice - radius, 0);
@@ -206,75 +214,87 @@ void NENormalizationLayerKernel::normalize_float(const Window &window)
         const uint8_t *const input_squared_x_ptr = input_squared_start_ptr + x * input_squared_stride_x;
         // Accumulate 2D In-Map values
         auto accu = static_cast<T>(0.f);
-        for(int j = first_row; j <= last_row; ++j)
+        for (int j = first_row; j <= last_row; ++j)
         {
             // Compute row displacement
             const uint8_t *const input_squared_ptr = input_squared_x_ptr + (j - current_row) * input_squared_stride_row;
-            for(int i = first_slice; i <= last_slice; ++i)
+            for (int i = first_slice; i <= last_slice; ++i)
             {
-                accu += *reinterpret_cast<const T *>(input_squared_ptr + (i - current_slice) * input_squared_stride_slice);
+                accu +=
+                    *reinterpret_cast<const T *>(input_squared_ptr + (i - current_slice) * input_squared_stride_slice);
             }
         }
 
         // Normalize
-        const auto normalized       = std::pow(accu * static_cast<T>(_norm_info.scale_coeff()) + static_cast<T>(_norm_info.kappa()), _norm_info.beta());
+        const auto normalized = std::pow(
+            accu * static_cast<T>(_norm_info.scale_coeff()) + static_cast<T>(_norm_info.kappa()), _norm_info.beta());
         const auto normalized_pixel = (*(input_ptr + x)) / normalized;
         *(output_ptr + x)           = normalized_pixel;
     };
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
-        auto       output_ptr = reinterpret_cast<T *>(output.ptr());
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+            auto       output_ptr = reinterpret_cast<T *>(output.ptr());
 
-        // Get range to normalize
-        const int current_row = do_2D_norm ? id[dim_y] : 0;
-        const int first_row   = do_2D_norm ? std::max(current_row - radius, 0) : 0;
-        const int last_row    = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+            // Get range to normalize
+            const int current_row = do_2D_norm ? id[dim_y] : 0;
+            const int first_row   = do_2D_norm ? std::max(current_row - radius, 0) : 0;
+            const int last_row    = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
 
-        int x = window_start_x;
-        // Compute serially starting elements for the case x dimension is width
-        for(; x < radius && x < window_end_x && dim == 0; ++x)
-        {
-            sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), output_ptr);
-        }
+            int x = window_start_x;
+            // Compute serially starting elements for the case x dimension is width
+            for (; x < radius && x < window_end_x && dim == 0; ++x)
+            {
+                sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(),
+                                         output_ptr);
+            }
 
-        // Compute vectorized
-        for(; x <= window_end_x - window_step_x - radius; x += window_step_x)
-        {
-            const int current_slice = dim == 0 ? x : id[dim];
-            const int first_slice   = std::max(current_slice - radius, 0);
-            const int last_slice    = std::min(current_slice + radius, max_right);
-
-            const uint8_t *const input_squared_x_ptr = input_squared.ptr() + x * input_squared_stride_x;
-            // Accumulate 2D In-Map values
-            auto accu = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-            for(int j = first_row; j <= last_row; ++j)
+            // Compute vectorized
+            for (; x <= window_end_x - window_step_x - radius; x += window_step_x)
             {
-                // Compute row displacement
-                const uint8_t *const input_squared_ptr = input_squared_x_ptr + (j - current_row) * input_squared_stride_row;
-                for(int i = first_slice; i <= last_slice; ++i)
+                const int current_slice = dim == 0 ? x : id[dim];
+                const int first_slice   = std::max(current_slice - radius, 0);
+                const int last_slice    = std::min(current_slice + radius, max_right);
+
+                const uint8_t *const input_squared_x_ptr = input_squared.ptr() + x * input_squared_stride_x;
+                // Accumulate 2D In-Map values
+                auto accu = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+                for (int j = first_row; j <= last_row; ++j)
                 {
-                    accu = wrapper::vadd(accu, wrapper::vloadq(reinterpret_cast<const T *>(input_squared_ptr + (i - current_slice) * input_squared_stride_slice)));
+                    // Compute row displacement
+                    const uint8_t *const input_squared_ptr =
+                        input_squared_x_ptr + (j - current_row) * input_squared_stride_row;
+                    for (int i = first_slice; i <= last_slice; ++i)
+                    {
+                        accu = wrapper::vadd(
+                            accu, wrapper::vloadq(reinterpret_cast<const T *>(
+                                      input_squared_ptr + (i - current_slice) * input_squared_stride_slice)));
+                    }
                 }
-            }
 
-            // Normalize
-            const auto normalized       = wrapper::vpow(wrapper::vmla(kappa_vec, coeff_vec, accu), beta_vec);
-            const auto normalized_pixel = wrapper::vmul(wrapper::vloadq(input_ptr + x), wrapper::vinv(normalized));
-            wrapper::vstore(reinterpret_cast<T *>(output_ptr + x), normalized_pixel);
-        }
+                // Normalize
+                const auto normalized       = wrapper::vpow(wrapper::vmla(kappa_vec, coeff_vec, accu), beta_vec);
+                const auto normalized_pixel = wrapper::vmul(wrapper::vloadq(input_ptr + x), wrapper::vinv(normalized));
+                wrapper::vstore(reinterpret_cast<T *>(output_ptr + x), normalized_pixel);
+            }
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), output_ptr);
-        }
-    },
-    input, input_squared, output);
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(),
+                                         output_ptr);
+            }
+        },
+        input, input_squared, output);
 }
 
-Status NENormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo norm_info)
+Status NENormalizationLayerKernel::validate(const ITensorInfo           *input,
+                                            const ITensorInfo           *input_squared,
+                                            const ITensorInfo           *output,
+                                            const NormalizationLayerInfo norm_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, input_squared, output, norm_info));
 
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.h b/src/core/NEON/kernels/NENormalizationLayerKernel.h
index 53a06b9ed9..2d8d9f3d60 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.h
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.h
@@ -60,7 +60,8 @@ public:
      * @param[out] output        Destination tensor. Output will have the same number of dimensions as input. Data type and layout supported: same as @p input.
      * @param[in]  norm_info     Normalization layer information like the normalization type, normalization size and other parameters.
      */
-    void configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info);
+    void
+    configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info);
     /** Static function to check if given info will lead to a valid configuration of @ref NENormalizationLayerKernel
      *
      * @param[in] input         Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
@@ -72,7 +73,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, NormalizationLayerInfo norm_info);
+    static Status validate(const ITensorInfo     *input,
+                           const ITensorInfo     *input_squared,
+                           const ITensorInfo     *output,
+                           NormalizationLayerInfo norm_info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEPadLayerKernel.cpp b/src/core/NEON/kernels/NEPadLayerKernel.cpp
index 734510b637..c9bcbc9127 100644
--- a/src/core/NEON/kernels/NEPadLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPadLayerKernel.cpp
@@ -28,26 +28,31 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &paddings, const PaddingMode mode)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *output,
+                          const PaddingList &paddings,
+                          const PaddingMode  mode)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(mode != PaddingMode::CONSTANT, "Only constant padding mode is supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(paddings.size() > 4, "Padding list bigger than 4 dimensions");
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorShape expected_output_shape = arm_compute::misc::shape_calculator::compute_padded_shape(input->tensor_shape(), paddings);
-        const TensorInfo  expected_output_info  = input->clone()->set_tensor_shape(expected_output_shape);
+        const TensorShape expected_output_shape =
+            arm_compute::misc::shape_calculator::compute_padded_shape(input->tensor_shape(), paddings);
+        const TensorInfo expected_output_info = input->clone()->set_tensor_shape(expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &expected_output_info);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -58,30 +63,34 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 template <typename T>
 void NEPadLayerKernel::run_pad_constant(const Window &window)
 {
-    Window output_window{ window };
+    Window output_window{window};
     output_window.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     const size_t element_size = _input->info()->element_size();
     Iterator     output_it(_output, output_window);
-    execute_window_loop(output_window, [&](const Coordinates & id)
-    {
-        Coordinates idin{ id };
-        for(size_t dim = _padding.size() - 1; dim > 0; --dim)
+    execute_window_loop(
+        output_window,
+        [&](const Coordinates &id)
         {
-            idin[dim] -= _padding[dim].first;
-            if(idin[dim] < 0 || static_cast<int>(_input->info()->dimension(dim)) - 1 < idin[dim])
+            Coordinates idin{id};
+            for (size_t dim = _padding.size() - 1; dim > 0; --dim)
             {
-                std::fill_n(reinterpret_cast<T *>(output_it.ptr()), _output->info()->dimension(0), _constant_value.get<T>());
-                return;
+                idin[dim] -= _padding[dim].first;
+                if (idin[dim] < 0 || static_cast<int>(_input->info()->dimension(dim)) - 1 < idin[dim])
+                {
+                    std::fill_n(reinterpret_cast<T *>(output_it.ptr()), _output->info()->dimension(0),
+                                _constant_value.get<T>());
+                    return;
+                }
             }
-        }
-        T *input_it_ptr  = reinterpret_cast<T *>(_input->ptr_to_element(idin));
-        T *output_it_ptr = reinterpret_cast<T *>(output_it.ptr());
-        std::fill_n(output_it_ptr, _padding[0].first, _constant_value.get<T>());
-        memcpy(output_it_ptr + _padding[0].first, input_it_ptr, _input->info()->dimension(0) * element_size);
-        std::fill_n(output_it_ptr + _padding[0].first + _input->info()->dimension(0), _padding[0].second, _constant_value.get<T>());
-    },
-    output_it);
+            T *input_it_ptr  = reinterpret_cast<T *>(_input->ptr_to_element(idin));
+            T *output_it_ptr = reinterpret_cast<T *>(output_it.ptr());
+            std::fill_n(output_it_ptr, _padding[0].first, _constant_value.get<T>());
+            memcpy(output_it_ptr + _padding[0].first, input_it_ptr, _input->info()->dimension(0) * element_size);
+            std::fill_n(output_it_ptr + _padding[0].first + _input->info()->dimension(0), _padding[0].second,
+                        _constant_value.get<T>());
+        },
+        output_it);
 }
 
 void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window)
@@ -92,7 +101,7 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window
     const size_t end_plane   = window.z().end();
 
     size_t start_plane_input = start_plane;
-    if(_padding.size() > 2)
+    if (_padding.size() > 2)
     {
         start_plane_input = (start_plane < _padding[2].first) ? 0 : start_plane - _padding[2].first;
     }
@@ -105,18 +114,20 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window
     const size_t jump_to_next_row_input  = _input->info()->dimension(0);
     const size_t jump_to_next_row_output = _padding[0].first + _padding[0].second;
 
-    uint8_t       *output_row_ptr = _output->buffer() + _output->info()->offset_first_element_in_bytes() + start_plane * output_plane_size;
-    const uint8_t *input_it_ptr   = _input->buffer() + _input->info()->offset_first_element_in_bytes() + start_plane_input * input_plane_size;
-    const auto     pad_value      = _constant_value.get<uint8_t>();
+    uint8_t *output_row_ptr =
+        _output->buffer() + _output->info()->offset_first_element_in_bytes() + start_plane * output_plane_size;
+    const uint8_t *input_it_ptr =
+        _input->buffer() + _input->info()->offset_first_element_in_bytes() + start_plane_input * input_plane_size;
+    const auto pad_value = _constant_value.get<uint8_t>();
 
-    for(size_t z_i = start_plane; z_i < end_plane; ++z_i)
+    for (size_t z_i = start_plane; z_i < end_plane; ++z_i)
     {
-        if(_padding.size() > 2 && z_i < _padding[2].first)
+        if (_padding.size() > 2 && z_i < _padding[2].first)
         {
             memset(output_row_ptr, pad_value, output_plane_size);
             output_row_ptr += output_plane_size;
         }
-        else if(_padding.size() > 2 && z_i > (_input->info()->dimension(2) + _padding[2].first - 1))
+        else if (_padding.size() > 2 && z_i > (_input->info()->dimension(2) + _padding[2].first - 1))
         {
             memset(output_row_ptr, pad_value, output_plane_size);
             output_row_ptr += output_plane_size;
@@ -127,7 +138,7 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window
             output_row_ptr += pad_y_elems_top;
             size_t y_i = _input->info()->dimension(1);
             // Basic loop unrolling
-            for(; y_i > 3; y_i -= 4)
+            for (; y_i > 3; y_i -= 4)
             {
                 memset(output_row_ptr, pad_value, _padding[0].first);
                 output_row_ptr += _padding[0].first;
@@ -160,7 +171,7 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window
                 memset(output_row_ptr, pad_value, _padding[0].second);
                 output_row_ptr += _padding[0].second;
             }
-            for(; y_i > 0; --y_i)
+            for (; y_i > 0; --y_i)
             {
                 memset(output_row_ptr, pad_value, _padding[0].first);
                 output_row_ptr += _padding[0].first;
@@ -183,12 +194,17 @@ NEPadLayerKernel::NEPadLayerKernel()
 {
 }
 
-void NEPadLayerKernel::configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
+void NEPadLayerKernel::configure(ITensor           *input,
+                                 ITensor           *output,
+                                 const PaddingList &padding,
+                                 const PixelValue   constant_value,
+                                 const PaddingMode  mode)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     // Auto-init
-    const TensorShape expected_output_shape = arm_compute::misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding);
-    const TensorInfo  expected_output_info  = input->info()->clone()->set_tensor_shape(expected_output_shape);
+    const TensorShape expected_output_shape =
+        arm_compute::misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding);
+    const TensorInfo expected_output_info = input->info()->clone()->set_tensor_shape(expected_output_shape);
     auto_init_if_empty(*output->info(), expected_output_info);
 
     // Perform validation step
@@ -200,14 +216,14 @@ void NEPadLayerKernel::configure(ITensor *input, ITensor *output, const PaddingL
     _constant_value = constant_value;
     _mode           = mode;
 
-    if(_mode == PaddingMode::CONSTANT)
+    if (_mode == PaddingMode::CONSTANT)
     {
-        switch(_input->info()->element_size())
+        switch (_input->info()->element_size())
         {
             case 1:
-                if(_input->info()->num_dimensions() == 3 &&                           // Is 3D
-                   padding.size() <= 3 &&                                             // Has 3D padding
-                   !_input->info()->has_padding() && !_output->info()->has_padding()) // Input & Output have no padding
+                if (_input->info()->num_dimensions() == 3 &&                           // Is 3D
+                    padding.size() <= 3 &&                                             // Has 3D padding
+                    !_input->info()->has_padding() && !_output->info()->has_padding()) // Input & Output have no padding
                 {
                     _func = &NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad;
                 }
@@ -240,7 +256,11 @@ void NEPadLayerKernel::configure(ITensor *input, ITensor *output, const PaddingL
     ICPPKernel::configure(win);
 }
 
-Status NEPadLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
+Status NEPadLayerKernel::validate(const ITensorInfo *input,
+                                  const ITensorInfo *output,
+                                  const PaddingList &padding,
+                                  const PixelValue   constant_value,
+                                  const PaddingMode  mode)
 {
     ARM_COMPUTE_UNUSED(constant_value);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, mode));
@@ -253,7 +273,7 @@ void NEPadLayerKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    if(_func != nullptr)
+    if (_func != nullptr)
     {
         (this->*_func)(window);
     }
@@ -263,7 +283,7 @@ size_t NEPadLayerKernel::get_mws(const CPUInfo &platform, size_t thread_count) c
 {
     ARM_COMPUTE_UNUSED(thread_count);
     ARM_COMPUTE_UNUSED(platform);
-    
+
     return ICPPKernel::default_mws;
 }
 
diff --git a/src/core/NEON/kernels/NEPadLayerKernel.h b/src/core/NEON/kernels/NEPadLayerKernel.h
index f82af1558a..d432887d2c 100644
--- a/src/core/NEON/kernels/NEPadLayerKernel.h
+++ b/src/core/NEON/kernels/NEPadLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NEPADLAYERKERNEL_H
 
 #include "arm_compute/core/PixelValue.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -62,7 +63,11 @@ public:
      * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT.
      *                           Only CONSTANT padding mode is currently supported
      */
-    void configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT);
+    void configure(ITensor           *input,
+                   ITensor           *output,
+                   const PaddingList &padding,
+                   const PixelValue   constant_value = PixelValue(),
+                   const PaddingMode  mode           = PaddingMode::CONSTANT);
     /**  Static function to check if given info will lead to a valid configuration of @ref NEPadLayer.
      *
      * @param[in] input          Source tensor info. Data types supported: All.
@@ -75,7 +80,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const PaddingList &padding,
+                           const PixelValue   constant_value = PixelValue(),
+                           const PaddingMode  mode           = PaddingMode::CONSTANT);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
index 3d89933377..15e933e66e 100644
--- a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -36,7 +37,10 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status validate_arguments(const ITensorInfo       *input1,
+                          const ITensorInfo       *input2,
+                          const ITensorInfo       *output,
+                          const PriorBoxLayerInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
@@ -45,10 +49,10 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
 
     // Check variances
     const int var_size = info.variances().size();
-    if(var_size > 1)
+    if (var_size > 1)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size != 4, "Must provide 4 variance values");
-        for(int i = 0; i < var_size; ++i)
+        for (int i = 0; i < var_size; ++i)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size <= 0, "Must be greater than 0");
         }
@@ -56,17 +60,19 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[0] < 0.f, "Step x should be greater or equal to 0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[1] < 0.f, "Step y should be greater or equal to 0");
 
-    if(!info.max_sizes().empty())
+    if (!info.max_sizes().empty())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), "Max and min sizes dimensions should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(),
+                                        "Max and min sizes dimensions should match");
     }
 
-    for(unsigned int i = 0; i < info.max_sizes().size(); ++i)
+    for (unsigned int i = 0; i < info.max_sizes().size(); ++i)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], "Max size should be greater than min size");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i],
+                                        "Max size should be greater than min size");
     }
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != 2);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
@@ -76,21 +82,26 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
 }
 } // namespace
 
-NEPriorBoxLayerKernel::NEPriorBoxLayerKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr), _info()
+NEPriorBoxLayerKernel::NEPriorBoxLayerKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr), _info()
 {
 }
 
-void NEPriorBoxLayerKernel::store_coordinates(float *out, const int offset, const float center_x, const float center_y, const float box_width, const float box_height, const int width,
-                                              const int height)
+void NEPriorBoxLayerKernel::store_coordinates(float      *out,
+                                              const int   offset,
+                                              const float center_x,
+                                              const float center_y,
+                                              const float box_width,
+                                              const float box_height,
+                                              const int   width,
+                                              const int   height)
 {
     float xmin = (center_x - box_width / 2.f) / width;
     float ymin = (center_y - box_height / 2.f) / height;
     float xmax = (center_x + box_width / 2.f) / width;
     float ymax = (center_y + box_height / 2.f) / height;
 
-    float32x4_t vec_elements = { xmin, ymin, xmax, ymax };
-    if(_info.clip())
+    float32x4_t vec_elements = {xmin, ymin, xmax, ymax};
+    if (_info.clip())
     {
         static const float32x4_t CONST_0 = vdupq_n_f32(0.f);
         static const float32x4_t CONST_1 = vdupq_n_f32(1.f);
@@ -112,7 +123,7 @@ void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window)
 
     int img_width  = _info.img_size().x;
     int img_height = _info.img_size().y;
-    if(img_width == 0 || img_height == 0)
+    if (img_width == 0 || img_height == 0)
     {
         img_width  = _input2->info()->dimension(width_idx);
         img_height = _input2->info()->dimension(height_idx);
@@ -120,7 +131,7 @@ void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window)
 
     float step_x = _info.steps()[0];
     float step_y = _info.steps()[1];
-    if(step_x == 0.f || step_y == 0.f)
+    if (step_x == 0.f || step_y == 0.f)
     {
         step_x = static_cast<float>(img_width) / layer_width;
         step_y = static_cast<float>(img_height) / layer_height;
@@ -130,74 +141,80 @@ void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window)
     slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 2));
 
     Iterator output(_output, slice);
-    execute_window_loop(slice, [&](const Coordinates & id)
-    {
-        float center_x = 0;
-        float center_y = 0;
-        int   idx      = id.x() / (4 * num_priors);
-        center_x       = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x;
-        center_y       = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y;
-
-        float box_width;
-        float box_height;
-        int   offset = 0;
-
-        auto out = reinterpret_cast<float *>(output.ptr());
-        for(unsigned int i = 0; i < _info.min_sizes().size(); ++i)
+    execute_window_loop(
+        slice,
+        [&](const Coordinates &id)
         {
-            const float min_size = _info.min_sizes().at(i);
-            box_width            = min_size;
-            box_height           = min_size;
-            store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
-            offset += 4;
-
-            if(!_info.max_sizes().empty())
+            float center_x = 0;
+            float center_y = 0;
+            int   idx      = id.x() / (4 * num_priors);
+            center_x       = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x;
+            center_y       = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y;
+
+            float box_width;
+            float box_height;
+            int   offset = 0;
+
+            auto out = reinterpret_cast<float *>(output.ptr());
+            for (unsigned int i = 0; i < _info.min_sizes().size(); ++i)
             {
-                const float max_size = _info.max_sizes().at(i);
-                box_width            = std::sqrt(min_size * max_size);
-                box_height           = box_width;
-
+                const float min_size = _info.min_sizes().at(i);
+                box_width            = min_size;
+                box_height           = min_size;
                 store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
                 offset += 4;
-            }
 
-            // rest of priors
-            for(auto ar : _info.aspect_ratios())
-            {
-                if(fabs(ar - 1.) < 1e-6)
+                if (!_info.max_sizes().empty())
                 {
-                    continue;
+                    const float max_size = _info.max_sizes().at(i);
+                    box_width            = std::sqrt(min_size * max_size);
+                    box_height           = box_width;
+
+                    store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+                    offset += 4;
                 }
 
-                box_width  = min_size * sqrt(ar);
-                box_height = min_size / sqrt(ar);
+                // rest of priors
+                for (auto ar : _info.aspect_ratios())
+                {
+                    if (fabs(ar - 1.) < 1e-6)
+                    {
+                        continue;
+                    }
 
-                store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
-                offset += 4;
+                    box_width  = min_size * sqrt(ar);
+                    box_height = min_size / sqrt(ar);
+
+                    store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+                    offset += 4;
+                }
             }
-        }
 
-        // set the variance
-        out = reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(id.x(), 1)));
-        float32x4_t var;
-        if(_info.variances().size() == 1)
-        {
-            var = vdupq_n_f32(_info.variances().at(0));
-        }
-        else
-        {
-            const float32x4_t vars = { _info.variances().at(0), _info.variances().at(1), _info.variances().at(2), _info.variances().at(3) };
-            var                    = vars;
-        }
-        for(int i = 0; i < num_priors; ++i)
-        {
-            vst1q_f32(out + 4 * i, var);
-        }
-    },
-    output);
+            // set the variance
+            out = reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(id.x(), 1)));
+            float32x4_t var;
+            if (_info.variances().size() == 1)
+            {
+                var = vdupq_n_f32(_info.variances().at(0));
+            }
+            else
+            {
+                const float32x4_t vars = {_info.variances().at(0), _info.variances().at(1), _info.variances().at(2),
+                                          _info.variances().at(3)};
+                var                    = vars;
+            }
+            for (int i = 0; i < num_priors; ++i)
+            {
+                vst1q_f32(out + 4 * i, var);
+            }
+        },
+        output);
 }
 
-void NEPriorBoxLayerKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, const PriorBoxLayerInfo &info)
+void NEPriorBoxLayerKernel::configure(const ITensor           *input1,
+                                      const ITensor           *input2,
+                                      ITensor                 *output,
+                                      const PriorBoxLayerInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
 
@@ -215,7 +232,10 @@ void NEPriorBoxLayerKernel::configure(const ITensor *input1, const ITensor *inpu
     INEKernel::configure(win);
 }
 
-Status NEPriorBoxLayerKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status NEPriorBoxLayerKernel::validate(const ITensorInfo       *input1,
+                                       const ITensorInfo       *input2,
+                                       const ITensorInfo       *output,
+                                       const PriorBoxLayerInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, info));
@@ -231,4 +251,4 @@ void NEPriorBoxLayerKernel::run(const Window &window, const ThreadInfo &info)
     // Run function
     calculate_prior_boxes(window);
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.h b/src/core/NEON/kernels/NEPriorBoxLayerKernel.h
index 430a47f9f8..460f80e085 100644
--- a/src/core/NEON/kernels/NEPriorBoxLayerKernel.h
+++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.h
@@ -67,7 +67,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info);
+    static Status validate(const ITensorInfo       *input1,
+                           const ITensorInfo       *input2,
+                           const ITensorInfo       *output,
+                           const PriorBoxLayerInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -84,7 +87,14 @@ private:
      * @param[in]  width      Input width.
      * @param[in]  height     Input height.
      */
-    void store_coordinates(float *out, const int offset, const float center_x, const float center_y, const float box_width, const float box_height, const int width, const int height);
+    void store_coordinates(float      *out,
+                           const int   offset,
+                           const float center_x,
+                           const float center_y,
+                           const float box_width,
+                           const float box_height,
+                           const int   width,
+                           const int   height);
     /** Function to calculate prior boxes.
      *
      * @param[in] window Input region on which to execute the kernel.
diff --git a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
index 46a0f625ce..8e1ed3a2a5 100644
--- a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
@@ -26,17 +26,17 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/NESymm.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/NESymm.h"
 
 #include <map>
 
@@ -72,8 +72,8 @@ inline int64x2x2_t mul_add(const int32x4_t &a, const int32x4_t &b, const int32x4
     const int64_t b_3 = vgetlane(b_high, 1);
 
     int64x2x2_t     result;
-    const int64x2_t result_0{ a_0 * b_0, a_1 * b_1 };
-    const int64x2_t result_1{ a_2 * b_2, a_3 * b_3 };
+    const int64x2_t result_0{a_0 * b_0, a_1 * b_1};
+    const int64x2_t result_1{a_2 * b_2, a_3 * b_3};
     result.val[0] = vadd(vmovl(vgetlow(bias)), result_0);
     result.val[1] = vadd(vmovl(vgethigh(bias)), result_1);
 
@@ -81,15 +81,17 @@ inline int64x2x2_t mul_add(const int32x4_t &a, const int32x4_t &b, const int32x4
 }
 } // namespace
 
-void NEQLSTMLayerNormalizationKernel::configure(const ITensor *input, ITensor *output, const ITensor *weight, const ITensor *bias)
+void NEQLSTMLayerNormalizationKernel::configure(const ITensor *input,
+                                                ITensor       *output,
+                                                const ITensor *weight,
+                                                const ITensor *bias)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, bias, output);
     ARM_COMPUTE_ERROR_ON(input == output);
     ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), weight->info(), bias->info()));
 
-    static const std::map<DataType, ComputeFuncType> fn_map =
-    {
-        { DataType::QSYMM16, std::mem_fn(&NEQLSTMLayerNormalizationKernel::compute_qsymm16) },
+    static const std::map<DataType, ComputeFuncType> fn_map = {
+        {DataType::QSYMM16, std::mem_fn(&NEQLSTMLayerNormalizationKernel::compute_qsymm16)},
     };
 
     _input  = input;
@@ -102,10 +104,10 @@ void NEQLSTMLayerNormalizationKernel::configure(const ITensor *input, ITensor *o
     _output->info()->set_quantization_info(compute_output_qinfo());
 
     const UniformQuantizationInfo wq_info = _weight->info()->quantization_info().uniform();
-    const Status                  s       = quantization::calculate_quantized_multiplier(wq_info.scale, &_output_multiplier, &_output_shift);
+    const Status s = quantization::calculate_quantized_multiplier(wq_info.scale, &_output_multiplier, &_output_shift);
     _output_shift *= -1;
 
-    if(!bool(s))
+    if (!bool(s))
     {
         _output_multiplier = 0;
         _output_shift      = 0;
@@ -134,7 +136,10 @@ Window NEQLSTMLayerNormalizationKernel::configure_window(ITensor *target)
     return window;
 }
 
-Status NEQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias)
+Status NEQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input,
+                                                 const ITensorInfo *output,
+                                                 const ITensorInfo *weight,
+                                                 const ITensorInfo *bias)
 {
     ARM_COMPUTE_UNUSED(output, bias, weight, input);
 
@@ -151,7 +156,7 @@ Status NEQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().x() != weight->tensor_shape().x());
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(weight, bias);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -182,11 +187,11 @@ inline std::pair<int64_t, int64_t> NEQLSTMLayerNormalizationKernel::sum_qsymm16(
     using AccType       = int64_t;
     using InputDataType = int16_t;
 
-    AccType sum{ 0 };
-    AccType sum_sq{ 0 };
+    AccType sum{0};
+    AccType sum_sq{0};
 
     int32_t x = _window_start_x;
-    for(; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x)
+    for (; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x)
     {
         using namespace wrapper;
         const int16x8_t val      = vloadq(input_ptr + x);
@@ -216,7 +221,7 @@ inline std::pair<int64_t, int64_t> NEQLSTMLayerNormalizationKernel::sum_qsymm16(
 #endif // __aarch64__
     }
 
-    for(; x < _window_end_x; ++x)
+    for (; x < _window_end_x; ++x)
     {
         const InputDataType val = input_ptr[x];
         sum += static_cast<AccType>(val);
@@ -230,7 +235,9 @@ inline void NEQLSTMLayerNormalizationKernel::normalize_qasymm16(const int16_t *i
                                                                 int16_t       *output_ptr,
                                                                 const int16_t *weight_ptr,
                                                                 const int32_t *bias_ptr,
-                                                                int32_t mean, int32_t inv_std_mul, int32_t inv_std_shift)
+                                                                int32_t        mean,
+                                                                int32_t        inv_std_mul,
+                                                                int32_t        inv_std_shift)
 {
     using OutputDataType = int16_t;
 
@@ -238,7 +245,7 @@ inline void NEQLSTMLayerNormalizationKernel::normalize_qasymm16(const int16_t *i
     const int32x4_t mean_vec = vdup_n(mean, wrapper::traits::vector_128_tag{});
 
     int32_t x = _window_start_x;
-    for(; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x)
+    for (; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x)
     {
         const int16x8_t val = vloadq(input_ptr + x);
         int32x4x2_t     shifted;
@@ -267,16 +274,18 @@ inline void NEQLSTMLayerNormalizationKernel::normalize_qasymm16(const int16_t *i
         vstore(output_ptr + x + 4, vqmovn(out_val.val[1]));
     }
 
-    for(; x < _window_end_x; ++x)
+    for (; x < _window_end_x; ++x)
     {
-        const auto    val             = static_cast<int32_t>(input_ptr[x]);
-        const int32_t shifted         = (val << 10) - mean;
-        const int32_t rescaled        = quantization::multiply_by_quantized_multiplier(shifted, inv_std_mul, inv_std_shift);
-        const int64_t weighted        = rescaled * weight_ptr[x] + bias_ptr[x];
+        const auto    val      = static_cast<int32_t>(input_ptr[x]);
+        const int32_t shifted  = (val << 10) - mean;
+        const int32_t rescaled = quantization::multiply_by_quantized_multiplier(shifted, inv_std_mul, inv_std_shift);
+        const int64_t weighted = rescaled * weight_ptr[x] + bias_ptr[x];
         const auto    reverse_shifted = static_cast<int32_t>((weighted + 512) >> 10);
-        int32_t       out_val         = quantization::multiply_by_quantized_multiplier(reverse_shifted, _output_multiplier, _output_shift + 12);
-        out_val                       = utility::clamp<decltype(out_val), OutputDataType>(out_val, std::numeric_limits<OutputDataType>::min());
-        output_ptr[x]                 = static_cast<OutputDataType>(out_val);
+        int32_t       out_val =
+            quantization::multiply_by_quantized_multiplier(reverse_shifted, _output_multiplier, _output_shift + 12);
+        out_val =
+            utility::clamp<decltype(out_val), OutputDataType>(out_val, std::numeric_limits<OutputDataType>::min());
+        output_ptr[x] = static_cast<OutputDataType>(out_val);
     }
 }
 
@@ -287,35 +296,38 @@ void NEQLSTMLayerNormalizationKernel::compute_qsymm16()
     using BiasDataType   = int32_t;
     using AccType        = int64_t;
 
-    Iterator input_iterator{ _input, _inout_window };
-    Iterator output_iterator{ _output, _inout_window };
-    Iterator weight_iterator{ _weight, _weight_window };
-    Iterator bias_iterator{ _bias, _weight_window };
+    Iterator input_iterator{_input, _inout_window};
+    Iterator output_iterator{_output, _inout_window};
+    Iterator weight_iterator{_weight, _weight_window};
+    Iterator bias_iterator{_bias, _weight_window};
 
     const auto weight_ptr = reinterpret_cast<const InputDataType *>(weight_iterator.ptr());
     const auto bias_ptr   = reinterpret_cast<const BiasDataType *>(bias_iterator.ptr());
 
     const uint32_t column_size = _input->info()->tensor_shape()[0];
 
-    execute_window_loop(_inout_window, [ &, this](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const InputDataType *>(input_iterator.ptr());
-        auto       out_ptr = reinterpret_cast<OutputDataType *>(output_iterator.ptr());
-
-        AccType sum{ 0 };
-        AccType sum_sq{ 0 };
-        std::tie(sum, sum_sq) = sum_qsymm16(in_ptr);
-
-        AccType mean{ 0 };
-        AccType variance{ 0 };
-        std::tie(mean, variance) = compute_mean_variance(sum, sum_sq, column_size);
-
-        int32_t stddev_invsqrt_mul{};
-        int32_t stddev_invsqrt_shift{};
-        quantization::get_invsqrt_quantized_multiplier_exp(static_cast<int32_t>(variance), -1, stddev_invsqrt_mul, stddev_invsqrt_shift);
-
-        normalize_qasymm16(in_ptr, out_ptr, weight_ptr, bias_ptr, mean, stddev_invsqrt_mul, stddev_invsqrt_shift);
-    },
-    input_iterator, output_iterator);
+    execute_window_loop(
+        _inout_window,
+        [&, this](const Coordinates &)
+        {
+            const auto in_ptr  = reinterpret_cast<const InputDataType *>(input_iterator.ptr());
+            auto       out_ptr = reinterpret_cast<OutputDataType *>(output_iterator.ptr());
+
+            AccType sum{0};
+            AccType sum_sq{0};
+            std::tie(sum, sum_sq) = sum_qsymm16(in_ptr);
+
+            AccType mean{0};
+            AccType variance{0};
+            std::tie(mean, variance) = compute_mean_variance(sum, sum_sq, column_size);
+
+            int32_t stddev_invsqrt_mul{};
+            int32_t stddev_invsqrt_shift{};
+            quantization::get_invsqrt_quantized_multiplier_exp(static_cast<int32_t>(variance), -1, stddev_invsqrt_mul,
+                                                               stddev_invsqrt_shift);
+
+            normalize_qasymm16(in_ptr, out_ptr, weight_ptr, bias_ptr, mean, stddev_invsqrt_mul, stddev_invsqrt_shift);
+        },
+        input_iterator, output_iterator);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
index a3ff6e988f..af5b6a0315 100644
--- a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
+++ b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NEQLSTMLAYERNORMALIZATIONKERNEL_H
 
 #include "src/core/NEON/INEKernel.h"
+
 #include <functional>
 
 namespace arm_compute
@@ -69,34 +70,26 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
     // constants
-    static constexpr uint32_t max_input_dimension{ 2 };  /**< The maximum input dimension supported */
-    static constexpr uint32_t max_weight_dimension{ 1 }; /**< The maximum weight dimension supported */
-    static constexpr uint32_t max_bias_dimension{ 1 };   /**< The maximum bias dimension supported */
-    static constexpr uint32_t vector_size_byte{ 16 };    /**< Computation vector size in byte */
+    static constexpr uint32_t max_input_dimension{2};  /**< The maximum input dimension supported */
+    static constexpr uint32_t max_weight_dimension{1}; /**< The maximum weight dimension supported */
+    static constexpr uint32_t max_bias_dimension{1};   /**< The maximum bias dimension supported */
+    static constexpr uint32_t vector_size_byte{16};    /**< Computation vector size in byte */
 
     using ComputeFuncType = std::function<void(NEQLSTMLayerNormalizationKernel &)>;
 
     ComputeFuncType _fn{}; /**< Function pointer to computation function */
 
-    const ITensor *_input
-    {
-        nullptr
-    }; /**< Input tensor */
-    const ITensor *_weight
-    {
-        nullptr
-    }; /**< Weight tensor */
-    const ITensor *_bias
-    {
-        nullptr
-    };                           /**< Bias tensor */
-    ITensor *_output{ nullptr }; /**< Output tensor */
+    const ITensor *_input{nullptr};  /**< Input tensor */
+    const ITensor *_weight{nullptr}; /**< Weight tensor */
+    const ITensor *_bias{nullptr};   /**< Bias tensor */
+    ITensor       *_output{nullptr}; /**< Output tensor */
 
     int32_t _output_multiplier{}; /**< Multiplier for output values */
     int32_t _output_shift{};      /**< Shift value for output values */
@@ -138,7 +131,9 @@ private:
                             int16_t       *output_ptr,
                             const int16_t *weight_ptr,
                             const int32_t *bias_ptr,
-                            int32_t mean, int32_t inv_std_mul, int32_t inv_std_shift);
+                            int32_t        mean,
+                            int32_t        inv_std_mul,
+                            int32_t        inv_std_shift);
     /** Function to compute output quantization information */
     QuantizationInfo compute_output_qinfo();
 };
diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
index 802aebb526..486cd6d331 100644
--- a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
@@ -26,11 +26,12 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/misc/Utility.h"
-#include "src/core/CPP/Validate.h"
+#include "arm_compute/core/Window.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/roialign/list.h"
@@ -49,7 +50,12 @@ struct ROIAlignSelectorData
 };
 
 using ROIAlignSelctorPtr = std::add_pointer<bool(const ROIAlignSelectorData &data)>::type;
-using ROIAlignUKernelPtr = std::add_pointer<void(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)>::type;
+using ROIAlignUKernelPtr = std::add_pointer<void(const ITensor      *input,
+                                                 ITensor            *output,
+                                                 const ITensor      *rois,
+                                                 ROIPoolingLayerInfo pool_info,
+                                                 const Window       &window,
+                                                 const ThreadInfo   &info)>::type;
 
 struct ROIAlignKernel
 {
@@ -58,31 +64,18 @@ struct ROIAlignKernel
     ROIAlignUKernelPtr       ukernel;
 };
 
-static const ROIAlignKernel available_kernels[] =
-{
-    {
-        "fp32_neon_roialign",
-        [](const ROIAlignSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_roialign)
-    },
+static const ROIAlignKernel available_kernels[] = {
+    {"fp32_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_roialign)},
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    {
-        "fp16_neon_roialign",
-        [](const ROIAlignSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_roialign)
-    },
+    {"fp16_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_roialign)},
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #if defined(ARM_COMPUTE_ENABLE_NEON)
-    {
-        "qu8_neon_roialign",
-        [](const ROIAlignSelectorData & data) { return data.dt == DataType::QASYMM8; },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qu8_roialign)
-    },
-    {
-        "qs8_neon_roialign",
-        [](const ROIAlignSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qs8_roialign)
-    },
+    {"qu8_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::QASYMM8; },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qu8_roialign)},
+    {"qs8_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qs8_roialign)},
 #endif //defined(ARM_COMPUTE_ENABLE_NEON)
 };
 
@@ -94,9 +87,9 @@ static const ROIAlignKernel available_kernels[] =
  */
 const ROIAlignKernel *get_implementation(const ROIAlignSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -104,24 +97,29 @@ const ROIAlignKernel *get_implementation(const ROIAlignSelectorData &data)
     return nullptr;
 }
 
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status validate_arguments(const ITensorInfo         *input,
+                          const ITensorInfo         *rois,
+                          ITensorInfo               *output,
+                          const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
     ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(0) != 5);
     ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC, DataLayout::NCHW);
     ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info), output->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info),
+                                                           output->tensor_shape());
     }
 
-    if(input->data_type() == DataType::QASYMM8 || input->data_type() == DataType::QASYMM8_SIGNED)
+    if (input->data_type() == DataType::QASYMM8 || input->data_type() == DataType::QASYMM8_SIGNED)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::QASYMM16);
 
@@ -143,13 +141,17 @@ NEROIAlignLayerKernel::NEROIAlignLayerKernel()
 {
 }
 
-void NEROIAlignLayerKernel::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIAlignLayerKernel::configure(const ITensor             *input,
+                                      const ITensor             *rois,
+                                      ITensor                   *output,
+                                      const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info));
     // Output auto inizialitation if not yet initialized
     const TensorShape output_shape = compute_roi_align_shape(*input->info(), *rois->info(), pool_info);
-    auto_init_if_empty((*output->info()), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+    auto_init_if_empty((*output->info()), output_shape, 1, input->info()->data_type(),
+                       input->info()->quantization_info());
     output->info()->set_data_layout(input->info()->data_layout());
 
     // Configure kernel window
@@ -167,7 +169,10 @@ void NEROIAlignLayerKernel::configure(const ITensor *input, const ITensor *rois,
     INEKernel::configure(window);
 }
 
-Status NEROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status NEROIAlignLayerKernel::validate(const ITensorInfo         *input,
+                                       const ITensorInfo         *rois,
+                                       ITensorInfo               *output,
+                                       const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info));
     return Status{};
@@ -176,9 +181,9 @@ Status NEROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorIn
 void NEROIAlignLayerKernel::run(const Window &window, const ThreadInfo &info)
 {
     const DataLayout data_layout = _input->info()->data_layout();
-    if(data_layout == DataLayout::NCHW || data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NCHW || data_layout == DataLayout::NHWC)
     {
-        const auto *uk = get_implementation(ROIAlignSelectorData{ _input->info()->data_type() });
+        const auto *uk = get_implementation(ROIAlignSelectorData{_input->info()->data_type()});
         ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
         uk->ukernel(_input, _output, _rois, _pool_info, window, info);
diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.h b/src/core/NEON/kernels/NEROIAlignLayerKernel.h
index 48a3de7285..9cc538b429 100644
--- a/src/core/NEON/kernels/NEROIAlignLayerKernel.h
+++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.h
@@ -83,7 +83,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           ITensorInfo               *output,
+                           const ROIPoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
index 400e8291d6..1a3810fb56 100644
--- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
@@ -22,9 +22,11 @@
  * SOFTWARE.
  */
 #include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h"
+
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -36,7 +38,10 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status validate_arguments(const ITensorInfo         *input,
+                          const ITensorInfo         *rois,
+                          const ITensorInfo         *output,
+                          const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, rois);
 
@@ -47,10 +52,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, con
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F32, DataType::QASYMM8);
     ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) || (output->dimension(1) != pool_info.pooled_height()));
+        ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) ||
+                                    (output->dimension(1) != pool_info.pooled_height()));
         ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(2));
         ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(1) != output->dimension(3));
     }
@@ -73,19 +79,28 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, con
  * @param[in]  roi_indx       Index of image of coordinate in output Tensor to store value
  */
 template <typename T>
-void template_eval(const ITensor *input, const ITensor *output, int region_start_x, int region_start_y,
-                   int region_end_x, int region_end_y, int fm, int px, int py, int roi_batch, int roi_indx)
+void template_eval(const ITensor *input,
+                   const ITensor *output,
+                   int            region_start_x,
+                   int            region_start_y,
+                   int            region_end_x,
+                   int            region_end_y,
+                   int            fm,
+                   int            px,
+                   int            py,
+                   int            roi_batch,
+                   int            roi_indx)
 {
-    if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+    if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
     {
         *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) = 0;
     }
     else
     {
         T curr_max = std::numeric_limits<T>::lowest(); // Min value of typename T
-        for(int j = region_start_y; j < region_end_y; ++j)
+        for (int j = region_start_y; j < region_end_y; ++j)
         {
-            for(int i = region_start_x; i < region_end_x; ++i)
+            for (int i = region_start_x; i < region_end_x; ++i)
             {
                 const auto val = *reinterpret_cast<const T *>(input->ptr_to_element(Coordinates(i, j, fm, roi_batch)));
                 curr_max       = std::max(val, curr_max);
@@ -93,11 +108,13 @@ void template_eval(const ITensor *input, const ITensor *output, int region_start
         }
 
         // if quantized datatype, requantize then store in output tensor
-        if(is_data_type_quantized(input->info()->data_type()))
+        if (is_data_type_quantized(input->info()->data_type()))
         {
             // covert qasymm to new output quantization scale and offset
-            UniformQuantizationInfo uqinfo = compute_requantization_scale_offset(input->info()->quantization_info().uniform(), output->info()->quantization_info().uniform());
-            *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) = quantize_qasymm8(curr_max, uqinfo);
+            UniformQuantizationInfo uqinfo = compute_requantization_scale_offset(
+                input->info()->quantization_info().uniform(), output->info()->quantization_info().uniform());
+            *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) =
+                quantize_qasymm8(curr_max, uqinfo);
         }
         else
         {
@@ -112,13 +129,19 @@ NEROIPoolingLayerKernel::NEROIPoolingLayerKernel()
 {
 }
 
-Status NEROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status NEROIPoolingLayerKernel::validate(const ITensorInfo         *input,
+                                         const ITensorInfo         *rois,
+                                         const ITensorInfo         *output,
+                                         const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info));
     return Status{};
 }
 
-void NEROIPoolingLayerKernel::configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIPoolingLayerKernel::configure(const ITensor             *input,
+                                        const ITensor             *rois,
+                                        const ITensor             *output,
+                                        const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois);
 
@@ -126,12 +149,15 @@ void NEROIPoolingLayerKernel::configure(const ITensor *input, const ITensor *roi
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info));
 
     // Output auto initialization if not yet initialized
-    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->info()->dimension(1));
+    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2),
+                             rois->info()->dimension(1));
 
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), output->info()->quantization_info());
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+                       output->info()->quantization_info());
 
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height()));
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) ||
+                         (output->info()->dimension(1) != pool_info.pooled_height()));
 
     // Set instance variables
     _input     = input;
@@ -167,7 +193,7 @@ void NEROIPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
     const auto *rois_ptr  = reinterpret_cast<const uint16_t *>(_rois->buffer());
     const auto  data_type = _input->info()->data_type();
 
-    for(int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
+    for (int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
     {
         const unsigned int roi_batch = rois_ptr[values_per_roi * roi_indx];
         const auto         x1        = rois_ptr[values_per_roi * roi_indx + 1];
@@ -182,30 +208,35 @@ void NEROIPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
         const int roi_height   = std::max(support::cpp11::round((y2 - y1) * spatial_scale), 1.f);
 
         // Iterate through all feature maps
-        for(int fm = 0; fm < fms; ++fm)
+        for (int fm = 0; fm < fms; ++fm)
         {
             // Iterate through all output pixels
-            for(int py = 0; py < pooled_h; ++py)
+            for (int py = 0; py < pooled_h; ++py)
             {
-                for(int px = 0; px < pooled_w; ++px)
+                for (int px = 0; px < pooled_w; ++px)
                 {
                     auto region_start_x = static_cast<int>(std::floor((static_cast<float>(px) / pooled_w) * roi_width));
-                    auto region_end_x   = static_cast<int>(std::floor((static_cast<float>(px + 1) / pooled_w) * roi_width));
-                    auto region_start_y = static_cast<int>(std::floor((static_cast<float>(py) / pooled_h) * roi_height));
-                    auto region_end_y   = static_cast<int>(std::floor((static_cast<float>(py + 1) / pooled_h) * roi_height));
+                    auto region_end_x =
+                        static_cast<int>(std::floor((static_cast<float>(px + 1) / pooled_w) * roi_width));
+                    auto region_start_y =
+                        static_cast<int>(std::floor((static_cast<float>(py) / pooled_h) * roi_height));
+                    auto region_end_y =
+                        static_cast<int>(std::floor((static_cast<float>(py + 1) / pooled_h) * roi_height));
 
                     region_start_x = std::min(std::max(region_start_x + roi_anchor_x, 0), width);
                     region_end_x   = std::min(std::max(region_end_x + roi_anchor_x, 0), width);
                     region_start_y = std::min(std::max(region_start_y + roi_anchor_y, 0), height);
                     region_end_y   = std::min(std::max(region_end_y + roi_anchor_y, 0), height);
 
-                    switch(data_type)
+                    switch (data_type)
                     {
                         case DataType::F32:
-                            template_eval<float>(_input, _output, region_start_x, region_start_y, region_end_x, region_end_y, fm, px, py, roi_batch, roi_indx);
+                            template_eval<float>(_input, _output, region_start_x, region_start_y, region_end_x,
+                                                 region_end_y, fm, px, py, roi_batch, roi_indx);
                             break;
                         case DataType::QASYMM8:
-                            template_eval<qasymm8_t>(_input, _output, region_start_x, region_start_y, region_end_x, region_end_y, fm, px, py, roi_batch, roi_indx);
+                            template_eval<qasymm8_t>(_input, _output, region_start_x, region_start_y, region_end_x,
+                                                     region_end_y, fm, px, py, roi_batch, roi_indx);
                             break;
                         default:
                             ARM_COMPUTE_ERROR("DataType not Supported");
@@ -216,4 +247,4 @@ void NEROIPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
         }
     }
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.h b/src/core/NEON/kernels/NEROIPoolingLayerKernel.h
index e7a7e90eef..81f6006ea2 100644
--- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.h
+++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.h
@@ -63,7 +63,8 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois tensor.
      */
-    void configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info);
+    void
+    configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -82,7 +83,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           const ITensorInfo         *output,
+                           const ROIPoolingLayerInfo &pool_info);
 
 private:
     const ITensor      *_input;
diff --git a/src/core/NEON/kernels/NERangeKernel.cpp b/src/core/NEON/kernels/NERangeKernel.cpp
index ec63a35de9..87b7b76b72 100644
--- a/src/core/NEON/kernels/NERangeKernel.cpp
+++ b/src/core/NEON/kernels/NERangeKernel.cpp
@@ -29,11 +29,12 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/common/Registrars.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/range/list.h"
 
 namespace arm_compute
@@ -55,48 +56,23 @@ struct RangeUKernel
     RangeUKernelPtr        ukernel;
 };
 
-static const RangeUKernel available_kernels[] =
-{
-    {
-        "fp16_neon_range",
-        [](const RangeSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_range_function)
-    },
-    {
-        "f32_neon_range",
-        [](const RangeSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_range_function)
-    },
-    {
-        "u8_neon_range",
-        [](const RangeSelectorData & data) { return data.dt == DataType::U8; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_range_function)
-    },
-    {
-        "u16_neon_range",
-        [](const RangeSelectorData & data) { return data.dt == DataType::U16; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::u16_neon_range_function)
-    },
-    {
-        "u32_neon_range",
-        [](const RangeSelectorData & data) { return data.dt == DataType::U32; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::u32_neon_range_function)
-    },
-    {
-        "s8_neon_range",
-        [](const RangeSelectorData & data) { return data.dt == DataType::S8; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_range_function)
-    },
-    {
-        "s16_neon_range",
-        [](const RangeSelectorData & data) { return data.dt == DataType::S16; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_range_function)
-    },
-    {
-        "s32_neon_range",
-        [](const RangeSelectorData & data) { return data.dt == DataType::S32; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::s32_neon_range_function)
-    },
+static const RangeUKernel available_kernels[] = {
+    {"fp16_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_range_function)},
+    {"f32_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_range_function)},
+    {"u8_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::U8; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_range_function)},
+    {"u16_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::U16; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::u16_neon_range_function)},
+    {"u32_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::U32; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::u32_neon_range_function)},
+    {"s8_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::S8; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_range_function)},
+    {"s16_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::S16; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_range_function)},
+    {"s32_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::S32; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s32_neon_range_function)},
 };
 
 /** Micro-kernel selector
@@ -107,9 +83,9 @@ static const RangeUKernel available_kernels[] =
  */
 const RangeUKernel *get_implementation(const RangeSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -119,28 +95,31 @@ const RangeUKernel *get_implementation(const RangeSelectorData &data)
 
 Status validate_arguments(const ITensorInfo &output, const float start, const float end, const float step)
 {
-    const auto *uk = get_implementation(RangeSelectorData{ output.data_type() });
+    const auto *uk = get_implementation(RangeSelectorData{output.data_type()});
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start < end) && (step <= 0)), "step must be greater than 0 when start < end");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start > end) && (step >= 0)), "step must be less than 0 when start > end");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output.data_type(), output.quantization_info()), "start value is outside the range of the data type");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output.data_type(), output.quantization_info()), "end value is outside the range of the data type");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output.data_type(), output.quantization_info()), "step value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output.data_type(), output.quantization_info()),
+                                    "start value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output.data_type(), output.quantization_info()),
+                                    "end value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output.data_type(), output.quantization_info()),
+                                    "step value is outside the range of the data type");
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.num_dimensions() != 1, "Output has to be a 1-D tensor");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.tensor_shape().total_size() < num_of_elements_in_range(start, end, step), "Output tensor size is incorrect");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.tensor_shape().total_size() < num_of_elements_in_range(start, end, step),
+                                    "Output tensor size is incorrect");
 
     return Status{};
 }
 } // namespace
 
-NERangeKernel::NERangeKernel()
-    : _start(0), _end(1), _step(1), _output(nullptr)
+NERangeKernel::NERangeKernel() : _start(0), _end(1), _step(1), _output(nullptr)
 {
 }
 
@@ -151,7 +130,8 @@ void NERangeKernel::configure(ITensor *output, float start, float end, float ste
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*(output->info()), start, end, step));
 
     // Auto initialize output if not initialized
-    auto_init_if_empty(*output->info(), TensorShape(num_of_elements_in_range(start, end, step)), 1, output->info()->data_type(), output->info()->quantization_info());
+    auto_init_if_empty(*output->info(), TensorShape(num_of_elements_in_range(start, end, step)), 1,
+                       output->info()->data_type(), output->info()->quantization_info());
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps());
@@ -178,7 +158,7 @@ void NERangeKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    const auto *uk = get_implementation(RangeSelectorData{ _output->info()->data_type() });
+    const auto *uk = get_implementation(RangeSelectorData{_output->info()->data_type()});
 
     uk->ukernel(_output, _start, _step, window);
 }
diff --git a/src/core/NEON/kernels/NERangeKernel.h b/src/core/NEON/kernels/NERangeKernel.h
index 90560995e6..fa555c2c2e 100644
--- a/src/core/NEON/kernels/NERangeKernel.h
+++ b/src/core/NEON/kernels/NERangeKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NERANGEKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index 19955af493..455d604b3b 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -28,16 +28,17 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/NEON/NEMath.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "support/SaturateCast.h"
 
-#include "src/core/NEON/wrapper/wrapper.h"
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -48,7 +49,7 @@ namespace
 template <typename T>
 void combine_and_store(int16x8_t t1, int16x8_t t2, Iterator &output, int offset = 0)
 {
-    if(std::is_same<T, uint8_t>::value)
+    if (std::is_same<T, uint8_t>::value)
     {
         auto res = wrapper::vcombine(wrapper::vqmovun(t1), wrapper::vqmovun(t2));
         wrapper::vstore(output.ptr() + offset, res);
@@ -63,8 +64,8 @@ void combine_and_store(int16x8_t t1, int16x8_t t2, Iterator &output, int offset
 template <typename T>
 uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis)
 {
-    uint32x4_t mask{ 0 };
-    if(op == ReductionOperation::ARG_IDX_MIN)
+    uint32x4_t mask{0};
+    if (op == ReductionOperation::ARG_IDX_MIN)
     {
         mask = wrapper::vcgt(b, a);
     }
@@ -73,12 +74,12 @@ uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOp
         mask = wrapper::vclt(b, a);
     }
 
-    uint32x4_t vec_idx = { idx, idx + 1, idx + 2, idx + 3 };
-    if(axis != 0)
+    uint32x4_t vec_idx = {idx, idx + 1, idx + 2, idx + 3};
+    if (axis != 0)
     {
         vec_idx = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
     }
-    uint32x4x4_t res = { { wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0 } };
+    uint32x4x4_t res = {{wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0}};
 
     return res;
 }
@@ -86,9 +87,9 @@ uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOp
 template <typename T>
 uint32x4x4_t calculate_index_quantized(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis)
 {
-    uint32x4x4_t mask{ { 0 } };
-    uint8x16_t   mask_u8{ 0 };
-    if(op == ReductionOperation::ARG_IDX_MIN)
+    uint32x4x4_t mask{{0}};
+    uint8x16_t   mask_u8{0};
+    if (op == ReductionOperation::ARG_IDX_MIN)
     {
         mask_u8 = wrapper::vcgt(b, a);
     }
@@ -96,44 +97,43 @@ uint32x4x4_t calculate_index_quantized(uint32_t idx, T a, T b, uint32x4x4_t c, R
     {
         mask_u8 = wrapper::vclt(b, a);
     }
-    auto wide_u16_1 = wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
-    auto wide_u16_2 = wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
-    mask.val[0]     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
-    mask.val[1]     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
-    mask.val[2]     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
-    mask.val[3]     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
-
-    uint32x4x4_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
-            { idx + 4, idx + 5, idx + 6, idx + 7 },
-            { idx + 8, idx + 9, idx + 10, idx + 11 },
-            { idx + 12, idx + 13, idx + 14, idx + 15 }
-        }
-    };
-    if(axis != 0)
+    auto wide_u16_1 =
+        wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
+    auto wide_u16_2 =
+        wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
+    mask.val[0] =
+        wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
+    mask.val[1] =
+        wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
+    mask.val[2] =
+        wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
+    mask.val[3] =
+        wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
+
+    uint32x4x4_t vec_idx = {{{idx + 0, idx + 1, idx + 2, idx + 3},
+                             {idx + 4, idx + 5, idx + 6, idx + 7},
+                             {idx + 8, idx + 9, idx + 10, idx + 11},
+                             {idx + 12, idx + 13, idx + 14, idx + 15}}};
+    if (axis != 0)
     {
         vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
         vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
         vec_idx.val[2] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
         vec_idx.val[3] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
     }
-    uint32x4x4_t res =
-    {
-        {
-            vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]),
-            vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]),
-            vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]),
-            vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3])
-        }
-    };
+    uint32x4x4_t res = {
+        {vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]), vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]),
+         vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]), vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3])}};
 
     return res;
 }
 
 // Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value.
 template <typename T>
-inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
-       typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type >::type
-       calculate_min(T in)
+inline typename std::enable_if<
+    std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
+    typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type>::type
+calculate_min(T in)
 {
     auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
     return wrapper::vpmin(pmin, pmin);
@@ -141,9 +141,10 @@ inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_
 
 // Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value.
 template <typename T>
-inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
-       typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type >::type
-       calculate_min(T in)
+inline typename std::enable_if<
+    std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
+    typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type>::type
+calculate_min(T in)
 {
     auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
     pmin      = wrapper::vpmin(pmin, pmin);
@@ -153,9 +154,10 @@ inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_s
 
 // Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value.
 template <typename T>
-inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
-       typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type >::type
-       calculate_max(T in)
+inline typename std::enable_if<
+    std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
+    typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type>::type
+calculate_max(T in)
 {
     auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
     return wrapper::vpmax(pmax, pmax);
@@ -163,9 +165,10 @@ inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_
 
 // Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value.
 template <typename T>
-inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
-       typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type >::type
-       calculate_max(T in)
+inline typename std::enable_if<
+    std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
+    typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type>::type
+calculate_max(T in)
 {
     auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
     pmax      = wrapper::vpmax(pmax, pmax);
@@ -176,10 +179,10 @@ inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_s
 template <typename T>
 uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op)
 {
-    uint32x4_t res_idx_mask{ 0 };
+    uint32x4_t res_idx_mask{0};
     uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
 
-    if(op == ReductionOperation::ARG_IDX_MIN)
+    if (op == ReductionOperation::ARG_IDX_MIN)
     {
         auto pmin    = calculate_min(vec_res_value);
         auto mask    = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
@@ -203,10 +206,10 @@ uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, T vec_res_value, Reduc
 template <typename T>
 uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op)
 {
-    uint32x4x4_t res_idx_mask{ { 0 } };
+    uint32x4x4_t res_idx_mask{{0}};
     uint32x4_t   mask_ones = vdupq_n_u32(0xFFFFFFFF);
-    uint8x16_t   mask_u8{ 0 };
-    if(op == ReductionOperation::ARG_IDX_MIN)
+    uint8x16_t   mask_u8{0};
+    if (op == ReductionOperation::ARG_IDX_MIN)
     {
         auto pmin = calculate_min(vec_res_value);
         mask_u8   = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
@@ -218,12 +221,18 @@ uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_va
     }
 
     // Widen vectors
-    auto wide_u16_1     = wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
-    auto wide_u16_2     = wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
-    auto wide_u32_1     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
-    auto wide_u32_2     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
-    auto wide_u32_3     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
-    auto wide_u32_4     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
+    auto wide_u16_1 =
+        wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
+    auto wide_u16_2 =
+        wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
+    auto wide_u32_1 =
+        wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
+    auto wide_u32_2 =
+        wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
+    auto wide_u32_3 =
+        wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
+    auto wide_u32_4 =
+        wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
     res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1);
     res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2);
     res_idx_mask.val[2] = wrapper::vand(vec_res_idx.val[2], wide_u32_3);
@@ -241,19 +250,19 @@ uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_va
         pmin      = wrapper::vpmin(pmin, pmin);
         res       = std::min(wrapper::vgetlane(pmin, 0), res);
         iter++;
-    }
-    while(iter < 4);
+    } while (iter < 4);
 
     return (res - 0xFFFFFFFF);
 }
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 template <>
-uint32x4x4_t calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis)
+uint32x4x4_t
+calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis)
 {
-    uint32x4x2_t mask{ 0 };
-    uint16x8_t   mask_u16{ 0 };
-    if(op == ReductionOperation::ARG_IDX_MIN)
+    uint32x4x2_t mask{0};
+    uint16x8_t   mask_u16{0};
+    if (op == ReductionOperation::ARG_IDX_MIN)
     {
         mask_u16 = wrapper::vcgt(b, a);
     }
@@ -263,19 +272,14 @@ uint32x4x4_t calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x
     }
     mask.val[0]          = wrapper::vmovl(wrapper::vgetlow(mask_u16));
     mask.val[1]          = wrapper::vmovl(wrapper::vgethigh(mask_u16));
-    uint32x4x2_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
-            { idx + 4, idx + 5, idx + 6, idx + 7 }
-        }
-    };
-    if(axis != 0)
+    uint32x4x2_t vec_idx = {{{idx + 0, idx + 1, idx + 2, idx + 3}, {idx + 4, idx + 5, idx + 6, idx + 7}}};
+    if (axis != 0)
     {
         vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
         vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
     }
-    uint32x4x4_t res = { wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]),
-                         wrapper::vbsl(mask.val[1], vec_idx.val[1], c.val[1]),
-                         0, 0
-                       };
+    uint32x4x4_t res = {wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]),
+                        wrapper::vbsl(mask.val[1], vec_idx.val[1], c.val[1]), 0, 0};
 
     return res;
 }
@@ -298,10 +302,10 @@ inline float16x4_t calculate_max(float16x8_t in)
 template <>
 uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_value, ReductionOperation op)
 {
-    uint32x4x2_t res_idx_mask{ 0 };
+    uint32x4x2_t res_idx_mask{0};
     uint32x4_t   mask_ones = vdupq_n_u32(0xFFFFFFFF);
     uint16x8_t   mask_u16;
-    if(op == ReductionOperation::ARG_IDX_MIN)
+    if (op == ReductionOperation::ARG_IDX_MIN)
     {
         auto pmin = calculate_min(vec_res_value);
         mask_u16  = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
@@ -313,8 +317,10 @@ uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_va
     }
 
     // Widen vectors
-    auto wide_u32_1     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(mask_u16), 8), wrapper::vmovl(wrapper::vgetlow(mask_u16)));
-    auto wide_u32_2     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(mask_u16), 8), wrapper::vmovl(wrapper::vgethigh(mask_u16)));
+    auto wide_u32_1 =
+        wrapper::vorr(vshll_n_u16(wrapper::vgetlow(mask_u16), 8), wrapper::vmovl(wrapper::vgetlow(mask_u16)));
+    auto wide_u32_2 =
+        wrapper::vorr(vshll_n_u16(wrapper::vgethigh(mask_u16), 8), wrapper::vmovl(wrapper::vgethigh(mask_u16)));
     res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1);
     res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2);
     res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones);
@@ -328,8 +334,7 @@ uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_va
         pmin      = wrapper::vpmin(pmin, pmin);
         res       = std::min(wrapper::vgetlane(pmin, 0), res);
         iter++;
-    }
-    while(iter < 2);
+    } while (iter < 2);
 
     return (res - 0xFFFFFFFF);
 }
@@ -388,7 +393,8 @@ struct RedOpX
     /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
 
-    inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
+    inline void operator()(
+        const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
     {
         const size_t input_dim_0    = in->info()->dimension(0);
         const int    window_step_x  = 16 / sizeof(T);
@@ -402,211 +408,217 @@ struct RedOpX
         Iterator output(out, out_window);
 
         execute_window_loop(
-            in_win_no_pad, [&](const Coordinates &)
-        {
-            const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
-
-            auto init_res_value = static_cast<T>(0.f);
-            switch(op)
+            in_win_no_pad,
+            [&](const Coordinates &)
             {
-                case ReductionOperation::ARG_IDX_MAX:
-                case ReductionOperation::ARG_IDX_MIN:
-                case ReductionOperation::MIN:
-                case ReductionOperation::MAX:
-                {
-                    init_res_value = static_cast<T>(*input_ptr);
-                    break;
-                }
-                case ReductionOperation::PROD:
-                {
-                    init_res_value = static_cast<T>(1.f);
-                    break;
-                }
-                default:
-                    break;
-            }
-            auto         vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{});
-            uint32x4x4_t vec_res_idx{ { 0 } };
+                const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
 
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto vec_elements = wrapper::vloadq(input_ptr + x);
-                switch(op)
+                auto init_res_value = static_cast<T>(0.f);
+                switch (op)
                 {
-                    case ReductionOperation::SUM_SQUARE:
-                        vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
-                        break;
-                    case ReductionOperation::MEAN_SUM:
-                    case ReductionOperation::SUM:
-                        vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
-                        break;
-                    case ReductionOperation::PROD:
-                        vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
-                        break;
-                    case ReductionOperation::ARG_IDX_MIN:
-                    {
-                        auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                        vec_res_idx             = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
-                        vec_res_value           = temp_vec_res_value;
-                        break;
-                    }
                     case ReductionOperation::ARG_IDX_MAX:
-                    {
-                        auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                        vec_res_idx             = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
-                        vec_res_value           = temp_vec_res_value;
-                        break;
-                    }
+                    case ReductionOperation::ARG_IDX_MIN:
                     case ReductionOperation::MIN:
+                    case ReductionOperation::MAX:
                     {
-                        vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                        init_res_value = static_cast<T>(*input_ptr);
                         break;
                     }
-                    case ReductionOperation::MAX:
+                    case ReductionOperation::PROD:
                     {
-                        vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                        init_res_value = static_cast<T>(1.f);
                         break;
                     }
                     default:
-                        ARM_COMPUTE_ERROR("Not supported");
+                        break;
                 }
-            }
+                auto         vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{});
+                uint32x4x4_t vec_res_idx{{0}};
 
-            switch(op)
-            {
-                case ReductionOperation::SUM:
-                case ReductionOperation::MEAN_SUM:
-                case ReductionOperation::SUM_SQUARE:
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-#ifdef ARM_COMPUTE_DEBUG_ENABLED
-                    auto res = static_cast<T>(0.f);
-                    for(int i = 0; i < S; ++i)
+                    const auto vec_elements = wrapper::vloadq(input_ptr + x);
+                    switch (op)
                     {
-                        res += wrapper::vgetlane(vec_res_value, i);
+                        case ReductionOperation::SUM_SQUARE:
+                            vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
+                            break;
+                        case ReductionOperation::MEAN_SUM:
+                        case ReductionOperation::SUM:
+                            vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
+                            break;
+                        case ReductionOperation::PROD:
+                            vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
+                            break;
+                        case ReductionOperation::ARG_IDX_MIN:
+                        {
+                            auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                            vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value,
+                                                                                   vec_res_idx, op, 0);
+                            vec_res_value = temp_vec_res_value;
+                            break;
+                        }
+                        case ReductionOperation::ARG_IDX_MAX:
+                        {
+                            auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                            vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value,
+                                                                                   vec_res_idx, op, 0);
+                            vec_res_value = temp_vec_res_value;
+                            break;
+                        }
+                        case ReductionOperation::MIN:
+                        {
+                            vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                            break;
+                        }
+                        case ReductionOperation::MAX:
+                        {
+                            vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                            break;
+                        }
+                        default:
+                            ARM_COMPUTE_ERROR("Not supported");
                     }
-#else  // ARM_COMPUTE_DEBUG_ENABLED
-                    auto carry_res = wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
-                    for(int i = 0; i < S / 4; ++i)
+                }
+
+                switch (op)
+                {
+                    case ReductionOperation::SUM:
+                    case ReductionOperation::MEAN_SUM:
+                    case ReductionOperation::SUM_SQUARE:
                     {
-                        carry_res = wrapper::vpadd(carry_res, carry_res);
-                    }
-                    auto res = wrapper::vgetlane(carry_res, 0);
+#ifdef ARM_COMPUTE_DEBUG_ENABLED
+                        auto res = static_cast<T>(0.f);
+                        for (int i = 0; i < S; ++i)
+                        {
+                            res += wrapper::vgetlane(vec_res_value, i);
+                        }
+#else  // ARM_COMPUTE_DEBUG_ENABLED
+                        auto carry_res =
+                            wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+                        for (int i = 0; i < S / 4; ++i)
+                        {
+                            carry_res = wrapper::vpadd(carry_res, carry_res);
+                        }
+                        auto res = wrapper::vgetlane(carry_res, 0);
 #endif // ARM_COMPUTE_DEBUG_ENABLED
-                    if(op == ReductionOperation::SUM_SQUARE)
-                    {
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
+                        if (op == ReductionOperation::SUM_SQUARE)
+                        {
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                res += (*(input_ptr + x)) * (*(input_ptr + x));
+                            }
+                        }
+                        else
                         {
-                            res += (*(input_ptr + x)) * (*(input_ptr + x));
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                res += *(input_ptr + x);
+                            }
                         }
+
+                        if (op == ReductionOperation::MEAN_SUM)
+                        {
+                            res /= input_dim_0;
+                        }
+
+                        *(reinterpret_cast<T *>(output.ptr())) = res;
+                        break;
                     }
-                    else
+                    case ReductionOperation::PROD:
                     {
+                        auto carry_res =
+                            wrapper::vmul(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+                        T res = 1;
+                        for (int i = 0; i < S / 2; ++i)
+                        {
+                            res *= wrapper::vgetlane(carry_res, i);
+                        }
+
                         // Compute left-over elements
-                        for(; x < window_end_x; ++x)
+                        for (; x < window_end_x; ++x)
                         {
-                            res += *(input_ptr + x);
+                            res *= *(input_ptr + x);
                         }
-                    }
 
-                    if(op == ReductionOperation::MEAN_SUM)
-                    {
-                        res /= input_dim_0;
+                        *(reinterpret_cast<T *>(output.ptr())) = res;
+                        break;
                     }
-
-                    *(reinterpret_cast<T *>(output.ptr())) = res;
-                    break;
-                }
-                case ReductionOperation::PROD:
-                {
-                    auto carry_res = wrapper::vmul(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
-                    T    res       = 1;
-                    for(int i = 0; i < S / 2; ++i)
+                    case ReductionOperation::ARG_IDX_MIN:
                     {
-                        res *= wrapper::vgetlane(carry_res, i);
-                    }
+                        auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
 
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        res *= *(input_ptr + x);
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            if (*(input_ptr + x) < res)
+                            {
+                                idx = x;
+                                res = *(input_ptr + x);
+                            }
+                        }
+                        *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
+                        break;
                     }
-
-                    *(reinterpret_cast<T *>(output.ptr())) = res;
-                    break;
-                }
-                case ReductionOperation::ARG_IDX_MIN:
-                {
-                    auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
+                    case ReductionOperation::ARG_IDX_MAX:
                     {
-                        if(*(input_ptr + x) < res)
+                        auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
                         {
-                            idx = x;
-                            res = *(input_ptr + x);
+                            if (*(input_ptr + x) > res)
+                            {
+                                idx = x;
+                                res = *(input_ptr + x);
+                            }
                         }
+                        *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
+                        break;
                     }
-                    *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
-                    break;
-                }
-                case ReductionOperation::ARG_IDX_MAX:
-                {
-                    auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
+                    case ReductionOperation::MIN:
                     {
-                        if(*(input_ptr + x) > res)
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
                         {
-                            idx = x;
-                            res = *(input_ptr + x);
+                            res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
                         }
+                        *(reinterpret_cast<T *>(output.ptr())) = res;
+                        break;
                     }
-                    *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
-                    break;
-                }
-                case ReductionOperation::MIN:
-                {
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
+                    case ReductionOperation::MAX:
                     {
-                        res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
-                    }
-                    *(reinterpret_cast<T *>(output.ptr())) = res;
-                    break;
-                }
-                case ReductionOperation::MAX:
-                {
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
 
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
+                        }
+                        *(reinterpret_cast<T *>(output.ptr())) = res;
+                        break;
                     }
-                    *(reinterpret_cast<T *>(output.ptr())) = res;
-                    break;
+                    default:
+                        ARM_COMPUTE_ERROR("Not supported");
                 }
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-        },
-        input, output);
+            },
+            input, output);
     }
 };
 
 template <typename T>
 struct RedOpX_quantized
 {
-    inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
+    inline void operator()(
+        const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
     {
         using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type;
 
@@ -637,246 +649,257 @@ struct RedOpX_quantized
         const float B = out_offset - (in_scale * in_offset) / (out_scale);
 
         execute_window_loop(
-            in_win_no_pad, [&](const Coordinates &)
-        {
-            const auto input_ptr = reinterpret_cast<T *>(input.ptr());
+            in_win_no_pad,
+            [&](const Coordinates &)
+            {
+                const auto input_ptr = reinterpret_cast<T *>(input.ptr());
+
+                auto vec_res_value1 =
+                    wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+                auto vec_res_value2 =
+                    wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+                auto vec_res_value3 =
+                    wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+                auto vec_res_value4 =
+                    wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+
+                auto vec_res_value1_f = vdupq_n_f32(static_cast<float>(1.f));
+                auto vec_res_value2_f = vdupq_n_f32(static_cast<float>(1.f));
+                auto vec_res_value3_f = vdupq_n_f32(static_cast<float>(1.f));
+                auto vec_res_value4_f = vdupq_n_f32(static_cast<float>(1.f));
+
+                typename wrapper::traits::neon_vector<T, 16>::type vec_res_value = {0};
+
+                if (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN ||
+                    op == ReductionOperation::MIN || op == ReductionOperation::MAX)
+                {
+                    vec_res_value = wrapper::vdup_n(*input_ptr, wrapper::traits::vector_128_tag{});
+                }
+
+                uint32x4x4_t vec_res_idx{{0}};
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto vec_elements = wrapper::vloadq(input_ptr + x);
+                    switch (op)
+                    {
+                        case ReductionOperation::SUM:
+                        case ReductionOperation::MEAN_SUM:
+                        {
+                            const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+                            const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
 
-            auto vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
-            auto vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
-            auto vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
-            auto vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+                            const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+                            const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+                            const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+                            const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
 
-            auto vec_res_value1_f = vdupq_n_f32(static_cast<float>(1.f));
-            auto vec_res_value2_f = vdupq_n_f32(static_cast<float>(1.f));
-            auto vec_res_value3_f = vdupq_n_f32(static_cast<float>(1.f));
-            auto vec_res_value4_f = vdupq_n_f32(static_cast<float>(1.f));
+                            vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
+                            vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
+                            vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
+                            vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
+                            break;
+                        }
+                        case ReductionOperation::PROD:
+                        {
+                            const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset);
+                            const auto scale32x4f_4  = vdupq_n_f32(iq_info.scale);
 
-            typename wrapper::traits::neon_vector<T, 16>::type vec_res_value = { 0 };
+                            const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+                            const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
 
-            if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::MIN || op == ReductionOperation::MAX)
-            {
-                vec_res_value = wrapper::vdup_n(*input_ptr, wrapper::traits::vector_128_tag{});
-            }
+                            const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+                            const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+                            const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+                            const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
 
-            uint32x4x4_t vec_res_idx{ { 0 } };
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto vec_elements = wrapper::vloadq(input_ptr + x);
-                switch(op)
-                {
-                    case ReductionOperation::SUM:
-                    case ReductionOperation::MEAN_SUM:
-                    {
-                        const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
-                        const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
-                        const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
-                        const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
-                        const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
-                        const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
-                        vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
-                        vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
-                        vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
-                        vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
-                        break;
-                    }
-                    case ReductionOperation::PROD:
-                    {
-                        const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset);
-                        const auto scale32x4f_4  = vdupq_n_f32(iq_info.scale);
-
-                        const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
-                        const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
-                        const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
-                        const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
-                        const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
-                        const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
-                        auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
-                        auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
-                        auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
-                        auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
-
-                        //de-quantize vec_elements
-                        temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4);
-                        temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4);
-                        temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4);
-                        temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4);
-
-                        vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f);
-                        vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f);
-                        vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f);
-                        vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f);
-                        break;
+                            auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
+                            auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
+                            auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
+                            auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
+
+                            //de-quantize vec_elements
+                            temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4);
+                            temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4);
+                            temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4);
+                            temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4);
+
+                            vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f);
+                            vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f);
+                            vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f);
+                            vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f);
+                            break;
+                        }
+                        case ReductionOperation::ARG_IDX_MIN:
+                        {
+                            auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                            vec_res_idx             = calculate_index_quantized<decltype(vec_res_value)>(
+                                x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+                            vec_res_value = temp_vec_res_value;
+                            break;
+                        }
+                        case ReductionOperation::ARG_IDX_MAX:
+                        {
+                            auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                            vec_res_idx             = calculate_index_quantized<decltype(vec_res_value)>(
+                                x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+                            vec_res_value = temp_vec_res_value;
+                            break;
+                        }
+                        case ReductionOperation::MIN:
+                        {
+                            vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                            break;
+                        }
+                        case ReductionOperation::MAX:
+                        {
+                            vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                            break;
+                        }
+                        default:
+                            ARM_COMPUTE_ERROR("Not supported");
                     }
+                }
+
+                switch (op)
+                {
                     case ReductionOperation::ARG_IDX_MIN:
                     {
-                        auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                        vec_res_idx             = calculate_index_quantized<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
-                        vec_res_value           = temp_vec_res_value;
+                        auto idx =
+                            calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            if (*(input_ptr + x) < res)
+                            {
+                                idx = x;
+                                res = *(input_ptr + x);
+                            }
+                        }
+                        *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
                         break;
                     }
                     case ReductionOperation::ARG_IDX_MAX:
                     {
-                        auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                        vec_res_idx             = calculate_index_quantized<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
-                        vec_res_value           = temp_vec_res_value;
+                        auto idx =
+                            calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            if (*(input_ptr + x) > res)
+                            {
+                                idx = x;
+                                res = *(input_ptr + x);
+                            }
+                        }
+                        *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
                         break;
                     }
                     case ReductionOperation::MIN:
                     {
-                        vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                        break;
-                    }
-                    case ReductionOperation::MAX:
-                    {
-                        vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                        break;
-                    }
-                    default:
-                        ARM_COMPUTE_ERROR("Not supported");
-                }
-            }
-
-            switch(op)
-            {
-                case ReductionOperation::ARG_IDX_MIN:
-                {
-                    auto idx = calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
 
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        if(*(input_ptr + x) < res)
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
                         {
-                            idx = x;
-                            res = *(input_ptr + x);
+                            res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
                         }
+                        *(reinterpret_cast<T *>(output.ptr())) = res;
+                        break;
                     }
-                    *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
-                    break;
-                }
-                case ReductionOperation::ARG_IDX_MAX:
-                {
-                    auto idx = calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
+                    case ReductionOperation::MAX:
                     {
-                        if(*(input_ptr + x) > res)
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
                         {
-                            idx = x;
-                            res = *(input_ptr + x);
+                            res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
                         }
+                        *(reinterpret_cast<T *>(output.ptr())) = res;
+                        break;
                     }
-                    *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
-                    break;
-                }
-                case ReductionOperation::MIN:
-                {
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
+                    case ReductionOperation::PROD:
                     {
-                        res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
-                    }
-                    *(reinterpret_cast<T *>(output.ptr())) = res;
-                    break;
-                }
-                case ReductionOperation::MAX:
-                {
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+                        auto carry_res = wrapper::vmul(vec_res_value1_f, vec_res_value2_f);
+                        carry_res      = wrapper::vmul(carry_res, vec_res_value3_f);
+                        carry_res      = wrapper::vmul(carry_res, vec_res_value4_f);
 
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
-                    }
-                    *(reinterpret_cast<T *>(output.ptr())) = res;
-                    break;
-                }
-                case ReductionOperation::PROD:
-                {
-                    auto carry_res = wrapper::vmul(vec_res_value1_f, vec_res_value2_f);
-                    carry_res      = wrapper::vmul(carry_res, vec_res_value3_f);
-                    carry_res      = wrapper::vmul(carry_res, vec_res_value4_f);
+                        float res = wrapper::vgetlane(carry_res, 0);
+                        res *= wrapper::vgetlane(carry_res, 1);
+                        res *= wrapper::vgetlane(carry_res, 2);
+                        res *= wrapper::vgetlane(carry_res, 3);
 
-                    float res = wrapper::vgetlane(carry_res, 0);
-                    res *= wrapper::vgetlane(carry_res, 1);
-                    res *= wrapper::vgetlane(carry_res, 2);
-                    res *= wrapper::vgetlane(carry_res, 3);
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            //de-quantize input
+                            if (std::is_same<T, uint8_t>::value)
+                            {
+                                res *= dequantize_qasymm8(*(input_ptr + x), iq_info);
+                            }
+                            else
+                            {
+                                res *= dequantize_qasymm8_signed(*(input_ptr + x), iq_info);
+                            }
+                        }
 
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        //de-quantize input
-                        if(std::is_same<T, uint8_t>::value)
+                        //re-quantize result
+                        if (std::is_same<T, uint8_t>::value)
                         {
-                            res *= dequantize_qasymm8(*(input_ptr + x), iq_info);
+                            res = quantize_qasymm8(res, iq_info);
                         }
                         else
                         {
-                            res *= dequantize_qasymm8_signed(*(input_ptr + x), iq_info);
+                            res = quantize_qasymm8_signed(res, iq_info);
                         }
-                    }
 
-                    //re-quantize result
-                    if(std::is_same<T, uint8_t>::value)
-                    {
-                        res = quantize_qasymm8(res, iq_info);
+                        *reinterpret_cast<T *>(output.ptr()) = static_cast<T>(res);
+                        break;
                     }
-                    else
+                    case ReductionOperation::SUM:
+                    case ReductionOperation::MEAN_SUM:
                     {
-                        res = quantize_qasymm8_signed(res, iq_info);
-                    }
+                        auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2);
+                        carry_res      = wrapper::vadd(carry_res, vec_res_value3);
+                        carry_res      = wrapper::vadd(carry_res, vec_res_value4);
 
-                    *reinterpret_cast<T *>(output.ptr()) = static_cast<T>(res);
-                    break;
-                }
-                case ReductionOperation::SUM:
-                case ReductionOperation::MEAN_SUM:
-                {
-                    auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2);
-                    carry_res      = wrapper::vadd(carry_res, vec_res_value3);
-                    carry_res      = wrapper::vadd(carry_res, vec_res_value4);
+                        auto carry_paddition =
+                            wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res));
+                        carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition);
+                        auto res        = static_cast<int32_t>(wrapper::vgetlane(carry_paddition, 0));
 
-                    auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res));
-                    carry_paddition      = wrapper::vpadd(carry_paddition, carry_paddition);
-                    auto res             = static_cast<int32_t>(wrapper::vgetlane(carry_paddition, 0));
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            res += *(input_ptr + x);
+                        }
 
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        res += *(input_ptr + x);
-                    }
+                        if (op == ReductionOperation::MEAN_SUM)
+                        {
+                            const int32_t resFinal = A * (static_cast<float>(res)) + B;
 
-                    if(op == ReductionOperation::MEAN_SUM)
-                    {
-                        const int32_t resFinal = A * (static_cast<float>(res)) + B;
+                            *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(resFinal);
+                        }
+                        else
+                        {
+                            // Subtract accumulated offsets
+                            res -= (in_info.dimension(0) - 1) * iq_info.offset;
+                            *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(res);
+                        }
 
-                        *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(resFinal);
-                    }
-                    else
-                    {
-                        // Subtract accumulated offsets
-                        res -= (in_info.dimension(0) - 1) * iq_info.offset;
-                        *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(res);
+                        break;
                     }
-
-                    break;
+                    default:
+                        ARM_COMPUTE_ERROR("Not supported");
                 }
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-        },
-        input, output);
+            },
+            input, output);
     }
 };
 
@@ -887,7 +910,12 @@ struct RedOpYZW
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
     using neon_vector  = typename wrapper::traits::neon_vector<T, S>::type;
 
-    inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int axis, const ReductionOperation op)
+    inline void operator()(const Window            &in_window,
+                           Window                  &out_window,
+                           const ITensor           *in,
+                           ITensor                 *out,
+                           int                      axis,
+                           const ReductionOperation op)
     {
         const TensorInfo in_info            = *(in->info());
         const int        window_step_x      = 16 / sizeof(T);
@@ -900,203 +928,210 @@ struct RedOpYZW
         Window in_win_no_pad = in_window;
         in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
         Window out_win_no_pad = out_window;
-        out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
+        out_win_no_pad.set(Window::DimX,
+                           Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
 
         Iterator input(in, in_win_no_pad);
         Iterator output(out, out_win_no_pad);
 
         execute_window_loop(
-            in_win_no_pad, [&](const Coordinates &)
-        {
-            const auto input_ptr = reinterpret_cast<T *>(input.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            in_win_no_pad,
+            [&](const Coordinates &)
             {
-                neon_vector vec_res_value = { 0 };
-                switch(op)
-                {
-                    case ReductionOperation::ARG_IDX_MAX:
-                    case ReductionOperation::ARG_IDX_MIN:
-                    case ReductionOperation::MIN:
-                    case ReductionOperation::MAX:
-                    {
-                        vec_res_value = wrapper::vloadq(input_ptr + x);
-                        break;
-                    }
-                    case ReductionOperation::PROD:
-                    {
-                        vec_res_value = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
-                        break;
-                    }
-                    default:
-                    {
-                        vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-                        break;
-                    }
-                }
-                uint32x4x4_t vec_res_idx{ { 0 } };
+                const auto input_ptr = reinterpret_cast<T *>(input.ptr());
 
-                for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    const T   *in_ptr       = reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
-                    const auto vec_elements = wrapper::vloadq(in_ptr);
-                    switch(op)
+                    neon_vector vec_res_value = {0};
+                    switch (op)
                     {
-                        case ReductionOperation::SUM:
-                        case ReductionOperation::MEAN_SUM:
-                            vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
-                            break;
-                        case ReductionOperation::SUM_SQUARE:
-                            vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
-                            break;
-                        case ReductionOperation::PROD:
-                            vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
-                            break;
-                        case ReductionOperation::ARG_IDX_MIN:
-                        {
-                            auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                            vec_res_idx             = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
-                            vec_res_value           = temp_vec_res_value;
-                            break;
-                        }
                         case ReductionOperation::ARG_IDX_MAX:
+                        case ReductionOperation::ARG_IDX_MIN:
+                        case ReductionOperation::MIN:
+                        case ReductionOperation::MAX:
                         {
-                            auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                            vec_res_idx             = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
-                            vec_res_value           = temp_vec_res_value;
+                            vec_res_value = wrapper::vloadq(input_ptr + x);
                             break;
                         }
-                        case ReductionOperation::MIN:
+                        case ReductionOperation::PROD:
                         {
-                            vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                            vec_res_value = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
                             break;
                         }
-                        case ReductionOperation::MAX:
+                        default:
                         {
-                            vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                            vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
                             break;
                         }
-                        default:
-                            ARM_COMPUTE_ERROR("Not supported");
                     }
-                }
-
-                if(op == ReductionOperation::MEAN_SUM)
-                {
-                    auto vec_width_inv = wrapper::vinv(wrapper::vdup_n(static_cast<T>(in_info.dimension(axis)), ExactTagType{}));
-                    vec_res_value      = wrapper::vmul(vec_res_value, vec_width_inv);
-                }
+                    uint32x4x4_t vec_res_idx{{0}};
 
-                if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
-                {
-                    wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x, vec_res_idx.val[0]);
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                    if(std::is_same<T, float16_t>::value)
+                    for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
                     {
-                        wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x + 4, vec_res_idx.val[1]);
+                        const T *in_ptr =
+                            reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
+                        const auto vec_elements = wrapper::vloadq(in_ptr);
+                        switch (op)
+                        {
+                            case ReductionOperation::SUM:
+                            case ReductionOperation::MEAN_SUM:
+                                vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
+                                break;
+                            case ReductionOperation::SUM_SQUARE:
+                                vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
+                                break;
+                            case ReductionOperation::PROD:
+                                vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
+                                break;
+                            case ReductionOperation::ARG_IDX_MIN:
+                            {
+                                auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                                vec_res_idx =
+                                    calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+                                vec_res_value = temp_vec_res_value;
+                                break;
+                            }
+                            case ReductionOperation::ARG_IDX_MAX:
+                            {
+                                auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                                vec_res_idx =
+                                    calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+                                vec_res_value = temp_vec_res_value;
+                                break;
+                            }
+                            case ReductionOperation::MIN:
+                            {
+                                vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                                break;
+                            }
+                            case ReductionOperation::MAX:
+                            {
+                                vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                                break;
+                            }
+                            default:
+                                ARM_COMPUTE_ERROR("Not supported");
+                        }
                     }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                }
-                else
-                {
-                    wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x * sizeof(T)), vec_res_value);
-                }
-            }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                auto res_value = 0.f;
-                switch(op)
-                {
-                    case ReductionOperation::ARG_IDX_MAX:
-                    case ReductionOperation::ARG_IDX_MIN:
-                    case ReductionOperation::MIN:
-                    case ReductionOperation::MAX:
+                    if (op == ReductionOperation::MEAN_SUM)
                     {
-                        res_value = *(input_ptr + x);
-                        break;
+                        auto vec_width_inv =
+                            wrapper::vinv(wrapper::vdup_n(static_cast<T>(in_info.dimension(axis)), ExactTagType{}));
+                        vec_res_value = wrapper::vmul(vec_res_value, vec_width_inv);
                     }
-                    case ReductionOperation::PROD:
+
+                    if (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
                     {
-                        res_value = static_cast<T>(1.f);
-                        break;
+                        wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x, vec_res_idx.val[0]);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                        if (std::is_same<T, float16_t>::value)
+                        {
+                            wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x + 4, vec_res_idx.val[1]);
+                        }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                     }
-                    default:
+                    else
                     {
-                        res_value = static_cast<T>(0.f);
-                        break;
+                        wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x * sizeof(T)), vec_res_value);
                     }
                 }
 
-                uint32_t res_idx = 0;
-                for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
-                    const T *in_ptr = reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
-
-                    switch(op)
+                    auto res_value = 0.f;
+                    switch (op)
                     {
-                        case ReductionOperation::SUM:
-                        case ReductionOperation::MEAN_SUM:
-                            res_value += *in_ptr;
-                            break;
-                        case ReductionOperation::SUM_SQUARE:
-                            res_value += *in_ptr * *in_ptr;
-                            break;
-                        case ReductionOperation::PROD:
-                            res_value *= *in_ptr;
-                            break;
+                        case ReductionOperation::ARG_IDX_MAX:
                         case ReductionOperation::ARG_IDX_MIN:
+                        case ReductionOperation::MIN:
+                        case ReductionOperation::MAX:
                         {
-                            if(*in_ptr < res_value)
-                            {
-                                res_value = *in_ptr;
-                                res_idx   = dim;
-                            }
+                            res_value = *(input_ptr + x);
                             break;
                         }
-                        case ReductionOperation::ARG_IDX_MAX:
+                        case ReductionOperation::PROD:
                         {
-                            if(*in_ptr > res_value)
-                            {
-                                res_value = *in_ptr;
-                                res_idx   = dim;
-                            }
+                            res_value = static_cast<T>(1.f);
                             break;
                         }
-                        case ReductionOperation::MIN:
+                        default:
                         {
-                            res_value = *in_ptr < res_value ? *in_ptr : res_value;
+                            res_value = static_cast<T>(0.f);
                             break;
                         }
-                        case ReductionOperation::MAX:
+                    }
+
+                    uint32_t res_idx = 0;
+                    for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                    {
+                        const T *in_ptr =
+                            reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
+
+                        switch (op)
                         {
-                            res_value = *in_ptr > res_value ? *in_ptr : res_value;
-                            break;
+                            case ReductionOperation::SUM:
+                            case ReductionOperation::MEAN_SUM:
+                                res_value += *in_ptr;
+                                break;
+                            case ReductionOperation::SUM_SQUARE:
+                                res_value += *in_ptr * *in_ptr;
+                                break;
+                            case ReductionOperation::PROD:
+                                res_value *= *in_ptr;
+                                break;
+                            case ReductionOperation::ARG_IDX_MIN:
+                            {
+                                if (*in_ptr < res_value)
+                                {
+                                    res_value = *in_ptr;
+                                    res_idx   = dim;
+                                }
+                                break;
+                            }
+                            case ReductionOperation::ARG_IDX_MAX:
+                            {
+                                if (*in_ptr > res_value)
+                                {
+                                    res_value = *in_ptr;
+                                    res_idx   = dim;
+                                }
+                                break;
+                            }
+                            case ReductionOperation::MIN:
+                            {
+                                res_value = *in_ptr < res_value ? *in_ptr : res_value;
+                                break;
+                            }
+                            case ReductionOperation::MAX:
+                            {
+                                res_value = *in_ptr > res_value ? *in_ptr : res_value;
+                                break;
+                            }
+                            default:
+                                ARM_COMPUTE_ERROR("Not supported");
                         }
-                        default:
-                            ARM_COMPUTE_ERROR("Not supported");
                     }
-                }
 
-                if(op == ReductionOperation::MEAN_SUM)
-                {
-                    res_value /= in_info.dimension(axis);
-                }
+                    if (op == ReductionOperation::MEAN_SUM)
+                    {
+                        res_value /= in_info.dimension(axis);
+                    }
 
-                if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
-                {
-                    *(reinterpret_cast<uint32_t *>(output.ptr()) + x) = res_idx;
-                }
-                else
-                {
-                    *(reinterpret_cast<T *>(output.ptr() + x * sizeof(T))) = res_value;
+                    if (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
+                    {
+                        *(reinterpret_cast<uint32_t *>(output.ptr()) + x) = res_idx;
+                    }
+                    else
+                    {
+                        *(reinterpret_cast<T *>(output.ptr() + x * sizeof(T))) = res_value;
+                    }
                 }
-            }
-        },
-        input, output);
+            },
+            input, output);
     }
 };
 
@@ -1107,7 +1142,8 @@ struct RedOpYZW_complex
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
     using neon_vector  = typename wrapper::traits::neon_vector<T, S>::type;
 
-    inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int, const ReductionOperation)
+    inline void operator()(
+        const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int, const ReductionOperation)
     {
         ARM_COMPUTE_ERROR_ON(axis != 2);
         ARM_COMPUTE_ERROR_ON(op != ReductionOperation::SUM);
@@ -1124,70 +1160,77 @@ struct RedOpYZW_complex
         Window in_win_no_pad = in_window;
         in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
         Window out_win_no_pad = out_window;
-        out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
+        out_win_no_pad.set(Window::DimX,
+                           Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
 
         Iterator input(in, in_win_no_pad);
         Iterator output(out, out_win_no_pad);
 
         execute_window_loop(
-            in_win_no_pad, [&](const Coordinates &)
-        {
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            in_win_no_pad,
+            [&](const Coordinates &)
             {
-                neon_vector vec_res_value_0 = { 0 };
-                neon_vector vec_res_value_1 = { 0 };
-
-                vec_res_value_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-                vec_res_value_1 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-
-                T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
-                for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    T *in_ptr_0 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
-                    T *in_ptr_1 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + 16 + stride_z * dim);
+                    neon_vector vec_res_value_0 = {0};
+                    neon_vector vec_res_value_1 = {0};
 
-                    const auto vec_elements_0 = wrapper::vloadq(in_ptr_0);
-                    const auto vec_elements_1 = wrapper::vloadq(in_ptr_1);
+                    vec_res_value_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+                    vec_res_value_1 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
 
-                    vec_res_value_0 = wrapper::vadd(vec_elements_0, vec_res_value_0);
-                    vec_res_value_1 = wrapper::vadd(vec_elements_1, vec_res_value_1);
-                }
+                    T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
+                    for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                    {
+                        T *in_ptr_0 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
+                        T *in_ptr_1 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + 16 + stride_z * dim);
 
-                wrapper::vstore(out_ptr, vec_res_value_0);
-                wrapper::vstore(out_ptr + 4, vec_res_value_1);
-            }
+                        const auto vec_elements_0 = wrapper::vloadq(in_ptr_0);
+                        const auto vec_elements_1 = wrapper::vloadq(in_ptr_1);
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                auto res_value_0 = 0.f;
-                auto res_value_1 = 0.f;
+                        vec_res_value_0 = wrapper::vadd(vec_elements_0, vec_res_value_0);
+                        vec_res_value_1 = wrapper::vadd(vec_elements_1, vec_res_value_1);
+                    }
 
-                T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
-                for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                    wrapper::vstore(out_ptr, vec_res_value_0);
+                    wrapper::vstore(out_ptr + 4, vec_res_value_1);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
-                    T *in_ptr = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
-                    res_value_0 += *in_ptr;
-                    res_value_1 += *(in_ptr + 1);
+                    auto res_value_0 = 0.f;
+                    auto res_value_1 = 0.f;
+
+                    T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
+                    for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                    {
+                        T *in_ptr = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
+                        res_value_0 += *in_ptr;
+                        res_value_1 += *(in_ptr + 1);
+                    }
+                    *out_ptr       = res_value_0;
+                    *(out_ptr + 1) = res_value_1;
                 }
-                *out_ptr       = res_value_0;
-                *(out_ptr + 1) = res_value_1;
-            }
-        },
-        input, output);
+            },
+            input, output);
     }
 };
 
 template <typename T>
 struct RedOpYZW_quantized
 {
-    inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int axis, const ReductionOperation op)
+    inline void operator()(const Window            &in_window,
+                           Window                  &out_window,
+                           const ITensor           *in,
+                           ITensor                 *out,
+                           int                      axis,
+                           const ReductionOperation op)
     {
         const TensorInfo              in_info = *(in->info());
         const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform();
-        using PromotedType                    = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type;
+        using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type;
 
         const auto oq_info = out->info()->quantization_info().uniform();
 
@@ -1201,12 +1244,14 @@ struct RedOpYZW_quantized
         Window in_win_no_pad = in_window;
         in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
         Window out_win_no_pad = out_window;
-        out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
+        out_win_no_pad.set(Window::DimX,
+                           Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
 
         Iterator input(in, in_win_no_pad);
         Iterator output(out, out_win_no_pad);
 
-        using vector_type   = typename wrapper::traits::neon_bitvector<PromotedType, wrapper::traits::BitWidth::W128>::type;
+        using vector_type =
+            typename wrapper::traits::neon_bitvector<PromotedType, wrapper::traits::BitWidth::W128>::type;
         using vector_type_f = typename wrapper::traits::neon_vector<float, 4>::type;
 
         vector_type vec_res_value1{};
@@ -1234,362 +1279,384 @@ struct RedOpYZW_quantized
         const auto vec_B = wrapper::vdup_n(static_cast<float>(B), wrapper::traits::vector_128_tag{});
 
         execute_window_loop(
-            in_win_no_pad, [&](const Coordinates &)
-        {
-            const auto input_ptr = reinterpret_cast<T *>(input.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            in_win_no_pad,
+            [&](const Coordinates &)
             {
-                uint32x4x4_t vec_res_idx{ { 0 } };
-                vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
-                vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
-                vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
-                vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+                const auto input_ptr = reinterpret_cast<T *>(input.ptr());
 
-                vec_res_value1_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
-                vec_res_value2_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
-                vec_res_value3_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
-                vec_res_value4_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    uint32x4x4_t vec_res_idx{{0}};
+                    vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+                    vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+                    vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+                    vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
 
-                auto vec_res_value = wrapper::vloadq(input_ptr + x);
+                    vec_res_value1_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+                    vec_res_value2_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+                    vec_res_value3_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+                    vec_res_value4_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
 
-                for(unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim)
-                {
-                    const T   *in_ptr       = input_ptr + x + in_info.strides_in_bytes()[axis] * index_dim;
-                    const auto vec_elements = wrapper::vloadq(in_ptr);
-                    switch(op)
+                    auto vec_res_value = wrapper::vloadq(input_ptr + x);
+
+                    for (unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim)
                     {
-                        case ReductionOperation::SUM:
-                        case ReductionOperation::MEAN_SUM:
+                        const T   *in_ptr       = input_ptr + x + in_info.strides_in_bytes()[axis] * index_dim;
+                        const auto vec_elements = wrapper::vloadq(in_ptr);
+                        switch (op)
                         {
-                            const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
-                            const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
-                            const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
-                            const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
-                            const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
-                            const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
-                            vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
-                            vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
-                            vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
-                            vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
-                            break;
+                            case ReductionOperation::SUM:
+                            case ReductionOperation::MEAN_SUM:
+                            {
+                                const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+                                const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+                                const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+                                const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+                                const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+                                const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+                                vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
+                                vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
+                                vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
+                                vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
+                                break;
+                            }
+                            case ReductionOperation::PROD:
+                            {
+                                const auto offset32x4f_4 = wrapper::vdup_n(static_cast<float>(iq_info.offset),
+                                                                           wrapper::traits::vector_128_tag{});
+                                const auto scale32x4f_4 =
+                                    wrapper::vdup_n(iq_info.scale, wrapper::traits::vector_128_tag{});
+
+                                const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+                                const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+                                const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+                                const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+                                const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+                                const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+                                auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
+                                auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
+                                auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
+                                auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
+
+                                //de-quantize vec_elements
+                                temp32x4f_1 = wrapper::vmul(wrapper::vsub(temp32x4f_1, offset32x4f_4), scale32x4f_4);
+                                temp32x4f_2 = wrapper::vmul(wrapper::vsub(temp32x4f_2, offset32x4f_4), scale32x4f_4);
+                                temp32x4f_3 = wrapper::vmul(wrapper::vsub(temp32x4f_3, offset32x4f_4), scale32x4f_4);
+                                temp32x4f_4 = wrapper::vmul(wrapper::vsub(temp32x4f_4, offset32x4f_4), scale32x4f_4);
+
+                                vec_res_value1_f = wrapper::vmul(temp32x4f_1, vec_res_value1_f);
+                                vec_res_value2_f = wrapper::vmul(temp32x4f_2, vec_res_value2_f);
+                                vec_res_value3_f = wrapper::vmul(temp32x4f_3, vec_res_value3_f);
+                                vec_res_value4_f = wrapper::vmul(temp32x4f_4, vec_res_value4_f);
+                                break;
+                            }
+                            case ReductionOperation::ARG_IDX_MIN:
+                            {
+                                auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                                vec_res_idx   = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value,
+                                                                          vec_res_idx, op, axis);
+                                vec_res_value = temp_vec_res_value;
+                                break;
+                            }
+                            case ReductionOperation::ARG_IDX_MAX:
+                            {
+                                auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                                vec_res_idx   = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value,
+                                                                          vec_res_idx, op, axis);
+                                vec_res_value = temp_vec_res_value;
+                                break;
+                            }
+                            case ReductionOperation::MIN:
+                            {
+                                vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                                break;
+                            }
+                            case ReductionOperation::MAX:
+                            {
+                                vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                                break;
+                            }
+                            default:
+                                ARM_COMPUTE_ERROR("Not supported");
                         }
-                        case ReductionOperation::PROD:
-                        {
-                            const auto offset32x4f_4 = wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{});
-                            const auto scale32x4f_4  = wrapper::vdup_n(iq_info.scale, wrapper::traits::vector_128_tag{});
-
-                            const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
-                            const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
-                            const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
-                            const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
-                            const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
-                            const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
-                            auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
-                            auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
-                            auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
-                            auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
+                    }
 
-                            //de-quantize vec_elements
-                            temp32x4f_1 = wrapper::vmul(wrapper::vsub(temp32x4f_1, offset32x4f_4), scale32x4f_4);
-                            temp32x4f_2 = wrapper::vmul(wrapper::vsub(temp32x4f_2, offset32x4f_4), scale32x4f_4);
-                            temp32x4f_3 = wrapper::vmul(wrapper::vsub(temp32x4f_3, offset32x4f_4), scale32x4f_4);
-                            temp32x4f_4 = wrapper::vmul(wrapper::vsub(temp32x4f_4, offset32x4f_4), scale32x4f_4);
-
-                            vec_res_value1_f = wrapper::vmul(temp32x4f_1, vec_res_value1_f);
-                            vec_res_value2_f = wrapper::vmul(temp32x4f_2, vec_res_value2_f);
-                            vec_res_value3_f = wrapper::vmul(temp32x4f_3, vec_res_value3_f);
-                            vec_res_value4_f = wrapper::vmul(temp32x4f_4, vec_res_value4_f);
-                            break;
-                        }
+                    switch (op)
+                    {
                         case ReductionOperation::ARG_IDX_MIN:
-                        {
-                            auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                            vec_res_idx             = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
-                            vec_res_value           = temp_vec_res_value;
-                            break;
-                        }
                         case ReductionOperation::ARG_IDX_MAX:
                         {
-                            auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                            vec_res_idx             = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
-                            vec_res_value           = temp_vec_res_value;
+                            wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x), vec_res_idx.val[0]);
+                            wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 4, vec_res_idx.val[1]);
+                            wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 8, vec_res_idx.val[2]);
+                            wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 12,
+                                            vec_res_idx.val[3]);
                             break;
                         }
                         case ReductionOperation::MIN:
-                        {
-                            vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                            break;
-                        }
                         case ReductionOperation::MAX:
                         {
-                            vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                            wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), vec_res_value);
                             break;
                         }
-                        default:
-                            ARM_COMPUTE_ERROR("Not supported");
-                    }
-                }
-
-                switch(op)
-                {
-                    case ReductionOperation::ARG_IDX_MIN:
-                    case ReductionOperation::ARG_IDX_MAX:
-                    {
-                        wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x), vec_res_idx.val[0]);
-                        wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 4, vec_res_idx.val[1]);
-                        wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 8, vec_res_idx.val[2]);
-                        wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 12, vec_res_idx.val[3]);
-                        break;
-                    }
-                    case ReductionOperation::MIN:
-                    case ReductionOperation::MAX:
-                    {
-                        wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), vec_res_value);
-                        break;
-                    }
-                    case ReductionOperation::SUM:
-                    {
-                        // Subtract offsets
-                        auto offsets = vdupq_n_s32((in_info.dimension(axis) - 1) * iq_info.offset);
+                        case ReductionOperation::SUM:
+                        {
+                            // Subtract offsets
+                            auto offsets = vdupq_n_s32((in_info.dimension(axis) - 1) * iq_info.offset);
 
-                        auto vec_res_s_value1 = wrapper::vreinterpret(vec_res_value1);
-                        auto vec_res_s_value2 = wrapper::vreinterpret(vec_res_value2);
-                        auto vec_res_s_value3 = wrapper::vreinterpret(vec_res_value3);
-                        auto vec_res_s_value4 = wrapper::vreinterpret(vec_res_value4);
+                            auto vec_res_s_value1 = wrapper::vreinterpret(vec_res_value1);
+                            auto vec_res_s_value2 = wrapper::vreinterpret(vec_res_value2);
+                            auto vec_res_s_value3 = wrapper::vreinterpret(vec_res_value3);
+                            auto vec_res_s_value4 = wrapper::vreinterpret(vec_res_value4);
 
-                        vec_res_s_value1 = wrapper::vsub(vec_res_s_value1, offsets);
-                        vec_res_s_value2 = wrapper::vsub(vec_res_s_value2, offsets);
-                        vec_res_s_value3 = wrapper::vsub(vec_res_s_value3, offsets);
-                        vec_res_s_value4 = wrapper::vsub(vec_res_s_value4, offsets);
+                            vec_res_s_value1 = wrapper::vsub(vec_res_s_value1, offsets);
+                            vec_res_s_value2 = wrapper::vsub(vec_res_s_value2, offsets);
+                            vec_res_s_value3 = wrapper::vsub(vec_res_s_value3, offsets);
+                            vec_res_s_value4 = wrapper::vsub(vec_res_s_value4, offsets);
 
-                        const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_s_value1), wrapper::vqmovn(vec_res_s_value2));
-                        const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_s_value3), wrapper::vqmovn(vec_res_s_value4));
+                            const auto temp16x8t_1 =
+                                wrapper::vcombine(wrapper::vqmovn(vec_res_s_value1), wrapper::vqmovn(vec_res_s_value2));
+                            const auto temp16x8t_2 =
+                                wrapper::vcombine(wrapper::vqmovn(vec_res_s_value3), wrapper::vqmovn(vec_res_s_value4));
 
-                        combine_and_store<T>(temp16x8t_1, temp16x8t_2, output, x);
-                        break;
-                    }
-                    case ReductionOperation::MEAN_SUM:
-                    {
-                        vec_res_value1_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value1), vec_A);
-                        vec_res_value2_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value2), vec_A);
-                        vec_res_value3_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value3), vec_A);
-                        vec_res_value4_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value4), vec_A);
+                            combine_and_store<T>(temp16x8t_1, temp16x8t_2, output, x);
+                            break;
+                        }
+                        case ReductionOperation::MEAN_SUM:
+                        {
+                            vec_res_value1_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value1), vec_A);
+                            vec_res_value2_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value2), vec_A);
+                            vec_res_value3_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value3), vec_A);
+                            vec_res_value4_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value4), vec_A);
 
 #ifdef __aarch64__
-                        vec_res_value1 = wrapper::vcvta<PromotedType>(vec_res_value1_f);
-                        vec_res_value2 = wrapper::vcvta<PromotedType>(vec_res_value2_f);
-                        vec_res_value3 = wrapper::vcvta<PromotedType>(vec_res_value3_f);
-                        vec_res_value4 = wrapper::vcvta<PromotedType>(vec_res_value4_f);
+                            vec_res_value1 = wrapper::vcvta<PromotedType>(vec_res_value1_f);
+                            vec_res_value2 = wrapper::vcvta<PromotedType>(vec_res_value2_f);
+                            vec_res_value3 = wrapper::vcvta<PromotedType>(vec_res_value3_f);
+                            vec_res_value4 = wrapper::vcvta<PromotedType>(vec_res_value4_f);
 #else  // defined(__aarch64__)
-                        vec_res_value1 = wrapper::vcvt<PromotedType>(vec_res_value1_f);
-                        vec_res_value2 = wrapper::vcvt<PromotedType>(vec_res_value2_f);
-                        vec_res_value3 = wrapper::vcvt<PromotedType>(vec_res_value3_f);
-                        vec_res_value4 = wrapper::vcvt<PromotedType>(vec_res_value4_f);
+                            vec_res_value1    = wrapper::vcvt<PromotedType>(vec_res_value1_f);
+                            vec_res_value2    = wrapper::vcvt<PromotedType>(vec_res_value2_f);
+                            vec_res_value3    = wrapper::vcvt<PromotedType>(vec_res_value3_f);
+                            vec_res_value4    = wrapper::vcvt<PromotedType>(vec_res_value4_f);
 #endif // __aarch64__
 
-                        const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
-                        const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
-                        auto       res         = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
-
-                        wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res);
-                        break;
-                    }
-                    case ReductionOperation::PROD:
-                    {
-                        const auto offset32x4f_4 = wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{});
-                        const auto iscale32x4f_4 = vinvq_f32(vdupq_n_f32(iq_info.scale));
-
-                        //re-quantize
-                        vec_res_value1_f = wrapper::vadd(wrapper::vmul(vec_res_value1_f, iscale32x4f_4), offset32x4f_4);
-                        vec_res_value2_f = wrapper::vadd(wrapper::vmul(vec_res_value2_f, iscale32x4f_4), offset32x4f_4);
-                        vec_res_value3_f = wrapper::vadd(wrapper::vmul(vec_res_value3_f, iscale32x4f_4), offset32x4f_4);
-                        vec_res_value4_f = wrapper::vadd(wrapper::vmul(vec_res_value4_f, iscale32x4f_4), offset32x4f_4);
-
-                        vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f);
-                        vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f);
-                        vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f);
-                        vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f);
+                            const auto temp16x8t_1 =
+                                wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
+                            const auto temp16x8t_2 =
+                                wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
+                            auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
 
-                        const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
-                        const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
-                        auto       res         = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
-
-                        wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res);
-                        break;
+                            wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res);
+                            break;
+                        }
+                        case ReductionOperation::PROD:
+                        {
+                            const auto offset32x4f_4 =
+                                wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{});
+                            const auto iscale32x4f_4 = vinvq_f32(vdupq_n_f32(iq_info.scale));
+
+                            //re-quantize
+                            vec_res_value1_f =
+                                wrapper::vadd(wrapper::vmul(vec_res_value1_f, iscale32x4f_4), offset32x4f_4);
+                            vec_res_value2_f =
+                                wrapper::vadd(wrapper::vmul(vec_res_value2_f, iscale32x4f_4), offset32x4f_4);
+                            vec_res_value3_f =
+                                wrapper::vadd(wrapper::vmul(vec_res_value3_f, iscale32x4f_4), offset32x4f_4);
+                            vec_res_value4_f =
+                                wrapper::vadd(wrapper::vmul(vec_res_value4_f, iscale32x4f_4), offset32x4f_4);
+
+                            vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f);
+                            vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f);
+                            vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f);
+                            vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f);
+
+                            const auto temp16x8t_1 =
+                                wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
+                            const auto temp16x8t_2 =
+                                wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
+                            auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
+
+                            wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res);
+                            break;
+                        }
+                        default:
+                            ARM_COMPUTE_ERROR("Not supported");
                     }
-                    default:
-                        ARM_COMPUTE_ERROR("Not supported");
                 }
-            }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                float   res_value   = 0.f;
-                int32_t res_value_q = 0;
-
-                switch(op)
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
-                    case ReductionOperation::ARG_IDX_MAX:
-                    case ReductionOperation::ARG_IDX_MIN:
-                    case ReductionOperation::MIN:
-                    case ReductionOperation::MAX:
-                    {
-                        res_value = *(input_ptr + x);
-                        break;
-                    }
-                    case ReductionOperation::PROD:
-                    {
-                        res_value = static_cast<T>(1.0f);
-                        break;
-                    }
-                    default:
-                    {
-                        res_value = static_cast<T>(0.0f);
-                        break;
-                    }
-                }
-                uint32_t res_idx = 0;
+                    float   res_value   = 0.f;
+                    int32_t res_value_q = 0;
 
-                for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
-                {
-                    const T *in_ptr = reinterpret_cast<T *>(input.ptr() + x + in_info.strides_in_bytes()[axis] * dim);
-                    switch(op)
+                    switch (op)
                     {
-                        case ReductionOperation::SUM:
+                        case ReductionOperation::ARG_IDX_MAX:
+                        case ReductionOperation::ARG_IDX_MIN:
+                        case ReductionOperation::MIN:
+                        case ReductionOperation::MAX:
                         {
-                            res_value += *in_ptr;
+                            res_value = *(input_ptr + x);
                             break;
                         }
-                        case ReductionOperation::MEAN_SUM:
+                        case ReductionOperation::PROD:
                         {
-                            res_value_q += *in_ptr;
+                            res_value = static_cast<T>(1.0f);
                             break;
                         }
-                        case ReductionOperation::SUM_SQUARE:
+                        default:
                         {
-                            res_value += *in_ptr * *in_ptr;
+                            res_value = static_cast<T>(0.0f);
                             break;
                         }
-                        case ReductionOperation::PROD:
+                    }
+                    uint32_t res_idx = 0;
+
+                    for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                    {
+                        const T *in_ptr =
+                            reinterpret_cast<T *>(input.ptr() + x + in_info.strides_in_bytes()[axis] * dim);
+                        switch (op)
                         {
-                            //de-quantize input
-                            if(std::is_same<T, uint8_t>::value)
+                            case ReductionOperation::SUM:
                             {
-                                res_value *= dequantize_qasymm8(*in_ptr, iq_info);
+                                res_value += *in_ptr;
+                                break;
                             }
-                            else
+                            case ReductionOperation::MEAN_SUM:
                             {
-                                res_value *= dequantize_qasymm8_signed(*in_ptr, iq_info);
+                                res_value_q += *in_ptr;
+                                break;
                             }
-                            break;
-                        }
-                        case ReductionOperation::ARG_IDX_MIN:
-                        {
-                            if(*in_ptr < res_value)
+                            case ReductionOperation::SUM_SQUARE:
                             {
-                                res_value = *in_ptr;
-                                res_idx   = dim;
+                                res_value += *in_ptr * *in_ptr;
+                                break;
                             }
-                            break;
-                        }
-                        case ReductionOperation::ARG_IDX_MAX:
-                        {
-                            if(*in_ptr > res_value)
+                            case ReductionOperation::PROD:
                             {
-                                res_value = *in_ptr;
-                                res_idx   = dim;
+                                //de-quantize input
+                                if (std::is_same<T, uint8_t>::value)
+                                {
+                                    res_value *= dequantize_qasymm8(*in_ptr, iq_info);
+                                }
+                                else
+                                {
+                                    res_value *= dequantize_qasymm8_signed(*in_ptr, iq_info);
+                                }
+                                break;
                             }
-                            break;
+                            case ReductionOperation::ARG_IDX_MIN:
+                            {
+                                if (*in_ptr < res_value)
+                                {
+                                    res_value = *in_ptr;
+                                    res_idx   = dim;
+                                }
+                                break;
+                            }
+                            case ReductionOperation::ARG_IDX_MAX:
+                            {
+                                if (*in_ptr > res_value)
+                                {
+                                    res_value = *in_ptr;
+                                    res_idx   = dim;
+                                }
+                                break;
+                            }
+                            case ReductionOperation::MIN:
+                            {
+                                res_value = *in_ptr < res_value ? *in_ptr : res_value;
+                                break;
+                            }
+                            case ReductionOperation::MAX:
+                            {
+                                res_value = *in_ptr > res_value ? *in_ptr : res_value;
+                                break;
+                            }
+                            default:
+                                ARM_COMPUTE_ERROR("Not supported");
                         }
-                        case ReductionOperation::MIN:
+                    }
+
+                    switch (op)
+                    {
+                        case ReductionOperation::MEAN_SUM:
                         {
-                            res_value = *in_ptr < res_value ? *in_ptr : res_value;
+                        // Apply previously calculated coefficients (with rounding on aarch64)
+#ifdef __aarch64__
+                            const int32_t res =
+                                arm_compute::support::cpp11::round(A * (static_cast<float>(res_value_q)) + B);
+#else  // defined(__aarch64__)
+                            const int32_t res = A * (static_cast<float>(res_value_q)) + B;
+#endif // __aarch64__
+                            *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res);
                             break;
                         }
-                        case ReductionOperation::MAX:
+                        case ReductionOperation::SUM:
                         {
-                            res_value = *in_ptr > res_value ? *in_ptr : res_value;
+                            // Subtract accumulated offsets
+                            res_value -= (in_info.dimension(axis) - 1) * iq_info.offset;
+                            *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res_value);
                             break;
                         }
-                        default:
-                            ARM_COMPUTE_ERROR("Not supported");
-                    }
-                }
-
-                switch(op)
-                {
-                    case ReductionOperation::MEAN_SUM:
-                    {
-                        // Apply previously calculated coefficients (with rounding on aarch64)
-#ifdef  __aarch64__
-                        const int32_t res                        = arm_compute::support::cpp11::round(A * (static_cast<float>(res_value_q)) + B);
-#else   // defined(__aarch64__)
-                        const int32_t res                        = A * (static_cast<float>(res_value_q)) + B;
-#endif  // __aarch64__
-                        *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res);
-                        break;
-                    }
-                    case ReductionOperation::SUM:
-                    {
-                        // Subtract accumulated offsets
-                        res_value -= (in_info.dimension(axis) - 1) * iq_info.offset;
-                        *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res_value);
-                        break;
-                    }
-                    case ReductionOperation::PROD:
-                    {
-                        //re-quantize result
-                        T res = 0;
-                        if(std::is_same<T, uint8_t>::value)
+                        case ReductionOperation::PROD:
                         {
-                            res = quantize_qasymm8(res_value, iq_info);
+                            //re-quantize result
+                            T res = 0;
+                            if (std::is_same<T, uint8_t>::value)
+                            {
+                                res = quantize_qasymm8(res_value, iq_info);
+                            }
+                            else
+                            {
+                                res = quantize_qasymm8_signed(res_value, iq_info);
+                            }
+                            *(reinterpret_cast<T *>(output.ptr() + x)) = res;
+                            break;
                         }
-                        else
+                        case ReductionOperation::ARG_IDX_MIN:
+                        case ReductionOperation::ARG_IDX_MAX:
                         {
-                            res = quantize_qasymm8_signed(res_value, iq_info);
+                            *(reinterpret_cast<uint32_t *>(output.ptr() + x * 4)) = res_idx;
+                            break;
                         }
-                        *(reinterpret_cast<T *>(output.ptr() + x)) = res;
-                        break;
-                    }
-                    case ReductionOperation::ARG_IDX_MIN:
-                    case ReductionOperation::ARG_IDX_MAX:
-                    {
-                        *(reinterpret_cast<uint32_t *>(output.ptr() + x * 4)) = res_idx;
-                        break;
+                        default:
+                            *(reinterpret_cast<T *>(output.ptr() + x)) = res_value;
                     }
-                    default:
-                        *(reinterpret_cast<T *>(output.ptr() + x)) = res_value;
                 }
-            }
-        },
-        input, output);
+            },
+            input, output);
     }
 };
 
-void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsigned int axis, const ReductionOperation op)
+void reduce_op(
+    const Window &window, const ITensor *input, ITensor *output, unsigned int axis, const ReductionOperation op)
 {
     const bool is_complex = (input->info()->num_channels() == 2);
 
-    if(is_complex)
+    if (is_complex)
     {
-        switch(axis)
+        switch (axis)
         {
             case 2:
-                switch(input->info()->data_type())
+                switch (input->info()->data_type())
                 {
                     case DataType::F32:
-                        switch(op)
+                        switch (op)
                         {
                             case ReductionOperation::SUM:
-                                return Reducer<RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>(), op);
+                                return Reducer<RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>>::reduceZ(
+                                    window, input, output, RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>(),
+                                    op);
                             default:
                                 ARM_COMPUTE_ERROR("Not supported");
                         }
@@ -1602,19 +1669,21 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi
         return;
     }
 
-    switch(axis)
+    switch (axis)
     {
         case 0:
         {
-            switch(input->info()->data_type())
+            switch (input->info()->data_type())
             {
                 case DataType::QASYMM8:
                 {
-                    return Reducer<RedOpX_quantized<uint8_t>>::reduceX(window, input, output, RedOpX_quantized<uint8_t>(), op);
+                    return Reducer<RedOpX_quantized<uint8_t>>::reduceX(window, input, output,
+                                                                       RedOpX_quantized<uint8_t>(), op);
                 }
                 case DataType::QASYMM8_SIGNED:
                 {
-                    return Reducer<RedOpX_quantized<int8_t>>::reduceX(window, input, output, RedOpX_quantized<int8_t>(), op);
+                    return Reducer<RedOpX_quantized<int8_t>>::reduceX(window, input, output, RedOpX_quantized<int8_t>(),
+                                                                      op);
                 }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F16:
@@ -1635,19 +1704,22 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi
             }
         }
         case 1:
-            switch(input->info()->data_type())
+            switch (input->info()->data_type())
             {
                 case DataType::QASYMM8:
                 {
-                    return Reducer<RedOpYZW_quantized<uint8_t>>::reduceY(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
+                    return Reducer<RedOpYZW_quantized<uint8_t>>::reduceY(window, input, output,
+                                                                         RedOpYZW_quantized<uint8_t>(), op);
                 }
                 case DataType::QASYMM8_SIGNED:
                 {
-                    return Reducer<RedOpYZW_quantized<int8_t>>::reduceY(window, input, output, RedOpYZW_quantized<int8_t>(), op);
+                    return Reducer<RedOpYZW_quantized<int8_t>>::reduceY(window, input, output,
+                                                                        RedOpYZW_quantized<int8_t>(), op);
                 }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, RedOpYZW<float16_t, 8>(), op);
+                    return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, RedOpYZW<float16_t, 8>(),
+                                                                    op);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F32:
                     return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(), op);
@@ -1657,15 +1729,18 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi
                     ARM_COMPUTE_ERROR("Not supported");
             }
         case 2:
-            switch(input->info()->data_type())
+            switch (input->info()->data_type())
             {
                 case DataType::QASYMM8:
-                    return Reducer<RedOpYZW_quantized<uint8_t>>::reduceZ(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
+                    return Reducer<RedOpYZW_quantized<uint8_t>>::reduceZ(window, input, output,
+                                                                         RedOpYZW_quantized<uint8_t>(), op);
                 case DataType::QASYMM8_SIGNED:
-                    return Reducer<RedOpYZW_quantized<int8_t>>::reduceZ(window, input, output, RedOpYZW_quantized<int8_t>(), op);
+                    return Reducer<RedOpYZW_quantized<int8_t>>::reduceZ(window, input, output,
+                                                                        RedOpYZW_quantized<int8_t>(), op);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8>(), op);
+                    return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8>(),
+                                                                    op);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F32:
                     return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(), op);
@@ -1675,15 +1750,18 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi
                     ARM_COMPUTE_ERROR("Not supported");
             }
         case 3:
-            switch(input->info()->data_type())
+            switch (input->info()->data_type())
             {
                 case DataType::QASYMM8:
-                    return Reducer<RedOpYZW_quantized<uint8_t>>::reduceW(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
+                    return Reducer<RedOpYZW_quantized<uint8_t>>::reduceW(window, input, output,
+                                                                         RedOpYZW_quantized<uint8_t>(), op);
                 case DataType::QASYMM8_SIGNED:
-                    return Reducer<RedOpYZW_quantized<int8_t>>::reduceW(window, input, output, RedOpYZW_quantized<int8_t>(), op);
+                    return Reducer<RedOpYZW_quantized<int8_t>>::reduceW(window, input, output,
+                                                                        RedOpYZW_quantized<int8_t>(), op);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, RedOpYZW<float16_t, 8>(), op);
+                    return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, RedOpYZW<float16_t, 8>(),
+                                                                    op);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F32:
                     return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(), op);
@@ -1704,9 +1782,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
 
-    if(input->num_channels() == 1)
+    if (input->num_channels() == 1)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                             DataType::S32, DataType::F16, DataType::F32);
     }
     else
     {
@@ -1715,13 +1794,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
         ARM_COMPUTE_RETURN_ERROR_ON(axis != 2);
     }
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+                                    "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN);
-        if(!is_arg_min_max)
+        if (!is_arg_min_max)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
             ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != output->num_channels());
@@ -1731,8 +1811,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32);
         }
 
-        const TensorShape output_shape         = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
-        const TensorInfo  tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
+        const TensorShape output_shape =
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
+        const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_reshaped);
     }
 
@@ -1745,7 +1826,10 @@ NEReductionOperationKernel::NEReductionOperationKernel()
 {
 }
 
-void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
+void NEReductionOperationKernel::configure(const ITensor     *input,
+                                           ITensor           *output,
+                                           unsigned int       axis,
+                                           ReductionOperation op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -1761,14 +1845,23 @@ void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output
     INEKernel::configure(win);
 
     // Calculate output shape and set if empty
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis);
+    const TensorShape output_shape =
+        arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis);
     // Output auto initialization if not yet initialized
     const bool is_arg_min_max   = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
     DataType   output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type();
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
+    auto_init_if_empty(*output->info(), input->info()
+                                            ->clone()
+                                            ->set_tensor_shape(output_shape)
+                                            .set_data_type(output_data_type)
+                                            .reset_padding()
+                                            .set_is_resizable(true));
 }
 
-Status NEReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status NEReductionOperationKernel::validate(const ITensorInfo *input,
+                                            const ITensorInfo *output,
+                                            unsigned int       axis,
+                                            ReductionOperation op)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
 
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.h b/src/core/NEON/kernels/NEReductionOperationKernel.h
index 08e654fd21..78bec62c14 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.h
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.h
@@ -77,7 +77,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEReorderKernel.cpp b/src/core/NEON/kernels/NEReorderKernel.cpp
index 1a7f58bb08..f92a4c87da 100644
--- a/src/core/NEON/kernels/NEReorderKernel.cpp
+++ b/src/core/NEON/kernels/NEReorderKernel.cpp
@@ -24,11 +24,13 @@
 #if defined(__aarch64__)
 
 #include "src/core/NEON/kernels/NEReorderKernel.h"
-#include "src/common/utils/Log.h"
-#include "src/core/NEON/kernels/arm_gemm/transform.hpp"
+
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/arm_gemm/transform.hpp"
+
 namespace arm_compute
 {
 
@@ -37,29 +39,32 @@ void NEReorderKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    switch(_input->info()->data_type())
+    switch (_input->info()->data_type())
     {
         case DataType::F32:
         {
             const int ksize_rows_elements = _xmax * _ksize;
-            const int jump_rows = ksize_rows_elements * window.x().start();
-            const int k_start = window.x().start() * _ksize;
-            const int k_end = std::min(window.x().end() * _ksize, _kmax);
-            const int stride = _kmax;
-            if(k_start < k_end)
+            const int jump_rows           = ksize_rows_elements * window.x().start();
+            const int k_start             = window.x().start() * _ksize;
+            const int k_end               = std::min(window.x().end() * _ksize, _kmax);
+            const int stride              = _kmax;
+            if (k_start < k_end)
             {
-
-                switch(_output_wf)
+                switch (_output_wf)
                 {
                     case WeightFormat::OHWIo4:
                     {
-                        arm_gemm::Transform<4, 1, true, arm_gemm::VLType::None>(reinterpret_cast<float *>(_output->buffer()) + jump_rows, reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
+                        arm_gemm::Transform<4, 1, true, arm_gemm::VLType::None>(
+                            reinterpret_cast<float *>(_output->buffer()) + jump_rows,
+                            reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
                         break;
                     }
 #if defined(ARM_COMPUTE_ENABLE_SVE)
                     case WeightFormat::OHWIo8:
                     {
-                        arm_gemm::Transform<1, 1, true, arm_gemm::VLType::SVE>(reinterpret_cast<float *>(_output->buffer()) + jump_rows, reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
+                        arm_gemm::Transform<1, 1, true, arm_gemm::VLType::SVE>(
+                            reinterpret_cast<float *>(_output->buffer()) + jump_rows,
+                            reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
                         break;
                     }
 #endif /* ARM_COMPUTE_ENABLE_SVE */
@@ -78,11 +83,20 @@ void NEReorderKernel::run(const Window &window, const ThreadInfo &info)
 }
 
 NEReorderKernel::NEReorderKernel()
-    : _input(nullptr), _output(nullptr), _ksize(0), _kmax(0), _xmax(0), _input_wf(WeightFormat::ANY), _output_wf(WeightFormat::ANY)
+    : _input(nullptr),
+      _output(nullptr),
+      _ksize(0),
+      _kmax(0),
+      _xmax(0),
+      _input_wf(WeightFormat::ANY),
+      _output_wf(WeightFormat::ANY)
 {
 }
 
-void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf)
+void NEReorderKernel::configure(const ITensor            *input,
+                                ITensor                  *output,
+                                arm_compute::WeightFormat input_wf,
+                                arm_compute::WeightFormat output_wf)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, input_wf, output_wf);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -96,7 +110,7 @@ void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compu
 
     // Setting parameters for transform
     auto dims = input->info()->num_dimensions();
-    switch(dims)
+    switch (dims)
     {
         case 2:
         {
@@ -120,7 +134,7 @@ void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compu
     // Window size is set by rows / _ksize
     Window win;
     int    window_size = 0;
-    switch(_output_wf)
+    switch (_output_wf)
     {
 #if defined(ARM_COMPUTE_ENABLE_SVE)
         case WeightFormat::OHWIo8:
@@ -142,7 +156,7 @@ void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compu
             break;
         }
     }
-    if(_kmax % _ksize != 0)
+    if (_kmax % _ksize != 0)
     {
         window_size += 1;
     }
@@ -152,11 +166,14 @@ void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compu
     INEKernel::configure(win);
 }
 
-Status NEReorderKernel::validate(const ITensorInfo *input, const ITensorInfo *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf)
+Status NEReorderKernel::validate(const ITensorInfo        *input,
+                                 const ITensorInfo        *output,
+                                 arm_compute::WeightFormat input_wf,
+                                 arm_compute::WeightFormat output_wf)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-    if(output->tensor_shape().total_size() != 0)
+    if (output->tensor_shape().total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -167,20 +184,20 @@ Status NEReorderKernel::validate(const ITensorInfo *input, const ITensorInfo *ou
         int  output_x_dim;
         int  output_k_dim;
         auto dims = output->num_dimensions();
-        switch(dims)
+        switch (dims)
         {
             case 2:
             {
-                input_x_dim = input->dimension(0); // Number of columns in input matrix
-                input_k_dim = input->dimension(1); // Number of rows in input matrix
+                input_x_dim  = input->dimension(0);  // Number of columns in input matrix
+                input_k_dim  = input->dimension(1);  // Number of rows in input matrix
                 output_x_dim = output->dimension(0); // Number of columns in output matrix
                 output_k_dim = output->dimension(1); // Number of rows in output matrix
                 break;
             }
             case 4:
             {
-                input_x_dim = input->dimension(2); // Number of columns in input matrix
-                input_k_dim = input->dimension(3); // Number of rows in input matrix
+                input_x_dim  = input->dimension(2);  // Number of columns in input matrix
+                input_k_dim  = input->dimension(3);  // Number of rows in input matrix
                 output_x_dim = output->dimension(2); // Number of columns in output matrix
                 output_k_dim = output->dimension(3); // Number of rows in output matrix
                 break;
@@ -192,7 +209,7 @@ Status NEReorderKernel::validate(const ITensorInfo *input, const ITensorInfo *ou
         }
 
         int ksize;
-        switch(output_wf)
+        switch (output_wf)
         {
             case WeightFormat::OHWIo8:
             {
@@ -216,11 +233,10 @@ Status NEReorderKernel::validate(const ITensorInfo *input, const ITensorInfo *ou
         ARM_COMPUTE_RETURN_ERROR_ON(rnd_up_input_kdim != output_k_dim);
         // output x_dim needs to be same as input
         ARM_COMPUTE_RETURN_ERROR_ON(input_x_dim != output_x_dim);
-
     }
     return Status{};
 }
 
 } // namespace arm_compute
 
-#endif  // defined(__aarch64__)
-\ No newline at end of file
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/NEReorderKernel.h b/src/core/NEON/kernels/NEReorderKernel.h
index 07908890f4..4528b25245 100644
--- a/src/core/NEON/kernels/NEReorderKernel.h
+++ b/src/core/NEON/kernels/NEReorderKernel.h
@@ -26,9 +26,10 @@
 #ifndef ACL_SRC_CORE_NEON_KERNELS_NEREORDERKERNEL
 #define ACL_SRC_CORE_NEON_KERNELS_NEREORDERKERNEL
 
-#include "src/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
 
+#include "src/core/NEON/INEKernel.h"
+
 namespace arm_compute
 {
 
@@ -36,7 +37,6 @@ namespace arm_compute
 class NEReorderKernel : public INEKernel
 {
 public:
-
     const char *name() const override
     {
         return "NEReorderKernel";
@@ -62,7 +62,10 @@ public:
      * @param[in]  input_wf  WeightFormat of input.
      * @param[in]  output_wf WeightFormat of output.
      */
-    void configure(const ITensor *input, ITensor *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf);
+    void configure(const ITensor            *input,
+                   ITensor                  *output,
+                   arm_compute::WeightFormat input_wf,
+                   arm_compute::WeightFormat output_wf);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEReorderKernel
      *
@@ -73,25 +76,27 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf);
+    static Status validate(const ITensorInfo        *input,
+                           const ITensorInfo        *output,
+                           arm_compute::WeightFormat input_wf,
+                           arm_compute::WeightFormat output_wf);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
 
-
-/*****************************************************************************/
+    /*****************************************************************************/
 
 private:
-    const ITensor *_input{nullptr}; // Input tensor
-    ITensor *_output{nullptr}; // Output tensor
-    int32_t  _ksize{0}; // Blocking parameter, how many rows kernel reorders on each call
-    int32_t  _kmax{0}; // Rows in input tensor
-    int32_t  _xmax{0}; // Columns in input tensor
-    WeightFormat _input_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of input tensor
-    WeightFormat _output_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of output tensor
+    const ITensor *_input{nullptr};  // Input tensor
+    ITensor       *_output{nullptr}; // Output tensor
+    int32_t        _ksize{0};        // Blocking parameter, how many rows kernel reorders on each call
+    int32_t        _kmax{0};         // Rows in input tensor
+    int32_t        _xmax{0};         // Columns in input tensor
+    WeightFormat   _input_wf{WeightFormat::UNSPECIFIED};  // WeightFormat of input tensor
+    WeightFormat   _output_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of output tensor
 };
 
 } // namespace arm_compute
 #endif /* ACL_SRC_CORE_NEON_KERNELS_NEREORDERKERNEL */
 
-#endif  // defined(__aarch64__)
-\ No newline at end of file
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/NEReorgLayerKernel.cpp b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
index a7b830c066..227570405c 100644
--- a/src/core/NEON/kernels/NEReorgLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
@@ -28,8 +28,9 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -50,13 +51,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
     const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
 
     ARM_COMPUTE_RETURN_ERROR_ON(stride <= 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, "The width of the input tensor must be a multiple of stride");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, "The height of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0,
+                                    "The width of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0,
+                                    "The height of the input tensor must be a multiple of stride");
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
+        const TensorInfo tensor_info_output =
+            output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -65,8 +69,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
 }
 } // namespace
 
-NEReorgLayerKernel::NEReorgLayerKernel()
-    : _input(nullptr), _output(nullptr), _stride(1)
+NEReorgLayerKernel::NEReorgLayerKernel() : _input(nullptr), _output(nullptr), _stride(1)
 {
 }
 
@@ -121,23 +124,26 @@ void NEReorgLayerKernel::run(const Window &window, const ThreadInfo &info)
     Iterator out(_output, collapsed_window);
 
     // Perform reorg
-    execute_window_loop(collapsed_window, [&](const Coordinates & id)
-    {
-        // Get spatial coords and channels
-        const unsigned int w = id[idx_w];
-        const unsigned int h = id[idx_h];
-        const unsigned int c = id[idx_c];
-
-        // Calculate mapping
-        const unsigned int offset     = c / out_c;
-        Coordinates        map_coords = id;
-        map_coords.set(idx_w, w * stride + offset % stride);
-        map_coords.set(idx_h, h * stride + offset / stride);
-        map_coords.set(idx_c, c % out_c);
-
-        // Perform mapping
-        std::memcpy(out.ptr(), in_ptr + _input->info()->offset_element_in_bytes(map_coords), _input->info()->element_size());
-    },
-    out);
+    execute_window_loop(
+        collapsed_window,
+        [&](const Coordinates &id)
+        {
+            // Get spatial coords and channels
+            const unsigned int w = id[idx_w];
+            const unsigned int h = id[idx_h];
+            const unsigned int c = id[idx_c];
+
+            // Calculate mapping
+            const unsigned int offset     = c / out_c;
+            Coordinates        map_coords = id;
+            map_coords.set(idx_w, w * stride + offset % stride);
+            map_coords.set(idx_h, h * stride + offset / stride);
+            map_coords.set(idx_c, c % out_c);
+
+            // Perform mapping
+            std::memcpy(out.ptr(), in_ptr + _input->info()->offset_element_in_bytes(map_coords),
+                        _input->info()->element_size());
+        },
+        out);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEReverseKernel.cpp b/src/core/NEON/kernels/NEReverseKernel.cpp
index ca6c117882..d2437eecd0 100644
--- a/src/core/NEON/kernels/NEReverseKernel.cpp
+++ b/src/core/NEON/kernels/NEReverseKernel.cpp
@@ -26,15 +26,17 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis)
 {
     ARM_COMPUTE_UNUSED(use_inverted_axis);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, axis);
@@ -42,11 +44,12 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->num_dimensions() > 1, "Axis must be a 1D tensor");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Current implementation only supports up to 4 dimensions.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4,
+                                    "Current implementation only supports up to 4 dimensions.");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->dimension(0) > 4, "Only up to 4 dimensions can be reversed");
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -57,8 +60,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 }
 } // namespace
 
-NEReverseKernel::NEReverseKernel()
-    : _input(nullptr), _output(nullptr), _axis(nullptr), _use_inverted_axis(false)
+NEReverseKernel::NEReverseKernel() : _input(nullptr), _output(nullptr), _axis(nullptr), _use_inverted_axis(false)
 {
 }
 
@@ -80,7 +82,10 @@ void NEReverseKernel::configure(const ITensor *input, ITensor *output, const ITe
     INEKernel::configure(calculate_max_window(*output->info()));
 }
 
-Status NEReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis)
+Status NEReverseKernel::validate(const ITensorInfo *input,
+                                 const ITensorInfo *output,
+                                 const ITensorInfo *axis,
+                                 bool               use_inverted_axis)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, use_inverted_axis));
 
@@ -88,29 +93,30 @@ Status NEReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *ou
 }
 
 template <typename T>
-void run_reverse(const Window &window, const ITensor *input, const ITensor *axis, ITensor *output, bool use_inverted_axis)
+void run_reverse(
+    const Window &window, const ITensor *input, const ITensor *axis, ITensor *output, bool use_inverted_axis)
 {
     unsigned int axis_bit = 0;
     const int    rank     = input->info()->num_dimensions();
 
-    for(unsigned int i = 0; i < axis->info()->dimension(0); ++i)
+    for (unsigned int i = 0; i < axis->info()->dimension(0); ++i)
     {
         int axis_i = *(reinterpret_cast<const int *>(axis->buffer()) + i);
 
         // The values of axis tensor must be between [-rank, rank-1].
-        if((axis_i < -rank) || (axis_i >= rank))
+        if ((axis_i < -rank) || (axis_i >= rank))
         {
             ARM_COMPUTE_ERROR("the valuses of the axis tensor must be within [-rank, rank-1].");
         }
 
         // In case of negative axis value i.e targeted axis(i) = rank + axis(i)
-        if(axis_i < 0)
+        if (axis_i < 0)
         {
             axis_i = rank + axis_i;
         }
 
         // Reverse ACL axis indices convention i.e. (inverted)axis = (tensor_rank - 1) - axis
-        if(use_inverted_axis)
+        if (use_inverted_axis)
         {
             axis_i = (rank - 1) - axis_i;
         }
@@ -127,43 +133,47 @@ void run_reverse(const Window &window, const ITensor *input, const ITensor *axis
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator input_it(input, win);
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            auto in = wrapper::vloadq(reinterpret_cast<T *>(input_it.ptr()) + x);
-
-            // Reverse 0 axis
-            if(axis_bit & 0x1)
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                in = wrapper::vrev64(in);
-                in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in));
+                auto in = wrapper::vloadq(reinterpret_cast<T *>(input_it.ptr()) + x);
+
+                // Reverse 0 axis
+                if (axis_bit & 0x1)
+                {
+                    in = wrapper::vrev64(in);
+                    in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in));
+                }
+
+                const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - window_step_x : x;
+                const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y();
+                const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z();
+                const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3];
+
+                auto out_ptr =
+                    reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w)));
+                wrapper::vstore(out_ptr, in);
             }
 
-            const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - window_step_x : x;
-            const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y();
-            const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z();
-            const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3];
-
-            auto out_ptr = reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w)));
-            wrapper::vstore(out_ptr, in);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const auto in = *(reinterpret_cast<T *>(input_it.ptr()) + x);
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                const auto in = *(reinterpret_cast<T *>(input_it.ptr()) + x);
 
-            const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - 1 : x;
-            const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y();
-            const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z();
-            const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3];
+                const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - 1 : x;
+                const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y();
+                const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z();
+                const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3];
 
-            *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w))) = in;
-        }
-    },
-    input_it);
+                *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w))) =
+                    in;
+            }
+        },
+        input_it);
 }
 
 void NEReverseKernel::run(const Window &window, const ThreadInfo &info)
@@ -172,7 +182,7 @@ void NEReverseKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    switch(_input->info()->element_size())
+    switch (_input->info()->element_size())
     {
         case 4:
             run_reverse<uint32_t>(window, _input, _axis, _output, _use_inverted_axis);
diff --git a/src/core/NEON/kernels/NEReverseKernel.h b/src/core/NEON/kernels/NEReverseKernel.h
index 7d9ec4691c..92261887f4 100644
--- a/src/core/NEON/kernels/NEReverseKernel.h
+++ b/src/core/NEON/kernels/NEReverseKernel.h
@@ -68,7 +68,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NESelectKernel.cpp b/src/core/NEON/kernels/NESelectKernel.cpp
index b8c9b244ee..7789b828ea 100644
--- a/src/core/NEON/kernels/NESelectKernel.cpp
+++ b/src/core/NEON/kernels/NESelectKernel.cpp
@@ -29,13 +29,12 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/common/Registrars.h"
-
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/select/list.h"
 
 #include <arm_neon.h>
@@ -54,7 +53,8 @@ struct SelectKernelSelectorData
 };
 
 using SelectorPtr = std::add_pointer<bool(const SelectKernelSelectorData &data)>::type;
-using KernelPtr   = std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Window &)>::type;
+using KernelPtr =
+    std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Window &)>::type;
 
 struct SelectKernelSelector
 {
@@ -63,95 +63,62 @@ struct SelectKernelSelector
     KernelPtr         ukernel;
 };
 
-static const SelectKernelSelector available_kernels[] =
-{
-    {
-        "neon_s8_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S8 && data.is_same_rank == true; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_same_rank)
-    },
-    {
-        "neon_s16_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S16 && data.is_same_rank == true; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_same_rank)
-    },
-    {
-        "neon_s32_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S32 && data.is_same_rank == true; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_same_rank)
-    },
-    {
-        "neon_u8_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U8 && data.is_same_rank == true; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_same_rank)
-    },
-    {
-        "neon_u16_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U16 && data.is_same_rank == true; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_same_rank)
-    },
-    {
-        "neon_u32_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U32 && data.is_same_rank == true; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_same_rank)
-    },
-    {
-        "neon_s8_not_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S8 && data.is_same_rank == false; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_not_same_rank)
-    },
-    {
-        "neon_s16_not_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S16 && data.is_same_rank == false; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_not_same_rank)
-    },
-    {
-        "neon_s32_not_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S32 && data.is_same_rank == false; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_not_same_rank)
-    },
-    {
-        "neon_u8_not_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U8 && data.is_same_rank == false; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_not_same_rank)
-    },
-    {
-        "neon_u16_not_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U16 && data.is_same_rank == false; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_not_same_rank)
-    },
-    {
-        "neon_u32_not_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U32 && data.is_same_rank == false; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_not_same_rank)
-    },
-    {
-        "neon_f16_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::F16 && data.is_same_rank == true; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_same_rank)
-    },
-    {
-        "neon_f16_not_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::F16 && data.is_same_rank == false; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_not_same_rank)
-    },
-    {
-        "neon_f32_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::F32 && data.is_same_rank == true; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_same_rank)
-    },
-    {
-        "neon_f32_not_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::F32 && data.is_same_rank == false; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_not_same_rank)
-    },
+static const SelectKernelSelector available_kernels[] = {
+    {"neon_s8_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::S8 && data.is_same_rank == true; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_same_rank)},
+    {"neon_s16_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::S16 && data.is_same_rank == true; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_same_rank)},
+    {"neon_s32_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::S32 && data.is_same_rank == true; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_same_rank)},
+    {"neon_u8_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::U8 && data.is_same_rank == true; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_same_rank)},
+    {"neon_u16_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::U16 && data.is_same_rank == true; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_same_rank)},
+    {"neon_u32_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::U32 && data.is_same_rank == true; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_same_rank)},
+    {"neon_s8_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::S8 && data.is_same_rank == false; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_not_same_rank)},
+    {"neon_s16_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::S16 && data.is_same_rank == false; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_not_same_rank)},
+    {"neon_s32_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::S32 && data.is_same_rank == false; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_not_same_rank)},
+    {"neon_u8_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::U8 && data.is_same_rank == false; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_not_same_rank)},
+    {"neon_u16_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::U16 && data.is_same_rank == false; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_not_same_rank)},
+    {"neon_u32_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::U32 && data.is_same_rank == false; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_not_same_rank)},
+    {"neon_f16_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::F16 && data.is_same_rank == true; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_same_rank)},
+    {"neon_f16_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::F16 && data.is_same_rank == false; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_not_same_rank)},
+    {"neon_f32_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::F32 && data.is_same_rank == true; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_same_rank)},
+    {"neon_f32_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::F32 && data.is_same_rank == false; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_not_same_rank)},
 };
 
 const SelectKernelSelector *get_implementation(const SelectKernelSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -184,7 +151,8 @@ void NESelectKernel::configure(const ITensor *c, const ITensor *x, const ITensor
     INEKernel::configure(win);
 }
 
-Status NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+Status
+NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(c, x, y);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(x);
@@ -195,9 +163,11 @@ Status NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, cons
 
     const bool is_same_rank = (c->tensor_shape().num_dimensions() == x->tensor_shape().num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(is_same_rank && (x->tensor_shape() != c->tensor_shape()));
-    ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && ((c->tensor_shape().num_dimensions() > 1) || (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
+    ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank &&
+                                ((c->tensor_shape().num_dimensions() > 1) ||
+                                 (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(x, output);
@@ -214,7 +184,7 @@ void NESelectKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON(_output == nullptr);
     ARM_COMPUTE_ERROR_ON(_output->info() == nullptr);
 
-    const auto *uk = get_implementation(SelectKernelSelectorData{ _output->info()->data_type(), _has_same_rank });
+    const auto *uk = get_implementation(SelectKernelSelectorData{_output->info()->data_type(), _has_same_rank});
     ARM_COMPUTE_ERROR_ON(uk == nullptr);
     ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
     uk->ukernel(_c, _x, _y, _output, window);
diff --git a/src/core/NEON/kernels/NESelectKernel.h b/src/core/NEON/kernels/NESelectKernel.h
index e82105a68e..4fec42b536 100644
--- a/src/core/NEON/kernels/NESelectKernel.h
+++ b/src/core/NEON/kernels/NESelectKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NESELECTKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -82,7 +83,6 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-
     const ITensor *_c;             /**< Condition tensor */
     const ITensor *_x;             /**< Source tensor 1 */
     const ITensor *_y;             /**< Source tensor 2 */
diff --git a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
index 673eace3c1..da023aeb96 100644
--- a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
@@ -26,11 +26,12 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 #include <cstdint>
@@ -41,19 +42,22 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *paddings, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *block_info,
+                          const ITensorInfo *paddings,
+                          const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, paddings, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(block_info->num_dimensions() > 1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{ 2 });
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{2});
     ARM_COMPUTE_RETURN_ERROR_ON(paddings->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{ 2, 2 });
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{2, 2});
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const DataLayout data_layout = input->data_layout();
         const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
@@ -64,7 +68,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
 
     return Status{};
 }
-Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status validate_arguments_static(const ITensorInfo *input,
+                                 const int          block_shape_x,
+                                 const int          block_shape_y,
+                                 const Size2D      &padding_left,
+                                 const Size2D      &padding_right,
                                  const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -73,9 +81,10 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
     ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x < 1 || block_shape_y < 1);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(input, block_shape_x, block_shape_y, padding_left, padding_right);
+        TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(
+            input, block_shape_x, block_shape_y, padding_left, padding_right);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -86,14 +95,25 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
 } // namespace
 
 NESpaceToBatchLayerKernel::NESpaceToBatchLayerKernel()
-    : _input(nullptr), _block_shape(nullptr), _paddings(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _padding_left(), _block_shape_x(), _block_shape_y()
+    : _input(nullptr),
+      _block_shape(nullptr),
+      _paddings(nullptr),
+      _output(nullptr),
+      _data_layout(DataLayout::UNKNOWN),
+      _padding_left(),
+      _block_shape_x(),
+      _block_shape_y()
 {
 }
 
-void NESpaceToBatchLayerKernel::configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output)
+void NESpaceToBatchLayerKernel::configure(const ITensor *input,
+                                          const ITensor *block_shape,
+                                          const ITensor *paddings,
+                                          ITensor       *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
 
     _input       = input;
     _block_shape = block_shape;
@@ -106,15 +126,22 @@ void NESpaceToBatchLayerKernel::configure(const ITensor *input, const ITensor *b
     ICPPKernel::configure(win);
 }
 
-void NESpaceToBatchLayerKernel::configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
-                                          ITensor *output)
+void NESpaceToBatchLayerKernel::configure(const ITensor *input,
+                                          const int      block_shape_x,
+                                          const int      block_shape_y,
+                                          const Size2D  &padding_left,
+                                          const Size2D  &padding_right,
+                                          ITensor       *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+    TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(
+        input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+                       input->info()->quantization_info());
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, padding_right, output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left,
+                                                         padding_right, output->info()));
 
     _input         = input;
     _output        = output;
@@ -128,15 +155,23 @@ void NESpaceToBatchLayerKernel::configure(const ITensor *input, const int block_
     INEKernel::configure(win);
 }
 
-Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input,
+                                           const ITensorInfo *block_shape,
+                                           const ITensorInfo *paddings,
+                                           const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, paddings, output));
     return Status{};
 }
-Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input,
+                                           const int          block_shape_x,
+                                           const int          block_shape_y,
+                                           const Size2D      &padding_left,
+                                           const Size2D      &padding_right,
                                            const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
     return Status{};
 }
 
@@ -146,17 +181,17 @@ void NESpaceToBatchLayerKernel::run(const Window &window, const ThreadInfo &info
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
 
-    if(_block_shape != nullptr)
+    if (_block_shape != nullptr)
     {
         // Retrieve the block shapes dynamically
         _block_shape_x = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(0)));
         _block_shape_y = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(1)));
     }
 
-    if(_paddings != nullptr)
+    if (_paddings != nullptr)
     {
-        const size_t pad_left_x = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({ 0, 0 }));
-        const size_t pad_left_y = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({ 1, 0 }));
+        const size_t pad_left_x = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({0, 0}));
+        const size_t pad_left_y = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({1, 0}));
         _padding_left           = Size2D(pad_left_x, pad_left_y);
     }
     const int height_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
@@ -173,57 +208,61 @@ void NESpaceToBatchLayerKernel::run(const Window &window, const ThreadInfo &info
     int batch_id = 0;
 
     // Main loop for NCHW and NHWC
-    if(_data_layout == DataLayout::NCHW)
+    if (_data_layout == DataLayout::NCHW)
     {
         do
         {
             Iterator out(_output, slice_out);
-            execute_window_loop(slice_out, [&](const Coordinates & id)
-            {
-                const size_t out_x = id.x();
-                const size_t out_y = id.y();
-                const size_t z     = id.z();
-                const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x;
-                const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x;
-                if(pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height && pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width)
+            execute_window_loop(
+                slice_out,
+                [&](const Coordinates &id)
                 {
-                    const int   w    = batch_id % batch_size;
-                    const int   in_x = pos_x - _padding_left.x();
-                    const int   in_y = pos_y - _padding_left.y();
-                    Coordinates input_coords{ in_x, in_y, z, w };
-                    memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
-                }
-            },
-            out);
+                    const size_t out_x = id.x();
+                    const size_t out_y = id.y();
+                    const size_t z     = id.z();
+                    const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x;
+                    const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x;
+                    if (pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height &&
+                        pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width)
+                    {
+                        const int   w    = batch_id % batch_size;
+                        const int   in_x = pos_x - _padding_left.x();
+                        const int   in_y = pos_y - _padding_left.y();
+                        Coordinates input_coords{in_x, in_y, z, w};
+                        memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+                    }
+                },
+                out);
             ++batch_id;
-        }
-        while(window.slide_window_slice_3D(slice_out));
+        } while (window.slide_window_slice_3D(slice_out));
     }
     else
     {
         do
         {
             Iterator out(_output, slice_out);
-            execute_window_loop(slice_out, [&](const Coordinates & id)
-            {
-                const size_t out_x = id.y();
-                const size_t out_y = id.z();
-                const size_t z     = id.x();
-                const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x;
-                const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x;
-                if(pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height && pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width)
+            execute_window_loop(
+                slice_out,
+                [&](const Coordinates &id)
                 {
-                    const int   w    = batch_id % batch_size;
-                    const int   in_x = pos_x - _padding_left.x();
-                    const int   in_y = pos_y - _padding_left.y();
-                    Coordinates input_coords{ z, in_x, in_y, w };
-                    memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
-                }
-            },
-            out);
+                    const size_t out_x = id.y();
+                    const size_t out_y = id.z();
+                    const size_t z     = id.x();
+                    const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x;
+                    const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x;
+                    if (pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height &&
+                        pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width)
+                    {
+                        const int   w    = batch_id % batch_size;
+                        const int   in_x = pos_x - _padding_left.x();
+                        const int   in_y = pos_y - _padding_left.y();
+                        Coordinates input_coords{z, in_x, in_y, w};
+                        memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+                    }
+                },
+                out);
             ++batch_id;
-        }
-        while(window.slide_window_slice_3D(slice_out));
+        } while (window.slide_window_slice_3D(slice_out));
     }
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h
index 44b8cbb514..6292c07136 100644
--- a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h
+++ b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NESPACETOBATCHLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -69,7 +70,12 @@ public:
      * @param[in]  padding_right The padding at the end of every dimension of the output tensor.
      * @param[out] output        Tensor output. Data types supported: same as @p input
      */
-    void configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output);
+    void configure(const ITensor *input,
+                   const int      block_shape_x,
+                   const int      block_shape_y,
+                   const Size2D  &padding_left,
+                   const Size2D  &padding_right,
+                   ITensor       *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayerKernel
      *
      * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -79,7 +85,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *block_shape,
+                           const ITensorInfo *paddings,
+                           const ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayerKernel (Static block shape and paddings)
      *
      * @param[in] input         Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -91,7 +100,12 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const int          block_shape_x,
+                           const int          block_shape_y,
+                           const Size2D      &padding_left,
+                           const Size2D      &padding_right,
+                           const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
index 7687c50c40..b49c5ee344 100644
--- a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
@@ -26,11 +26,12 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 #include <cstdint>
@@ -50,7 +51,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
     ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const DataLayout data_layout = input->data_layout();
         const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -115,43 +116,45 @@ void NESpaceToDepthLayerKernel::run(const Window &window, const ThreadInfo &info
     int batch_id = 0;
 
     // Main loop for NCHW and NHWC
-    if(_data_layout == DataLayout::NCHW)
+    if (_data_layout == DataLayout::NCHW)
     {
         do
         {
             Iterator out(_output, slice_out);
-            execute_window_loop(slice_out, [&](const Coordinates & id)
-            {
-                const size_t channel_id = id.z();
-                const size_t in_x       = id.x() * _block_shape + (channel_id / channel_size) % _block_shape;
-                const size_t in_y       = id.y() * _block_shape + (channel_id / channel_size) / _block_shape;
-                const int    z          = channel_id % channel_size;
-                Coordinates  input_coords{ in_x, in_y, z, batch_id };
-                memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
-            },
-            out);
+            execute_window_loop(
+                slice_out,
+                [&](const Coordinates &id)
+                {
+                    const size_t channel_id = id.z();
+                    const size_t in_x       = id.x() * _block_shape + (channel_id / channel_size) % _block_shape;
+                    const size_t in_y       = id.y() * _block_shape + (channel_id / channel_size) / _block_shape;
+                    const int    z          = channel_id % channel_size;
+                    Coordinates  input_coords{in_x, in_y, z, batch_id};
+                    memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+                },
+                out);
             ++batch_id;
-        }
-        while(window.slide_window_slice_3D(slice_out));
+        } while (window.slide_window_slice_3D(slice_out));
     }
     else
     {
         do
         {
             Iterator out(_output, slice_out);
-            execute_window_loop(slice_out, [&](const Coordinates & id)
-            {
-                const size_t channel_id = id.x();
-                const size_t in_x       = id.y() * _block_shape + (channel_id / channel_size) % _block_shape;
-                const size_t in_y       = id.z() * _block_shape + (channel_id / channel_size) / _block_shape;
-                const int    z          = channel_id % channel_size;
-                Coordinates  input_coords{ z, in_x, in_y, batch_id };
-                memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
-            },
-            out);
+            execute_window_loop(
+                slice_out,
+                [&](const Coordinates &id)
+                {
+                    const size_t channel_id = id.x();
+                    const size_t in_x       = id.y() * _block_shape + (channel_id / channel_size) % _block_shape;
+                    const size_t in_y       = id.z() * _block_shape + (channel_id / channel_size) / _block_shape;
+                    const int    z          = channel_id % channel_size;
+                    Coordinates  input_coords{z, in_x, in_y, batch_id};
+                    memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+                },
+                out);
             ++batch_id;
-        }
-        while(window.slide_window_slice_3D(slice_out));
+        } while (window.slide_window_slice_3D(slice_out));
     }
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h
index 953b68a401..7d147c5b94 100644
--- a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h
+++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NESPACETODEPTHLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
diff --git a/src/core/NEON/kernels/NEStackLayerKernel.cpp b/src/core/NEON/kernels/NEStackLayerKernel.cpp
index 93080e2ac7..e23b40a9aa 100644
--- a/src/core/NEON/kernels/NEStackLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEStackLayerKernel.cpp
@@ -25,13 +25,13 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -41,7 +41,11 @@ using namespace arm_compute::misc::shape_calculator;
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input,
+                          unsigned int       axis,
+                          unsigned int       idx_input,
+                          unsigned int       num_tensors,
+                          const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
@@ -50,9 +54,10 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned
     ARM_COMPUTE_RETURN_ERROR_ON(axis > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_stack_shape(*input, axis, num_tensors));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
+                                                           compute_stack_shape(*input, axis, num_tensors));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
@@ -60,7 +65,8 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
 {
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_stack_shape(*input, axis, num_tensors)));
@@ -71,11 +77,12 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsi
     return std::make_pair(Status{}, win);
 }
 
-inline Coordinates shift_from_axis_and_replace_coordinate(const Coordinates &id, unsigned int axis, unsigned int idx_input)
+inline Coordinates
+shift_from_axis_and_replace_coordinate(const Coordinates &id, unsigned int axis, unsigned int idx_input)
 {
     constexpr int max_out_coord = 5; // Input shape is max a 4D shape, output is max 5D
     Coordinates   id_out        = id;
-    for(unsigned int i = max_out_coord - 1; i > axis; --i)
+    for (unsigned int i = max_out_coord - 1; i > axis; --i)
     {
         id_out.set(i, id[i - 1]);
     }
@@ -84,12 +91,12 @@ inline Coordinates shift_from_axis_and_replace_coordinate(const Coordinates &id,
 }
 } // namespace
 
-NEStackLayerKernel::NEStackLayerKernel()
-    : _input(nullptr), _output(nullptr), _axis(), _idx_input()
+NEStackLayerKernel::NEStackLayerKernel() : _input(nullptr), _output(nullptr), _axis(), _idx_input()
 {
 }
 
-void NEStackLayerKernel::configure(const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output)
+void NEStackLayerKernel::configure(
+    const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, idx_input, num_tensors, output->info()));
@@ -106,10 +113,15 @@ void NEStackLayerKernel::configure(const ITensor *input, unsigned int axis, unsi
     INEKernel::configure(win_config.second);
 }
 
-Status NEStackLayerKernel::validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+Status NEStackLayerKernel::validate(const ITensorInfo *input,
+                                    unsigned int       axis,
+                                    unsigned int       idx_input,
+                                    unsigned int       num_tensors,
+                                    const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, idx_input, num_tensors, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
     return Status{};
 }
 
@@ -131,12 +143,15 @@ void NEStackLayerKernel::run(const Window &window, const ThreadInfo &info)
     const int stride_w = _output->info()->num_dimensions() >= 3 ? _output->info()->strides_in_bytes()[3] : 0;
     const int stride_k = _output->info()->num_dimensions() >= 4 ? _output->info()->strides_in_bytes()[4] : 0;
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        Coordinates id_out = shift_from_axis_and_replace_coordinate(id, _axis, _idx_input);
-        const int   idx    = id_out[0] * stride_x + id_out[1] * stride_y + id_out[2] * stride_z + id_out[3] * stride_w + id_out[4] * stride_k;
-        std::memcpy(output.ptr() + idx, input.ptr(), _input->info()->element_size());
-    },
-    input);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            Coordinates id_out = shift_from_axis_and_replace_coordinate(id, _axis, _idx_input);
+            const int idx = id_out[0] * stride_x + id_out[1] * stride_y + id_out[2] * stride_z + id_out[3] * stride_w +
+                            id_out[4] * stride_k;
+            std::memcpy(output.ptr() + idx, input.ptr(), _input->info()->element_size());
+        },
+        input);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEStackLayerKernel.h b/src/core/NEON/kernels/NEStackLayerKernel.h
index 9b36518e4d..685812b56d 100644
--- a/src/core/NEON/kernels/NEStackLayerKernel.h
+++ b/src/core/NEON/kernels/NEStackLayerKernel.h
@@ -26,6 +26,7 @@
 #define ARM_COMPUTE_NESTACKLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -64,7 +65,8 @@ public:
      * @param[out] output      Output tensor. Data types supported: Same as @p input.
      *
      */
-    void configure(const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output);
+    void configure(
+        const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output);
     /**  Static function to check if given info will lead to a valid configuration of @ref NEStackLayerKernel
      *
      * @note Supported input tensor rank: up to 4
@@ -78,7 +80,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           unsigned int       axis,
+                           unsigned int       idx_input,
+                           unsigned int       num_tensors,
+                           const ITensorInfo *output);
 
     // Inherited methods overridden
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEStridedSliceKernel.cpp b/src/core/NEON/kernels/NEStridedSliceKernel.cpp
index 2b406a8b8b..efff51be9d 100644
--- a/src/core/NEON/kernels/NEStridedSliceKernel.cpp
+++ b/src/core/NEON/kernels/NEStridedSliceKernel.cpp
@@ -26,9 +26,10 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Window.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -38,9 +39,14 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                          int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *output,
+                          const Coordinates &starts,
+                          const Coordinates &ends,
+                          const BiStrides   &strides,
+                          int32_t            begin_mask,
+                          int32_t            end_mask,
+                          int32_t            shrink_axis_mask)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
@@ -49,19 +55,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
     ARM_COMPUTE_RETURN_ERROR_ON(starts.num_dimensions() > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(ends.num_dimensions() > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(strides.num_dimensions() > input->num_dimensions());
-    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i)
-    {
-        return i == 0;
-    }));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i) { return i == 0; }));
 
     // Get expected output shape
-    const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
-                                                                                                          starts, ends, strides,
-                                                                                                          begin_mask, end_mask, shrink_axis_mask);
+    const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(
+        *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     ARM_COMPUTE_RETURN_ERROR_ON(exp_output_shape.total_size() == 0);
 
     // Checks output if configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const TensorInfo exp_output_info = output->clone()->set_tensor_shape(exp_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &exp_output_info);
@@ -71,14 +74,18 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input, ITensorInfo *output,
-                                                        const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                                        int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input,
+                                                        ITensorInfo       *output,
+                                                        const Coordinates &starts,
+                                                        const Coordinates &ends,
+                                                        const BiStrides   &strides,
+                                                        int32_t            begin_mask,
+                                                        int32_t            end_mask,
+                                                        int32_t            shrink_axis_mask)
 {
     // Output tensor auto initialization if not yet initialized
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
-                                                                                                      starts, ends, strides,
-                                                                                                      begin_mask, end_mask, shrink_axis_mask);
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(
+        *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
 
     // Create window
@@ -88,38 +95,49 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
 }
 } // namespace
 
-NEStridedSliceKernel::NEStridedSliceKernel()
-    : _starts_abs(), _final_strides(), _shrink_mask()
+NEStridedSliceKernel::NEStridedSliceKernel() : _starts_abs(), _final_strides(), _shrink_mask()
 {
 }
 
-void NEStridedSliceKernel::configure(const ITensorInfo *input, ITensorInfo *output,
-                                     const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                     int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void NEStridedSliceKernel::configure(const ITensorInfo *input,
+                                     ITensorInfo       *output,
+                                     const Coordinates &starts,
+                                     const Coordinates &ends,
+                                     const BiStrides   &strides,
+                                     int32_t            begin_mask,
+                                     int32_t            end_mask,
+                                     int32_t            shrink_axis_mask)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
     _shrink_mask                   = shrink_axis_mask;
     const TensorShape &input_shape = input->tensor_shape();
     Coordinates        ends_abs;
-    std::tie(_starts_abs, ends_abs, _final_strides) = arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(
-                                                          input_shape,
-                                                          starts, ends, strides,
-                                                          begin_mask, end_mask, shrink_axis_mask);
+    std::tie(_starts_abs, ends_abs, _final_strides) =
+        arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(input_shape, starts, ends, strides,
+                                                                               begin_mask, end_mask, shrink_axis_mask);
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    auto win_config =
+        validate_and_configure_window(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 }
 
-Status NEStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                      const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                      int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status NEStridedSliceKernel::validate(const ITensorInfo *input,
+                                      const ITensorInfo *output,
+                                      const Coordinates &starts,
+                                      const Coordinates &ends,
+                                      const BiStrides   &strides,
+                                      int32_t            begin_mask,
+                                      int32_t            end_mask,
+                                      int32_t            shrink_axis_mask)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(),
-                                                              starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), starts, ends,
+                                                              strides, begin_mask, end_mask, shrink_axis_mask)
+                                    .first);
 
     return Status{};
 }
@@ -156,7 +174,7 @@ void NEStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, co
 
     size_t length_x = win.shape()[0];
 
-    if(_final_strides[0] == 1 && !is_shrink_x)
+    if (_final_strides[0] == 1 && !is_shrink_x)
     {
         win.set(Window::DimX, Window::Dimension(0, 1, 1));
         width_size = width_size * length_x;
@@ -183,16 +201,17 @@ void NEStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, co
     uint8_t *cur_ptr;
 
     execute_window_loop(
-        win, [&](const Coordinates & id)
-    {
-        cur_ptr = input_base;
-        cur_ptr += (start_0 + (id[idx_x] * shrinked_stride_0)) * byte_increment_0;
-        cur_ptr += (start_1 + (id[idx_y] * shrinked_stride_1)) * byte_increment_1;
-        cur_ptr += (start_2 + (id[idx_z] * shrinked_stride_2)) * byte_increment_2;
-        cur_ptr += (start_3 + (id[idx_w] * shrinked_stride_3)) * byte_increment_3;
-
-        std::copy_n(cur_ptr, width_size, output_it.ptr());
-    },
-    output_it);
+        win,
+        [&](const Coordinates &id)
+        {
+            cur_ptr = input_base;
+            cur_ptr += (start_0 + (id[idx_x] * shrinked_stride_0)) * byte_increment_0;
+            cur_ptr += (start_1 + (id[idx_y] * shrinked_stride_1)) * byte_increment_1;
+            cur_ptr += (start_2 + (id[idx_z] * shrinked_stride_2)) * byte_increment_2;
+            cur_ptr += (start_3 + (id[idx_w] * shrinked_stride_3)) * byte_increment_3;
+
+            std::copy_n(cur_ptr, width_size, output_it.ptr());
+        },
+        output_it);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEStridedSliceKernel.h b/src/core/NEON/kernels/NEStridedSliceKernel.h
index 9ce517417d..a475f09a17 100644
--- a/src/core/NEON/kernels/NEStridedSliceKernel.h
+++ b/src/core/NEON/kernels/NEStridedSliceKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NE_STRIDED_SLICE_KERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 #include <cstdint>
@@ -68,9 +69,14 @@ public:
      * @param[in]  shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    void configure(const ITensorInfo *input, ITensorInfo *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+    void configure(const ITensorInfo *input,
+                   ITensorInfo       *output,
+                   const Coordinates &starts,
+                   const Coordinates &ends,
+                   const BiStrides   &strides,
+                   int32_t            begin_mask,
+                   int32_t            end_mask,
+                   int32_t            shrink_axis_mask);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEStridedSliceKernel
      *
@@ -86,9 +92,14 @@ public:
      * @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                           int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const Coordinates &starts,
+                           const Coordinates &ends,
+                           const BiStrides   &strides,
+                           int32_t            begin_mask,
+                           int32_t            end_mask,
+                           int32_t            shrink_axis_mask);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NETileKernel.cpp b/src/core/NEON/kernels/NETileKernel.cpp
index 94256dc12d..577ce5b69e 100644
--- a/src/core/NEON/kernels/NETileKernel.cpp
+++ b/src/core/NEON/kernels/NETileKernel.cpp
@@ -27,9 +27,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -43,15 +44,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON(multiples.size() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(multiples.empty());
-    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e)
-    {
-        return e == 0;
-    }));
+    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e) { return e == 0; }));
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
@@ -59,8 +58,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 }
 } // namespace
 
-NETileKernel::NETileKernel()
-    : _input(nullptr), _output(nullptr)
+NETileKernel::NETileKernel() : _input(nullptr), _output(nullptr)
 {
 }
 
@@ -95,8 +93,9 @@ void NETileKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    Window output_window{ window };
-    output_window.set(Window::DimX, Window::Dimension(output_window.x().start(), output_window.x().end(), _input->info()->dimension(0)));
+    Window output_window{window};
+    output_window.set(Window::DimX, Window::Dimension(output_window.x().start(), output_window.x().end(),
+                                                      _input->info()->dimension(0)));
     Window out_slice = output_window.first_slice_window_1D();
 
     const auto src_shape = _input->info()->tensor_shape();
@@ -104,17 +103,19 @@ void NETileKernel::run(const Window &window, const ThreadInfo &info)
     {
         Iterator output_it(_output, out_slice);
 
-        execute_window_loop(out_slice, [&](const Coordinates & id)
-        {
-            const size_t x = id.x();
-            const size_t y = id.y();
-            const size_t z = id.z();
-            const size_t w = id[3];
-            Coordinates  input_coords{ x % src_shape[0], y % src_shape[1], z % src_shape[2], w % src_shape[3] };
-            memcpy(output_it.ptr(), _input->ptr_to_element(input_coords), _input->info()->dimension(0) * _input->info()->element_size());
-        },
-        output_it);
-    }
-    while(output_window.slide_window_slice_1D(out_slice));
+        execute_window_loop(
+            out_slice,
+            [&](const Coordinates &id)
+            {
+                const size_t x = id.x();
+                const size_t y = id.y();
+                const size_t z = id.z();
+                const size_t w = id[3];
+                Coordinates  input_coords{x % src_shape[0], y % src_shape[1], z % src_shape[2], w % src_shape[3]};
+                memcpy(output_it.ptr(), _input->ptr_to_element(input_coords),
+                       _input->info()->dimension(0) * _input->info()->element_size());
+            },
+            output_it);
+    } while (output_window.slide_window_slice_1D(out_slice));
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/assembly/depthwise.hpp b/src/core/NEON/kernels/assembly/depthwise.hpp
index dbd47ccfa9..13c2d314e4 100644
--- a/src/core/NEON/kernels/assembly/depthwise.hpp
+++ b/src/core/NEON/kernels/assembly/depthwise.hpp
@@ -38,9 +38,8 @@ struct DepthwiseConfig
     DepthwiseMethod method = DepthwiseMethod::DEFAULT;
     std::string     filter = "";
 
-    DepthwiseConfig(DepthwiseMethod method)
-        : method(method) {};
-    DepthwiseConfig() {};
+    DepthwiseConfig(DepthwiseMethod method) : method(method){};
+    DepthwiseConfig(){};
 };
 
 struct DepthwiseArgs
@@ -63,18 +62,24 @@ struct DepthwiseArgs
 
     bool fast_mode = false;
 
-    DepthwiseArgs(
-        const CPUInfo *cpu_info,
-        unsigned int kernel_rows, unsigned int kernel_cols,
-        unsigned int stride_rows, unsigned int stride_cols,
-        unsigned int dilation_rows, unsigned int dilation_cols,
-        unsigned int n_batches, unsigned int input_rows, unsigned int input_cols,
-        unsigned int input_channels,
-        unsigned int output_rows, unsigned int output_cols,
-        unsigned int  channel_multiplier,
-        PaddingValues padding, arm_gemm::Activation activation,
-
-        const DepthwiseConfig *config)
+    DepthwiseArgs(const CPUInfo       *cpu_info,
+                  unsigned int         kernel_rows,
+                  unsigned int         kernel_cols,
+                  unsigned int         stride_rows,
+                  unsigned int         stride_cols,
+                  unsigned int         dilation_rows,
+                  unsigned int         dilation_cols,
+                  unsigned int         n_batches,
+                  unsigned int         input_rows,
+                  unsigned int         input_cols,
+                  unsigned int         input_channels,
+                  unsigned int         output_rows,
+                  unsigned int         output_cols,
+                  unsigned int         channel_multiplier,
+                  PaddingValues        padding,
+                  arm_gemm::Activation activation,
+
+                  const DepthwiseConfig *config)
         : cpu_info(cpu_info),
           kernel_rows(kernel_rows),
           kernel_cols(kernel_cols),
@@ -95,20 +100,38 @@ struct DepthwiseArgs
     {
     }
 
-    DepthwiseArgs(
-        const CPUInfo *cpu_info,
-        unsigned int kernel_rows, unsigned int kernel_cols,
-        unsigned int stride_rows, unsigned int stride_cols,
-        unsigned int n_batches, unsigned int input_rows, unsigned int input_cols,
-        unsigned int input_channels,
-        unsigned int output_rows, unsigned int output_cols,
-        unsigned int  channel_multiplier,
-        PaddingValues padding, arm_gemm::Activation activation,
-        const DepthwiseConfig *config)
-        : DepthwiseArgs(cpu_info, kernel_rows, kernel_cols, stride_rows,
-                        stride_cols, 1, 1, n_batches, input_rows, input_cols,
-                        input_channels, output_rows, output_cols,
-                        channel_multiplier, padding, activation, config)
+    DepthwiseArgs(const CPUInfo         *cpu_info,
+                  unsigned int           kernel_rows,
+                  unsigned int           kernel_cols,
+                  unsigned int           stride_rows,
+                  unsigned int           stride_cols,
+                  unsigned int           n_batches,
+                  unsigned int           input_rows,
+                  unsigned int           input_cols,
+                  unsigned int           input_channels,
+                  unsigned int           output_rows,
+                  unsigned int           output_cols,
+                  unsigned int           channel_multiplier,
+                  PaddingValues          padding,
+                  arm_gemm::Activation   activation,
+                  const DepthwiseConfig *config)
+        : DepthwiseArgs(cpu_info,
+                        kernel_rows,
+                        kernel_cols,
+                        stride_rows,
+                        stride_cols,
+                        1,
+                        1,
+                        n_batches,
+                        input_rows,
+                        input_cols,
+                        input_channels,
+                        output_rows,
+                        output_cols,
+                        channel_multiplier,
+                        padding,
+                        activation,
+                        config)
     {
     }
 };
@@ -127,17 +150,18 @@ struct Tile
     {
     }
 
-    Tile()
-        : Tile(nullptr, 0, 0, 0)
+    Tile() : Tile(nullptr, 0, 0, 0)
     {
     }
 
-    void load_from(
-        const TInput      *input,
-        const unsigned int ld_row, const unsigned int ld_col,
-        const unsigned int n_rows, const unsigned int n_cols,
-        const int input_i, const int input_j,
-        const unsigned int channel_multiplier) const
+    void load_from(const TInput      *input,
+                   const unsigned int ld_row,
+                   const unsigned int ld_col,
+                   const unsigned int n_rows,
+                   const unsigned int n_cols,
+                   const int          input_i,
+                   const int          input_j,
+                   const unsigned int channel_multiplier) const
     {
         const auto pad_top  = input_i < 0 ? -input_i : 0;
         const auto pad_left = input_j < 0 ? -input_j : 0;
@@ -145,18 +169,15 @@ struct Tile
         const auto padded_rows = std::min(n_rows - input_i, tile_rows) - pad_top;
         const auto padded_cols = std::min(n_cols - input_j, tile_cols) - pad_left;
 
-        if(padded_rows < tile_rows || padded_cols < tile_cols)
+        if (padded_rows < tile_rows || padded_cols < tile_cols)
         {
             memset(array, 0, tile_rows * tile_cols * tile_channels * sizeof(TInput));
         }
 
-        do_premultiply<TInput>(
-            (TInput *)input + std::max(input_i, 0) * ld_row + std::max(input_j, 0) * ld_col,
-            ld_row, ld_col,
-            array + pad_top * tile_cols * tile_channels + pad_left * tile_channels,
-            tile_cols * tile_channels, tile_channels,
-            padded_rows, padded_cols, tile_channels / channel_multiplier,
-            channel_multiplier);
+        do_premultiply<TInput>((TInput *)input + std::max(input_i, 0) * ld_row + std::max(input_j, 0) * ld_col, ld_row,
+                               ld_col, array + pad_top * tile_cols * tile_channels + pad_left * tile_channels,
+                               tile_cols * tile_channels, tile_channels, padded_rows, padded_cols,
+                               tile_channels / channel_multiplier, channel_multiplier);
     }
 };
 
@@ -168,9 +189,8 @@ protected:
     std::string         m_name{};
 
 public:
-    DepthwiseCommon(const DepthwiseArgs &args)
-        : m_args(args) {};
-    DepthwiseCommon(DepthwiseCommon &) = delete;
+    DepthwiseCommon(const DepthwiseArgs &args) : m_args(args){};
+    DepthwiseCommon(DepthwiseCommon &)            = delete;
     DepthwiseCommon &operator=(DepthwiseCommon &) = delete;
 
     std::string name() const override
@@ -181,19 +201,18 @@ public:
     void set_name(std::string name)
     {
         // Only allow the name to be set once
-        if(m_name.empty())
+        if (m_name.empty())
         {
             m_name = name;
         }
     }
 
-    void execute(
-        const void *const  input,
-        const void *const  parameters,
-        void *const        output,
-        void *const        working_space,
-        const unsigned int thread_id,
-        const unsigned int n_threads) const override final
+    void execute(const void *const  input,
+                 const void *const  parameters,
+                 void *const        output,
+                 void *const        working_space,
+                 const unsigned int thread_id,
+                 const unsigned int n_threads) const override final
     {
         const size_t ld_input_col    = m_args.input_channels;
         const size_t ld_input_row    = ld_input_col * m_args.input_cols;
@@ -202,56 +221,47 @@ public:
         const size_t ld_output_row   = ld_output_col * m_args.output_cols;
         const size_t ld_output_batch = ld_output_row * m_args.output_rows;
 
-        execute(
-            input, ld_input_col, ld_input_row, ld_input_batch,
-            parameters, output, ld_output_col, ld_output_row, ld_output_batch,
-            working_space, thread_id, n_threads);
+        execute(input, ld_input_col, ld_input_row, ld_input_batch, parameters, output, ld_output_col, ld_output_row,
+                ld_output_batch, working_space, thread_id, n_threads);
     }
 
-    void execute(
-        const void *const  input,
-        size_t             ld_input_col,
-        size_t             ld_input_row,
-        size_t             ld_input_batch,
-        const void *const  parameters,
-        void *const        output,
-        size_t             ld_output_col,
-        size_t             ld_output_row,
-        size_t             ld_output_batch,
-        void *const        working_space,
-        const unsigned int thread_id,
-        const unsigned int n_threads) const override final
+    void execute(const void *const  input,
+                 size_t             ld_input_col,
+                 size_t             ld_input_row,
+                 size_t             ld_input_batch,
+                 const void *const  parameters,
+                 void *const        output,
+                 size_t             ld_output_col,
+                 size_t             ld_output_row,
+                 size_t             ld_output_batch,
+                 void *const        working_space,
+                 const unsigned int thread_id,
+                 const unsigned int n_threads) const override final
     {
-        execute(
-            m_args.n_batches, m_args.input_rows, m_args.input_cols,
-            m_args.input_channels, m_args.padding,
-            input, ld_input_col, ld_input_row, ld_input_batch,
-            parameters,
-            m_args.output_rows, m_args.output_cols,
-            output, ld_output_col, ld_output_row, ld_output_batch,
-            working_space, thread_id, n_threads);
+        execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.input_channels, m_args.padding, input,
+                ld_input_col, ld_input_row, ld_input_batch, parameters, m_args.output_rows, m_args.output_cols, output,
+                ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, n_threads);
     }
 
-    void execute(
-        unsigned int         batches,
-        unsigned int         input_height,
-        unsigned int         input_width,
-        unsigned int         channels,
-        const PaddingValues &padding,
-        const void          *input,
-        size_t               ld_input_col,
-        size_t               ld_input_row,
-        size_t               ld_input_batch,
-        const void          *parameters,
-        unsigned int         output_height,
-        unsigned int         output_width,
-        void                *output,
-        size_t               ld_output_col,
-        size_t               ld_output_row,
-        size_t               ld_output_batch,
-        void                *working_space,
-        unsigned int         thread_id,
-        unsigned int         n_threads) const override final
+    void execute(unsigned int         batches,
+                 unsigned int         input_height,
+                 unsigned int         input_width,
+                 unsigned int         channels,
+                 const PaddingValues &padding,
+                 const void          *input,
+                 size_t               ld_input_col,
+                 size_t               ld_input_row,
+                 size_t               ld_input_batch,
+                 const void          *parameters,
+                 unsigned int         output_height,
+                 unsigned int         output_width,
+                 void                *output,
+                 size_t               ld_output_col,
+                 size_t               ld_output_row,
+                 size_t               ld_output_batch,
+                 void                *working_space,
+                 unsigned int         thread_id,
+                 unsigned int         n_threads) const override final
     {
         // Construct a new set of arguments to reflect that we might have been
         // passed different input/output tensors. Dilation is handled at this
@@ -271,38 +281,33 @@ public:
         auto ld_output_col_d = ld_output_col * m_args.dilation_cols;
         auto ld_output_row_d = ld_output_row * m_args.dilation_rows;
 
-        for(size_t drow = 0; drow < m_args.dilation_rows; drow++)
+        for (size_t drow = 0; drow < m_args.dilation_rows; drow++)
         {
             size_t start_i;
-            std::tie(args.output_rows, args.input_rows, start_i,
-                     args.padding.top, args.padding.bottom) =
-                         get_reduced_view_for_dilation(
-                             output_height, input_height, drow, m_args.dilation_rows,
-                             m_args.kernel_rows, m_args.stride_rows, padding.top);
+            std::tie(args.output_rows, args.input_rows, start_i, args.padding.top, args.padding.bottom) =
+                get_reduced_view_for_dilation(output_height, input_height, drow, m_args.dilation_rows,
+                                              m_args.kernel_rows, m_args.stride_rows, padding.top);
 
             auto input_row  = static_cast<const TInput *>(input) + start_i * ld_input_row;
             auto output_row = static_cast<TOutput *>(output) + drow * ld_output_row;
 
-            if(args.output_rows)
+            if (args.output_rows)
             {
-                for(size_t dcol = 0; dcol < m_args.dilation_cols; dcol++)
+                for (size_t dcol = 0; dcol < m_args.dilation_cols; dcol++)
                 {
                     size_t start_j;
-                    std::tie(args.output_cols, args.input_cols, start_j,
-                             args.padding.left, args.padding.right) =
-                                 get_reduced_view_for_dilation(
-                                     output_width, input_width, dcol, m_args.dilation_cols,
-                                     m_args.kernel_cols, m_args.stride_cols, padding.left);
+                    std::tie(args.output_cols, args.input_cols, start_j, args.padding.left, args.padding.right) =
+                        get_reduced_view_for_dilation(output_width, input_width, dcol, m_args.dilation_cols,
+                                                      m_args.kernel_cols, m_args.stride_cols, padding.left);
 
                     const TInput *input_col  = input_row + start_j * ld_input_col;
                     TOutput      *output_col = output_row + dcol * ld_output_col;
 
-                    if(args.output_cols)
+                    if (args.output_cols)
                     {
-                        this->execute_internal(
-                            args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch, parameters,
-                            output_col, ld_output_col_d, ld_output_row_d, ld_output_batch,
-                            working_space, thread_id, n_threads);
+                        this->execute_internal(args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch,
+                                               parameters, output_col, ld_output_col_d, ld_output_row_d,
+                                               ld_output_batch, working_space, thread_id, n_threads);
                     }
                 }
             }
@@ -310,20 +315,19 @@ public:
     }
 
 protected:
-    virtual void execute_internal(
-        const DepthwiseArgs &instance_args,
-        const void          *input,
-        size_t               ld_input_col,
-        size_t               ld_input_row,
-        size_t               ld_input_batch,
-        const void          *parameters,
-        void                *output,
-        size_t               ld_output_col,
-        size_t               ld_output_row,
-        size_t               ld_output_batch,
-        void                *working_space,
-        unsigned int         thread_id,
-        unsigned int         n_threads) const = 0;
+    virtual void execute_internal(const DepthwiseArgs &instance_args,
+                                  const void          *input,
+                                  size_t               ld_input_col,
+                                  size_t               ld_input_row,
+                                  size_t               ld_input_batch,
+                                  const void          *parameters,
+                                  void                *output,
+                                  size_t               ld_output_col,
+                                  size_t               ld_output_row,
+                                  size_t               ld_output_batch,
+                                  void                *working_space,
+                                  unsigned int         thread_id,
+                                  unsigned int         n_threads) const = 0;
 
     virtual bool uses_premultiply() const
     {
diff --git a/src/core/NEON/kernels/assembly/depthwise_common.hpp b/src/core/NEON/kernels/assembly/depthwise_common.hpp
index a5db793b3d..5ff848e281 100644
--- a/src/core/NEON/kernels/assembly/depthwise_common.hpp
+++ b/src/core/NEON/kernels/assembly/depthwise_common.hpp
@@ -49,11 +49,7 @@ struct KernelDescription
     bool            is_default     = false;
     uint64_t        cycle_estimate = 0;
 
-    KernelDescription(
-        DepthwiseMethod method,
-        std::string     name,
-        bool            is_default,
-        uint64_t        cycle_estimate)
+    KernelDescription(DepthwiseMethod method, std::string name, bool is_default, uint64_t cycle_estimate)
         : method(method), name(name), is_default(is_default), cycle_estimate(cycle_estimate)
     {
     }
@@ -78,58 +74,51 @@ public:
     // pointer the bias vector (which may be nullptr in the case of no bias) and
     // a pointer to the array of weights (stored in HWIO order).
     virtual void pack_parameters(
-        void       *buffer,
-        const void *biases,
-        const void *weights,
-        size_t      ld_weight_col = 0,
-        size_t      ld_weight_row = 0) = 0;
+        void *buffer, const void *biases, const void *weights, size_t ld_weight_col = 0, size_t ld_weight_row = 0) = 0;
 
     // Determine the amount of working space required
     virtual size_t get_working_size(unsigned int n_threads) const = 0;
 
     // Execute the convolution over the specified area of memory.
-    virtual void execute(
-        const void *input,       // Pointer to input tensor
-        const void *parameters,  // Packed parameters buffer
-        void        *output,
-        void        *working_space,
-        unsigned int thread_id,
-        unsigned int n_threads) const = 0;
-
-    virtual void execute(
-        const void *input,
-        size_t       ld_input_col,
-        size_t       ld_input_row,
-        size_t       ld_input_batch,
-        const void *parameters,
-        void        *output,
-        size_t       ld_output_col,
-        size_t       ld_output_row,
-        size_t       ld_output_batch,
-        void        *working_space,
-        unsigned int thread_id,
-        unsigned int n_threads) const = 0;
-
-    virtual void execute(
-        unsigned int batches,
-        unsigned int input_height,
-        unsigned int input_width,
-        unsigned int channels,
-        const PaddingValues &,
-        const void *input,
-        size_t       ld_input_col,
-        size_t       ld_input_row,
-        size_t       ld_input_batch,
-        const void *parameters,
-        unsigned int output_height,
-        unsigned int output_width,
-        void        *output,
-        size_t       ld_output_col,
-        size_t       ld_output_row,
-        size_t       ld_output_batch,
-        void        *working_space,
-        unsigned int thread_id,
-        unsigned int n_threads) const = 0;
+    virtual void execute(const void  *input,      // Pointer to input tensor
+                         const void  *parameters, // Packed parameters buffer
+                         void        *output,
+                         void        *working_space,
+                         unsigned int thread_id,
+                         unsigned int n_threads) const = 0;
+
+    virtual void execute(const void  *input,
+                         size_t       ld_input_col,
+                         size_t       ld_input_row,
+                         size_t       ld_input_batch,
+                         const void  *parameters,
+                         void        *output,
+                         size_t       ld_output_col,
+                         size_t       ld_output_row,
+                         size_t       ld_output_batch,
+                         void        *working_space,
+                         unsigned int thread_id,
+                         unsigned int n_threads) const = 0;
+
+    virtual void execute(unsigned int batches,
+                         unsigned int input_height,
+                         unsigned int input_width,
+                         unsigned int channels,
+                         const PaddingValues &,
+                         const void  *input,
+                         size_t       ld_input_col,
+                         size_t       ld_input_row,
+                         size_t       ld_input_batch,
+                         const void  *parameters,
+                         unsigned int output_height,
+                         unsigned int output_width,
+                         void        *output,
+                         size_t       ld_output_col,
+                         size_t       ld_output_row,
+                         size_t       ld_output_batch,
+                         void        *working_space,
+                         unsigned int thread_id,
+                         unsigned int n_threads) const = 0;
 };
 
 // To handle a dilation factor of D execute the kernel once for each d in
@@ -145,12 +134,13 @@ public:
 // - Number of valid input pixels corresponding to `d`
 // - Offset of the first pixel corresponding to `d`
 // - Amount of padding in the view for `d`
-std::tuple<size_t, size_t, size_t, size_t, size_t>
-get_reduced_view_for_dilation(
-    size_t out_size, size_t in_size,
-    size_t d, size_t dilation_factor,
-    size_t kernel_size, size_t stride,
-    size_t pad_before);
+std::tuple<size_t, size_t, size_t, size_t, size_t> get_reduced_view_for_dilation(size_t out_size,
+                                                                                 size_t in_size,
+                                                                                 size_t d,
+                                                                                 size_t dilation_factor,
+                                                                                 size_t kernel_size,
+                                                                                 size_t stride,
+                                                                                 size_t pad_before);
 
 } // namespace depthwise
 } // namespace arm_conv
diff --git a/src/core/NEON/kernels/assembly/pool_common.hpp b/src/core/NEON/kernels/assembly/pool_common.hpp
index f1f70cf1d6..045f9f95d3 100644
--- a/src/core/NEON/kernels/assembly/pool_common.hpp
+++ b/src/core/NEON/kernels/assembly/pool_common.hpp
@@ -68,45 +68,42 @@ public:
     virtual size_t get_working_size(unsigned int num_threads) const = 0;
 
     // Execute pooling over the specified area of memory.
-    virtual void execute(
-        const void *const input,
-        void *const       output,
-        void             *working_space,
-        unsigned int      thread_id,
-        unsigned int      num_threads) const = 0;
+    virtual void execute(const void *const input,
+                         void *const       output,
+                         void             *working_space,
+                         unsigned int      thread_id,
+                         unsigned int      num_threads) const = 0;
 
-    virtual void execute(
-        const void *const input,
-        size_t            ld_input_col,
-        size_t            ld_input_row,
-        size_t            ld_input_batch,
-        void *const       output,
-        size_t            ld_output_col,
-        size_t            ld_output_row,
-        size_t            ld_output_batch,
-        void             *working_space,
-        unsigned int      thread_id,
-        unsigned int      num_threads) const = 0;
+    virtual void execute(const void *const input,
+                         size_t            ld_input_col,
+                         size_t            ld_input_row,
+                         size_t            ld_input_batch,
+                         void *const       output,
+                         size_t            ld_output_col,
+                         size_t            ld_output_row,
+                         size_t            ld_output_batch,
+                         void             *working_space,
+                         unsigned int      thread_id,
+                         unsigned int      num_threads) const = 0;
 
-    virtual void execute(
-        unsigned int      batches,
-        unsigned int      height,
-        unsigned int      width,
-        unsigned int      channels,
-        const void *const input,
-        size_t            ld_input_col,
-        size_t            ld_input_row,
-        size_t            ld_input_batch,
-        const PaddingValues &,
-        unsigned int output_height,
-        unsigned int output_width,
-        void *const  output,
-        size_t       ld_output_col,
-        size_t       ld_output_row,
-        size_t       ld_output_batch,
-        void        *working_space,
-        unsigned int thread_id,
-        unsigned int num_threads) const = 0;
+    virtual void execute(unsigned int      batches,
+                         unsigned int      height,
+                         unsigned int      width,
+                         unsigned int      channels,
+                         const void *const input,
+                         size_t            ld_input_col,
+                         size_t            ld_input_row,
+                         size_t            ld_input_batch,
+                         const PaddingValues &,
+                         unsigned int output_height,
+                         unsigned int output_width,
+                         void *const  output,
+                         size_t       ld_output_col,
+                         size_t       ld_output_row,
+                         size_t       ld_output_batch,
+                         void        *working_space,
+                         unsigned int thread_id,
+                         unsigned int num_threads) const = 0;
 };
 
 } // namespace pooling
diff --git a/src/core/NEON/kernels/assembly/pooling.hpp b/src/core/NEON/kernels/assembly/pooling.hpp
index e8db35c593..89d594298e 100644
--- a/src/core/NEON/kernels/assembly/pooling.hpp
+++ b/src/core/NEON/kernels/assembly/pooling.hpp
@@ -36,9 +36,8 @@ struct PoolingConfig
     PoolingMethod method = PoolingMethod::DEFAULT;
     std::string   filter = "";
 
-    PoolingConfig(PoolingMethod method)
-        : method(method) {};
-    PoolingConfig() {};
+    PoolingConfig(PoolingMethod method) : method(method){};
+    PoolingConfig(){};
 };
 
 struct PoolingArgs
@@ -57,30 +56,40 @@ struct PoolingArgs
 
     const PoolingConfig *config;
 
-    PoolingArgs(
-        const CPUInfo       *cpu_info,
-        PoolingType          pool_type,
-        const PoolingWindow &window,
-        const PoolingStride &stride,
-        bool                 exclude_padding,
-        unsigned int         n_batches,
-        unsigned int         input_rows,
-        unsigned int         input_cols,
-        unsigned int         n_channels,
-        unsigned int         output_rows,
-        unsigned int         output_cols,
-        const PaddingValues &padding,
-        const PoolingConfig *cfg)
-        : cpu_info(cpu_info), pool_type(pool_type), pool_window(window), pool_stride(stride), exclude_padding(exclude_padding), n_batches(n_batches), input_rows(input_rows), input_cols(input_cols),
-          n_channels(n_channels), output_rows(output_rows), output_cols(output_cols), padding(padding), config(cfg)
+    PoolingArgs(const CPUInfo       *cpu_info,
+                PoolingType          pool_type,
+                const PoolingWindow &window,
+                const PoolingStride &stride,
+                bool                 exclude_padding,
+                unsigned int         n_batches,
+                unsigned int         input_rows,
+                unsigned int         input_cols,
+                unsigned int         n_channels,
+                unsigned int         output_rows,
+                unsigned int         output_cols,
+                const PaddingValues &padding,
+                const PoolingConfig *cfg)
+        : cpu_info(cpu_info),
+          pool_type(pool_type),
+          pool_window(window),
+          pool_stride(stride),
+          exclude_padding(exclude_padding),
+          n_batches(n_batches),
+          input_rows(input_rows),
+          input_cols(input_cols),
+          n_channels(n_channels),
+          output_rows(output_rows),
+          output_cols(output_cols),
+          padding(padding),
+          config(cfg)
     {
         // If either of the pooling window dimensions are set to zero, meaning
         // "pool everything", then replace with the corresponding input dimension.
-        if(pool_window.rows == 0)
+        if (pool_window.rows == 0)
         {
             pool_window.rows = input_rows;
         }
-        if(pool_window.cols == 0)
+        if (pool_window.cols == 0)
         {
             pool_window.cols = input_cols;
         }
@@ -100,10 +109,16 @@ struct Requantize32
     int32_t per_layer_right_shift = 0;
     int32_t per_layer_mul         = 0;
 
-    Requantize32(int32_t input_offset, int32_t output_offset,
-                 int32_t per_layer_left_shift, int32_t per_layer_right_shift,
+    Requantize32(int32_t input_offset,
+                 int32_t output_offset,
+                 int32_t per_layer_left_shift,
+                 int32_t per_layer_right_shift,
                  int32_t per_layer_mul)
-        : input_offset(input_offset), output_offset(output_offset), per_layer_left_shift(per_layer_left_shift), per_layer_right_shift(per_layer_right_shift), per_layer_mul(per_layer_mul)
+        : input_offset(input_offset),
+          output_offset(output_offset),
+          per_layer_left_shift(per_layer_left_shift),
+          per_layer_right_shift(per_layer_right_shift),
+          per_layer_mul(per_layer_mul)
     {
     }
 };
@@ -115,105 +130,88 @@ protected:
     const PoolingArgs m_args;
 
 public:
-    PoolingCommon(const PoolingArgs &args)
-        : m_args(args)
+    PoolingCommon(const PoolingArgs &args) : m_args(args)
     {
     }
-    PoolingCommon(PoolingCommon &) = delete;
+    PoolingCommon(PoolingCommon &)            = delete;
     PoolingCommon &operator=(PoolingCommon &) = delete;
 
     size_t get_working_size(unsigned int) const override = 0;
 
     // Execute pooling over the specified area of memory.
-    void execute(
-        const void *const input,
-        void *const       output,
-        void             *working_space,
-        unsigned int      thread_id,
-        unsigned int      num_threads) const override
+    void execute(const void *const input,
+                 void *const       output,
+                 void             *working_space,
+                 unsigned int      thread_id,
+                 unsigned int      num_threads) const override
     {
-        this->execute(
-            input,
-            m_args.n_channels,
-            m_args.n_channels * m_args.input_cols,
-            m_args.n_channels * m_args.input_cols * m_args.input_rows,
-            output,
-            m_args.n_channels,
-            m_args.n_channels * m_args.output_cols,
-            m_args.n_channels * m_args.output_cols * m_args.output_rows,
-            working_space,
-            thread_id, num_threads);
+        this->execute(input, m_args.n_channels, m_args.n_channels * m_args.input_cols,
+                      m_args.n_channels * m_args.input_cols * m_args.input_rows, output, m_args.n_channels,
+                      m_args.n_channels * m_args.output_cols,
+                      m_args.n_channels * m_args.output_cols * m_args.output_rows, working_space, thread_id,
+                      num_threads);
     }
 
-    void execute(
-        const void *const input,
-        size_t            ld_input_col,
-        size_t            ld_input_row,
-        size_t            ld_input_batch,
-        void *const       output,
-        size_t            ld_output_col,
-        size_t            ld_output_row,
-        size_t            ld_output_batch,
-        void             *working_space,
-        unsigned int      thread_id,
-        unsigned int      num_threads) const override
+    void execute(const void *const input,
+                 size_t            ld_input_col,
+                 size_t            ld_input_row,
+                 size_t            ld_input_batch,
+                 void *const       output,
+                 size_t            ld_output_col,
+                 size_t            ld_output_row,
+                 size_t            ld_output_batch,
+                 void             *working_space,
+                 unsigned int      thread_id,
+                 unsigned int      num_threads) const override
     {
-        this->execute(
-            m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.n_channels,
-            input, ld_input_col, ld_input_row, ld_input_batch,
-            m_args.padding, m_args.output_rows, m_args.output_cols,
-            output, ld_output_col, ld_output_row, ld_output_batch,
-            working_space, thread_id, num_threads);
+        this->execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.n_channels, input, ld_input_col,
+                      ld_input_row, ld_input_batch, m_args.padding, m_args.output_rows, m_args.output_cols, output,
+                      ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, num_threads);
     }
 
-    void execute(
-        unsigned int         batches,
-        unsigned int         height,
-        unsigned int         width,
-        unsigned int         channels,
-        const void *const    input,
-        size_t               ld_input_col,
-        size_t               ld_input_row,
-        size_t               ld_input_batch,
-        const PaddingValues &padding,
-        unsigned int         output_height,
-        unsigned int         output_width,
-        void *const          output,
-        size_t               ld_output_col,
-        size_t               ld_output_row,
-        size_t               ld_output_batch,
-        void                *working_space,
-        unsigned int         thread_id,
-        unsigned int         num_threads) const override
+    void execute(unsigned int         batches,
+                 unsigned int         height,
+                 unsigned int         width,
+                 unsigned int         channels,
+                 const void *const    input,
+                 size_t               ld_input_col,
+                 size_t               ld_input_row,
+                 size_t               ld_input_batch,
+                 const PaddingValues &padding,
+                 unsigned int         output_height,
+                 unsigned int         output_width,
+                 void *const          output,
+                 size_t               ld_output_col,
+                 size_t               ld_output_row,
+                 size_t               ld_output_batch,
+                 void                *working_space,
+                 unsigned int         thread_id,
+                 unsigned int         num_threads) const override
     {
-        this->execute_internal(
-            batches, height, width, channels, padding,
-            input, ld_input_col, ld_input_row, ld_input_batch,
-            output_height, output_width,
-            output, ld_output_col, ld_output_row, ld_output_batch,
-            working_space, thread_id, num_threads);
+        this->execute_internal(batches, height, width, channels, padding, input, ld_input_col, ld_input_row,
+                               ld_input_batch, output_height, output_width, output, ld_output_col, ld_output_row,
+                               ld_output_batch, working_space, thread_id, num_threads);
     }
 
 protected:
-    virtual void execute_internal(
-        unsigned int batches,
-        unsigned int height,
-        unsigned int width,
-        unsigned int channels,
-        const PaddingValues &,
-        const void *const input,
-        size_t            ld_input_col,
-        size_t            ld_input_row,
-        size_t            ld_input_batch,
-        unsigned int      output_height,
-        unsigned int      output_width,
-        void *const       output,
-        size_t            ld_output_col,
-        size_t            ld_output_row,
-        size_t            ld_output_batch,
-        void             *working_space,
-        unsigned int      thread_id,
-        unsigned int      num_threads) const = 0;
+    virtual void execute_internal(unsigned int batches,
+                                  unsigned int height,
+                                  unsigned int width,
+                                  unsigned int channels,
+                                  const PaddingValues &,
+                                  const void *const input,
+                                  size_t            ld_input_col,
+                                  size_t            ld_input_row,
+                                  size_t            ld_input_batch,
+                                  unsigned int      output_height,
+                                  unsigned int      output_width,
+                                  void *const       output,
+                                  size_t            ld_output_col,
+                                  size_t            ld_output_row,
+                                  size_t            ld_output_batch,
+                                  void             *working_space,
+                                  unsigned int      thread_id,
+                                  unsigned int      num_threads) const = 0;
 };
 
 template <typename TInput, typename TOutput>
diff --git a/src/core/NEON/kernels/assembly/premultiply.hpp b/src/core/NEON/kernels/assembly/premultiply.hpp
index 16f26de38a..fb97cf8baf 100644
--- a/src/core/NEON/kernels/assembly/premultiply.hpp
+++ b/src/core/NEON/kernels/assembly/premultiply.hpp
@@ -44,30 +44,27 @@ void do_premultiply(const T           *in_ptr,
                     const unsigned     input_channels,
                     const unsigned int channel_multiplier)
 {
-    if(sizeof(T) == 4 && channel_multiplier == 6)
+    if (sizeof(T) == 4 && channel_multiplier == 6)
     {
-        do_premultiply_float_6(
-            (const float *)in_ptr, ld_row, ld_col,
-            (float *)out_ptr, out_ld_row, out_ld_col,
-            tile_rows, tile_cols,
-            input_channels);
+        do_premultiply_float_6((const float *)in_ptr, ld_row, ld_col, (float *)out_ptr, out_ld_row, out_ld_col,
+                               tile_rows, tile_cols, input_channels);
     }
     else
     {
-        for(unsigned int i = 0; i < tile_rows; i++)
+        for (unsigned int i = 0; i < tile_rows; i++)
         {
             const T *ip2 = in_ptr + i * ld_row;
             T       *op2 = out_ptr + i * out_ld_row;
-            for(unsigned int j = 0; j < tile_cols; j++)
+            for (unsigned int j = 0; j < tile_cols; j++)
             {
                 const T *ip = ip2;
                 T       *op = op2;
-                for(unsigned int c = 0; c < input_channels; c++)
+                for (unsigned int c = 0; c < input_channels; c++)
                 {
                     T val = *ip;
                     ip++;
 
-                    for(unsigned int r = 0; r < channel_multiplier; r++)
+                    for (unsigned int r = 0; r < channel_multiplier; r++)
                     {
                         op[r] = val;
                     }
diff --git a/src/core/NEON/kernels/assembly/winograd.hpp b/src/core/NEON/kernels/assembly/winograd.hpp
index 50290757ec..dbf95d23cd 100644
--- a/src/core/NEON/kernels/assembly/winograd.hpp
+++ b/src/core/NEON/kernels/assembly/winograd.hpp
@@ -45,17 +45,24 @@ struct ConvolutionArgs
     Shape2D              kernel_shape;
     arm_gemm::Activation activation;
 
-    ConvolutionArgs(
-        unsigned int   n_batches,
-        const Shape2D &input_shape,
-        unsigned int   n_input_channels,
-        unsigned int pad_top, unsigned int pad_left,
-        const Shape2D              &output_shape,
-        unsigned int                n_output_channels,
-        const Shape2D               kernel_shape,
-        const arm_gemm::Activation &activation = {})
-        : n_batches(n_batches), input_shape(input_shape), n_input_channels(n_input_channels), pad_top(pad_top), pad_left(pad_left), output_shape(output_shape), n_output_channels(n_output_channels),
-          kernel_shape(kernel_shape), activation(activation)
+    ConvolutionArgs(unsigned int                n_batches,
+                    const Shape2D              &input_shape,
+                    unsigned int                n_input_channels,
+                    unsigned int                pad_top,
+                    unsigned int                pad_left,
+                    const Shape2D              &output_shape,
+                    unsigned int                n_output_channels,
+                    const Shape2D               kernel_shape,
+                    const arm_gemm::Activation &activation = {})
+        : n_batches(n_batches),
+          input_shape(input_shape),
+          n_input_channels(n_input_channels),
+          pad_top(pad_top),
+          pad_left(pad_left),
+          output_shape(output_shape),
+          n_output_channels(n_output_channels),
+          kernel_shape(kernel_shape),
+          activation(activation)
     {
     }
 };
@@ -105,23 +112,30 @@ public:
     virtual unsigned int get_transformed_tile_rows(void) const = 0;
     virtual unsigned int get_transformed_tile_cols(void) const = 0;
 
-    void execute(
-        const ConvolutionArgs &args,
-        const void *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel,
-        void *outptr, const WinogradDomainSpec &wds,
-        unsigned int thread_id, unsigned int n_threads) const
+    void execute(const ConvolutionArgs    &args,
+                 const void               *inptr,
+                 size_t                    ld_in_row,
+                 size_t                    ld_in_col,
+                 size_t                    ld_input_channel,
+                 void                     *outptr,
+                 const WinogradDomainSpec &wds,
+                 unsigned int              thread_id,
+                 unsigned int              n_threads) const
     {
-        this->execute(
-            args, inptr, ld_in_row, ld_in_col, ld_input_channel,
-            outptr, wds.weight_ld_matrix, wds.weight_ld_row,
-            thread_id, n_threads);
+        this->execute(args, inptr, ld_in_row, ld_in_col, ld_input_channel, outptr, wds.weight_ld_matrix,
+                      wds.weight_ld_row, thread_id, n_threads);
     }
 
-    virtual void execute(
-        const ConvolutionArgs &args,
-        const void *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel,
-        void *outptr, size_t ld_out_matrix, size_t ld_out_row,
-        unsigned int thread_id, unsigned int n_threads) const = 0;
+    virtual void execute(const ConvolutionArgs &args,
+                         const void            *inptr,
+                         size_t                 ld_in_row,
+                         size_t                 ld_in_col,
+                         size_t                 ld_input_channel,
+                         void                  *outptr,
+                         size_t                 ld_out_matrix,
+                         size_t                 ld_out_row,
+                         unsigned int           thread_id,
+                         unsigned int           n_threads) const = 0;
 };
 
 } // namespace weight_transform
@@ -136,27 +150,35 @@ public:
     virtual unsigned int get_input_rows(void) const = 0;
     virtual unsigned int get_input_cols(void) const = 0;
 
-    virtual size_t get_working_space_size(
-        const ConvolutionArgs &args,
-        unsigned int           n_threads) const = 0;
-
-    void execute(
-        const ConvolutionArgs &args,
-        const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col,
-        void *outptr, const WinogradDomainSpec &wds,
-        void *working_space, unsigned int thread_id, unsigned int n_threads) const
+    virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0;
+
+    void execute(const ConvolutionArgs    &args,
+                 const void               *inptr,
+                 size_t                    ld_in_batch,
+                 size_t                    ld_in_row,
+                 size_t                    ld_in_col,
+                 void                     *outptr,
+                 const WinogradDomainSpec &wds,
+                 void                     *working_space,
+                 unsigned int              thread_id,
+                 unsigned int              n_threads) const
     {
-        this->execute(
-            args, inptr, ld_in_batch, ld_in_row, ld_in_col,
-            outptr, wds.input_ld_batch, wds.input_ld_matrix, wds.input_ld_row,
-            working_space, thread_id, n_threads);
+        this->execute(args, inptr, ld_in_batch, ld_in_row, ld_in_col, outptr, wds.input_ld_batch, wds.input_ld_matrix,
+                      wds.input_ld_row, working_space, thread_id, n_threads);
     }
 
-    virtual void execute(
-        const ConvolutionArgs &args,
-        const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col,
-        void *outptr, size_t ld_out_batch, size_t ld_out_matrix, size_t ld_out_row,
-        void *working_space, unsigned int thread_id, unsigned int n_threads) const = 0;
+    virtual void execute(const ConvolutionArgs &args,
+                         const void            *inptr,
+                         size_t                 ld_in_batch,
+                         size_t                 ld_in_row,
+                         size_t                 ld_in_col,
+                         void                  *outptr,
+                         size_t                 ld_out_batch,
+                         size_t                 ld_out_matrix,
+                         size_t                 ld_out_row,
+                         void                  *working_space,
+                         unsigned int           thread_id,
+                         unsigned int           n_threads) const = 0;
 };
 
 } // namespace input_transform
@@ -177,31 +199,37 @@ public:
     virtual unsigned int get_kernel_rows(void) const = 0;
     virtual unsigned int get_kernel_cols(void) const = 0;
 
-    virtual size_t get_working_space_size(
-        const ConvolutionArgs &args,
-        unsigned int           n_threads) const = 0;
-
-    void execute(
-        const ConvolutionArgs &args,
-        const void *inptr, const WinogradDomainSpec &wds,
-        const void *bias,
-        void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
-        void *working_space, unsigned int thread_id, unsigned int n_threads) const
+    virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0;
+
+    void execute(const ConvolutionArgs    &args,
+                 const void               *inptr,
+                 const WinogradDomainSpec &wds,
+                 const void               *bias,
+                 void                     *outptr,
+                 size_t                    ld_out_batch,
+                 size_t                    ld_out_row,
+                 size_t                    ld_out_col,
+                 void                     *working_space,
+                 unsigned int              thread_id,
+                 unsigned int              n_threads) const
     {
-        this->execute(
-            args,
-            inptr, wds.output_ld_batch, wds.output_ld_matrix, wds.output_ld_row,
-            bias,
-            outptr, ld_out_batch, ld_out_row, ld_out_col,
-            working_space, thread_id, n_threads);
+        this->execute(args, inptr, wds.output_ld_batch, wds.output_ld_matrix, wds.output_ld_row, bias, outptr,
+                      ld_out_batch, ld_out_row, ld_out_col, working_space, thread_id, n_threads);
     }
 
-    virtual void execute(
-        const ConvolutionArgs &args,
-        const void *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row,
-        const void *bias,
-        void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
-        void *working_space, unsigned int thread_id, unsigned int n_threads) const = 0;
+    virtual void execute(const ConvolutionArgs &args,
+                         const void            *inptr,
+                         size_t                 ld_in_batch,
+                         size_t                 ld_in_matrix,
+                         size_t                 ld_in_row,
+                         const void            *bias,
+                         void                  *outptr,
+                         size_t                 ld_out_batch,
+                         size_t                 ld_out_row,
+                         size_t                 ld_out_col,
+                         void                  *working_space,
+                         unsigned int           thread_id,
+                         unsigned int           n_threads) const = 0;
 };
 
 } // namespace output_transform
@@ -210,7 +238,7 @@ struct WinogradImpl
 {
     const output_transform::ITransform *output_transform = nullptr;
     const weight_transform::ITransform *weight_transform = nullptr;
-    const input_transform::ITransform *input_transform  = nullptr;
+    const input_transform::ITransform  *input_transform  = nullptr;
     std::unique_ptr<arm_gemm::GemmArgs> gemm_args;
     WinogradDomainSpec                  winograd_spec;
 };
@@ -220,15 +248,18 @@ struct WinogradImpl
  * Assigns to the pointers in the `dest` struct and returns true or false to
  * indicate whether the given problem can be executed or not.
  */
-template <typename TIn, typename TWeight = TIn, typename TOut = TIn, typename TWinogradIn = TIn, typename TWinogradOut = TOut>
-bool get_implementation(
-    WinogradImpl &dest, // Destination for the selected implementation
-    const CPUInfo *,
-    const ConvolutionArgs &,
-    int  max_threads,
-    bool fast_mode,
-    const WinogradConfig *,
-    const arm_gemm::GemmConfig *);
+template <typename TIn,
+          typename TWeight      = TIn,
+          typename TOut         = TIn,
+          typename TWinogradIn  = TIn,
+          typename TWinogradOut = TOut>
+bool get_implementation(WinogradImpl &dest, // Destination for the selected implementation
+                        const CPUInfo *,
+                        const ConvolutionArgs &,
+                        int  max_threads,
+                        bool fast_mode,
+                        const WinogradConfig *,
+                        const arm_gemm::GemmConfig *);
 
 } // namespace winograd
 } // namespace arm_conv
diff --git a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp
index ed5254a0a4..e3d9b670b3 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp
@@ -24,8 +24,9 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
+
 #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
@@ -37,12 +38,26 @@ namespace arm_compute
 {
 namespace
 {
-using BatchNomalizationPtr = void (*)(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                                      float epsilon, ActivationLayerInfo &act_info, const Window &window);
+using BatchNomalizationPtr = void (*)(ITensor             *src,
+                                      ITensor             *dst,
+                                      const ITensor       *mean,
+                                      const ITensor       *var,
+                                      const ITensor       *beta,
+                                      const ITensor       *gamma,
+                                      float                epsilon,
+                                      ActivationLayerInfo &act_info,
+                                      const Window        &window);
 
 template <typename T>
-void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                         float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void batch_normalization(ITensor             *src,
+                         ITensor             *dst,
+                         const ITensor       *mean,
+                         const ITensor       *var,
+                         const ITensor       *beta,
+                         const ITensor       *gamma,
+                         float                epsilon,
+                         ActivationLayerInfo &act_info,
+                         const Window        &window)
 {
     /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float16_t, wrapper::traits::BitWidth::W128>;
@@ -57,86 +72,99 @@ void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
 
-    const auto input_mean  = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (gamma != nullptr) ? reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (beta != nullptr) ? reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_mean = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var  = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma =
+        (gamma != nullptr) ? reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta =
+        (beta != nullptr) ? reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
 
     T activation_functor(act_info);
 
     const auto epsilon_vec = wrapper::vdup_n(static_cast<float16_t>(epsilon), ExactTagType{});
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
-
-        // Perform core calculations using vector operations
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            // Conctruct vectors
-            const auto mean_vec  = wrapper::vloadq(input_mean + x);
-            const auto var_vec   = wrapper::vloadq(input_var + x);
-            const auto gamma_vec = (input_gamma != nullptr) ? wrapper::vloadq(input_gamma + x) : wrapper::vdup_n(static_cast<float16_t>(1.f), ExactTagType{});
-            const auto beta_vec  = (input_beta != nullptr) ? wrapper::vloadq(input_beta + x) : wrapper::vdup_n(static_cast<float16_t>(0.f), ExactTagType{});
-
-            // Calculate denominator
-            const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
-
-            // Calculate x bar
-            const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
-            const auto x_bar     = wrapper::vmul(numerator, denominator);
-            auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
-
-            // Perform fused activation
-            if(act_info.enabled())
+            const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
+
+            // Perform core calculations using vector operations
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                activation_functor(res);
+                // Conctruct vectors
+                const auto mean_vec  = wrapper::vloadq(input_mean + x);
+                const auto var_vec   = wrapper::vloadq(input_var + x);
+                const auto gamma_vec = (input_gamma != nullptr)
+                                           ? wrapper::vloadq(input_gamma + x)
+                                           : wrapper::vdup_n(static_cast<float16_t>(1.f), ExactTagType{});
+                const auto beta_vec  = (input_beta != nullptr)
+                                           ? wrapper::vloadq(input_beta + x)
+                                           : wrapper::vdup_n(static_cast<float16_t>(0.f), ExactTagType{});
+
+                // Calculate denominator
+                const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+
+                // Calculate x bar
+                const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
+                const auto x_bar     = wrapper::vmul(numerator, denominator);
+                auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
+
+                // Perform fused activation
+                if (act_info.enabled())
+                {
+                    activation_functor(res);
+                }
+
+                // Store results
+                wrapper::vstore(output_ptr + x, res);
             }
 
-            // Store results
-            wrapper::vstore(output_ptr + x, res);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            // Conctruct vectors
-            const float16_t gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f;
-            const float16_t beta  = (input_beta != nullptr) ? input_beta[x] : 0.f;
-
-            const float16_t denominator = sqrt(input_var[x] + epsilon);
-            const float16_t numerator   = input_ptr[x] - input_mean[x];
-            const float16_t x_bar       = numerator / denominator;
-            float16_t       res         = beta + x_bar * gamma;
-
-            // Perform fused activation
-            if(act_info.enabled())
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                activation_functor(res);
+                // Conctruct vectors
+                const float16_t gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f;
+                const float16_t beta  = (input_beta != nullptr) ? input_beta[x] : 0.f;
+
+                const float16_t denominator = sqrt(input_var[x] + epsilon);
+                const float16_t numerator   = input_ptr[x] - input_mean[x];
+                const float16_t x_bar       = numerator / denominator;
+                float16_t       res         = beta + x_bar * gamma;
+
+                // Perform fused activation
+                if (act_info.enabled())
+                {
+                    activation_functor(res);
+                }
+
+                // Store results
+                *reinterpret_cast<float16_t *>(output_ptr + x) = res;
             }
-
-            // Store results
-            *reinterpret_cast<float16_t *>(output_ptr + x) = res;
-        }
-    },
-    input, output);
+        },
+        input, output);
 }
 
 // Fused Batched Normalization with activation functions
-static std::map<ActivationLayerInfo::ActivationFunction, BatchNomalizationPtr> fused_map =
-{
-    { ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization<detail::relu<float16_t, 8>> },
-    { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization<detail::brelu<float16_t, 8>> },
-    { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization<detail::lubrelu<float16_t, 8>> }
-};
-}
+static std::map<ActivationLayerInfo::ActivationFunction, BatchNomalizationPtr> fused_map = {
+    {ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization<detail::relu<float16_t, 8>>},
+    {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization<detail::brelu<float16_t, 8>>},
+    {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization<detail::lubrelu<float16_t, 8>>}};
+} // namespace
 namespace cpu
 {
-void fp16_neon_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                                   float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void fp16_neon_batch_normalization(ITensor             *src,
+                                   ITensor             *dst,
+                                   const ITensor       *mean,
+                                   const ITensor       *var,
+                                   const ITensor       *beta,
+                                   const ITensor       *gamma,
+                                   float                epsilon,
+                                   ActivationLayerInfo &act_info,
+                                   const Window        &window)
 {
-    if(act_info.enabled())
+    if (act_info.enabled())
     {
         fused_map[act_info.activation()](src, dst, mean, var, beta, gamma, epsilon, act_info, window);
     }
diff --git a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp
index d6e22e1843..4e1654ee6b 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp
@@ -24,8 +24,9 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
+
 #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
@@ -36,12 +37,26 @@ namespace arm_compute
 {
 namespace
 {
-using BatchNomalizationPtr = void (*)(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                                      float epsilon, ActivationLayerInfo &act_info, const Window &window);
+using BatchNomalizationPtr = void (*)(ITensor             *src,
+                                      ITensor             *dst,
+                                      const ITensor       *mean,
+                                      const ITensor       *var,
+                                      const ITensor       *beta,
+                                      const ITensor       *gamma,
+                                      float                epsilon,
+                                      ActivationLayerInfo &act_info,
+                                      const Window        &window);
 
 template <typename T>
-void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                         float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void batch_normalization(ITensor             *src,
+                         ITensor             *dst,
+                         const ITensor       *mean,
+                         const ITensor       *var,
+                         const ITensor       *beta,
+                         const ITensor       *gamma,
+                         float                epsilon,
+                         ActivationLayerInfo &act_info,
+                         const Window        &window)
 {
     /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
@@ -56,86 +71,99 @@ void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
 
-    const auto input_mean  = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (gamma != nullptr) ? reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (beta != nullptr) ? reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_mean = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var  = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma =
+        (gamma != nullptr) ? reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta =
+        (beta != nullptr) ? reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
 
     T activation_functor(act_info);
 
     const auto epsilon_vec = wrapper::vdup_n(static_cast<float>(epsilon), ExactTagType{});
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float *>(output.ptr());
-
-        // Perform core calculations using vector operations
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            // Conctruct vectors
-            const auto mean_vec  = wrapper::vloadq(input_mean + x);
-            const auto var_vec   = wrapper::vloadq(input_var + x);
-            const auto gamma_vec = (input_gamma != nullptr) ? wrapper::vloadq(input_gamma + x) : wrapper::vdup_n(static_cast<float>(1.f), ExactTagType{});
-            const auto beta_vec  = (input_beta != nullptr) ? wrapper::vloadq(input_beta + x) : wrapper::vdup_n(static_cast<float>(0.f), ExactTagType{});
-
-            // Calculate denominator
-            const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
-
-            // Calculate x bar
-            const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
-            const auto x_bar     = wrapper::vmul(numerator, denominator);
-            auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
-
-            // Perform fused activation
-            if(act_info.enabled())
+            const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<float *>(output.ptr());
+
+            // Perform core calculations using vector operations
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                activation_functor(res);
+                // Conctruct vectors
+                const auto mean_vec  = wrapper::vloadq(input_mean + x);
+                const auto var_vec   = wrapper::vloadq(input_var + x);
+                const auto gamma_vec = (input_gamma != nullptr)
+                                           ? wrapper::vloadq(input_gamma + x)
+                                           : wrapper::vdup_n(static_cast<float>(1.f), ExactTagType{});
+                const auto beta_vec  = (input_beta != nullptr)
+                                           ? wrapper::vloadq(input_beta + x)
+                                           : wrapper::vdup_n(static_cast<float>(0.f), ExactTagType{});
+
+                // Calculate denominator
+                const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+
+                // Calculate x bar
+                const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
+                const auto x_bar     = wrapper::vmul(numerator, denominator);
+                auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
+
+                // Perform fused activation
+                if (act_info.enabled())
+                {
+                    activation_functor(res);
+                }
+
+                // Store results
+                wrapper::vstore(output_ptr + x, res);
             }
 
-            // Store results
-            wrapper::vstore(output_ptr + x, res);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            // Conctruct vectors
-            const float gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f;
-            const float beta  = (input_beta != nullptr) ? input_beta[x] : 0.f;
-
-            const float denominator = sqrt(input_var[x] + epsilon);
-            const float numerator   = input_ptr[x] - input_mean[x];
-            const float x_bar       = numerator / denominator;
-            float       res         = beta + x_bar * gamma;
-
-            // Perform fused activation
-            if(act_info.enabled())
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                activation_functor(res);
+                // Conctruct vectors
+                const float gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f;
+                const float beta  = (input_beta != nullptr) ? input_beta[x] : 0.f;
+
+                const float denominator = sqrt(input_var[x] + epsilon);
+                const float numerator   = input_ptr[x] - input_mean[x];
+                const float x_bar       = numerator / denominator;
+                float       res         = beta + x_bar * gamma;
+
+                // Perform fused activation
+                if (act_info.enabled())
+                {
+                    activation_functor(res);
+                }
+
+                // Store results
+                *reinterpret_cast<float *>(output_ptr + x) = res;
             }
-
-            // Store results
-            *reinterpret_cast<float *>(output_ptr + x) = res;
-        }
-    },
-    input, output);
+        },
+        input, output);
 }
 
 // Fused Batched Normalization with activation functions
-static std::map<ActivationLayerInfo::ActivationFunction, BatchNomalizationPtr> fused_map =
-{
-    { ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization<detail::relu<float, 4>> },
-    { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization<detail::brelu<float, 4>> },
-    { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization<detail::lubrelu<float, 4>> }
-};
-}
+static std::map<ActivationLayerInfo::ActivationFunction, BatchNomalizationPtr> fused_map = {
+    {ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization<detail::relu<float, 4>>},
+    {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization<detail::brelu<float, 4>>},
+    {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization<detail::lubrelu<float, 4>>}};
+} // namespace
 namespace cpu
 {
-void fp32_neon_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                                   float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void fp32_neon_batch_normalization(ITensor             *src,
+                                   ITensor             *dst,
+                                   const ITensor       *mean,
+                                   const ITensor       *var,
+                                   const ITensor       *beta,
+                                   const ITensor       *gamma,
+                                   float                epsilon,
+                                   ActivationLayerInfo &act_info,
+                                   const Window        &window)
 {
-    if(act_info.enabled())
+    if (act_info.enabled())
     {
         fused_map[act_info.activation()](src, dst, mean, var, beta, gamma, epsilon, act_info, window);
     }
diff --git a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp
index 98cd9aa7fe..48caaa3e63 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp
@@ -25,6 +25,7 @@
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/SVEMath.h"
 
 #include <cmath>
@@ -37,8 +38,15 @@ namespace arm_compute
 {
 namespace cpu
 {
-void fp16_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                                  float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void fp16_sve_batch_normalization(ITensor             *src,
+                                  ITensor             *dst,
+                                  const ITensor       *mean,
+                                  const ITensor       *var,
+                                  const ITensor       *beta,
+                                  const ITensor       *gamma,
+                                  float                epsilon,
+                                  ActivationLayerInfo &act_info,
+                                  const Window        &window)
 {
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
@@ -49,69 +57,74 @@ void fp16_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mea
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
 
-    const auto input_mean  = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (gamma != nullptr) ? reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (beta != nullptr) ? reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_mean = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var  = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma =
+        (gamma != nullptr) ? reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta =
+        (beta != nullptr) ? reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
 
     const auto epsilon_vec = svdup_n_f16(epsilon);
     const auto const_1     = svdup_n_f16(1.f);
     const auto const_0     = svdup_n_f16(0.f);
     const auto va          = svdup_n_f16(act_info.a());
     const auto vb          = svdup_n_f16(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b16(x, window_end_x);
-        do
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            // Conctruct vectors
-            const auto mean_vec  = svld1_f16(pg, input_mean + x);
-            const auto var_vec   = svld1_f16(pg, input_var + x);
-            const auto gamma_vec = (input_gamma != nullptr) ? svld1_f16(pg, input_gamma + x) : const_1;
-            const auto beta_vec  = (input_beta != nullptr) ? svld1_f16(pg, input_beta + x) : const_0;
+            const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
 
-            // Calculate denominator
-            const auto tmp         = svadd_f16_z(pg, var_vec, epsilon_vec);
-            auto       denominator = svrsqrte_f16(tmp);
-            denominator            = svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator);
-            denominator            = svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator);
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b16(x, window_end_x);
+            do
+            {
+                // Conctruct vectors
+                const auto mean_vec  = svld1_f16(pg, input_mean + x);
+                const auto var_vec   = svld1_f16(pg, input_var + x);
+                const auto gamma_vec = (input_gamma != nullptr) ? svld1_f16(pg, input_gamma + x) : const_1;
+                const auto beta_vec  = (input_beta != nullptr) ? svld1_f16(pg, input_beta + x) : const_0;
 
-            // Calculate x bar
-            const auto numerator = svsub_f16_z(pg, svld1_f16(pg, input_ptr + x), mean_vec);
-            const auto x_bar     = svmul_f16_z(pg, numerator, denominator);
-            auto       res       = svmla_f16_z(pg, beta_vec, x_bar, gamma_vec);
+                // Calculate denominator
+                const auto tmp         = svadd_f16_z(pg, var_vec, epsilon_vec);
+                auto       denominator = svrsqrte_f16(tmp);
+                denominator =
+                    svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator);
+                denominator =
+                    svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator);
 
-            // Perform fused activation
-            if(act_info.enabled())
-            {
-                if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
-                {
-                    res = svmax_f16_z(pg, const_0, res);
-                }
-                else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-                {
-                    res = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, res));
-                }
-                else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                // Calculate x bar
+                const auto numerator = svsub_f16_z(pg, svld1_f16(pg, input_ptr + x), mean_vec);
+                const auto x_bar     = svmul_f16_z(pg, numerator, denominator);
+                auto       res       = svmla_f16_z(pg, beta_vec, x_bar, gamma_vec);
+
+                // Perform fused activation
+                if (act_info.enabled())
                 {
-                    res = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, res));
+                    if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+                    {
+                        res = svmax_f16_z(pg, const_0, res);
+                    }
+                    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                    {
+                        res = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, res));
+                    }
+                    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                    {
+                        res = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, res));
+                    }
                 }
-            }
 
-            // Store results
-            svst1_f16(pg, output_ptr + x, res);
+                // Store results
+                svst1_f16(pg, output_ptr + x, res);
 
-            x += svcntw();
-            pg = svwhilelt_b16(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b16(), pg));
-    },
-    input, output);
+                x += svcntw();
+                pg = svwhilelt_b16(x, window_end_x);
+            } while (svptest_any(svptrue_b16(), pg));
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp
index 952ab320bf..df4fbfe607 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp
@@ -25,6 +25,7 @@
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/SVEMath.h"
 
 #include <cmath>
@@ -37,8 +38,15 @@ namespace arm_compute
 {
 namespace cpu
 {
-void fp32_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                                  float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void fp32_sve_batch_normalization(ITensor             *src,
+                                  ITensor             *dst,
+                                  const ITensor       *mean,
+                                  const ITensor       *var,
+                                  const ITensor       *beta,
+                                  const ITensor       *gamma,
+                                  float                epsilon,
+                                  ActivationLayerInfo &act_info,
+                                  const Window        &window)
 {
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
@@ -49,69 +57,74 @@ void fp32_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mea
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
 
-    const auto input_mean  = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (gamma != nullptr) ? reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (beta != nullptr) ? reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_mean = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var  = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma =
+        (gamma != nullptr) ? reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta =
+        (beta != nullptr) ? reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
 
     const auto epsilon_vec = svdup_n_f32(epsilon);
     const auto const_1     = svdup_n_f32(1.f);
     const auto const_0     = svdup_n_f32(0.f);
     const auto va          = svdup_n_f32(act_info.a());
     const auto vb          = svdup_n_f32(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float *>(output.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b32(x, window_end_x);
-        do
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            // Conctruct vectors
-            const auto mean_vec  = svld1_f32(pg, input_mean + x);
-            const auto var_vec   = svld1_f32(pg, input_var + x);
-            const auto gamma_vec = (input_gamma != nullptr) ? svld1_f32(pg, input_gamma + x) : const_1;
-            const auto beta_vec  = (input_beta != nullptr) ? svld1_f32(pg, input_beta + x) : const_0;
+            const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<float *>(output.ptr());
 
-            // Calculate denominator
-            const auto tmp         = svadd_f32_z(pg, var_vec, epsilon_vec);
-            auto       denominator = svrsqrte_f32(tmp);
-            denominator            = svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator);
-            denominator            = svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator);
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b32(x, window_end_x);
+            do
+            {
+                // Conctruct vectors
+                const auto mean_vec  = svld1_f32(pg, input_mean + x);
+                const auto var_vec   = svld1_f32(pg, input_var + x);
+                const auto gamma_vec = (input_gamma != nullptr) ? svld1_f32(pg, input_gamma + x) : const_1;
+                const auto beta_vec  = (input_beta != nullptr) ? svld1_f32(pg, input_beta + x) : const_0;
 
-            // Calculate x bar
-            const auto numerator = svsub_f32_z(pg, svld1_f32(pg, input_ptr + x), mean_vec);
-            const auto x_bar     = svmul_f32_z(pg, numerator, denominator);
-            auto       res       = svmla_f32_z(pg, beta_vec, x_bar, gamma_vec);
+                // Calculate denominator
+                const auto tmp         = svadd_f32_z(pg, var_vec, epsilon_vec);
+                auto       denominator = svrsqrte_f32(tmp);
+                denominator =
+                    svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator);
+                denominator =
+                    svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator);
 
-            // Perform fused activation
-            if(act_info.enabled())
-            {
-                if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
-                {
-                    res = svmax_f32_z(pg, const_0, res);
-                }
-                else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-                {
-                    res = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, res));
-                }
-                else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                // Calculate x bar
+                const auto numerator = svsub_f32_z(pg, svld1_f32(pg, input_ptr + x), mean_vec);
+                const auto x_bar     = svmul_f32_z(pg, numerator, denominator);
+                auto       res       = svmla_f32_z(pg, beta_vec, x_bar, gamma_vec);
+
+                // Perform fused activation
+                if (act_info.enabled())
                 {
-                    res = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, res));
+                    if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+                    {
+                        res = svmax_f32_z(pg, const_0, res);
+                    }
+                    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                    {
+                        res = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, res));
+                    }
+                    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                    {
+                        res = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, res));
+                    }
                 }
-            }
 
-            // Store results
-            svst1_f32(pg, output_ptr + x, res);
+                // Store results
+                svst1_f32(pg, output_ptr + x, res);
 
-            x += svcntw();
-            pg = svwhilelt_b32(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b32(), pg));
-    },
-    input, output);
+                x += svcntw();
+                pg = svwhilelt_b32(x, window_end_x);
+            } while (svptest_any(svptrue_b32(), pg));
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/batchnormalization/impl/list.h b/src/core/NEON/kernels/batchnormalization/impl/list.h
index 8e0ea36f5a..cbf540bd71 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/list.h
+++ b/src/core/NEON/kernels/batchnormalization/impl/list.h
@@ -28,9 +28,9 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_BATCH_NORMALIZATION_KERNEL(func_name)                                                                              \
-    void func_name(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, \
-                   float epsilon, ActivationLayerInfo &act_info, const Window &window)
+#define DECLARE_BATCH_NORMALIZATION_KERNEL(func_name)                                                        \
+    void func_name(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, \
+                   const ITensor *gamma, float epsilon, ActivationLayerInfo &act_info, const Window &window)
 
 DECLARE_BATCH_NORMALIZATION_KERNEL(fp16_neon_batch_normalization);
 DECLARE_BATCH_NORMALIZATION_KERNEL(fp16_sve_batch_normalization);
diff --git a/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h b/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h
index 3900ea62cd..95cdc8f2f9 100644
--- a/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h
+++ b/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
@@ -158,8 +159,7 @@ struct logistic
      *
      * @param[in] act_info Activation layer information.
      */
-    explicit logistic(ActivationLayerInfo act_info)
-        : vone(wrapper::vdup_n(static_cast<T>(1), ExactTagType{}))
+    explicit logistic(ActivationLayerInfo act_info) : vone(wrapper::vdup_n(static_cast<T>(1), ExactTagType{}))
     {
         ARM_COMPUTE_UNUSED(act_info);
     }
@@ -198,8 +198,7 @@ struct relu
      *
      * @param[in] act_info Activation layer information.
      */
-    explicit relu(ActivationLayerInfo act_info)
-        : vzero(wrapper::vdup_n(static_cast<T>(0), ExactTagType{}))
+    explicit relu(ActivationLayerInfo act_info) : vzero(wrapper::vdup_n(static_cast<T>(0), ExactTagType{}))
     {
         ARM_COMPUTE_UNUSED(act_info);
     }
diff --git a/src/core/NEON/kernels/detail/NEColorConvertHelper.inl b/src/core/NEON/kernels/detail/NEColorConvertHelper.inl
index ac196d9dbb..50fff04cad 100644
--- a/src/core/NEON/kernels/detail/NEColorConvertHelper.inl
+++ b/src/core/NEON/kernels/detail/NEColorConvertHelper.inl
@@ -25,6 +25,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IMultiImage.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/NEON/NEMath.h"
 
 #include <arm_neon.h>
@@ -50,8 +51,12 @@ constexpr float rgb2u8_red_coef   = 0.2126f;
 constexpr float rgb2u8_green_coef = 0.7152f;
 constexpr float rgb2u8_blue_coef  = 0.0722f;
 
-inline float32x4_t rgb_to_greyscale_calculation(const float32x4_t &rcolor, const float32x4_t &gcolor, const float32x4_t &bcolor,
-                                                const float rcoef, const float gcoef, const float bcoef)
+inline float32x4_t rgb_to_greyscale_calculation(const float32x4_t &rcolor,
+                                                const float32x4_t &gcolor,
+                                                const float32x4_t &bcolor,
+                                                const float        rcoef,
+                                                const float        gcoef,
+                                                const float        bcoef)
 {
     float32x4_t greyscale = vmulq_n_f32(rcolor, rcoef);
     greyscale             = vmlaq_n_f32(greyscale, gcolor, gcoef);
@@ -86,8 +91,12 @@ inline void rgb_to_u8_conversion(const uint8x16x3_t &in, uint8x16_t &out)
     arm_compute::convert_float32x4x4_to_uint8x16(out_float32, out);
 }
 
-inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &gvec, const float32x4_t &bvec,
-                                   float32x4_t &yvec, float32x4_t &uvec, float32x4_t &vvec)
+inline void rgb_to_yuv_calculation(const float32x4_t &rvec,
+                                   const float32x4_t &gvec,
+                                   const float32x4_t &bvec,
+                                   float32x4_t       &yvec,
+                                   float32x4_t       &uvec,
+                                   float32x4_t       &vvec)
 {
     /*
     Y'= 0.2126*R' + 0.7152*G' + 0.0722*B'
@@ -110,8 +119,12 @@ inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &g
     vvec = vmlaq_n_f32(c128, vvec, rgb2yuv_bt709_cv);
 }
 
-inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uvec_val, const float32x4_t &yyvec_val,
-                                    float32x4_t vvec_val, unsigned char *output_ptr, const bool alpha)
+inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val,
+                                    float32x4_t        uvec_val,
+                                    const float32x4_t &yyvec_val,
+                                    float32x4_t        vvec_val,
+                                    unsigned char     *output_ptr,
+                                    const bool         alpha)
 {
     float32x4x3_t rgb1, rgb2;
 
@@ -126,8 +139,7 @@ inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uve
     // b = 1.8556f*f_u + 0.0000f*f_v;
     const auto red   = vmulq_n_f32(vvec_val, red_coef_bt709);
     const auto blue  = vmulq_n_f32(uvec_val, blue_coef_bt709);
-    const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709),
-                                 vmulq_n_f32(vvec_val, green_coef2_bt709));
+    const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709), vmulq_n_f32(vvec_val, green_coef2_bt709));
 
     // Compute the final r,g,b values using y1 for the first texel and y2 for the second one.
     // the result is stored in two float32x4x3_t which then are converted to one uint8x8x3_t
@@ -144,7 +156,7 @@ inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uve
     uint8x8x3_t u8_rgb;
     arm_compute::convert_float32x4x3_to_uint8x8x3(rgb1, rgb2, u8_rgb);
 
-    if(!alpha)
+    if (!alpha)
     {
         vst3_lane_u8(&output_ptr[0], u8_rgb, 0);
         vst3_lane_u8(&output_ptr[3], u8_rgb, 4);
@@ -177,7 +189,7 @@ inline uint8x16x3_t load_rgb(const unsigned char *const ptr, const bool alpha)
 {
     uint8x16x3_t rgb;
 
-    if(alpha)
+    if (alpha)
     {
         const auto tmp = vld4q_u8(ptr);
         rgb.val[0]     = tmp.val[0];
@@ -206,12 +218,12 @@ inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_botto
     float32x4x4_t fyvec_top, fuvec_top, fvvec_top;
     float32x4x4_t fyvec_bottom, fuvec_bottom, fvvec_bottom;
 
-    for(auto i = 0; i < 4; ++i)
+    for (auto i = 0; i < 4; ++i)
     {
-        rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i],
-                               fyvec_top.val[i], fuvec_top.val[i], fvvec_top.val[i]);
-        rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i],
-                               fyvec_bottom.val[i], fuvec_bottom.val[i], fvvec_bottom.val[i]);
+        rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i], fyvec_top.val[i], fuvec_top.val[i],
+                               fvvec_top.val[i]);
+        rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i], fyvec_bottom.val[i],
+                               fuvec_bottom.val[i], fvvec_bottom.val[i]);
     }
 
     arm_compute::convert_float32x4x4_to_uint8x16(fyvec_top, vec_top.val[0]);
@@ -222,9 +234,14 @@ inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_botto
     arm_compute::convert_float32x4x4_to_uint8x16(fvvec_bottom, vec_bottom.val[2]);
 }
 
-inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
-                              const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
-                              unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom,
+inline void store_rgb_to_nv12(const uint8x16_t &rvec_top,
+                              const uint8x16_t &gvec_top,
+                              const uint8x16_t &bvec_top,
+                              const uint8x16_t &rvec_bottom,
+                              const uint8x16_t &gvec_bottom,
+                              const uint8x16_t &bvec_bottom,
+                              unsigned char *const __restrict out_y_top,
+                              unsigned char *const __restrict out_y_bottom,
                               unsigned char *const __restrict out_uv)
 {
     uint8x16x3_t vec_top, vec_bottom;
@@ -252,9 +269,14 @@ inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec
     vst2_u8(out_uv, uvvec);
 }
 
-inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
-                              const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
-                              unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom,
+inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top,
+                              const uint8x16_t &gvec_top,
+                              const uint8x16_t &bvec_top,
+                              const uint8x16_t &rvec_bottom,
+                              const uint8x16_t &gvec_bottom,
+                              const uint8x16_t &bvec_bottom,
+                              unsigned char *const __restrict out_y_top,
+                              unsigned char *const __restrict out_y_bottom,
                               unsigned char *const __restrict out_u,
                               unsigned char *const __restrict out_v)
 {
@@ -273,14 +295,16 @@ inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec
 
     const auto uvvec_top    = vuzpq_u8(vec_top.val[1], vec_top.val[2]);
     const auto uvvec_bottom = vuzpq_u8(vec_bottom.val[1], vec_bottom.val[2]);
-    const auto uvvec        = vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]),
-                                        vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1]));
+    const auto uvvec =
+        vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]), vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1]));
 
     vst1_u8(out_u, vget_low_u8(uvvec));
     vst1_u8(out_v, vget_high_u8(uvvec));
 }
 
-inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, const uint8x16_t &bvec,
+inline void store_rgb_to_yuv4(const uint8x16_t &rvec,
+                              const uint8x16_t &gvec,
+                              const uint8x16_t &bvec,
                               unsigned char *const __restrict out_y,
                               unsigned char *const __restrict out_u,
                               unsigned char *const __restrict out_v)
@@ -291,10 +315,9 @@ inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, co
     const float32x4x4_t fbvec = arm_compute::convert_uint8x16_to_float32x4x4(bvec);
 
     float32x4x4_t fyvec, fuvec, fvvec;
-    for(auto i = 0; i < 4; ++i)
+    for (auto i = 0; i < 4; ++i)
     {
-        rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i],
-                               fyvec.val[i], fuvec.val[i], fvvec.val[i]);
+        rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i], fyvec.val[i], fuvec.val[i], fvvec.val[i]);
     }
 
     uint8x16_t yvec, uvec, vvec;
@@ -307,7 +330,7 @@ inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, co
     vst1q_u8(out_v, vvec);
 }
 #endif /* DOXYGEN_SKIP_THIS */
-}
+} // namespace
 
 namespace arm_compute
 {
@@ -329,17 +352,19 @@ void colorconvert_rgb_to_rgbx(const void *__restrict input, void *__restrict out
     Iterator in(input_ptr, win);
     Iterator out(output_ptr, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto   ta1 = vld3q_u8(in.ptr());
-        uint8x16x4_t ta2;
-        ta2.val[0] = ta1.val[0];
-        ta2.val[1] = ta1.val[1];
-        ta2.val[2] = ta1.val[2];
-        ta2.val[3] = vdupq_n_u8(255);
-        vst4q_u8(out.ptr(), ta2);
-    },
-    in, out);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto   ta1 = vld3q_u8(in.ptr());
+            uint8x16x4_t ta2;
+            ta2.val[0] = ta1.val[0];
+            ta2.val[1] = ta1.val[1];
+            ta2.val[2] = ta1.val[2];
+            ta2.val[3] = vdupq_n_u8(255);
+            vst4q_u8(out.ptr(), ta2);
+        },
+        in, out);
 }
 
 /** Convert RGB to U8.
@@ -360,14 +385,16 @@ void colorconvert_rgb_to_u8(const void *__restrict input, void *__restrict outpu
     Iterator in(input_ptr, win);
     Iterator out(output_ptr, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta1 = vld3q_u8(in.ptr());
-        uint8x16_t ta2;
-        rgb_to_u8_conversion(ta1, ta2);
-        vst1q_u8(out.ptr(), ta2);
-    },
-    in, out);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta1 = vld3q_u8(in.ptr());
+            uint8x16_t ta2;
+            rgb_to_u8_conversion(ta1, ta2);
+            vst1q_u8(out.ptr(), ta2);
+        },
+        in, out);
 }
 
 /** Convert RGBX to RGB.
@@ -388,16 +415,18 @@ void colorconvert_rgbx_to_rgb(const void *input, void *output, const Window &win
     Iterator in(input_ptr, win);
     Iterator out(output_ptr, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto   ta1 = vld4q_u8(in.ptr());
-        uint8x16x3_t ta2;
-        ta2.val[0] = ta1.val[0];
-        ta2.val[1] = ta1.val[1];
-        ta2.val[2] = ta1.val[2];
-        vst3q_u8(out.ptr(), ta2);
-    },
-    in, out);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto   ta1 = vld4q_u8(in.ptr());
+            uint8x16x3_t ta2;
+            ta2.val[0] = ta1.val[0];
+            ta2.val[1] = ta1.val[1];
+            ta2.val[2] = ta1.val[2];
+            vst3q_u8(out.ptr(), ta2);
+        },
+        in, out);
 }
 
 /** Convert YUYV to RGB.
@@ -422,26 +451,32 @@ void colorconvert_yuyv_to_rgb(const void *__restrict input, void *__restrict out
     Iterator in(input_ptr, win);
     Iterator out(output_ptr, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta = vld4q_u8(in.ptr());
-        //ta.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta.val[1] = U0 U2 U4 U6 ...
-        //ta.val[2] = Y1 Y3 Y5 Y7 ...
-        //ta.val[3] = V0 V2 V4 V7 ...
-
-        // Convert the uint8x16x4_t to float32x4x4_t
-        const float32x4x4_t yvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[0 + shift]);
-        const float32x4x4_t uvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[1 - shift]);
-        const float32x4x4_t yyvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[2 + shift]);
-        const float32x4x4_t vvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[3 - shift]);
-
-        yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
-    },
-    in, out);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta = vld4q_u8(in.ptr());
+            //ta.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta.val[1] = U0 U2 U4 U6 ...
+            //ta.val[2] = Y1 Y3 Y5 Y7 ...
+            //ta.val[3] = V0 V2 V4 V7 ...
+
+            // Convert the uint8x16x4_t to float32x4x4_t
+            const float32x4x4_t yvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[0 + shift]);
+            const float32x4x4_t uvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[1 - shift]);
+            const float32x4x4_t yyvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[2 + shift]);
+            const float32x4x4_t vvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[3 - shift]);
+
+            yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size,
+                                    alpha);
+            yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size,
+                                    alpha);
+            yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size,
+                                    alpha);
+            yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size,
+                                    alpha);
+        },
+        in, out);
 }
 
 /** Convert NV12 to RGB.
@@ -475,35 +510,45 @@ void colorconvert_nv12_to_rgb(const void *__restrict input, void *__restrict out
     Iterator in_uv(input_ptr->plane(1), win_uv);
     Iterator out(output_ptr, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        const auto ta_uv       = vld2q_u8(in_uv.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_uv.val[0] = U0 U2 U4 U6 ...
-        //ta_uv.val[1] = V0 V2 V4 V6 ...
-
-        // Convert the uint8x16x4_t to float32x4x4_t
-        float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
-        float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
-        float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
-        float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
-        float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift]);
-        float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift]);
-
-        yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
-
-        yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
-    },
-    in_y, in_uv, out);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_y_top    = vld2q_u8(in_y.ptr());
+            const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+            const auto ta_uv       = vld2q_u8(in_uv.ptr());
+            //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+            //ta_uv.val[0] = U0 U2 U4 U6 ...
+            //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+            // Convert the uint8x16x4_t to float32x4x4_t
+            float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
+            float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
+            float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
+            float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
+            float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift]);
+            float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift]);
+
+            yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0],
+                                    out.ptr() + 0 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1],
+                                    out.ptr() + 1 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2],
+                                    out.ptr() + 2 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3],
+                                    out.ptr() + 3 * element_size, alpha);
+
+            yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0],
+                                    out.ptr() + out_stride + 0 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1],
+                                    out.ptr() + out_stride + 1 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2],
+                                    out.ptr() + out_stride + 2 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3],
+                                    out.ptr() + out_stride + 3 * element_size, alpha);
+        },
+        in_y, in_uv, out);
 }
 
 /** Convert IYUV to RGB.
@@ -537,59 +582,71 @@ void colorconvert_iyuv_to_rgb(const void *__restrict input, void *__restrict out
     Iterator in_v(input_ptr->plane(2), win_uv);
     Iterator out(output_ptr, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto *y_top_ptr    = in_y.ptr();
-        const auto *y_bottom_ptr = in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y();
-        const auto *u_ptr        = in_u.ptr();
-        const auto *v_ptr        = in_v.ptr();
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto *y_top_ptr    = in_y.ptr();
+            const auto *y_bottom_ptr = in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y();
+            const auto *u_ptr        = in_u.ptr();
+            const auto *v_ptr        = in_v.ptr();
 
         // Work-around issue in gcc 9(>=) where vld2q might cause issues with register allocation
 #if defined(__arch64__)
-        const auto ta0_y_top    = vld1q_u8(y_top_ptr);
-        const auto ta1_y_top    = vld1q_u8(y_top_ptr + 16);
-        const auto ta0_y_bottom = vld1q_u8(y_bottom_ptr);
-        const auto ta1_y_bottom = vld1q_u8(y_bottom_ptr + 16);
-        const auto ta_u         = vld1q_u8(u_ptr);
-        const auto ta_v         = vld1q_u8(v_ptr);
-
-        // Convert the uint8x16x4_t to float32x4x4_t
-        float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_top, ta1_y_top));
-        float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_top, ta1_y_top));
-        float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_bottom, ta1_y_bottom));
-        float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_bottom, ta1_y_bottom));
-        float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
-        float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
+            const auto ta0_y_top    = vld1q_u8(y_top_ptr);
+            const auto ta1_y_top    = vld1q_u8(y_top_ptr + 16);
+            const auto ta0_y_bottom = vld1q_u8(y_bottom_ptr);
+            const auto ta1_y_bottom = vld1q_u8(y_bottom_ptr + 16);
+            const auto ta_u         = vld1q_u8(u_ptr);
+            const auto ta_v         = vld1q_u8(v_ptr);
+
+            // Convert the uint8x16x4_t to float32x4x4_t
+            float32x4x4_t yvec_top  = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_top, ta1_y_top));
+            float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_top, ta1_y_top));
+            float32x4x4_t yvec_bottom =
+                arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_bottom, ta1_y_bottom));
+            float32x4x4_t yyvec_bottom =
+                arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_bottom, ta1_y_bottom));
+            float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
+            float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
 #else  /* defined(__arch64__) */
-        const auto ta_y_top    = vld2q_u8(y_top_ptr);
-        const auto ta_y_bottom = vld2q_u8(y_bottom_ptr);
-        const auto ta_u        = vld1q_u8(u_ptr);
-        const auto ta_v        = vld1q_u8(v_ptr);
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_u.val[0] = U0 U2 U4 U6 ...
-        //ta_v.val[0] = V0 V2 V4 V6 ...
-
-        // Convert the uint8x16x4_t to float32x4x4_t
-        float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
-        float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
-        float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
-        float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
-        float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
-        float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
+            const auto ta_y_top    = vld2q_u8(y_top_ptr);
+            const auto ta_y_bottom = vld2q_u8(y_bottom_ptr);
+            const auto ta_u        = vld1q_u8(u_ptr);
+            const auto ta_v        = vld1q_u8(v_ptr);
+            //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+            //ta_u.val[0] = U0 U2 U4 U6 ...
+            //ta_v.val[0] = V0 V2 V4 V6 ...
+
+            // Convert the uint8x16x4_t to float32x4x4_t
+            float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
+            float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
+            float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
+            float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
+            float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
+            float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
 #endif /* defined(__arch64__) */
 
-        yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
-
-        yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
-    },
-    in_y, in_u, in_v, out);
+            yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0],
+                                    out.ptr() + 0 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1],
+                                    out.ptr() + 1 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2],
+                                    out.ptr() + 2 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3],
+                                    out.ptr() + 3 * element_size, alpha);
+
+            yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0],
+                                    out.ptr() + out_stride + 0 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1],
+                                    out.ptr() + out_stride + 1 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2],
+                                    out.ptr() + out_stride + 2 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3],
+                                    out.ptr() + out_stride + 3 * element_size, alpha);
+        },
+        in_y, in_u, in_v, out);
 }
 
 /** Convert YUYV to NV12.
@@ -621,31 +678,33 @@ void colorconvert_yuyv_to_nv12(const void *__restrict input, void *__restrict ou
     Iterator out_y(output_ptr->plane(0), win);
     Iterator out_uv(output_ptr->plane(1), win_uv);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_top    = vld4q_u8(in.ptr());
-        const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
-        //ta.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta.val[1] = U0 U2 U4 U6 ...
-        //ta.val[2] = Y1 Y3 Y5 Y7 ...
-        //ta.val[3] = V0 V2 V4 V7 ...
-
-        uint8x16x2_t yvec;
-        yvec.val[0] = ta_top.val[0 + shift];
-        yvec.val[1] = ta_top.val[2 + shift];
-        vst2q_u8(out_y.ptr(), yvec);
-
-        uint8x16x2_t yyvec;
-        yyvec.val[0] = ta_bottom.val[0 + shift];
-        yyvec.val[1] = ta_bottom.val[2 + shift];
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
-
-        uint8x16x2_t uvvec;
-        uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
-        uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
-        vst2q_u8(out_uv.ptr(), uvvec);
-    },
-    in, out_y, out_uv);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_top    = vld4q_u8(in.ptr());
+            const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
+            //ta.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta.val[1] = U0 U2 U4 U6 ...
+            //ta.val[2] = Y1 Y3 Y5 Y7 ...
+            //ta.val[3] = V0 V2 V4 V7 ...
+
+            uint8x16x2_t yvec;
+            yvec.val[0] = ta_top.val[0 + shift];
+            yvec.val[1] = ta_top.val[2 + shift];
+            vst2q_u8(out_y.ptr(), yvec);
+
+            uint8x16x2_t yyvec;
+            yyvec.val[0] = ta_bottom.val[0 + shift];
+            yyvec.val[1] = ta_bottom.val[2 + shift];
+            vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
+
+            uint8x16x2_t uvvec;
+            uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
+            uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
+            vst2q_u8(out_uv.ptr(), uvvec);
+        },
+        in, out_y, out_uv);
 }
 
 /** Convert IYUV to NV12.
@@ -676,23 +735,25 @@ void colorconvert_iyuv_to_nv12(const void *__restrict input, void *__restrict ou
     Iterator out_y(output_ptr->plane(0), win);
     Iterator out_uv(output_ptr->plane(1), win_uv);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto   ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto   ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        uint8x16x2_t ta_uv;
-        ta_uv.val[0] = vld1q_u8(in_u.ptr());
-        ta_uv.val[1] = vld1q_u8(in_v.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_uv.val[0] = U0 U2 U4 U6 ...
-        //ta_uv.val[1] = V0 V2 V4 V6 ...
-
-        vst2q_u8(out_y.ptr(), ta_y_top);
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
-        vst2q_u8(out_uv.ptr(), ta_uv);
-    },
-    in_y, in_u, in_v, out_y, out_uv);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto   ta_y_top    = vld2q_u8(in_y.ptr());
+            const auto   ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+            uint8x16x2_t ta_uv;
+            ta_uv.val[0] = vld1q_u8(in_u.ptr());
+            ta_uv.val[1] = vld1q_u8(in_v.ptr());
+            //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+            //ta_uv.val[0] = U0 U2 U4 U6 ...
+            //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+            vst2q_u8(out_y.ptr(), ta_y_top);
+            vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+            vst2q_u8(out_uv.ptr(), ta_uv);
+        },
+        in_y, in_u, in_v, out_y, out_uv);
 }
 
 /** Convert NV12 to IYUV.
@@ -726,22 +787,24 @@ void colorconvert_nv12_to_iyuv(const void *__restrict input, void *__restrict ou
     Iterator out_u(output_ptr->plane(1), win_uv);
     Iterator out_v(output_ptr->plane(2), win_uv);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        const auto ta_uv       = vld2q_u8(in_uv.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_uv.val[0] = U0 U2 U4 U6 ...
-        //ta_uv.val[1] = V0 V2 V4 V6 ...
-
-        vst2q_u8(out_y.ptr(), ta_y_top);
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
-        vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]);
-        vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]);
-    },
-    in_y, in_uv, out_y, out_u, out_v);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_y_top    = vld2q_u8(in_y.ptr());
+            const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+            const auto ta_uv       = vld2q_u8(in_uv.ptr());
+            //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+            //ta_uv.val[0] = U0 U2 U4 U6 ...
+            //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+            vst2q_u8(out_y.ptr(), ta_y_top);
+            vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+            vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]);
+            vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]);
+        },
+        in_y, in_uv, out_y, out_u, out_v);
 }
 
 /** Convert YUYV to IYUV.
@@ -774,34 +837,36 @@ void colorconvert_yuyv_to_iyuv(const void *__restrict input, void *__restrict ou
     Iterator out_u(output_ptr->plane(1), win_uv);
     Iterator out_v(output_ptr->plane(2), win_uv);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_top    = vld4q_u8(in.ptr());
-        const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
-        //ta.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta.val[1] = U0 U2 U4 U6 ...
-        //ta.val[2] = Y1 Y3 Y5 Y7 ...
-        //ta.val[3] = V0 V2 V4 V7 ...
-
-        uint8x16x2_t yvec;
-        yvec.val[0] = ta_top.val[0 + shift];
-        yvec.val[1] = ta_top.val[2 + shift];
-        vst2q_u8(out_y.ptr(), yvec);
-
-        uint8x16x2_t yyvec;
-        yyvec.val[0] = ta_bottom.val[0 + shift];
-        yyvec.val[1] = ta_bottom.val[2 + shift];
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
-
-        uint8x16_t uvec;
-        uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
-        vst1q_u8(out_u.ptr(), uvec);
-
-        uint8x16_t vvec;
-        vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
-        vst1q_u8(out_v.ptr(), vvec);
-    },
-    in, out_y, out_u, out_v);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_top    = vld4q_u8(in.ptr());
+            const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
+            //ta.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta.val[1] = U0 U2 U4 U6 ...
+            //ta.val[2] = Y1 Y3 Y5 Y7 ...
+            //ta.val[3] = V0 V2 V4 V7 ...
+
+            uint8x16x2_t yvec;
+            yvec.val[0] = ta_top.val[0 + shift];
+            yvec.val[1] = ta_top.val[2 + shift];
+            vst2q_u8(out_y.ptr(), yvec);
+
+            uint8x16x2_t yyvec;
+            yyvec.val[0] = ta_bottom.val[0 + shift];
+            yyvec.val[1] = ta_bottom.val[2 + shift];
+            vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
+
+            uint8x16_t uvec;
+            uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
+            vst1q_u8(out_u.ptr(), uvec);
+
+            uint8x16_t vvec;
+            vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
+            vst1q_u8(out_v.ptr(), vvec);
+        },
+        in, out_y, out_u, out_v);
 }
 
 /** Convert NV12 to YUV4.
@@ -835,32 +900,34 @@ void colorconvert_nv12_to_yuv4(const void *__restrict input, void *__restrict ou
     Iterator out_u(output_ptr->plane(1), win);
     Iterator out_v(output_ptr->plane(2), win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        const auto ta_uv       = vld2q_u8(in_uv.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_uv.val[0] = U0 U2 U4 U6 ...
-        //ta_uv.val[1] = V0 V2 V4 V6 ...
-
-        vst2q_u8(out_y.ptr(), ta_y_top);
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
-
-        uint8x16x2_t uvec;
-        uvec.val[0] = ta_uv.val[0 + shift];
-        uvec.val[1] = ta_uv.val[0 + shift];
-        vst2q_u8(out_u.ptr(), uvec);
-        vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
-
-        uint8x16x2_t vvec;
-        vvec.val[0] = ta_uv.val[1 - shift];
-        vvec.val[1] = ta_uv.val[1 - shift];
-        vst2q_u8(out_v.ptr(), vvec);
-        vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
-    },
-    in_y, in_uv, out_y, out_u, out_v);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_y_top    = vld2q_u8(in_y.ptr());
+            const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+            const auto ta_uv       = vld2q_u8(in_uv.ptr());
+            //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+            //ta_uv.val[0] = U0 U2 U4 U6 ...
+            //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+            vst2q_u8(out_y.ptr(), ta_y_top);
+            vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+
+            uint8x16x2_t uvec;
+            uvec.val[0] = ta_uv.val[0 + shift];
+            uvec.val[1] = ta_uv.val[0 + shift];
+            vst2q_u8(out_u.ptr(), uvec);
+            vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
+
+            uint8x16x2_t vvec;
+            vvec.val[0] = ta_uv.val[1 - shift];
+            vvec.val[1] = ta_uv.val[1 - shift];
+            vst2q_u8(out_v.ptr(), vvec);
+            vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
+        },
+        in_y, in_uv, out_y, out_u, out_v);
 }
 
 /** Convert IYUV to YUV4.
@@ -892,33 +959,35 @@ void colorconvert_iyuv_to_yuv4(const void *__restrict input, void *__restrict ou
     Iterator out_u(output_ptr->plane(1), win);
     Iterator out_v(output_ptr->plane(2), win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        const auto ta_u        = vld1q_u8(in_u.ptr());
-        const auto ta_v        = vld1q_u8(in_v.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_u = U0 U2 U4 U6 ...
-        //ta_v = V0 V2 V4 V6 ...
-
-        vst2q_u8(out_y.ptr(), ta_y_top);
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
-
-        uint8x16x2_t uvec;
-        uvec.val[0] = ta_u;
-        uvec.val[1] = ta_u;
-        vst2q_u8(out_u.ptr(), uvec);
-        vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
-
-        uint8x16x2_t vvec;
-        vvec.val[0] = ta_v;
-        vvec.val[1] = ta_v;
-        vst2q_u8(out_v.ptr(), vvec);
-        vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
-    },
-    in_y, in_u, in_v, out_y, out_u, out_v);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_y_top    = vld2q_u8(in_y.ptr());
+            const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+            const auto ta_u        = vld1q_u8(in_u.ptr());
+            const auto ta_v        = vld1q_u8(in_v.ptr());
+            //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+            //ta_u = U0 U2 U4 U6 ...
+            //ta_v = V0 V2 V4 V6 ...
+
+            vst2q_u8(out_y.ptr(), ta_y_top);
+            vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+
+            uint8x16x2_t uvec;
+            uvec.val[0] = ta_u;
+            uvec.val[1] = ta_u;
+            vst2q_u8(out_u.ptr(), uvec);
+            vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
+
+            uint8x16x2_t vvec;
+            vvec.val[0] = ta_v;
+            vvec.val[1] = ta_v;
+            vst2q_u8(out_v.ptr(), vvec);
+            vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
+        },
+        in_y, in_u, in_v, out_y, out_u, out_v);
 }
 
 /** Convert RGB to NV12.
@@ -948,20 +1017,21 @@ void colorconvert_rgb_to_nv12(const void *__restrict input, void *__restrict out
     Iterator out_y(output_ptr->plane(0), win);
     Iterator out_uv(output_ptr->plane(1), win_uv);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
-        const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
-        //ta_rgb.val[0] = R0 R1 R2 R3 ...
-        //ta_rgb.val[1] = G0 G1 G2 G3 ...
-        //ta_rgb.val[2] = B0 B1 B2 B3 ...
-
-        store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
-                          ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
-                          out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
-                          out_uv.ptr());
-    },
-    in, out_y, out_uv);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
+            const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
+            //ta_rgb.val[0] = R0 R1 R2 R3 ...
+            //ta_rgb.val[1] = G0 G1 G2 G3 ...
+            //ta_rgb.val[2] = B0 B1 B2 B3 ...
+
+            store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], ta_rgb_bottom.val[0],
+                              ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], out_y.ptr(),
+                              out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), out_uv.ptr());
+        },
+        in, out_y, out_uv);
 }
 
 /** Convert RGB to IYUV.
@@ -992,20 +1062,22 @@ void colorconvert_rgb_to_iyuv(const void *__restrict input, void *__restrict out
     Iterator out_u(output_ptr->plane(1), win_uv);
     Iterator out_v(output_ptr->plane(2), win_uv);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
-        const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
-        //ta_rgb.val[0] = R0 R1 R2 R3 ...
-        //ta_rgb.val[1] = G0 G1 G2 G3 ...
-        //ta_rgb.val[2] = B0 B1 B2 B3 ...
-
-        store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
-                          ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
-                          out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
-                          out_u.ptr(), out_v.ptr());
-    },
-    in, out_y, out_u, out_v);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
+            const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
+            //ta_rgb.val[0] = R0 R1 R2 R3 ...
+            //ta_rgb.val[1] = G0 G1 G2 G3 ...
+            //ta_rgb.val[2] = B0 B1 B2 B3 ...
+
+            store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], ta_rgb_bottom.val[0],
+                              ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], out_y.ptr(),
+                              out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), out_u.ptr(),
+                              out_v.ptr());
+        },
+        in, out_y, out_u, out_v);
 }
 
 /** Convert RGB to YUV4.
@@ -1030,16 +1102,17 @@ void colorconvert_rgb_to_yuv4(const void *__restrict input, void *__restrict out
     Iterator out_u(output_ptr->plane(1), win);
     Iterator out_v(output_ptr->plane(2), win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_rgb = load_rgb(in.ptr(), alpha);
-        //ta_rgb.val[0] = R0 R1 R2 R3 ...
-        //ta_rgb.val[1] = G0 G1 G2 G3 ...
-        //ta_rgb.val[2] = B0 B1 B2 B3 ...
-
-        store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2],
-                          out_y.ptr(), out_u.ptr(), out_v.ptr());
-    },
-    in, out_y, out_u, out_v);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_rgb = load_rgb(in.ptr(), alpha);
+            //ta_rgb.val[0] = R0 R1 R2 R3 ...
+            //ta_rgb.val[1] = G0 G1 G2 G3 ...
+            //ta_rgb.val[2] = B0 B1 B2 B3 ...
+
+            store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2], out_y.ptr(), out_u.ptr(), out_v.ptr());
+        },
+        in, out_y, out_u, out_v);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h b/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h
index 96defbc9c9..4b1eb079b2 100644
--- a/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h
+++ b/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h
@@ -33,56 +33,32 @@ namespace detail
 {
 inline float32x4x3_t load_matrix_row(const float *ptr)
 {
-    const float32x4x3_t r =
-    {
-        {
-            vld1q_dup_f32(ptr),
-            vld1q_dup_f32(1 + ptr),
-            vld1q_dup_f32(2 + ptr)
-        }
-    };
+    const float32x4x3_t r = {{vld1q_dup_f32(ptr), vld1q_dup_f32(1 + ptr), vld1q_dup_f32(2 + ptr)}};
     return r;
 }
 
 template <unsigned int stridex>
-float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2);
+float32x4x2_t convolve_3x3(const float         *in_top,
+                           const float         *in_mid,
+                           const float         *in_low,
+                           const float32x4x3_t &m0,
+                           const float32x4x3_t &m1,
+                           const float32x4x3_t &m2);
 
 template <>
-inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
+inline float32x4x2_t convolve_3x3<1>(const float         *in_top,
+                                     const float         *in_mid,
+                                     const float         *in_low,
+                                     const float32x4x3_t &m0,
+                                     const float32x4x3_t &m1,
+                                     const float32x4x3_t &m2)
 {
-    const float32x4x3_t vtop =
-    {
-        {
-            vld1q_f32(in_top),
-            vld1q_f32(in_top + 4),
-            vld1q_f32(in_top + 8)
-        }
-    };
-    const float32x4x3_t vmid =
-    {
-        {
-            vld1q_f32(in_mid),
-            vld1q_f32(in_mid + 4),
-            vld1q_f32(in_mid + 8)
-        }
-    };
-    const float32x4x3_t vlow =
-    {
-        {
-            vld1q_f32(in_low),
-            vld1q_f32(in_low + 4),
-            vld1q_f32(in_low + 8)
-        }
-    };
-    float32x4x2_t out =
-    {
-        {
-            vmulq_f32(vtop.val[0], m0.val[0]),
-            vmulq_f32(vtop.val[1], m0.val[0])
-        }
-    };
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
+    const float32x4x3_t vtop = {{vld1q_f32(in_top), vld1q_f32(in_top + 4), vld1q_f32(in_top + 8)}};
+    const float32x4x3_t vmid = {{vld1q_f32(in_mid), vld1q_f32(in_mid + 4), vld1q_f32(in_mid + 8)}};
+    const float32x4x3_t vlow = {{vld1q_f32(in_low), vld1q_f32(in_low + 4), vld1q_f32(in_low + 8)}};
+    float32x4x2_t       out  = {{vmulq_f32(vtop.val[0], m0.val[0]), vmulq_f32(vtop.val[1], m0.val[0])}};
+    out.val[0]               = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
+    out.val[0]               = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
 
     out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
     out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
@@ -106,7 +82,12 @@ inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, c
 }
 
 template <>
-inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
+inline float32x4x2_t convolve_3x3<2>(const float         *in_top,
+                                     const float         *in_mid,
+                                     const float         *in_low,
+                                     const float32x4x3_t &m0,
+                                     const float32x4x3_t &m1,
+                                     const float32x4x3_t &m2)
 {
     float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
@@ -116,7 +97,12 @@ inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, c
 }
 
 template <>
-inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
+inline float32x4x2_t convolve_3x3<3>(const float         *in_top,
+                                     const float         *in_mid,
+                                     const float         *in_low,
+                                     const float32x4x3_t &m0,
+                                     const float32x4x3_t &m1,
+                                     const float32x4x3_t &m2)
 {
     float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
@@ -165,6 +151,6 @@ int get_input_num_elems_processed<3>(unsigned int num_elems_written_per_iteratio
 {
     return num_elems_written_per_iteration * 3;
 }
-}
+} // namespace detail
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H */
diff --git a/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h b/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
index 7ba52a16b7..fd1ee54597 100644
--- a/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
+++ b/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
@@ -45,14 +45,7 @@ namespace detail
 inline float32x4x3_t load_matrix_row(const float *ptr, int weights_offset = 0)
 {
     ARM_COMPUTE_UNUSED(weights_offset);
-    const float32x4x3_t r =
-    {
-        {
-            vld1q_dup_f32(ptr),
-            vld1q_dup_f32(1 + ptr),
-            vld1q_dup_f32(2 + ptr)
-        }
-    };
+    const float32x4x3_t r = {{vld1q_dup_f32(ptr), vld1q_dup_f32(1 + ptr), vld1q_dup_f32(2 + ptr)}};
     return r;
 }
 
@@ -63,21 +56,16 @@ inline float32x4x3_t load_matrix_row(const float *ptr, int weights_offset = 0)
  *
  * @return The loaded matrix.
  */
-template < typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
+template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value)>
 inline int32x4x3_t load_matrix_row(const T *ptr, int weights_offset = 0)
 {
     const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset);
 
     /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
        r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
-    int32x4x3_t r =
-    {
-        {
-            vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)),
-            vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))),
-            vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2)))
-        }
-    };
+    int32x4x3_t r = {{vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)),
+                      vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))),
+                      vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2)))}};
     return r;
 }
 
@@ -245,36 +233,23 @@ inline void accumulate_results<3>(float16_t *buffer, const float16x8x2_t &values
  * @param[in] input_offset (Optional) Input quantization offset.
  *
  */
-inline float32x4_t single_convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low,
-                                                const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                                                const size_t dilation_x, int input_offset)
+inline float32x4_t single_convolve_3x3_dilation(const float         *in_top,
+                                                const float         *in_mid,
+                                                const float         *in_low,
+                                                const float32x4x3_t &m0,
+                                                const float32x4x3_t &m1,
+                                                const float32x4x3_t &m2,
+                                                const size_t         dilation_x,
+                                                int                  input_offset)
 {
     ARM_COMPUTE_UNUSED(input_offset);
 
-    const float32x4x3_t vtop =
-    {
-        {
-            vld1q_f32(in_top),
-            vld1q_f32(in_top + dilation_x),
-            vld1q_f32(in_top + 2 * dilation_x)
-        }
-    };
-    const float32x4x3_t vmid =
-    {
-        {
-            vld1q_f32(in_mid),
-            vld1q_f32(in_mid + dilation_x),
-            vld1q_f32(in_mid + 2 * dilation_x)
-        }
-    };
-    const float32x4x3_t vlow =
-    {
-        {
-            vld1q_f32(in_low),
-            vld1q_f32(in_low + dilation_x),
-            vld1q_f32(in_low + 2 * dilation_x)
-        }
-    };
+    const float32x4x3_t vtop = {
+        {vld1q_f32(in_top), vld1q_f32(in_top + dilation_x), vld1q_f32(in_top + 2 * dilation_x)}};
+    const float32x4x3_t vmid = {
+        {vld1q_f32(in_mid), vld1q_f32(in_mid + dilation_x), vld1q_f32(in_mid + 2 * dilation_x)}};
+    const float32x4x3_t vlow = {
+        {vld1q_f32(in_low), vld1q_f32(in_low + dilation_x), vld1q_f32(in_low + 2 * dilation_x)}};
     float32x4_t out = vmulq_f32(vtop.val[0], m0.val[0]);
     out             = vmlaq_f32(out, vtop.val[1], m0.val[1]);
     out             = vmlaq_f32(out, vtop.val[2], m0.val[2]);
@@ -303,26 +278,28 @@ inline float32x4_t single_convolve_3x3_dilation(const float *in_top, const float
  * @param[in] input_offset (Optional) Input quantization offset.
  *
  */
-inline float32x4x2_t convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low,
-                                           const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                                           const size_t dilation_x, unsigned int stridex, int input_offset = 0)
+inline float32x4x2_t convolve_3x3_dilation(const float         *in_top,
+                                           const float         *in_mid,
+                                           const float         *in_low,
+                                           const float32x4x3_t &m0,
+                                           const float32x4x3_t &m1,
+                                           const float32x4x3_t &m2,
+                                           const size_t         dilation_x,
+                                           unsigned int         stridex,
+                                           int                  input_offset = 0)
 {
     ARM_COMPUTE_ERROR_ON(stridex > 3);
-    float32x4x2_t out =
-    {
-        {
-            single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
-            single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)
-        }
-    };
+    float32x4x2_t out = {
+        {single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
+         single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)}};
 
-    if(stridex == 2)
+    if (stridex == 2)
     {
         out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
         out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
         out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
     }
-    else if(stridex == 3)
+    else if (stridex == 3)
     {
         out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
     }
@@ -344,26 +321,32 @@ inline float32x4x2_t convolve_3x3_dilation(const float *in_top, const float *in_
  *
  */
 template <bool accumulate>
-void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr,
-                  const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                  unsigned int stridex, int input_offset = 0);
+void convolve_3x3(const float         *in_top,
+                  const float         *in_mid,
+                  const float         *in_low,
+                  float               *out_ptr,
+                  const float32x4x3_t &m0,
+                  const float32x4x3_t &m1,
+                  const float32x4x3_t &m2,
+                  unsigned int         stridex,
+                  int                  input_offset = 0);
 
 template <bool accumulate>
-inline void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr,
-                         const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                         unsigned int stridex, int input_offset)
+inline void convolve_3x3(const float         *in_top,
+                         const float         *in_mid,
+                         const float         *in_low,
+                         float               *out_ptr,
+                         const float32x4x3_t &m0,
+                         const float32x4x3_t &m1,
+                         const float32x4x3_t &m2,
+                         unsigned int         stridex,
+                         int                  input_offset)
 {
     ARM_COMPUTE_UNUSED(input_offset);
     ARM_COMPUTE_ERROR_ON(stridex > 3);
 
-    float32x4x2_t out =
-    {
-        {
-            vdupq_n_f32(0.f),
-            vdupq_n_f32(0.f)
-        }
-    };
-    if(stridex == 2)
+    float32x4x2_t out = {{vdupq_n_f32(0.f), vdupq_n_f32(0.f)}};
+    if (stridex == 2)
     {
         const float32x4x2_t vtop     = vld2q_f32(in_top);
         const float32x4x2_t vmid     = vld2q_f32(in_mid);
@@ -389,32 +372,11 @@ inline void convolve_3x3(const float *in_top, const float *in_mid, const float *
     }
     else
     {
-        const float32x4x3_t vtop =
-        {
-            {
-                vld1q_f32(in_top),
-                vld1q_f32(in_top + 4),
-                vld1q_f32(in_top + 8)
-            }
-        };
-        const float32x4x3_t vmid =
-        {
-            {
-                vld1q_f32(in_mid),
-                vld1q_f32(in_mid + 4),
-                vld1q_f32(in_mid + 8)
-            }
-        };
-        const float32x4x3_t vlow =
-        {
-            {
-                vld1q_f32(in_low),
-                vld1q_f32(in_low + 4),
-                vld1q_f32(in_low + 8)
-            }
-        };
-        out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]);
-        out.val[1] = vmulq_f32(vtop.val[1], m0.val[0]);
+        const float32x4x3_t vtop = {{vld1q_f32(in_top), vld1q_f32(in_top + 4), vld1q_f32(in_top + 8)}};
+        const float32x4x3_t vmid = {{vld1q_f32(in_mid), vld1q_f32(in_mid + 4), vld1q_f32(in_mid + 8)}};
+        const float32x4x3_t vlow = {{vld1q_f32(in_low), vld1q_f32(in_low + 4), vld1q_f32(in_low + 8)}};
+        out.val[0]               = vmulq_f32(vtop.val[0], m0.val[0]);
+        out.val[1]               = vmulq_f32(vtop.val[1], m0.val[0]);
 
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
@@ -438,7 +400,7 @@ inline void convolve_3x3(const float *in_top, const float *in_mid, const float *
         out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
         out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
 
-        if(stridex == 3)
+        if (stridex == 3)
         {
             out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
             accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
@@ -462,65 +424,43 @@ inline void convolve_3x3(const float *in_top, const float *in_mid, const float *
  * @param[in] input_offset Input quantization offset.
  *
  */
-template < typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
-inline int32x4_t single_convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low,
-                                              const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
-                                              size_t dilation_x, int32_t input_offset)
+template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value)>
+inline int32x4_t single_convolve_3x3_dilation(const T           *in_top,
+                                              const T           *in_mid,
+                                              const T           *in_low,
+                                              const int32x4x3_t &m0,
+                                              const int32x4x3_t &m1,
+                                              const int32x4x3_t &m2,
+                                              size_t             dilation_x,
+                                              int32_t            input_offset)
 {
     using VectorType    = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x3_t, int8x8x3_t>::type;
     using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
 
     const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{});
 
-    const VectorType vtop =
-    {
-        {
-            wrapper::vload(in_top),
-            wrapper::vload(in_top + dilation_x),
-            wrapper::vload(in_top + 2 * dilation_x)
-        }
-    };
-    const VectorType vmid =
-    {
-        {
-            wrapper::vload(in_mid),
-            wrapper::vload(in_mid + dilation_x),
-            wrapper::vload(in_mid + 2 * dilation_x)
-        }
-    };
-    const VectorType vlow =
-    {
-        {
-            wrapper::vload(in_low),
-            wrapper::vload(in_low + dilation_x),
-            wrapper::vload(in_low + 2 * dilation_x)
-        }
-    };
-
-    const int32x4x3_t vtop_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[2])))),
-        }
-    };
-    const int32x4x3_t vmid_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[2])))),
-        }
-    };
-    const int32x4x3_t vlow_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[2])))),
-        }
-    };
+    const VectorType vtop = {
+        {wrapper::vload(in_top), wrapper::vload(in_top + dilation_x), wrapper::vload(in_top + 2 * dilation_x)}};
+    const VectorType vmid = {
+        {wrapper::vload(in_mid), wrapper::vload(in_mid + dilation_x), wrapper::vload(in_mid + 2 * dilation_x)}};
+    const VectorType vlow = {
+        {wrapper::vload(in_low), wrapper::vload(in_low + dilation_x), wrapper::vload(in_low + 2 * dilation_x)}};
+
+    const int32x4x3_t vtop_s32 = {{
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[2])))),
+    }};
+    const int32x4x3_t vmid_s32 = {{
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[2])))),
+    }};
+    const int32x4x3_t vlow_s32 = {{
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[2])))),
+    }};
 
     int32x4_t out = wrapper::vmul(vtop_s32.val[0], m0.val[0]);
     out           = wrapper::vmla(out, vtop_s32.val[1], m0.val[1]);
@@ -550,26 +490,29 @@ inline int32x4_t single_convolve_3x3_dilation(const T *in_top, const T *in_mid,
  * @param[in] input_offset Input quantization offset.
  *
  */
-template < typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
-inline int32x4x2_t convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
-                                         const size_t dilation_x, unsigned int stridex, int input_offset)
+template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value)>
+inline int32x4x2_t convolve_3x3_dilation(const T           *in_top,
+                                         const T           *in_mid,
+                                         const T           *in_low,
+                                         const int32x4x3_t &m0,
+                                         const int32x4x3_t &m1,
+                                         const int32x4x3_t &m2,
+                                         const size_t       dilation_x,
+                                         unsigned int       stridex,
+                                         int                input_offset)
 {
     ARM_COMPUTE_ERROR_ON(stridex > 3);
-    int32x4x2_t out =
-    {
-        {
-            single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
-            single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)
-        }
-    };
+    int32x4x2_t out = {
+        {single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
+         single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)}};
 
-    if(stridex == 2)
+    if (stridex == 2)
     {
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3);
     }
-    else if(stridex == 3)
+    else if (stridex == 3)
     {
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);
     }
@@ -589,10 +532,19 @@ inline int32x4x2_t convolve_3x3_dilation(const T *in_top, const T *in_mid, const
  * @param[in]  input_offset Input quantization offset.
  *
  */
-template < bool accumulate, typename T1, typename T2, ARM_COMPUTE_REQUIRES_TA(std::is_same<T1, uint8_t>::value || std::is_same<T1, int8_t>::value) >
-void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_ptr,
-                  const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
-                  unsigned int stridex, int32_t input_offset)
+template <bool accumulate,
+          typename T1,
+          typename T2,
+          ARM_COMPUTE_REQUIRES_TA(std::is_same<T1, uint8_t>::value || std::is_same<T1, int8_t>::value)>
+void convolve_3x3(const T1          *in_top,
+                  const T1          *in_mid,
+                  const T1          *in_low,
+                  T2                *out_ptr,
+                  const int32x4x3_t &m0,
+                  const int32x4x3_t &m1,
+                  const int32x4x3_t &m2,
+                  unsigned int       stridex,
+                  int32_t            input_offset)
 {
     ARM_COMPUTE_ERROR_ON(stridex > 3);
     using VectorType    = typename std::conditional<std::is_same<T1, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
@@ -600,60 +552,30 @@ void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_
 
     const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{});
 
-    const VectorType vtop =
-    {
-        {
-            wrapper::vload(in_top),
-            wrapper::vload(in_top + 8)
-        }
-    };
-    const VectorType vmid =
-    {
-        {
-            wrapper::vload(in_mid),
-            wrapper::vload(in_mid + 8)
-        }
-    };
-    const VectorType vlow =
-    {
-        {
-            wrapper::vload(in_low),
-            wrapper::vload(in_low + 8)
-        }
-    };
-
-    const int32x4x3_t vtop_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vtop.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
-        }
-    };
-    const int32x4x3_t vmid_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vmid.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
-        }
-    };
-    const int32x4x3_t vlow_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vlow.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
-        }
-    };
-
-    int32x4x2_t out
-    {
-        {
-            wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
-            wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
-        }
-    };
+    const VectorType vtop = {{wrapper::vload(in_top), wrapper::vload(in_top + 8)}};
+    const VectorType vmid = {{wrapper::vload(in_mid), wrapper::vload(in_mid + 8)}};
+    const VectorType vlow = {{wrapper::vload(in_low), wrapper::vload(in_low + 8)}};
+
+    const int32x4x3_t vtop_s32 = {{
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vtop.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
+    }};
+    const int32x4x3_t vmid_s32 = {{
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vmid.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
+    }};
+    const int32x4x3_t vlow_s32 = {{
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vlow.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
+    }};
+
+    int32x4x2_t out{{
+        wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
+        wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
+    }};
 
     // 0
     out.val[0] = wrapper::vmla(out.val[0], vtop_s32.val[0], m0.val[0]);
@@ -681,11 +603,11 @@ void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_
     out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vlow_s32.val[1], vlow_s32.val[2]), m2.val[1]);
     out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vlow_s32.val[1], vlow_s32.val[2]), m2.val[2]);
 
-    if(stridex == 1)
+    if (stridex == 1)
     {
         accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out);
     }
-    else if(stridex == 2)
+    else if (stridex == 2)
     {
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);
@@ -693,7 +615,7 @@ void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_
 
         accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out);
     }
-    else if(stridex == 3)
+    else if (stridex == 3)
     {
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);
         accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
@@ -712,14 +634,7 @@ inline float16x8x3_t load_matrix_row(const float16_t *ptr, int weights_offset =
     ARM_COMPUTE_UNUSED(weights_offset);
     /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
        r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
-    const float16x8x3_t r =
-    {
-        {
-            vld1q_dup_f16(ptr),
-            vld1q_dup_f16(1 + ptr),
-            vld1q_dup_f16(2 + ptr)
-        }
-    };
+    const float16x8x3_t r = {{vld1q_dup_f16(ptr), vld1q_dup_f16(1 + ptr), vld1q_dup_f16(2 + ptr)}};
     return r;
 }
 
@@ -735,35 +650,22 @@ inline float16x8x3_t load_matrix_row(const float16_t *ptr, int weights_offset =
  * @param[in] input_offset (Optional)Input quantization offset.
  *
  */
-inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low,
-                                                const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                                                const size_t dilation_x, int input_offset = 0)
+inline float16x8_t single_convolve_3x3_dilation(const float16_t     *in_top,
+                                                const float16_t     *in_mid,
+                                                const float16_t     *in_low,
+                                                const float16x8x3_t &m0,
+                                                const float16x8x3_t &m1,
+                                                const float16x8x3_t &m2,
+                                                const size_t         dilation_x,
+                                                int                  input_offset = 0)
 {
     ARM_COMPUTE_UNUSED(input_offset);
-    const float16x8x3_t vtop =
-    {
-        {
-            vld1q_f16(in_top),
-            vld1q_f16(in_top + dilation_x),
-            vld1q_f16(in_top + 2 * dilation_x)
-        }
-    };
-    const float16x8x3_t vmid =
-    {
-        {
-            vld1q_f16(in_mid),
-            vld1q_f16(in_mid + dilation_x),
-            vld1q_f16(in_mid + 2 * dilation_x)
-        }
-    };
-    const float16x8x3_t vlow =
-    {
-        {
-            vld1q_f16(in_low),
-            vld1q_f16(in_low + dilation_x),
-            vld1q_f16(in_low + 2 * dilation_x)
-        }
-    };
+    const float16x8x3_t vtop = {
+        {vld1q_f16(in_top), vld1q_f16(in_top + dilation_x), vld1q_f16(in_top + 2 * dilation_x)}};
+    const float16x8x3_t vmid = {
+        {vld1q_f16(in_mid), vld1q_f16(in_mid + dilation_x), vld1q_f16(in_mid + 2 * dilation_x)}};
+    const float16x8x3_t vlow = {
+        {vld1q_f16(in_low), vld1q_f16(in_low + dilation_x), vld1q_f16(in_low + 2 * dilation_x)}};
     float16x8_t out = vmulq_f16(vtop.val[0], m0.val[0]);
     out             = vaddq_f16(out, vmulq_f16(vtop.val[1], m0.val[1]));
     out             = vaddq_f16(out, vmulq_f16(vtop.val[2], m0.val[2]));
@@ -792,19 +694,21 @@ inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top, const f
  * @param[in] input_offset (Optional) Input quantization offset.
  *
  */
-inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low,
-                                           const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                                           const size_t dilation_x, unsigned int stridex, int input_offset = 0)
-{
-    float16x8x2_t out =
-    {
-        {
-            single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
-            single_convolve_3x3_dilation(in_top + 8, in_mid + 8, in_low + 8, m0, m1, m2, dilation_x, input_offset)
-        }
-    };
-
-    if(stridex == 2)
+inline float16x8x2_t convolve_3x3_dilation(const float16_t     *in_top,
+                                           const float16_t     *in_mid,
+                                           const float16_t     *in_low,
+                                           const float16x8x3_t &m0,
+                                           const float16x8x3_t &m1,
+                                           const float16x8x3_t &m2,
+                                           const size_t         dilation_x,
+                                           unsigned int         stridex,
+                                           int                  input_offset = 0)
+{
+    float16x8x2_t out = {
+        {single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
+         single_convolve_3x3_dilation(in_top + 8, in_mid + 8, in_low + 8, m0, m1, m2, dilation_x, input_offset)}};
+
+    if (stridex == 2)
     {
         out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1);
         out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 4), out.val[0], 2);
@@ -814,7 +718,7 @@ inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float1
         out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 4), out.val[0], 6);
         out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 6), out.val[0], 7);
     }
-    else if(stridex == 3)
+    else if (stridex == 3)
     {
         out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);
         out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2);
@@ -838,20 +742,20 @@ inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float1
  *
  */
 template <bool accumulate>
-inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, float16_t *out_ptr,
-                         const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                         unsigned int stridex, int input_offset = 0)
+inline void convolve_3x3(const float16_t     *in_top,
+                         const float16_t     *in_mid,
+                         const float16_t     *in_low,
+                         float16_t           *out_ptr,
+                         const float16x8x3_t &m0,
+                         const float16x8x3_t &m1,
+                         const float16x8x3_t &m2,
+                         unsigned int         stridex,
+                         int                  input_offset = 0)
 {
     ARM_COMPUTE_UNUSED(input_offset);
 
-    float16x8x2_t out =
-    {
-        {
-            vdupq_n_f16(0),
-            vdupq_n_f16(0)
-        }
-    };
-    if(stridex == 2)
+    float16x8x2_t out = {{vdupq_n_f16(0), vdupq_n_f16(0)}};
+    if (stridex == 2)
     {
         const float16x8x2_t vtop     = vld2q_f16(in_top);
         const float16x8x2_t vmid     = vld2q_f16(in_mid);
@@ -877,32 +781,11 @@ inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const
     }
     else
     {
-        const float16x8x3_t vtop =
-        {
-            {
-                vld1q_f16(in_top),
-                vld1q_f16(in_top + 8),
-                vld1q_f16(in_top + 16)
-            }
-        };
-        const float16x8x3_t vmid =
-        {
-            {
-                vld1q_f16(in_mid),
-                vld1q_f16(in_mid + 8),
-                vld1q_f16(in_mid + 16)
-            }
-        };
-        const float16x8x3_t vlow =
-        {
-            {
-                vld1q_f16(in_low),
-                vld1q_f16(in_low + 8),
-                vld1q_f16(in_low + 16)
-            }
-        };
-        out.val[0] = vmulq_f16(vtop.val[0], m0.val[0]);
-        out.val[1] = vmulq_f16(vtop.val[1], m0.val[0]);
+        const float16x8x3_t vtop = {{vld1q_f16(in_top), vld1q_f16(in_top + 8), vld1q_f16(in_top + 16)}};
+        const float16x8x3_t vmid = {{vld1q_f16(in_mid), vld1q_f16(in_mid + 8), vld1q_f16(in_mid + 16)}};
+        const float16x8x3_t vlow = {{vld1q_f16(in_low), vld1q_f16(in_low + 8), vld1q_f16(in_low + 16)}};
+        out.val[0]               = vmulq_f16(vtop.val[0], m0.val[0]);
+        out.val[1]               = vmulq_f16(vtop.val[1], m0.val[0]);
 
         out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 1), m0.val[1]));
         out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 2), m0.val[2]));
@@ -921,7 +804,7 @@ inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const
         out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 1), m2.val[1]));
         out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 2), m2.val[2]));
 
-        if(stridex == 3)
+        if (stridex == 3)
         {
             out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);
             out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2);
@@ -946,7 +829,7 @@ inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const
  */
 inline int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration, unsigned int stridex)
 {
-    switch(stridex)
+    switch (stridex)
     {
         case 1:
             return num_elems_written_per_iteration;
@@ -959,6 +842,6 @@ inline int get_input_num_elems_processed(unsigned int num_elems_written_per_iter
             return 0;
     }
 }
-}
+} // namespace detail
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H */
diff --git a/src/core/NEON/wrapper/intrinsics/cvt.h b/src/core/NEON/wrapper/intrinsics/cvt.h
index 1c77a9e9f0..381de2284a 100644
--- a/src/core/NEON/wrapper/intrinsics/cvt.h
+++ b/src/core/NEON/wrapper/intrinsics/cvt.h
@@ -30,12 +30,11 @@ namespace arm_compute
 {
 namespace wrapper
 {
-#define VCVT_TO_F32_IMPL(ptype, vtype, prefix, postfix1, postfix2)                   \
-    template <typename T>                                                            \
-    inline typename std::enable_if<std::is_same<T, float>::value, float32x4_t>::type \
-    vcvt(const vtype &a)                                                             \
-    {                                                                                \
-        return prefix##_##postfix1##_##postfix2(a);                                  \
+#define VCVT_TO_F32_IMPL(ptype, vtype, prefix, postfix1, postfix2)                                        \
+    template <typename T>                                                                                 \
+    inline typename std::enable_if<std::is_same<T, float>::value, float32x4_t>::type vcvt(const vtype &a) \
+    {                                                                                                     \
+        return prefix##_##postfix1##_##postfix2(a);                                                       \
     }
 
 VCVT_TO_F32_IMPL(float32x4_t, uint32x4_t, vcvtq, f32, u32)
@@ -46,12 +45,11 @@ VCVT_TO_F32_IMPL(float32x4_t, float16x4_t, vcvt, f32, f16)
 #undef VCVT_TO_F32_IMPL
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#define VCVT_TO_F16_IMPL(ptype, vtype, prefix, postfix1, postfix2)                       \
-    template <typename T>                                                                \
-    inline typename std::enable_if<std::is_same<T, float16_t>::value, float16x4_t>::type \
-    vcvt(const vtype &a)                                                                 \
-    {                                                                                    \
-        return prefix##_##postfix1##_##postfix2(a);                                      \
+#define VCVT_TO_F16_IMPL(ptype, vtype, prefix, postfix1, postfix2)                                            \
+    template <typename T>                                                                                     \
+    inline typename std::enable_if<std::is_same<T, float16_t>::value, float16x4_t>::type vcvt(const vtype &a) \
+    {                                                                                                         \
+        return prefix##_##postfix1##_##postfix2(a);                                                           \
     }
 
 VCVT_TO_F16_IMPL(float16x4_t, float32x4_t, vcvt, f16, f32)
@@ -59,14 +57,14 @@ VCVT_TO_F16_IMPL(float16x4_t, float32x4_t, vcvt, f16, f32)
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 template <typename T>
-inline typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, uint32_t>::value, uint32x4_t >::type
+inline typename std::enable_if<std::is_same<T, uint8_t>::value || std::is_same<T, uint32_t>::value, uint32x4_t>::type
 vcvt(const float32x4_t &a)
 {
     return vcvtq_u32_f32(a);
 }
 
 template <typename T>
-inline typename std::enable_if < std::is_same<T, int8_t>::value || std::is_same<T, int32_t>::value, int32x4_t >::type
+inline typename std::enable_if<std::is_same<T, int8_t>::value || std::is_same<T, int32_t>::value, int32x4_t>::type
 vcvt(const float32x4_t &a)
 {
     return vcvtq_s32_f32(a);
@@ -74,15 +72,13 @@ vcvt(const float32x4_t &a)
 
 #ifdef __aarch64__
 template <typename T>
-inline typename std::enable_if<std::is_same<T, uint32_t>::value, uint32x4_t>::type
-vcvta(const float32x4_t &a)
+inline typename std::enable_if<std::is_same<T, uint32_t>::value, uint32x4_t>::type vcvta(const float32x4_t &a)
 {
     return vcvtaq_u32_f32(a);
 }
 
 template <typename T>
-inline typename std::enable_if<std::is_same<T, int32_t>::value, int32x4_t>::type
-vcvta(const float32x4_t &a)
+inline typename std::enable_if<std::is_same<T, int32_t>::value, int32x4_t>::type vcvta(const float32x4_t &a)
 {
     return vcvtaq_s32_f32(a);
 }
@@ -96,14 +92,13 @@ vcvta(const float32x4_t &a)
  */
 inline void vcvt_bf16_f32(const float *inptr, uint16_t *outptr)
 {
-    __asm __volatile(
-        "ldp    q0, q1, [%[inptr]]\n"
-        ".inst  0xea16800\n"  // BFCVTN v0, v0
-        ".inst  0x4ea16820\n" // BFCVTN2 v0, v1
-        "str    q0, [%[outptr]]\n"
-        : [inptr] "+r"(inptr)
-        : [outptr] "r"(outptr)
-        : "v0", "v1", "memory");
+    __asm __volatile("ldp    q0, q1, [%[inptr]]\n"
+                     ".inst  0xea16800\n"  // BFCVTN v0, v0
+                     ".inst  0x4ea16820\n" // BFCVTN2 v0, v1
+                     "str    q0, [%[outptr]]\n"
+                     : [inptr] "+r"(inptr)
+                     : [outptr] "r"(outptr)
+                     : "v0", "v1", "memory");
 }
 #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
 
diff --git a/src/core/NEON/wrapper/intrinsics/div.h b/src/core/NEON/wrapper/intrinsics/div.h
index 265f30d33b..ece991a5b0 100644
--- a/src/core/NEON/wrapper/intrinsics/div.h
+++ b/src/core/NEON/wrapper/intrinsics/div.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_DIV_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/erf.h b/src/core/NEON/wrapper/intrinsics/erf.h
index e2207648e5..0e34462b96 100644
--- a/src/core/NEON/wrapper/intrinsics/erf.h
+++ b/src/core/NEON/wrapper/intrinsics/erf.h
@@ -26,6 +26,7 @@
 #define ARM_COMPUTE_WRAPPER_ERF_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/exp.h b/src/core/NEON/wrapper/intrinsics/exp.h
index c2a6970967..f44577b926 100644
--- a/src/core/NEON/wrapper/intrinsics/exp.h
+++ b/src/core/NEON/wrapper/intrinsics/exp.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_EXP_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/getlane.h b/src/core/NEON/wrapper/intrinsics/getlane.h
index 2052751612..ae813bb2fa 100644
--- a/src/core/NEON/wrapper/intrinsics/getlane.h
+++ b/src/core/NEON/wrapper/intrinsics/getlane.h
@@ -33,7 +33,7 @@ namespace wrapper
 #define VGETLANE_IMPL_8(stype, vtype, postfix)                         \
     inline stype vgetlane(const vtype vector, const unsigned int lane) \
     {                                                                  \
-        switch(lane)                                                   \
+        switch (lane)                                                  \
         {                                                              \
             case 0:                                                    \
                 return vget_lane_##postfix(vector, 0);                 \
@@ -59,7 +59,7 @@ namespace wrapper
 #define VGETLANE_IMPL_4(stype, vtype, postfix)                         \
     inline stype vgetlane(const vtype vector, const unsigned int lane) \
     {                                                                  \
-        switch(lane)                                                   \
+        switch (lane)                                                  \
         {                                                              \
             case 0:                                                    \
                 return vget_lane_##postfix(vector, 0);                 \
@@ -77,7 +77,7 @@ namespace wrapper
 #define VGETLANE_IMPL_2(stype, vtype, postfix)                         \
     inline stype vgetlane(const vtype vector, const unsigned int lane) \
     {                                                                  \
-        switch(lane)                                                   \
+        switch (lane)                                                  \
         {                                                              \
             case 0:                                                    \
                 return vget_lane_##postfix(vector, 0);                 \
@@ -102,7 +102,7 @@ VGETLANE_IMPL_4(float16_t, float16x4_t, f16)
 #define VGETQLANE_IMPL_16(stype, vtype, postfix)                       \
     inline stype vgetlane(const vtype vector, const unsigned int lane) \
     {                                                                  \
-        switch(lane)                                                   \
+        switch (lane)                                                  \
         {                                                              \
             case 0:                                                    \
                 return vgetq_lane_##postfix(vector, 0);                \
@@ -144,7 +144,7 @@ VGETLANE_IMPL_4(float16_t, float16x4_t, f16)
 #define VGETQLANE_IMPL_8(stype, vtype, postfix)                        \
     inline stype vgetlane(const vtype vector, const unsigned int lane) \
     {                                                                  \
-        switch(lane)                                                   \
+        switch (lane)                                                  \
         {                                                              \
             case 0:                                                    \
                 return vgetq_lane_##postfix(vector, 0);                \
@@ -170,7 +170,7 @@ VGETLANE_IMPL_4(float16_t, float16x4_t, f16)
 #define VGETQLANE_IMPL_4(stype, vtype, postfix)                        \
     inline stype vgetlane(const vtype vector, const unsigned int lane) \
     {                                                                  \
-        switch(lane)                                                   \
+        switch (lane)                                                  \
         {                                                              \
             case 0:                                                    \
                 return vgetq_lane_##postfix(vector, 0);                \
@@ -188,7 +188,7 @@ VGETLANE_IMPL_4(float16_t, float16x4_t, f16)
 #define VGETQLANE_IMPL_2(stype, vtype, postfix)                        \
     inline stype vgetlane(const vtype vector, const unsigned int lane) \
     {                                                                  \
-        switch(lane)                                                   \
+        switch (lane)                                                  \
         {                                                              \
             case 0:                                                    \
                 return vgetq_lane_##postfix(vector, 0);                \
diff --git a/src/core/NEON/wrapper/intrinsics/inv.h b/src/core/NEON/wrapper/intrinsics/inv.h
index de398b0403..e443be679b 100644
--- a/src/core/NEON/wrapper/intrinsics/inv.h
+++ b/src/core/NEON/wrapper/intrinsics/inv.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_INV_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/invsqrt.h b/src/core/NEON/wrapper/intrinsics/invsqrt.h
index 2343efa8f8..257b445cc7 100644
--- a/src/core/NEON/wrapper/intrinsics/invsqrt.h
+++ b/src/core/NEON/wrapper/intrinsics/invsqrt.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_INVSQRT_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/log.h b/src/core/NEON/wrapper/intrinsics/log.h
index 357a77ca78..d091407edb 100644
--- a/src/core/NEON/wrapper/intrinsics/log.h
+++ b/src/core/NEON/wrapper/intrinsics/log.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_LOG_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/pow.h b/src/core/NEON/wrapper/intrinsics/pow.h
index 61f834ed23..dfd6ccc358 100644
--- a/src/core/NEON/wrapper/intrinsics/pow.h
+++ b/src/core/NEON/wrapper/intrinsics/pow.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_POW_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/qmov.h b/src/core/NEON/wrapper/intrinsics/qmov.h
index 167f3cf43b..9a0a23a241 100644
--- a/src/core/NEON/wrapper/intrinsics/qmov.h
+++ b/src/core/NEON/wrapper/intrinsics/qmov.h
@@ -31,15 +31,13 @@ namespace arm_compute
 namespace wrapper
 {
 template <typename T>
-inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x8_t>::type
-vqmov(const int16x8_t &a)
+inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x8_t>::type vqmov(const int16x8_t &a)
 {
     return vqmovun_s16(a);
 }
 
 template <typename T>
-inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x8_t>::type
-vqmov(const int16x8_t &a)
+inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x8_t>::type vqmov(const int16x8_t &a)
 {
     return vqmovn_s16(a);
 }
diff --git a/src/core/NEON/wrapper/intrinsics/reinterpret.h b/src/core/NEON/wrapper/intrinsics/reinterpret.h
index cf00a4aceb..c2c4f720d2 100644
--- a/src/core/NEON/wrapper/intrinsics/reinterpret.h
+++ b/src/core/NEON/wrapper/intrinsics/reinterpret.h
@@ -35,7 +35,7 @@ namespace wrapper
     {                                                               \
         return prefix##_##postfix1##_##postfix2(a);                 \
     }                                                               \
-    \
+                                                                    \
     inline ptype vreinterpret(const ptype &a)                       \
     {                                                               \
         return a;                                                   \
diff --git a/src/core/NEON/wrapper/intrinsics/round.h b/src/core/NEON/wrapper/intrinsics/round.h
index d23feb6b42..7789aab770 100644
--- a/src/core/NEON/wrapper/intrinsics/round.h
+++ b/src/core/NEON/wrapper/intrinsics/round.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_ROUND_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/wrapper/intrinsics/setlane.h b/src/core/NEON/wrapper/intrinsics/setlane.h
index 197eedacb5..259b8eaf90 100644
--- a/src/core/NEON/wrapper/intrinsics/setlane.h
+++ b/src/core/NEON/wrapper/intrinsics/setlane.h
@@ -33,7 +33,7 @@ namespace wrapper
 #define VSETLANE_IMPL_8(stype, atype, vtype, postfix)                                     \
     inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
     {                                                                                     \
-        switch(lane)                                                                      \
+        switch (lane)                                                                     \
         {                                                                                 \
             case 0:                                                                       \
                 return vset_lane_##postfix(value, vector, 0);                             \
@@ -59,7 +59,7 @@ namespace wrapper
 #define VSETLANE_IMPL_4(stype, atype, vtype, postfix)                                     \
     inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
     {                                                                                     \
-        switch(lane)                                                                      \
+        switch (lane)                                                                     \
         {                                                                                 \
             case 0:                                                                       \
                 return vset_lane_##postfix(value, vector, 0);                             \
@@ -77,7 +77,7 @@ namespace wrapper
 #define VSETLANE_IMPL_2(stype, atype, vtype, postfix)                                     \
     inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
     {                                                                                     \
-        switch(lane)                                                                      \
+        switch (lane)                                                                     \
         {                                                                                 \
             case 0:                                                                       \
                 return vset_lane_##postfix(value, vector, 0);                             \
@@ -102,7 +102,7 @@ VSETLANE_IMPL_4(float16x4_t, float16_t, float16x4_t, f16)
 #define VSETQLANE_IMPL_16(stype, atype, vtype, postfix)                                   \
     inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
     {                                                                                     \
-        switch(lane)                                                                      \
+        switch (lane)                                                                     \
         {                                                                                 \
             case 0:                                                                       \
                 return vsetq_lane_##postfix(value, vector, 0);                            \
@@ -144,7 +144,7 @@ VSETLANE_IMPL_4(float16x4_t, float16_t, float16x4_t, f16)
 #define VSETQLANE_IMPL_8(stype, atype, vtype, postfix)                                    \
     inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
     {                                                                                     \
-        switch(lane)                                                                      \
+        switch (lane)                                                                     \
         {                                                                                 \
             case 0:                                                                       \
                 return vsetq_lane_##postfix(value, vector, 0);                            \
@@ -170,7 +170,7 @@ VSETLANE_IMPL_4(float16x4_t, float16_t, float16x4_t, f16)
 #define VSETQLANE_IMPL_4(stype, atype, vtype, postfix)                                    \
     inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
     {                                                                                     \
-        switch(lane)                                                                      \
+        switch (lane)                                                                     \
         {                                                                                 \
             case 0:                                                                       \
                 return vsetq_lane_##postfix(value, vector, 0);                            \
diff --git a/src/core/NEON/wrapper/intrinsics/shr.h b/src/core/NEON/wrapper/intrinsics/shr.h
index 73ca9c56c6..6ccb9cdf92 100644
--- a/src/core/NEON/wrapper/intrinsics/shr.h
+++ b/src/core/NEON/wrapper/intrinsics/shr.h
@@ -75,7 +75,7 @@ VQRSHRN_SCALAR_IMPL(uint32_t, uint64_t, vqrshrnd_n, u64)
     {                                                                                                            \
         return prefix_signed##_##postfix(a, b);                                                                  \
     }                                                                                                            \
-    \
+                                                                                                                 \
     template <int b, typename T>                                                                                 \
     inline typename std::enable_if<std::is_integral<T>::value && !std::is_signed<T>::value, u##half_vtype>::type \
     vqrshrn_ex(const vtype &a)                                                                                   \
@@ -128,7 +128,7 @@ VSHRQ_SCALAR_IMPL(int32_t, vshrd_n, s64)
     {                                                                                                            \
         return prefix_signed##_##postfix(a, b);                                                                  \
     }                                                                                                            \
-    \
+                                                                                                                 \
     template <int b, typename T>                                                                                 \
     inline typename std::enable_if<std::is_integral<T>::value && !std::is_signed<T>::value, u##half_vtype>::type \
     vqrshrn_ex(const vtype &a)                                                                                   \
diff --git a/src/core/NEON/wrapper/intrinsics/sin.h b/src/core/NEON/wrapper/intrinsics/sin.h
index 03c2813a32..d24fdfa816 100644
--- a/src/core/NEON/wrapper/intrinsics/sin.h
+++ b/src/core/NEON/wrapper/intrinsics/sin.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_SIN_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -54,4 +55,4 @@ VSIN_IMPL_INT(int32x4_t, vsinq, s32)
 #undef vsub_IMPL
 } // namespace wrapper
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_SUB_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_WRAPPER_SUB_H */
diff --git a/src/core/NEON/wrapper/intrinsics/svcnt.h b/src/core/NEON/wrapper/intrinsics/svcnt.h
index e530e7c83f..c4652504b4 100644
--- a/src/core/NEON/wrapper/intrinsics/svcnt.h
+++ b/src/core/NEON/wrapper/intrinsics/svcnt.h
@@ -30,7 +30,7 @@ namespace arm_compute
 namespace wrapper
 {
 template <size_t element_size>
-inline uint64_t  svcnt_size();
+inline uint64_t svcnt_size();
 
 template <>
 inline uint64_t svcnt_size<64>()
@@ -65,4 +65,4 @@ inline uint64_t svcnt()
 } // namespace arm_compute
 
 #endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCNT_H */
-\ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCNT_H */
diff --git a/src/core/NEON/wrapper/intrinsics/svcvt.h b/src/core/NEON/wrapper/intrinsics/svcvt.h
index 746b004d7d..00ef7b7eb3 100644
--- a/src/core/NEON/wrapper/intrinsics/svcvt.h
+++ b/src/core/NEON/wrapper/intrinsics/svcvt.h
@@ -29,11 +29,12 @@ namespace arm_compute
 {
 namespace wrapper
 {
-#define SVCVT_Z_TO_F32_IMPL(vtype)                                                                                        \
-    template <typename T>                                                                                                 \
-    inline typename std::enable_if<std::is_same<T, float>::value, svfloat32_t>::type svcvt_z(svbool_t pg, const vtype &a) \
-    {                                                                                                                     \
-        return svcvt_f32_z(pg, a);                                                                                        \
+#define SVCVT_Z_TO_F32_IMPL(vtype)                                                                            \
+    template <typename T>                                                                                     \
+    inline typename std::enable_if<std::is_same<T, float>::value, svfloat32_t>::type svcvt_z(svbool_t     pg, \
+                                                                                             const vtype &a)  \
+    {                                                                                                         \
+        return svcvt_f32_z(pg, a);                                                                            \
     }
 
 SVCVT_Z_TO_F32_IMPL(svuint32_t)
@@ -42,11 +43,12 @@ SVCVT_Z_TO_F32_IMPL(svfloat16_t)
 
 #undef SVCVT_Z_TO_F32_IMPL
 
-#define SVCVT_Z_TO_F16_IMPL(vtype)                                                                                            \
-    template <typename T>                                                                                                     \
-    inline typename std::enable_if<std::is_same<T, float16_t>::value, svfloat16_t>::type svcvt_z(svbool_t pg, const vtype &a) \
-    {                                                                                                                         \
-        return svcvt_f16_z(pg, a);                                                                                            \
+#define SVCVT_Z_TO_F16_IMPL(vtype)                                                                                \
+    template <typename T>                                                                                         \
+    inline typename std::enable_if<std::is_same<T, float16_t>::value, svfloat16_t>::type svcvt_z(svbool_t     pg, \
+                                                                                                 const vtype &a)  \
+    {                                                                                                             \
+        return svcvt_f16_z(pg, a);                                                                                \
     }
 
 SVCVT_Z_TO_F16_IMPL(svuint32_t)
@@ -55,11 +57,12 @@ SVCVT_Z_TO_F16_IMPL(svfloat32_t)
 
 #undef SVCVT_Z_TO_F16_IMPL
 
-#define SVCVT_Z_TO_S32_IMPL(vtype)                                                                                        \
-    template <typename T>                                                                                                 \
-    inline typename std::enable_if<std::is_same<T, int32_t>::value, svint32_t>::type svcvt_z(svbool_t pg, const vtype &a) \
-    {                                                                                                                     \
-        return svcvt_s32_z(pg, a);                                                                                        \
+#define SVCVT_Z_TO_S32_IMPL(vtype)                                                                            \
+    template <typename T>                                                                                     \
+    inline typename std::enable_if<std::is_same<T, int32_t>::value, svint32_t>::type svcvt_z(svbool_t     pg, \
+                                                                                             const vtype &a)  \
+    {                                                                                                         \
+        return svcvt_s32_z(pg, a);                                                                            \
     }
 
 SVCVT_Z_TO_S32_IMPL(svfloat16_t)
@@ -71,4 +74,4 @@ SVCVT_Z_TO_S32_IMPL(svfloat32_t)
 } // namespace arm_compute
 
 #endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCVT_H */
-\ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCVT_H */
diff --git a/src/core/NEON/wrapper/intrinsics/svexp.h b/src/core/NEON/wrapper/intrinsics/svexp.h
index d6ce9a77d1..1e8bce3960 100644
--- a/src/core/NEON/wrapper/intrinsics/svexp.h
+++ b/src/core/NEON/wrapper/intrinsics/svexp.h
@@ -26,6 +26,7 @@
 
 #if defined(__ARM_FEATURE_SVE)
 #include "src/core/NEON/SVEMath.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -46,4 +47,4 @@ SVEXP_IMPL(svfloat16_t, f16)
 } // namespace arm_compute
 
 #endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVEXP_H */
-\ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVEXP_H */
diff --git a/src/core/NEON/wrapper/intrinsics/svlog.h b/src/core/NEON/wrapper/intrinsics/svlog.h
index 5b505ae1e3..b4630e20ed 100644
--- a/src/core/NEON/wrapper/intrinsics/svlog.h
+++ b/src/core/NEON/wrapper/intrinsics/svlog.h
@@ -25,6 +25,7 @@
 #define SRC_CORE_NEON_WRAPPER_INTRINSICS_SVLOG_H
 #if defined(__ARM_FEATURE_SVE)
 #include "src/core/NEON/SVEMath.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -44,4 +45,4 @@ SVLOG_IMPL(svfloat16_t, f16)
 } // namespace wrapper
 } // namespace arm_compute
 #endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVLOG_H */
-\ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVLOG_H */
diff --git a/src/core/NEON/wrapper/intrinsics/svptrue.h b/src/core/NEON/wrapper/intrinsics/svptrue.h
index 53407e5301..6ed00bccbf 100644
--- a/src/core/NEON/wrapper/intrinsics/svptrue.h
+++ b/src/core/NEON/wrapper/intrinsics/svptrue.h
@@ -30,7 +30,7 @@ namespace arm_compute
 namespace wrapper
 {
 template <size_t element_size>
-inline svbool_t  svptrue_size();
+inline svbool_t svptrue_size();
 
 template <>
 inline svbool_t svptrue_size<64>()
@@ -65,4 +65,4 @@ svbool_t svptrue()
 } // namespace arm_compute
 
 #endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVPTRUE_H */
-\ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVPTRUE_H */
diff --git a/src/core/NEON/wrapper/intrinsics/svwhilelt.h b/src/core/NEON/wrapper/intrinsics/svwhilelt.h
index ef58217dc4..f0f84a9508 100644
--- a/src/core/NEON/wrapper/intrinsics/svwhilelt.h
+++ b/src/core/NEON/wrapper/intrinsics/svwhilelt.h
@@ -32,7 +32,7 @@ namespace wrapper
 #define SVWHILELT_IMPL(type)                           \
     template <size_t element_size>                     \
     inline svbool_t svwhilelt_size(type a, type b);    \
-    \
+                                                       \
     template <>                                        \
     inline svbool_t svwhilelt_size<64>(type a, type b) \
     {                                                  \
@@ -70,4 +70,4 @@ inline svbool_t svwhilelt(IndexType a, IndexType b)
 } // namespace arm_compute
 
 #endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVWHILELT_H */
-\ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVWHILELT_H */
diff --git a/src/core/NEON/wrapper/intrinsics/tanh.h b/src/core/NEON/wrapper/intrinsics/tanh.h
index daeaf19997..e74f0e86fe 100644
--- a/src/core/NEON/wrapper/intrinsics/tanh.h
+++ b/src/core/NEON/wrapper/intrinsics/tanh.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_TANH_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/src/core/NEON/wrapper/scalar/add.h b/src/core/NEON/wrapper/scalar/add.h
index 642d9261f3..2ec88869e3 100644
--- a/src/core/NEON/wrapper/scalar/add.h
+++ b/src/core/NEON/wrapper/scalar/add.h
@@ -32,22 +32,22 @@ namespace wrapper
 {
 inline uint8_t add_sat(const uint8_t &a, const uint8_t &b)
 {
-    const uint8x8_t va = { a, 0, 0, 0, 0, 0, 0, 0 };
-    const uint8x8_t vb = { b, 0, 0, 0, 0, 0, 0, 0 };
+    const uint8x8_t va = {a, 0, 0, 0, 0, 0, 0, 0};
+    const uint8x8_t vb = {b, 0, 0, 0, 0, 0, 0, 0};
     return vget_lane_u8(vqadd_u8(va, vb), 0);
 }
 
 inline int16_t add_sat(const int16_t &a, const int16_t &b)
 {
-    const int16x4_t va = { a, 0, 0, 0 };
-    const int16x4_t vb = { b, 0, 0, 0 };
+    const int16x4_t va = {a, 0, 0, 0};
+    const int16x4_t vb = {b, 0, 0, 0};
     return vget_lane_s16(vqadd_s16(va, vb), 0);
 }
 
 inline int32_t add_sat(const int32_t &a, const int32_t &b)
 {
-    const int32x2_t va = { a, 0 };
-    const int32x2_t vb = { b, 0 };
+    const int32x2_t va = {a, 0};
+    const int32x2_t vb = {b, 0};
     return vget_lane_s32(vqadd_s32(va, vb), 0);
 }
 
diff --git a/src/core/NEON/wrapper/scalar/sub.h b/src/core/NEON/wrapper/scalar/sub.h
index 1fe51d75fc..00de7d867f 100644
--- a/src/core/NEON/wrapper/scalar/sub.h
+++ b/src/core/NEON/wrapper/scalar/sub.h
@@ -32,22 +32,22 @@ namespace wrapper
 {
 inline uint8_t sub_sat(const uint8_t &a, const uint8_t &b)
 {
-    const uint8x8_t va = { a, 0, 0, 0, 0, 0, 0, 0 };
-    const uint8x8_t vb = { b, 0, 0, 0, 0, 0, 0, 0 };
+    const uint8x8_t va = {a, 0, 0, 0, 0, 0, 0, 0};
+    const uint8x8_t vb = {b, 0, 0, 0, 0, 0, 0, 0};
     return vget_lane_u8(vqsub_u8(va, vb), 0);
 }
 
 inline int16_t sub_sat(const int16_t &a, const int16_t &b)
 {
-    const int16x4_t va = { a, 0, 0, 0 };
-    const int16x4_t vb = { b, 0, 0, 0 };
+    const int16x4_t va = {a, 0, 0, 0};
+    const int16x4_t vb = {b, 0, 0, 0};
     return vget_lane_s16(vqsub_s16(va, vb), 0);
 }
 
 inline int32_t sub_sat(const int32_t &a, const int32_t &b)
 {
-    const int32x2_t va = { a, 0 };
-    const int32x2_t vb = { b, 0 };
+    const int32x2_t va = {a, 0};
+    const int32x2_t vb = {b, 0};
     return vget_lane_s32(vqsub_s32(va, vb), 0);
 }
 
diff --git a/src/core/NEON/wrapper/svtraits.h b/src/core/NEON/wrapper/svtraits.h
index 5ccd0ba8f1..330d272752 100644
--- a/src/core/NEON/wrapper/svtraits.h
+++ b/src/core/NEON/wrapper/svtraits.h
@@ -25,6 +25,7 @@
 #define SRC_CORE_NEON_WRAPPER_SVTRAITS_H
 #if defined(ARM_COMPUTE_ENABLE_SVE)
 #include "src/core/NEON/SVEMath.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
diff --git a/src/core/Rounding.cpp b/src/core/Rounding.cpp
index 99858e2a98..62ce335815 100644
--- a/src/core/Rounding.cpp
+++ b/src/core/Rounding.cpp
@@ -25,6 +25,7 @@
 #include "arm_compute/core/Rounding.h"
 
 #include "arm_compute/core/Error.h"
+
 #include "support/ToolchainSupport.h"
 
 #include <cmath>
@@ -36,7 +37,7 @@ int arm_compute::round(float x, RoundingPolicy rounding_policy)
 {
     using namespace std;
     int rounded = 0;
-    switch(rounding_policy)
+    switch (rounding_policy)
     {
         case RoundingPolicy::TO_ZERO:
         {
@@ -51,9 +52,7 @@ int arm_compute::round(float x, RoundingPolicy rounding_policy)
         case RoundingPolicy::TO_NEAREST_EVEN:
         {
 #ifdef __aarch64__
-            asm("fcvtns %x[res], %s[value]"
-                : [res] "=r"(rounded)
-                : [value] "w"(x));
+            asm("fcvtns %x[res], %s[value]" : [res] "=r"(rounded) : [value] "w"(x));
 #else  // __aarch64__
             ARM_COMPUTE_ERROR("TO_NEAREST_EVEN rounding policy is not supported.");
 #endif // __aarch64__
diff --git a/src/core/Size2D.cpp b/src/core/Size2D.cpp
index 6eb46e56af..69b2651520 100644
--- a/src/core/Size2D.cpp
+++ b/src/core/Size2D.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Size2D.h"
+
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -30,4 +31,4 @@ std::string Size2D::to_string() const
 {
     return support::cpp11::to_string(width) + std::string("x") + support::cpp11::to_string(height);
 }
-}
+} // namespace arm_compute
diff --git a/src/core/Size3D.cpp b/src/core/Size3D.cpp
index 3ee9fb8e5c..b56a99acd7 100644
--- a/src/core/Size3D.cpp
+++ b/src/core/Size3D.cpp
@@ -22,12 +22,14 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Size3D.h"
+
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
 std::string Size3D::to_string() const
 {
-    return support::cpp11::to_string(width) + std::string("x") + support::cpp11::to_string(height) + std::string("x") + support::cpp11::to_string(depth);
+    return support::cpp11::to_string(width) + std::string("x") + support::cpp11::to_string(height) + std::string("x") +
+           support::cpp11::to_string(depth);
 }
-}
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/SubTensorInfo.cpp b/src/core/SubTensorInfo.cpp
index 723b6bc016..8012c3d721 100644
--- a/src/core/SubTensorInfo.cpp
+++ b/src/core/SubTensorInfo.cpp
@@ -42,10 +42,10 @@ namespace
 TensorShape extend_parent_shape(TensorShape parent_shape, TensorShape shape, Coordinates coords)
 {
     // Extend shape
-    for(unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
+    for (unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
     {
         int dimension_extend = coords[i] + static_cast<int>(shape[i]);
-        if((dimension_extend > static_cast<int>(parent_shape[i])) && (dimension_extend > 0))
+        if ((dimension_extend > static_cast<int>(parent_shape[i])) && (dimension_extend > 0))
         {
             parent_shape.set(i, static_cast<size_t>(dimension_extend));
         }
@@ -56,23 +56,35 @@ TensorShape extend_parent_shape(TensorShape parent_shape, TensorShape shape, Coo
 } // namespace
 
 SubTensorInfo::SubTensorInfo()
-    : _parent(nullptr), _tensor_shape(), _dims_state(), _coords(), _valid_region{ Coordinates(), _tensor_shape }, _extend_parent(false), _lock_paddings(false)
+    : _parent(nullptr),
+      _tensor_shape(),
+      _dims_state(),
+      _coords(),
+      _valid_region{Coordinates(), _tensor_shape},
+      _extend_parent(false),
+      _lock_paddings(false)
 {
 }
 
 SubTensorInfo::SubTensorInfo(ITensorInfo *parent, TensorShape tensor_shape, Coordinates coords, bool extend_parent)
-    : _parent(parent), _tensor_shape(tensor_shape), _dims_state(), _coords(coords), _valid_region{ Coordinates(), _tensor_shape }, _extend_parent(extend_parent), _lock_paddings(false)
+    : _parent(parent),
+      _tensor_shape(tensor_shape),
+      _dims_state(),
+      _coords(coords),
+      _valid_region{Coordinates(), _tensor_shape},
+      _extend_parent(extend_parent),
+      _lock_paddings(false)
 {
     ARM_COMPUTE_ERROR_ON(parent == nullptr);
 
     // Check if subtensor is valid if parent is configured
-    if(parent->tensor_shape().total_size() != 0 && !_extend_parent)
+    if (parent->tensor_shape().total_size() != 0 && !_extend_parent)
     {
         ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(parent->tensor_shape(), coords, tensor_shape);
     }
 
     // Initialize valid region
-    _valid_region = ValidRegion{ Coordinates(), _tensor_shape };
+    _valid_region = ValidRegion{Coordinates(), _tensor_shape};
 }
 
 std::unique_ptr<ITensorInfo> SubTensorInfo::clone() const
@@ -91,17 +103,17 @@ ITensorInfo &SubTensorInfo::set_tensor_shape(const TensorShape &shape)
     ARM_COMPUTE_ERROR_ON(_parent == nullptr);
 
     // Check if subtensor is valid if parent is configured
-    if(_parent->tensor_shape().total_size() != 0 && !_extend_parent)
+    if (_parent->tensor_shape().total_size() != 0 && !_extend_parent)
     {
         ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(_parent->tensor_shape(), _coords, shape);
-        _valid_region = ValidRegion{ _coords, shape };
+        _valid_region = ValidRegion{_coords, shape};
     }
-    else if(_extend_parent) // Extend parent shape, configure if specified
+    else if (_extend_parent) // Extend parent shape, configure if specified
     {
         ARM_COMPUTE_ERROR_ON((_parent->data_type() == DataType::UNKNOWN) && (_parent->format() == Format::UNKNOWN));
         TensorShape parent_extended_shape = extend_parent_shape(_parent->tensor_shape(), shape, _coords);
         _parent->set_tensor_shape(parent_extended_shape);
-        _parent->set_valid_region(ValidRegion{ Coordinates(), parent_extended_shape });
+        _parent->set_valid_region(ValidRegion{Coordinates(), parent_extended_shape});
     }
     _tensor_shape = shape;
     return *this;
@@ -133,11 +145,11 @@ bool SubTensorInfo::extend_padding(const PaddingSize &padding)
     ARM_COMPUTE_ERROR_ON(_parent->total_size() == 0);
 
     // Check that you do not extend padding on sub-tensors unless XY shape matches parent tensor
-    if(!_extend_parent && (padding.left || padding.right))
+    if (!_extend_parent && (padding.left || padding.right))
     {
         ARM_COMPUTE_ERROR_ON(_parent->tensor_shape().x() != tensor_shape().x());
     }
-    if(!_extend_parent && (padding.top || padding.bottom))
+    if (!_extend_parent && (padding.top || padding.bottom))
     {
         ARM_COMPUTE_ERROR_ON(_parent->tensor_shape().y() != tensor_shape().y());
     }
@@ -153,7 +165,7 @@ int32_t SubTensorInfo::offset_element_in_bytes(const Coordinates &pos) const
     int32_t        offset  = offset_first_element_in_bytes();
     const Strides &strides = strides_in_bytes();
 
-    for(size_t i = 0; i < _tensor_shape.num_dimensions(); ++i)
+    for (size_t i = 0; i < _tensor_shape.num_dimensions(); ++i)
     {
         offset += pos[i] * strides[i];
     }
diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp
index 5905ba5215..31bddbde40 100644
--- a/src/core/TensorInfo.cpp
+++ b/src/core/TensorInfo.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/Utils.h"
 
 #include <memory>
@@ -34,13 +35,26 @@
 namespace arm_compute
 {
 TensorInfo::TensorInfo()
-    : _total_size(0), _offset_first_element_in_bytes(0), _strides_in_bytes(), _num_channels(0), _tensor_shape(), _dims_state(), _data_type(DataType::UNKNOWN), _format(Format::UNKNOWN), _is_resizable{ true },
-      _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }, _quantization_info(), _data_layout(DataLayout::NCHW), _are_values_constant(true), _id(invalid_tensor_id), _lock_paddings(false)
-{
-}
-
-TensorInfo::TensorInfo(const ITensorInfo &info)
-    : TensorInfo()
+    : _total_size(0),
+      _offset_first_element_in_bytes(0),
+      _strides_in_bytes(),
+      _num_channels(0),
+      _tensor_shape(),
+      _dims_state(),
+      _data_type(DataType::UNKNOWN),
+      _format(Format::UNKNOWN),
+      _is_resizable{true},
+      _valid_region{Coordinates(), _tensor_shape},
+      _padding{0},
+      _quantization_info(),
+      _data_layout(DataLayout::NCHW),
+      _are_values_constant(true),
+      _id(invalid_tensor_id),
+      _lock_paddings(false)
+{
+}
+
+TensorInfo::TensorInfo(const ITensorInfo &info) : TensorInfo()
 {
     _total_size                    = info.total_size();
     _offset_first_element_in_bytes = info.offset_first_element_in_bytes();
@@ -60,8 +74,7 @@ TensorInfo::TensorInfo(const ITensorInfo &info)
     _lock_paddings                 = info.lock_paddings();
 }
 
-TensorInfo::TensorInfo(const TensorInfo &info)
-    : TensorInfo()
+TensorInfo::TensorInfo(const TensorInfo &info) : TensorInfo()
 {
     _total_size                    = info.total_size();
     _offset_first_element_in_bytes = info.offset_first_element_in_bytes();
@@ -80,8 +93,7 @@ TensorInfo::TensorInfo(const TensorInfo &info)
     _id                            = info.id();
     _lock_paddings                 = false;
 }
-TensorInfo::TensorInfo(Format format)
-    : TensorInfo(TensorShape(), format)
+TensorInfo::TensorInfo(Format format) : TensorInfo(TensorShape(), format)
 {
 }
 
@@ -90,25 +102,25 @@ TensorInfo::TensorInfo(unsigned int width, unsigned int height, Format format)
 {
 }
 
-TensorInfo::TensorInfo(const TensorShape &tensor_shape, Format format)
-    : TensorInfo()
+TensorInfo::TensorInfo(const TensorShape &tensor_shape, Format format) : TensorInfo()
 {
     init(tensor_shape, format);
 }
 
-TensorInfo::TensorInfo(size_t num_channels, DataType data_type)
-    : TensorInfo()
+TensorInfo::TensorInfo(size_t num_channels, DataType data_type) : TensorInfo()
 {
     init(TensorShape(), num_channels, data_type);
 }
 
-TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type)
-    : TensorInfo()
+TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type) : TensorInfo()
 {
     init(tensor_shape, num_channels, data_type);
 }
 
-TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, QuantizationInfo quantization_info)
+TensorInfo::TensorInfo(const TensorShape &tensor_shape,
+                       size_t             num_channels,
+                       DataType           data_type,
+                       QuantizationInfo   quantization_info)
     : TensorInfo()
 {
     init(tensor_shape, num_channels, data_type);
@@ -137,9 +149,11 @@ void TensorInfo::init(const TensorShape &tensor_shape, Format format)
     _format = format;
 }
 
-void TensorInfo::init(const TensorShape &tensor_shape, Format format,
-                      const Strides &strides_in_bytes, size_t offset_first_element_in_bytes,
-                      size_t total_size_in_bytes)
+void TensorInfo::init(const TensorShape &tensor_shape,
+                      Format             format,
+                      const Strides     &strides_in_bytes,
+                      size_t             offset_first_element_in_bytes,
+                      size_t             total_size_in_bytes)
 {
     size_t         num_channels = num_channels_from_format(format);
     const DataType type         = data_type_from_format(format);
@@ -165,9 +179,12 @@ void TensorInfo::init(const TensorShape &tensor_shape, size_t num_channels, Data
     set_tensor_shape(tensor_shape);
 }
 
-void TensorInfo::init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type,
-                      const Strides &strides_in_bytes, size_t offset_first_element_in_bytes,
-                      size_t total_size_in_bytes)
+void TensorInfo::init(const TensorShape &tensor_shape,
+                      size_t             num_channels,
+                      DataType           data_type,
+                      const Strides     &strides_in_bytes,
+                      size_t             offset_first_element_in_bytes,
+                      size_t             total_size_in_bytes)
 {
     ARM_COMPUTE_ERROR_ON(num_channels == 0);
 
@@ -179,7 +196,7 @@ void TensorInfo::init(const TensorShape &tensor_shape, size_t num_channels, Data
     _strides_in_bytes              = strides_in_bytes;
     _total_size                    = total_size_in_bytes;
 
-    _valid_region = ValidRegion{ Coordinates(), _tensor_shape };
+    _valid_region = ValidRegion{Coordinates(), _tensor_shape};
 }
 
 size_t TensorInfo::init_auto_padding(const TensorShape &tensor_shape, Format format)
@@ -202,7 +219,7 @@ size_t TensorInfo::init_auto_padding(const TensorShape &tensor_shape, size_t num
     _format       = Format::UNKNOWN;
     _tensor_shape = tensor_shape;
 
-    _valid_region = ValidRegion{ Coordinates(), _tensor_shape };
+    _valid_region = ValidRegion{Coordinates(), _tensor_shape};
 
     auto_padding();
 
@@ -233,11 +250,11 @@ std::tuple<Strides, size_t, size_t> TensorInfo::calculate_padding_requirements(c
     size_t       required_total_size           = 0;
     const size_t required_offset_first_element = padding.left * stride_x + padding.top * stride_y;
 
-    switch(_tensor_shape.num_dimensions())
+    switch (_tensor_shape.num_dimensions())
     {
         case 0:
         {
-            if(_tensor_shape.total_size() > 0)
+            if (_tensor_shape.total_size() > 0)
             {
                 required_strides    = Strides(stride_x, stride_x);
                 required_total_size = stride_z;
@@ -258,7 +275,8 @@ std::tuple<Strides, size_t, size_t> TensorInfo::calculate_padding_requirements(c
 
             const unsigned int idx_last_dimension = _tensor_shape.num_dimensions() - 1;
 
-            required_total_size = static_cast<size_t>(_tensor_shape[idx_last_dimension]) * required_strides[idx_last_dimension];
+            required_total_size =
+                static_cast<size_t>(_tensor_shape[idx_last_dimension]) * required_strides[idx_last_dimension];
             break;
         }
     }
@@ -284,25 +302,25 @@ bool TensorInfo::extend_padding(const PaddingSize &padding)
 
     bool updated = false;
 
-    if(padding.top > _padding.top)
+    if (padding.top > _padding.top)
     {
         _padding.top = padding.top;
         updated      = true;
     }
 
-    if(padding.right > _padding.right)
+    if (padding.right > _padding.right)
     {
         _padding.right = padding.right;
         updated        = true;
     }
 
-    if(padding.bottom > _padding.bottom)
+    if (padding.bottom > _padding.bottom)
     {
         _padding.bottom = padding.bottom;
         updated         = true;
     }
 
-    if(padding.left > _padding.left)
+    if (padding.left > _padding.left)
     {
         _padding.left = padding.left;
         updated       = true;
@@ -336,7 +354,7 @@ ITensorInfo &TensorInfo::set_format(Format format)
 {
     _format = format;
 
-    if(_data_type == DataType::UNKNOWN)
+    if (_data_type == DataType::UNKNOWN)
     {
         _num_channels = num_channels_from_format(format);
         _data_type    = data_type_from_format(format);
@@ -355,19 +373,19 @@ ITensorInfo &TensorInfo::set_tensor_shape(const TensorShape &shape)
     _offset_first_element_in_bytes = 0;
     _strides_in_bytes              = compute_strides(*this);
 
-    if(_tensor_shape.num_dimensions() == 0)
+    if (_tensor_shape.num_dimensions() == 0)
     {
         _total_size = _strides_in_bytes[0];
     }
     else
     {
         const unsigned int idx_last_dimension = _tensor_shape.num_dimensions() - 1;
-        _total_size                           = static_cast<size_t>(_tensor_shape[idx_last_dimension]) * _strides_in_bytes[idx_last_dimension];
+        _total_size = static_cast<size_t>(_tensor_shape[idx_last_dimension]) * _strides_in_bytes[idx_last_dimension];
     }
 
     std::tie(_strides_in_bytes, _offset_first_element_in_bytes, _total_size) = calculate_padding_requirements(_padding);
 
-    _valid_region = ValidRegion{ Coordinates(), _tensor_shape };
+    _valid_region = ValidRegion{Coordinates(), _tensor_shape};
     return *this;
 }
 
@@ -392,9 +410,10 @@ ITensorInfo &TensorInfo::set_data_layout(const DataLayout &data_layout)
 ITensorInfo &TensorInfo::reset_padding()
 {
     _padding = PaddingSize();
-    if(((_format != Format::UNKNOWN) || (_data_type != DataType::UNKNOWN)) && _total_size != 0)
+    if (((_format != Format::UNKNOWN) || (_data_type != DataType::UNKNOWN)) && _total_size != 0)
     {
-        std::tie(_strides_in_bytes, _offset_first_element_in_bytes, _total_size) = calculate_padding_requirements(_padding);
+        std::tie(_strides_in_bytes, _offset_first_element_in_bytes, _total_size) =
+            calculate_padding_requirements(_padding);
     }
     return *this;
 }
@@ -405,7 +424,7 @@ int32_t TensorInfo::offset_element_in_bytes(const Coordinates &pos) const
 
     int32_t offset = _offset_first_element_in_bytes;
 
-    for(size_t i = 0; i < _tensor_shape.num_dimensions(); ++i)
+    for (size_t i = 0; i < _tensor_shape.num_dimensions(); ++i)
     {
         offset += pos[i] * _strides_in_bytes[i];
     }
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index 1ca7adb3a8..90a7ac32c0 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -49,7 +49,7 @@ std::string read_file(const std::string &filename, bool binary)
         fs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
         std::ios_base::openmode mode = std::ios::in;
 
-        if(binary)
+        if (binary)
         {
             mode |= std::ios::binary;
         }
@@ -66,7 +66,7 @@ std::string read_file(const std::string &filename, bool binary)
         out.assign(std::istreambuf_iterator<char>(fs), std::istreambuf_iterator<char>());
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
-    catch(const std::ifstream::failure &e)
+    catch (const std::ifstream::failure &e)
     {
         ARM_COMPUTE_ERROR_VAR("Accessing %s: %s", filename.c_str(), e.what());
     }
@@ -77,32 +77,28 @@ std::string read_file(const std::string &filename, bool binary)
 
 const std::string &string_from_channel(Channel channel)
 {
-    static std::map<Channel, const std::string> channels_map =
-    {
-        { Channel::UNKNOWN, "UNKNOWN" },
-        { Channel::R, "R" },
-        { Channel::G, "G" },
-        { Channel::B, "B" },
-        { Channel::A, "A" },
-        { Channel::Y, "Y" },
-        { Channel::U, "U" },
-        { Channel::V, "V" },
-        { Channel::C0, "C0" },
-        { Channel::C1, "C1" },
-        { Channel::C2, "C2" },
-        { Channel::C3, "C3" }
-    };
+    static std::map<Channel, const std::string> channels_map = {{Channel::UNKNOWN, "UNKNOWN"},
+                                                                {Channel::R, "R"},
+                                                                {Channel::G, "G"},
+                                                                {Channel::B, "B"},
+                                                                {Channel::A, "A"},
+                                                                {Channel::Y, "Y"},
+                                                                {Channel::U, "U"},
+                                                                {Channel::V, "V"},
+                                                                {Channel::C0, "C0"},
+                                                                {Channel::C1, "C1"},
+                                                                {Channel::C2, "C2"},
+                                                                {Channel::C3, "C3"}};
 
     return channels_map[channel];
 }
 
 const std::string &string_from_border_mode(BorderMode border_mode)
 {
-    static std::map<BorderMode, const std::string> border_mode_map =
-    {
-        { BorderMode::UNDEFINED, "UNDEFINED" },
-        { BorderMode::CONSTANT, "CONSTANT" },
-        { BorderMode::REPLICATE, "REPLICATE" },
+    static std::map<BorderMode, const std::string> border_mode_map = {
+        {BorderMode::UNDEFINED, "UNDEFINED"},
+        {BorderMode::CONSTANT, "CONSTANT"},
+        {BorderMode::REPLICATE, "REPLICATE"},
     };
 
     return border_mode_map[border_mode];
@@ -110,11 +106,10 @@ const std::string &string_from_border_mode(BorderMode border_mode)
 
 const std::string &string_from_norm_type(NormType type)
 {
-    static std::map<NormType, const std::string> norm_type_map =
-    {
-        { NormType::IN_MAP_1D, "IN_MAP_1D" },
-        { NormType::IN_MAP_2D, "IN_MAP_2D" },
-        { NormType::CROSS_MAP, "CROSS_MAP" },
+    static std::map<NormType, const std::string> norm_type_map = {
+        {NormType::IN_MAP_1D, "IN_MAP_1D"},
+        {NormType::IN_MAP_2D, "IN_MAP_2D"},
+        {NormType::CROSS_MAP, "CROSS_MAP"},
     };
 
     return norm_type_map[type];
@@ -122,11 +117,10 @@ const std::string &string_from_norm_type(NormType type)
 
 const std::string &string_from_pooling_type(PoolingType type)
 {
-    static std::map<PoolingType, const std::string> pool_type_map =
-    {
-        { PoolingType::MAX, "MAX" },
-        { PoolingType::AVG, "AVG" },
-        { PoolingType::L2, "L2" },
+    static std::map<PoolingType, const std::string> pool_type_map = {
+        {PoolingType::MAX, "MAX"},
+        {PoolingType::AVG, "AVG"},
+        {PoolingType::L2, "L2"},
     };
 
     return pool_type_map[type];
@@ -134,38 +128,36 @@ const std::string &string_from_pooling_type(PoolingType type)
 
 bool is_pool_region_entirely_outside_input(const PoolingLayerInfo &info)
 {
-    if(info.is_global_pooling || info.exclude_padding || info.pool_size.x() == 0 || info.pool_size.y() == 0)
+    if (info.is_global_pooling || info.exclude_padding || info.pool_size.x() == 0 || info.pool_size.y() == 0)
     {
         return false;
     }
     const auto ps                = info.pad_stride_info;
-    const auto pool_le_padding_x = info.pool_size.x() <= std::max({ ps.pad_left(), ps.pad_right() });
-    const auto pool_le_padding_y = info.pool_size.y() <= std::max({ ps.pad_top(), ps.pad_bottom() });
+    const auto pool_le_padding_x = info.pool_size.x() <= std::max({ps.pad_left(), ps.pad_right()});
+    const auto pool_le_padding_y = info.pool_size.y() <= std::max({ps.pad_top(), ps.pad_bottom()});
     return pool_le_padding_x || pool_le_padding_y;
 }
 
 bool is_pool_3d_region_entirely_outside_input(const Pooling3dLayerInfo &info)
 {
-    if(info.is_global_pooling || info.pool_size.x() == 0 || info.pool_size.y() == 0 || info.pool_size.z() == 0)
+    if (info.is_global_pooling || info.pool_size.x() == 0 || info.pool_size.y() == 0 || info.pool_size.z() == 0)
     {
         return false;
     }
     const auto ps                = info.padding;
-    const auto pool_le_padding_x = info.pool_size.x() <= std::max({ ps.left, ps.right });
-    const auto pool_le_padding_y = info.pool_size.y() <= std::max({ ps.top, ps.bottom });
-    const auto pool_le_padding_z = info.pool_size.z() <= std::max({ ps.front, ps.back });
+    const auto pool_le_padding_x = info.pool_size.x() <= std::max({ps.left, ps.right});
+    const auto pool_le_padding_y = info.pool_size.y() <= std::max({ps.top, ps.bottom});
+    const auto pool_le_padding_z = info.pool_size.z() <= std::max({ps.front, ps.back});
     return pool_le_padding_x || pool_le_padding_y || pool_le_padding_z;
 }
 
 const std::string &string_from_gemmlowp_output_stage(GEMMLowpOutputStageType output_stage)
 {
-    static std::map<GEMMLowpOutputStageType, const std::string> output_stage_map =
-    {
-        { GEMMLowpOutputStageType::NONE, "" },
-        { GEMMLowpOutputStageType::QUANTIZE_DOWN, "quantize_down" },
-        { GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, "quantize_down_fixedpoint" },
-        { GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT, "quantize_down_float" }
-    };
+    static std::map<GEMMLowpOutputStageType, const std::string> output_stage_map = {
+        {GEMMLowpOutputStageType::NONE, ""},
+        {GEMMLowpOutputStageType::QUANTIZE_DOWN, "quantize_down"},
+        {GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, "quantize_down_fixedpoint"},
+        {GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT, "quantize_down_float"}};
 
     return output_stage_map[output_stage];
 }
@@ -175,7 +167,7 @@ std::string string_from_pixel_value(const PixelValue &value, const DataType data
     std::stringstream ss;
     std::string       converted_string;
 
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::U8:
         case DataType::QASYMM8:
@@ -223,11 +215,16 @@ std::string string_from_pixel_value(const PixelValue &value, const DataType data
     return converted_string;
 }
 
-PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info, DataLayout data_layout, const Size2D &dilation,
+PadStrideInfo calculate_same_pad(TensorShape                  input_shape,
+                                 TensorShape                  weights_shape,
+                                 PadStrideInfo                conv_info,
+                                 DataLayout                   data_layout,
+                                 const Size2D                &dilation,
                                  const DimensionRoundingType &rounding_type)
 {
     const auto &strides = conv_info.stride();
-    ARM_COMPUTE_ERROR_ON_MSG((strides.first < 1 || strides.second < 1), "Stride values should be greater than or equal to 1.");
+    ARM_COMPUTE_ERROR_ON_MSG((strides.first < 1 || strides.second < 1),
+                             "Stride values should be greater than or equal to 1.");
 
     const unsigned int width_idx     = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const unsigned int height_idx    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
@@ -246,8 +243,9 @@ PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_sh
     const int real_weight_height = (kernel_height - 1) * dilation.y() + 1;
 
     // Calculate total pad
-    const int pad_width  = std::max(0, static_cast<int>((out_width - 1) * strides.first + real_weight_width - in_width));
-    const int pad_height = std::max(0, static_cast<int>((out_height - 1) * strides.second + real_weight_height - in_height));
+    const int pad_width = std::max(0, static_cast<int>((out_width - 1) * strides.first + real_weight_width - in_width));
+    const int pad_height =
+        std::max(0, static_cast<int>((out_height - 1) * strides.second + real_weight_height - in_height));
 
     // Calculate individual paddings
     const unsigned int pad_left   = pad_width / 2;
@@ -265,8 +263,10 @@ PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_sh
     return same_info;
 }
 
-std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned int in_width, unsigned int in_height,
-                                                                      unsigned int kernel_width, unsigned int kernel_height,
+std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned int         in_width,
+                                                                      unsigned int         in_height,
+                                                                      unsigned int         kernel_width,
+                                                                      unsigned int         kernel_height,
                                                                       const PadStrideInfo &pad_stride_info)
 {
     const unsigned int pad_left   = pad_stride_info.pad_left();
@@ -285,8 +285,10 @@ std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned i
     return std::make_pair<unsigned int, unsigned int>(w, h);
 }
 
-std::pair<unsigned int, unsigned int> scaled_dimensions(int width, int height,
-                                                        int kernel_width, int kernel_height,
+std::pair<unsigned int, unsigned int> scaled_dimensions(int                  width,
+                                                        int                  height,
+                                                        int                  kernel_width,
+                                                        int                  kernel_height,
                                                         const PadStrideInfo &pad_stride_info,
                                                         const Size2D        &dilation)
 {
@@ -300,15 +302,25 @@ std::pair<unsigned int, unsigned int> scaled_dimensions(int width, int height,
     const int stride_y   = pad_stride_info.stride().second;
     int       w          = 0;
     int       h          = 0;
-    switch(pad_stride_info.round())
+    switch (pad_stride_info.round())
     {
         case DimensionRoundingType::FLOOR:
-            w = static_cast<int>(std::floor((static_cast<float>(width + pad_left + pad_right - (dilation_x * (kernel_width - 1) + 1)) / stride_x) + 1));
-            h = static_cast<int>(std::floor((static_cast<float>(height + pad_top + pad_bottom - (dilation_y * (kernel_height - 1) + 1)) / stride_y) + 1));
+            w = static_cast<int>(std::floor(
+                (static_cast<float>(width + pad_left + pad_right - (dilation_x * (kernel_width - 1) + 1)) / stride_x) +
+                1));
+            h = static_cast<int>(
+                std::floor((static_cast<float>(height + pad_top + pad_bottom - (dilation_y * (kernel_height - 1) + 1)) /
+                            stride_y) +
+                           1));
             break;
         case DimensionRoundingType::CEIL:
-            w = static_cast<int>(std::ceil((static_cast<float>(width + pad_left + pad_right - (dilation_x * (kernel_width - 1) + 1)) / stride_x) + 1));
-            h = static_cast<int>(std::ceil((static_cast<float>(height + pad_top + pad_bottom - (dilation_y * (kernel_height - 1) + 1)) / stride_y) + 1));
+            w = static_cast<int>(std::ceil(
+                (static_cast<float>(width + pad_left + pad_right - (dilation_x * (kernel_width - 1) + 1)) / stride_x) +
+                1));
+            h = static_cast<int>(
+                std::ceil((static_cast<float>(height + pad_top + pad_bottom - (dilation_y * (kernel_height - 1) + 1)) /
+                           stride_y) +
+                          1));
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported rounding type");
@@ -319,9 +331,8 @@ std::pair<unsigned int, unsigned int> scaled_dimensions(int width, int height,
     return std::make_pair(static_cast<unsigned int>(w), static_cast<unsigned int>(h));
 }
 
-std::pair<int, int> scaled_dimensions_signed(int width, int height,
-                                             int kernel_width, int kernel_height,
-                                             const PadStrideInfo &pad_stride_info)
+std::pair<int, int> scaled_dimensions_signed(
+    int width, int height, int kernel_width, int kernel_height, const PadStrideInfo &pad_stride_info)
 {
     const int pad_left   = pad_stride_info.pad_left();
     const int pad_top    = pad_stride_info.pad_top();
@@ -331,15 +342,19 @@ std::pair<int, int> scaled_dimensions_signed(int width, int height,
     const int stride_y   = pad_stride_info.stride().second;
     int       w          = 0;
     int       h          = 0;
-    switch(pad_stride_info.round())
+    switch (pad_stride_info.round())
     {
         case DimensionRoundingType::FLOOR:
-            w = static_cast<int>(std::floor((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
-            h = static_cast<int>(std::floor((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
+            w = static_cast<int>(
+                std::floor((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
+            h = static_cast<int>(
+                std::floor((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
             break;
         case DimensionRoundingType::CEIL:
-            w = static_cast<int>(std::ceil((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
-            h = static_cast<int>(std::ceil((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
+            w = static_cast<int>(
+                std::ceil((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
+            h = static_cast<int>(
+                std::ceil((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported rounding type");
@@ -348,8 +363,12 @@ std::pair<int, int> scaled_dimensions_signed(int width, int height,
     return std::make_pair(static_cast<int>(w), static_cast<int>(h));
 }
 
-std::tuple<int, int, int> scaled_3d_dimensions_signed(int width, int height, int depth,
-                                                      int kernel_width, int kernel_height, int kernel_depth,
+std::tuple<int, int, int> scaled_3d_dimensions_signed(int                       width,
+                                                      int                       height,
+                                                      int                       depth,
+                                                      int                       kernel_width,
+                                                      int                       kernel_height,
+                                                      int                       kernel_depth,
                                                       const Pooling3dLayerInfo &pool3d_info)
 {
     const int pad_left   = pool3d_info.padding.left;
@@ -365,17 +384,23 @@ std::tuple<int, int, int> scaled_3d_dimensions_signed(int width, int height, int
     int       h          = 0;
     int       d          = 0;
 
-    switch(pool3d_info.round_type)
+    switch (pool3d_info.round_type)
     {
         case DimensionRoundingType::FLOOR:
-            w = static_cast<int>(std::floor((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
-            h = static_cast<int>(std::floor((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
-            d = static_cast<int>(std::floor((static_cast<float>(depth + pad_front + pad_back - kernel_depth) / stride_z) + 1));
+            w = static_cast<int>(
+                std::floor((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
+            h = static_cast<int>(
+                std::floor((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
+            d = static_cast<int>(
+                std::floor((static_cast<float>(depth + pad_front + pad_back - kernel_depth) / stride_z) + 1));
             break;
         case DimensionRoundingType::CEIL:
-            w = static_cast<int>(std::ceil((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
-            h = static_cast<int>(std::ceil((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
-            d = static_cast<int>(std::ceil((static_cast<float>(depth + pad_front + pad_back - kernel_depth) / stride_z) + 1));
+            w = static_cast<int>(
+                std::ceil((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
+            h = static_cast<int>(
+                std::ceil((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
+            d = static_cast<int>(
+                std::ceil((static_cast<float>(depth + pad_front + pad_back - kernel_depth) / stride_z) + 1));
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported rounding type");
@@ -400,9 +425,9 @@ QuantizationInfo get_softmax_output_quantization_info(DataType input_type, bool
     // * Softmax with QASYMM8_SIGNED: scale = 1/256, offset = -128
     // * LogSoftmax with QASYMM8: scale = 1/256, offset = 0
     // * LogSoftmax with QASYMM8_SIGNED: scale = 16/256, offset = 127
-    if(is_data_type_quantized_asymmetric_signed(input_type))
+    if (is_data_type_quantized_asymmetric_signed(input_type))
     {
-        if(is_log)
+        if (is_log)
         {
             return QuantizationInfo(16.f / 256, 127);
         }
@@ -414,17 +439,21 @@ QuantizationInfo get_softmax_output_quantization_info(DataType input_type, bool
     return QuantizationInfo(1.f / 256, 0);
 }
 
-std::pair<int32_t, int32_t> get_quantized_activation_min_max(const ActivationLayerInfo &act_info, DataType data_type, UniformQuantizationInfo oq_info)
+std::pair<int32_t, int32_t> get_quantized_activation_min_max(const ActivationLayerInfo &act_info,
+                                                             DataType                   data_type,
+                                                             UniformQuantizationInfo    oq_info)
 {
     const bool is_qasymm8_signed = is_data_type_quantized_asymmetric_signed(data_type);
     const auto a                 = act_info.a();
     const auto b                 = act_info.b();
-    const int  a_int             = is_qasymm8_signed ? quantize_qasymm8_signed(a, oq_info) : quantize_qasymm8(a, oq_info);
-    const int  b_int             = is_qasymm8_signed ? quantize_qasymm8_signed(b, oq_info) : quantize_qasymm8(b, oq_info);
-    const auto type_max_value    = std::get<1>(get_min_max(data_type)).get<int32_t>();
+    const int  a_int          = is_qasymm8_signed ? quantize_qasymm8_signed(a, oq_info) : quantize_qasymm8(a, oq_info);
+    const int  b_int          = is_qasymm8_signed ? quantize_qasymm8_signed(b, oq_info) : quantize_qasymm8(b, oq_info);
+    const auto type_max_value = std::get<1>(get_min_max(data_type)).get<int32_t>();
 
-    const int32_t min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? oq_info.offset : b_int;
-    const int32_t max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? type_max_value : a_int;
+    const int32_t min_activation =
+        act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? oq_info.offset : b_int;
+    const int32_t max_activation =
+        act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? type_max_value : a_int;
 
     return std::make_pair(min_activation, max_activation);
 }
@@ -433,11 +462,11 @@ std::unordered_map<const ITensorInfo *, PaddingSize> get_padding_info(std::initi
 {
     std::unordered_map<const ITensorInfo *, PaddingSize> res;
 
-    for(const ITensor *tensor : tensors)
+    for (const ITensor *tensor : tensors)
     {
-        if(tensor)
+        if (tensor)
         {
-            res.insert({ tensor->info(), tensor->info()->padding() });
+            res.insert({tensor->info(), tensor->info()->padding()});
         }
     }
 
@@ -448,11 +477,11 @@ std::unordered_map<const ITensorInfo *, PaddingSize> get_padding_info(std::initi
 {
     std::unordered_map<const ITensorInfo *, PaddingSize> res;
 
-    for(const ITensorInfo *info : infos)
+    for (const ITensorInfo *info : infos)
     {
-        if(info)
+        if (info)
         {
-            res.insert({ info, info->padding() });
+            res.insert({info, info->padding()});
         }
     }
 
@@ -461,17 +490,20 @@ std::unordered_map<const ITensorInfo *, PaddingSize> get_padding_info(std::initi
 
 bool has_padding_changed(const std::unordered_map<const ITensorInfo *, PaddingSize> &padding_map)
 {
-    return std::find_if(padding_map.begin(), padding_map.end(), [](const std::pair<const ITensorInfo *, PaddingSize> &padding_info)
-    {
-        return (padding_info.first->padding() != padding_info.second);
-    })
-    != padding_map.end();
+    return std::find_if(padding_map.begin(), padding_map.end(),
+                        [](const std::pair<const ITensorInfo *, PaddingSize> &padding_info)
+                        { return (padding_info.first->padding() != padding_info.second); }) != padding_map.end();
 }
 
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
-void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n, int stream_width, const std::string &element_delim)
+void print_consecutive_elements(std::ostream      &s,
+                                DataType           dt,
+                                const uint8_t     *ptr,
+                                unsigned int       n,
+                                int                stream_width,
+                                const std::string &element_delim)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::QASYMM8:
@@ -481,36 +513,46 @@ void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr
         case DataType::QSYMM8:
         case DataType::QASYMM8_SIGNED:
         case DataType::QSYMM8_PER_CHANNEL:
-            print_consecutive_elements_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n, stream_width,
+                                                    element_delim);
             break;
         case DataType::U16:
         case DataType::QASYMM16:
-            print_consecutive_elements_impl<uint16_t>(s, reinterpret_cast<const uint16_t *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<uint16_t>(s, reinterpret_cast<const uint16_t *>(ptr), n, stream_width,
+                                                      element_delim);
             break;
         case DataType::S16:
         case DataType::QSYMM16:
-            print_consecutive_elements_impl<int16_t>(s, reinterpret_cast<const int16_t *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<int16_t>(s, reinterpret_cast<const int16_t *>(ptr), n, stream_width,
+                                                     element_delim);
             break;
         case DataType::U32:
-            print_consecutive_elements_impl<uint32_t>(s, reinterpret_cast<const uint32_t *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<uint32_t>(s, reinterpret_cast<const uint32_t *>(ptr), n, stream_width,
+                                                      element_delim);
             break;
         case DataType::S32:
-            print_consecutive_elements_impl<int32_t>(s, reinterpret_cast<const int32_t *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<int32_t>(s, reinterpret_cast<const int32_t *>(ptr), n, stream_width,
+                                                     element_delim);
             break;
         case DataType::U64:
-            print_consecutive_elements_impl<uint64_t>(s, reinterpret_cast<const uint64_t *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<uint64_t>(s, reinterpret_cast<const uint64_t *>(ptr), n, stream_width,
+                                                      element_delim);
             break;
         case DataType::S64:
-            print_consecutive_elements_impl<int64_t>(s, reinterpret_cast<const int64_t *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<int64_t>(s, reinterpret_cast<const int64_t *>(ptr), n, stream_width,
+                                                     element_delim);
             break;
         case DataType::BFLOAT16:
-            print_consecutive_elements_impl<bfloat16>(s, reinterpret_cast<const bfloat16 *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<bfloat16>(s, reinterpret_cast<const bfloat16 *>(ptr), n, stream_width,
+                                                      element_delim);
             break;
         case DataType::F16:
-            print_consecutive_elements_impl<half>(s, reinterpret_cast<const half *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<half>(s, reinterpret_cast<const half *>(ptr), n, stream_width,
+                                                  element_delim);
             break;
         case DataType::F32:
-            print_consecutive_elements_impl<float>(s, reinterpret_cast<const float *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<float>(s, reinterpret_cast<const float *>(ptr), n, stream_width,
+                                                   element_delim);
             break;
         default:
             ARM_COMPUTE_ERROR("Undefined element size for given data type");
@@ -519,7 +561,7 @@ void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr
 
 int max_consecutive_elements_display_width(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::QASYMM8:
diff --git a/src/core/Validate.cpp b/src/core/Validate.cpp
index 5a6486e11e..d8f796193e 100644
--- a/src/core/Validate.cpp
+++ b/src/core/Validate.cpp
@@ -23,13 +23,16 @@
  */
 #include "arm_compute/core/Validate.h"
 
-arm_compute::Status arm_compute::error_on_mismatching_windows(const char *function, const char *file, const int line,
-                                                              const arm_compute::Window &full, const arm_compute::Window &win)
+arm_compute::Status arm_compute::error_on_mismatching_windows(const char                *function,
+                                                              const char                *file,
+                                                              const int                  line,
+                                                              const arm_compute::Window &full,
+                                                              const arm_compute::Window &win)
 {
     full.validate();
     win.validate();
 
-    for(size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].start() != win[i].start(), function, file, line);
         ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].end() != win[i].end(), function, file, line);
@@ -38,13 +41,16 @@ arm_compute::Status arm_compute::error_on_mismatching_windows(const char *functi
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_invalid_subwindow(const char *function, const char *file, const int line,
-                                                            const arm_compute::Window &full, const arm_compute::Window &sub)
+arm_compute::Status arm_compute::error_on_invalid_subwindow(const char                *function,
+                                                            const char                *file,
+                                                            const int                  line,
+                                                            const arm_compute::Window &full,
+                                                            const arm_compute::Window &sub)
 {
     full.validate();
     sub.validate();
 
-    for(size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].start() > sub[i].start(), function, file, line);
         ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].end() < sub[i].end(), function, file, line);
@@ -54,8 +60,12 @@ arm_compute::Status arm_compute::error_on_invalid_subwindow(const char *function
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_window_not_collapsable_at_dimension(const char *function, const char *file, const int line,
-                                                                              const arm_compute::Window &full, const arm_compute::Window &window, const int dim)
+arm_compute::Status arm_compute::error_on_window_not_collapsable_at_dimension(const char                *function,
+                                                                              const char                *file,
+                                                                              const int                  line,
+                                                                              const arm_compute::Window &full,
+                                                                              const arm_compute::Window &window,
+                                                                              const int                  dim)
 {
     full.validate();
     window.validate();
@@ -67,65 +77,73 @@ arm_compute::Status arm_compute::error_on_window_not_collapsable_at_dimension(co
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_coordinates_dimensions_gte(const char *function, const char *file, const int line,
-                                                                     const arm_compute::Coordinates &pos, unsigned int max_dim)
+arm_compute::Status arm_compute::error_on_coordinates_dimensions_gte(
+    const char *function, const char *file, const int line, const arm_compute::Coordinates &pos, unsigned int max_dim)
 {
-    for(unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i)
+    for (unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_LOC(pos[i] != 0, function, file, line);
     }
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_window_dimensions_gte(const char *function, const char *file, const int line,
-                                                                const arm_compute::Window &win, unsigned int max_dim)
+arm_compute::Status arm_compute::error_on_window_dimensions_gte(
+    const char *function, const char *file, const int line, const arm_compute::Window &win, unsigned int max_dim)
 {
-    for(unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i)
+    for (unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR((win[i].start() != 0) || (win[i].end() != win[i].step()),
-                                                function, file, line,
-                                                "Maximum number of dimensions expected %u but dimension %u is not empty", max_dim, i);
+        ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(
+            (win[i].start() != 0) || (win[i].end() != win[i].step()), function, file, line,
+            "Maximum number of dimensions expected %u but dimension %u is not empty", max_dim, i);
     }
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_tensor_not_2d(const char *function, const char *file, const int line,
+arm_compute::Status arm_compute::error_on_tensor_not_2d(const char                 *function,
+                                                        const char                 *file,
+                                                        const int                   line,
                                                         const arm_compute::ITensor *tensor)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor->info() == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor->info()->num_dimensions() != 2,
-                                            function, file, line,
-                                            "Only 2D Tensors are supported by this kernel (%zu passed)", tensor->info()->num_dimensions());
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor->info()->num_dimensions() != 2, function, file, line,
+                                            "Only 2D Tensors are supported by this kernel (%zu passed)",
+                                            tensor->info()->num_dimensions());
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_tensor_not_2d(const char *function, const char *file, const int line,
+arm_compute::Status arm_compute::error_on_tensor_not_2d(const char                     *function,
+                                                        const char                     *file,
+                                                        const int                       line,
                                                         const arm_compute::ITensorInfo *tensor)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor->num_dimensions() != 2,
-                                            function, file, line,
-                                            "Only 2D Tensors are supported by this kernel (%zu passed)", tensor->num_dimensions());
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor->num_dimensions() != 2, function, file, line,
+                                            "Only 2D Tensors are supported by this kernel (%zu passed)",
+                                            tensor->num_dimensions());
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_channel_not_in_known_format(const char *function, const char *file, const int line,
-                                                                      arm_compute::Format fmt, arm_compute::Channel cn)
+arm_compute::Status arm_compute::error_on_channel_not_in_known_format(
+    const char *function, const char *file, const int line, arm_compute::Format fmt, arm_compute::Channel cn)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(fmt == arm_compute::Format::UNKNOWN, function, file, line);
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(cn == arm_compute::Channel::UNKNOWN, function, file, line);
 
-    switch(fmt)
+    switch (fmt)
     {
         case arm_compute::Format::RGB888:
-            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R, arm_compute::Channel::G, arm_compute::Channel::B);
+            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R,
+                                                 arm_compute::Channel::G, arm_compute::Channel::B);
             break;
         case arm_compute::Format::RGBA8888:
-            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R, arm_compute::Channel::G, arm_compute::Channel::B, arm_compute::Channel::A);
+            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R,
+                                                 arm_compute::Channel::G, arm_compute::Channel::B,
+                                                 arm_compute::Channel::A);
             break;
         case arm_compute::Format::UV88:
-            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::U, arm_compute::Channel::V);
+            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::U,
+                                                 arm_compute::Channel::V);
             break;
         case arm_compute::Format::IYUV:
         case arm_compute::Format::UYVY422:
@@ -133,7 +151,8 @@ arm_compute::Status arm_compute::error_on_channel_not_in_known_format(const char
         case arm_compute::Format::NV12:
         case arm_compute::Format::NV21:
         case arm_compute::Format::YUV444:
-            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::Y, arm_compute::Channel::U, arm_compute::Channel::V);
+            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::Y,
+                                                 arm_compute::Channel::U, arm_compute::Channel::V);
             break;
         default:
             ARM_COMPUTE_ERROR_LOC(function, file, line, "Not supported format.");
@@ -141,21 +160,26 @@ arm_compute::Status arm_compute::error_on_channel_not_in_known_format(const char
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_unconfigured_kernel(const char *function, const char *file, const int line,
+arm_compute::Status arm_compute::error_on_unconfigured_kernel(const char                 *function,
+                                                              const char                 *file,
+                                                              const int                   line,
                                                               const arm_compute::IKernel *kernel)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(kernel == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(!kernel->is_window_configured(),
-                                        function, file, line,
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(!kernel->is_window_configured(), function, file, line,
                                         "This kernel hasn't been configured.");
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_invalid_subtensor(const char *function, const char *file, const int line,
-                                                            const TensorShape &parent_shape, const Coordinates &coords, const TensorShape &shape)
+arm_compute::Status arm_compute::error_on_invalid_subtensor(const char        *function,
+                                                            const char        *file,
+                                                            const int          line,
+                                                            const TensorShape &parent_shape,
+                                                            const Coordinates &coords,
+                                                            const TensorShape &shape)
 {
     // Check dimensions
-    for(unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
+    for (unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
     {
         const bool invalid_idx        = coords[i] >= static_cast<int>(parent_shape[i]);
         const bool out_of_bounds_size = coords[i] + static_cast<int>(shape[i]) > static_cast<int>(parent_shape[i]);
@@ -164,15 +188,20 @@ arm_compute::Status arm_compute::error_on_invalid_subtensor(const char *function
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_invalid_subtensor_valid_region(const char *function, const char *file, const int line,
-                                                                         const ValidRegion &parent_valid_region, const ValidRegion &valid_region)
+arm_compute::Status arm_compute::error_on_invalid_subtensor_valid_region(const char        *function,
+                                                                         const char        *file,
+                                                                         const int          line,
+                                                                         const ValidRegion &parent_valid_region,
+                                                                         const ValidRegion &valid_region)
 {
     // Check valid regions
-    for(unsigned int d = 0; d < TensorShape::num_max_dimensions; ++d)
+    for (unsigned int d = 0; d < TensorShape::num_max_dimensions; ++d)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_LOC((parent_valid_region.anchor[d] > valid_region.anchor[d]), function, file, line);
-        ARM_COMPUTE_RETURN_ERROR_ON_LOC((parent_valid_region.anchor[d] + static_cast<int>(parent_valid_region.shape[d])) < (valid_region.anchor[d] + static_cast<int>(valid_region.shape[d])),
-                                        function, file, line);
+        ARM_COMPUTE_RETURN_ERROR_ON_LOC(
+            (parent_valid_region.anchor[d] + static_cast<int>(parent_valid_region.shape[d])) <
+                (valid_region.anchor[d] + static_cast<int>(valid_region.shape[d])),
+            function, file, line);
     }
 
     return arm_compute::Status{};
diff --git a/src/core/common/Macros.h b/src/core/common/Macros.h
index d791154e5c..bc0ea29911 100644
--- a/src/core/common/Macros.h
+++ b/src/core/common/Macros.h
@@ -25,9 +25,9 @@
 #define ARM_COMPUTE_COMMON_MACROS_H
 
 #define ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(TypeName) \
-    TypeName(const TypeName &) = delete;               \
+    TypeName(const TypeName &)            = delete;    \
     TypeName &operator=(const TypeName &) = delete;    \
     TypeName(TypeName &&)                 = default;   \
-    TypeName &operator=(TypeName &&) = default
+    TypeName &operator=(TypeName &&)      = default
 
 #endif /* ARM_COMPUTE_COMMON_MACROS_H */
diff --git a/src/core/common/Registrars.h b/src/core/common/Registrars.h
index d6dc3449fc..686304b8d7 100644
--- a/src/core/common/Registrars.h
+++ b/src/core/common/Registrars.h
@@ -46,7 +46,7 @@
 
 #else /* !defined(ENABLE_FP16_KERNELS) */
 #define REGISTER_FP16_NEON(func_name) nullptr
-#define REGISTER_FP16_SVE(func_name) nullptr
+#define REGISTER_FP16_SVE(func_name)  nullptr
 #define REGISTER_FP16_SVE2(func_name) nullptr
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
 
@@ -72,7 +72,7 @@
 
 #else /* defined(ENABLE_FP32_KERNELS) */
 #define REGISTER_FP32_NEON(func_name) nullptr
-#define REGISTER_FP32_SVE(func_name) nullptr
+#define REGISTER_FP32_SVE(func_name)  nullptr
 #define REGISTER_FP32_SVE2(func_name) nullptr
 #endif /* defined(ENABLE_FP32_KERNELS) */
 
@@ -94,7 +94,7 @@
 
 #else /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */
 #define REGISTER_QASYMM8_SIGNED_NEON(func_name) nullptr
-#define REGISTER_QASYMM8_SIGNED_SVE(func_name) nullptr
+#define REGISTER_QASYMM8_SIGNED_SVE(func_name)  nullptr
 #define REGISTER_QASYMM8_SIGNED_SVE2(func_name) nullptr
 #endif /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */
 
@@ -115,7 +115,7 @@
 
 #else /* defined(ENABLE_QASYMM8_KERNELS) */
 #define REGISTER_QASYMM8_NEON(func_name) nullptr
-#define REGISTER_QASYMM8_SVE(func_name) nullptr
+#define REGISTER_QASYMM8_SVE(func_name)  nullptr
 #define REGISTER_QASYMM8_SVE2(func_name) nullptr
 #endif /* defined(ENABLE_QASYMM8_KERNELS) */
 
@@ -137,7 +137,7 @@
 
 #else /* defined(ENABLE_QSYMM16_KERNELS) */
 #define REGISTER_QSYMM16_NEON(func_name) nullptr
-#define REGISTER_QSYMM16_SVE(func_name) nullptr
+#define REGISTER_QSYMM16_SVE(func_name)  nullptr
 #define REGISTER_QSYMM16_SVE2(func_name) nullptr
 #endif /* defined(ENABLE_QSYMM16_KERNELS) */
 
@@ -169,7 +169,7 @@
 
 #else /* defined(ENABLE_INTEGER_KERNELS) */
 #define REGISTER_INTEGER_NEON(func_name) nullptr
-#define REGISTER_INTEGER_SVE(func_name) nullptr
+#define REGISTER_INTEGER_SVE(func_name)  nullptr
 #define REGISTER_INTEGER_SVE2(func_name) nullptr
 #endif /* defined(ENABLE_INTEGER_KERNELS) */
 
diff --git a/src/core/helpers/AutoConfiguration.h b/src/core/helpers/AutoConfiguration.h
index 8715dcd74b..9df2a76983 100644
--- a/src/core/helpers/AutoConfiguration.h
+++ b/src/core/helpers/AutoConfiguration.h
@@ -24,9 +24,9 @@
 #ifndef SRC_CORE_HELPERS_AUTOCONFIGURATION_H
 #define SRC_CORE_HELPERS_AUTOCONFIGURATION_H
 
-#include "arm_compute/core/utils/DataTypeUtils.h"
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/DataTypeUtils.h"
 
 namespace arm_compute
 {
@@ -42,10 +42,11 @@ namespace arm_compute
  */
 inline bool auto_init_if_empty(ITensorInfo       &info,
                                const TensorShape &shape,
-                               int num_channels, DataType data_type,
-                               QuantizationInfo quantization_info = QuantizationInfo())
+                               int                num_channels,
+                               DataType           data_type,
+                               QuantizationInfo   quantization_info = QuantizationInfo())
 {
-    if(info.tensor_shape().total_size() == 0)
+    if (info.tensor_shape().total_size() == 0)
     {
         info.set_data_type(data_type);
         info.set_num_channels(num_channels);
@@ -70,7 +71,7 @@ inline bool auto_init_if_empty(ITensorInfo       &info,
  */
 inline bool auto_init_if_empty(ITensorInfo &info_sink, const ITensorInfo &info_source)
 {
-    if(info_sink.tensor_shape().total_size() == 0)
+    if (info_sink.tensor_shape().total_size() == 0)
     {
         info_sink.set_data_type(info_source.data_type());
         info_sink.set_num_channels(info_source.num_channels());
@@ -93,7 +94,7 @@ inline bool auto_init_if_empty(ITensorInfo &info_sink, const ITensorInfo &info_s
  */
 inline bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
 {
-    if(info.tensor_shape().total_size() == 0)
+    if (info.tensor_shape().total_size() == 0)
     {
         info.set_tensor_shape(shape);
         return true;
@@ -112,7 +113,7 @@ inline bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
  */
 inline bool set_format_if_unknown(ITensorInfo &info, Format format)
 {
-    if(info.data_type() == DataType::UNKNOWN)
+    if (info.data_type() == DataType::UNKNOWN)
     {
         info.set_format(format);
         return true;
@@ -131,7 +132,7 @@ inline bool set_format_if_unknown(ITensorInfo &info, Format format)
  */
 inline bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type)
 {
-    if(info.data_type() == DataType::UNKNOWN)
+    if (info.data_type() == DataType::UNKNOWN)
     {
         info.set_data_type(data_type);
         return true;
@@ -150,7 +151,7 @@ inline bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type)
  */
 inline bool set_data_layout_if_unknown(ITensorInfo &info, DataLayout data_layout)
 {
-    if(info.data_layout() == DataLayout::UNKNOWN)
+    if (info.data_layout() == DataLayout::UNKNOWN)
     {
         info.set_data_layout(data_layout);
         return true;
@@ -169,7 +170,7 @@ inline bool set_data_layout_if_unknown(ITensorInfo &info, DataLayout data_layout
  */
 inline bool set_quantization_info_if_empty(ITensorInfo &info, QuantizationInfo quantization_info)
 {
-    if(info.quantization_info().empty() && (is_data_type_quantized_asymmetric(info.data_type())))
+    if (info.quantization_info().empty() && (is_data_type_quantized_asymmetric(info.data_type())))
     {
         info.set_quantization_info(quantization_info);
         return true;
diff --git a/src/core/helpers/MemoryHelpers.h b/src/core/helpers/MemoryHelpers.h
index a41052687b..dd094b414c 100644
--- a/src/core/helpers/MemoryHelpers.h
+++ b/src/core/helpers/MemoryHelpers.h
@@ -24,9 +24,9 @@
 #ifndef SRC_COMMON_MEMORY_HELPERS_H
 #define SRC_COMMON_MEMORY_HELPERS_H
 
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
 #include <memory>
@@ -43,18 +43,17 @@ inline int offset_int_vec(int offset)
 template <typename TensorType>
 struct WorkspaceDataElement
 {
-    int                          slot{ -1 };
-    experimental::MemoryLifetime lifetime{ experimental::MemoryLifetime::Temporary };
-    std::unique_ptr<TensorType>  tensor{ nullptr };
+    int                          slot{-1};
+    experimental::MemoryLifetime lifetime{experimental::MemoryLifetime::Temporary};
+    std::unique_ptr<TensorType>  tensor{nullptr};
 };
 
 template <typename TensorType>
 using WorkspaceData = std::vector<WorkspaceDataElement<TensorType>>;
 
 template <typename TensorType>
-WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirements &mem_reqs,
-                                           MemoryGroup                            &mgroup,
-                                           ITensorPack                            &run_pack)
+WorkspaceData<TensorType>
+manage_workspace(const experimental::MemoryRequirements &mem_reqs, MemoryGroup &mgroup, ITensorPack &run_pack)
 {
     ITensorPack dummy_pack = ITensorPack();
     return manage_workspace<TensorType>(mem_reqs, mgroup, run_pack, dummy_pack);
@@ -63,24 +62,26 @@ WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirement
 template <typename TensorType>
 WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirements &mem_reqs,
                                            MemoryGroup                            &mgroup,
-                                           ITensorPack &run_pack, ITensorPack &prep_pack)
+                                           ITensorPack                            &run_pack,
+                                           ITensorPack                            &prep_pack)
 {
     WorkspaceData<TensorType> workspace_memory;
-    for(const auto &req : mem_reqs)
+    for (const auto &req : mem_reqs)
     {
-        if(req.size == 0)
+        if (req.size == 0)
         {
             continue;
         }
 
-        const auto aux_info = TensorInfo{ TensorShape(req.size), 1, DataType::U8 };
-        workspace_memory.emplace_back(WorkspaceDataElement<TensorType> { req.slot, req.lifetime, std::make_unique<TensorType>() });
+        const auto aux_info = TensorInfo{TensorShape(req.size), 1, DataType::U8};
+        workspace_memory.emplace_back(
+            WorkspaceDataElement<TensorType>{req.slot, req.lifetime, std::make_unique<TensorType>()});
 
         auto aux_tensor = workspace_memory.back().tensor.get();
         ARM_COMPUTE_ERROR_ON_NULLPTR(aux_tensor);
         aux_tensor->allocator()->init(aux_info, req.alignment);
 
-        if(req.lifetime == experimental::MemoryLifetime::Temporary)
+        if (req.lifetime == experimental::MemoryLifetime::Temporary)
         {
             mgroup.manage(aux_tensor);
         }
@@ -91,7 +92,7 @@ WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirement
         run_pack.add_tensor(req.slot, aux_tensor);
     }
 
-    for(auto &mem : workspace_memory)
+    for (auto &mem : workspace_memory)
     {
         auto tensor = mem.tensor.get();
         tensor->allocator()->allocate();
@@ -103,31 +104,29 @@ WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirement
 template <typename TensorType>
 void release_prepare_tensors(WorkspaceData<TensorType> &workspace, ITensorPack &prep_pack)
 {
-    workspace.erase(std::remove_if(workspace.begin(),
-                                   workspace.end(),
-                                   [&prep_pack](auto & wk)
-    {
-        const bool to_erase = wk.lifetime == experimental::MemoryLifetime::Prepare;
-        if(to_erase)
-        {
-            prep_pack.remove_tensor(wk.slot);
-        }
-        return to_erase;
-    }),
-    workspace.end());
+    workspace.erase(std::remove_if(workspace.begin(), workspace.end(),
+                                   [&prep_pack](auto &wk)
+                                   {
+                                       const bool to_erase = wk.lifetime == experimental::MemoryLifetime::Prepare;
+                                       if (to_erase)
+                                       {
+                                           prep_pack.remove_tensor(wk.slot);
+                                       }
+                                       return to_erase;
+                                   }),
+                    workspace.end());
 }
 
 /** Utility function to release tensors with lifetime marked as Prepare */
 template <typename TensorType>
-void release_temporaries(const experimental::MemoryRequirements &mem_reqs,
-                         WorkspaceData<TensorType>              &workspace)
+void release_temporaries(const experimental::MemoryRequirements &mem_reqs, WorkspaceData<TensorType> &workspace)
 {
-    for(auto &ws : workspace)
+    for (auto &ws : workspace)
     {
         const int slot = ws.slot;
-        for(auto &m : mem_reqs)
+        for (auto &m : mem_reqs)
         {
-            if(m.slot == slot && m.lifetime == experimental::MemoryLifetime::Prepare)
+            if (m.slot == slot && m.lifetime == experimental::MemoryLifetime::Prepare)
             {
                 auto tensor = ws.tensor.get();
                 tensor->allocator()->free();
diff --git a/src/core/helpers/PoolingHelpers.h b/src/core/helpers/PoolingHelpers.h
index 079629ee6a..9ef045f472 100644
--- a/src/core/helpers/PoolingHelpers.h
+++ b/src/core/helpers/PoolingHelpers.h
@@ -33,8 +33,20 @@ namespace cpu
 namespace
 {
 
-inline float calculate_avg_scale_pool3d(bool exclude_padding, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int pool_size_z, const int upper_bound_w,
-                                 const int upper_bound_h, const int upper_bound_d, const int pad_x, const int pad_y, const int pad_z, const int stride_x, const int stride_y, const int stride_z)
+inline float calculate_avg_scale_pool3d(bool               exclude_padding,
+                                        const Coordinates &id,
+                                        const int          pool_size_x,
+                                        const int          pool_size_y,
+                                        const int          pool_size_z,
+                                        const int          upper_bound_w,
+                                        const int          upper_bound_h,
+                                        const int          upper_bound_d,
+                                        const int          pad_x,
+                                        const int          pad_y,
+                                        const int          pad_z,
+                                        const int          stride_x,
+                                        const int          stride_y,
+                                        const int          stride_z)
 {
     // Based on NDHWC
     int start_x = id[1] * stride_x - pad_x;
@@ -44,7 +56,7 @@ inline float calculate_avg_scale_pool3d(bool exclude_padding, const Coordinates
     const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
     const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
     const int end_z = std::min(start_z + pool_size_z, upper_bound_d);
-    if(exclude_padding)
+    if (exclude_padding)
     {
         start_x = std::max(0, start_x);
         start_y = std::max(0, start_y);
@@ -53,8 +65,17 @@ inline float calculate_avg_scale_pool3d(bool exclude_padding, const Coordinates
     return 1.f / ((end_y - start_y) * (end_x - start_x) * (end_z - start_z));
 }
 
-inline float calculate_avg_scale_pool2d(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
-                                 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+inline float calculate_avg_scale_pool2d(bool               exclude_padding,
+                                        DataLayout         data_layout,
+                                        const Coordinates &id,
+                                        const int          pool_size_x,
+                                        const int          pool_size_y,
+                                        const int          upper_bound_w,
+                                        const int          upper_bound_h,
+                                        const int          pad_x,
+                                        const int          pad_y,
+                                        const int          stride_x,
+                                        const int          stride_y)
 {
     const unsigned int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
@@ -64,7 +85,7 @@ inline float calculate_avg_scale_pool2d(bool exclude_padding, DataLayout data_la
 
     const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
     const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
-    if(exclude_padding)
+    if (exclude_padding)
     {
         start_x = std::max(0, start_x);
         start_y = std::max(0, start_y);
@@ -117,17 +138,26 @@ inline float32x4_t vcvtq_f32_q32(int32x4_t values)
 }
 
 template <typename Tout>
-inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset);
+inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc,
+                                           const float          quant_rescale,
+                                           const float          scale_pooling,
+                                           const int32_t        new_offset);
 
 template <>
-inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
+inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc,
+                                                 const float          quant_rescale,
+                                                 const float          scale_pooling,
+                                                 const int32_t        new_offset)
 {
     const float new_scale = quant_rescale / scale_pooling;
     return vquantize(acc, UniformQuantizationInfo(new_scale, new_offset));
 }
 
 template <>
-inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
+inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc,
+                                                const float          quant_rescale,
+                                                const float          scale_pooling,
+                                                const int32_t        new_offset)
 {
     const float new_scale = quant_rescale / scale_pooling;
     return vquantize_signed(acc, UniformQuantizationInfo(new_scale, new_offset));
@@ -139,30 +169,24 @@ inline Tout vrequantize_pooling(Tin vec1, Tin vec2, const UniformQuantizationInf
 template <>
 inline uint8x16_t vrequantize_pooling(uint8x8_t vec1, uint8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
 {
-    const float32x4x4_t acc =
-    {
-        {
-            vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))),
-            vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))),
-            vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))),
-            vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))),
-        }
-    };
+    const float32x4x4_t acc = {{
+        vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))),
+        vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))),
+        vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))),
+        vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))),
+    }};
     return vquantize(acc, requant_qinfo);
 }
 
 template <>
 inline int8x16_t vrequantize_pooling(int8x8_t vec1, int8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
 {
-    const float32x4x4_t acc =
-    {
-        {
-            vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))),
-            vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))),
-            vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))),
-            vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))),
-        }
-    };
+    const float32x4x4_t acc = {{
+        vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))),
+        vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))),
+        vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))),
+        vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))),
+    }};
     return vquantize_signed(acc, requant_qinfo);
 }
 
@@ -172,26 +196,20 @@ inline T vrequantize_pooling(T &vec, const UniformQuantizationInfo &requant_qinf
 template <>
 inline uint8x8_t vrequantize_pooling(uint8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
 {
-    const float32x4x2_t acc =
-    {
-        {
-            vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))),
-            vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))),
-        }
-    };
+    const float32x4x2_t acc = {{
+        vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))),
+        vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))),
+    }};
     return vquantize(acc, requant_qinfo);
 }
 
 template <>
 inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
 {
-    const float32x4x2_t acc =
-    {
-        {
-            vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))),
-            vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))),
-        }
-    };
+    const float32x4x2_t acc = {{
+        vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))),
+        vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))),
+    }};
     return vquantize_signed(acc, requant_qinfo);
 }
 
@@ -199,4 +217,3 @@ inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo
 } // namespace cpu
 } // namespace arm_compute
 #endif /* SRC_CORE_HELPERS_POOLINGHELPERS_H */
-
diff --git a/src/core/helpers/ScaleHelpers.h b/src/core/helpers/ScaleHelpers.h
index e769bba782..47605e7385 100644
--- a/src/core/helpers/ScaleHelpers.h
+++ b/src/core/helpers/ScaleHelpers.h
@@ -50,8 +50,12 @@ namespace scale_helpers
  *
  * @return The bilinear interpolated pixel value
  */
-inline uint8_t delta_bilinear_c1_quantized(const uint8_t *pixel_ptr, size_t stride, float dx, float dy,
-                                           UniformQuantizationInfo iq_info, UniformQuantizationInfo oq_info)
+inline uint8_t delta_bilinear_c1_quantized(const uint8_t          *pixel_ptr,
+                                           size_t                  stride,
+                                           float                   dx,
+                                           float                   dy,
+                                           UniformQuantizationInfo iq_info,
+                                           UniformQuantizationInfo oq_info)
 {
     ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
 
@@ -85,8 +89,12 @@ inline uint8_t delta_bilinear_c1_quantized(const uint8_t *pixel_ptr, size_t stri
  *
  * @return The bilinear interpolated pixel value
  */
-inline int8_t delta_bilinear_c1_quantized(const int8_t *pixel_ptr, size_t stride, float dx, float dy,
-                                          UniformQuantizationInfo iq_info, UniformQuantizationInfo oq_info)
+inline int8_t delta_bilinear_c1_quantized(const int8_t           *pixel_ptr,
+                                          size_t                  stride,
+                                          float                   dx,
+                                          float                   dy,
+                                          UniformQuantizationInfo iq_info,
+                                          UniformQuantizationInfo oq_info)
 {
     ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
 
@@ -122,9 +130,8 @@ inline int8_t delta_bilinear_c1_quantized(const int8_t *pixel_ptr, size_t stride
  *
  * @return The pixel at (x, y) using area interpolation.
  */
-inline uint8_t
-pixel_area_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr,
-                      float hr, int x, int y)
+inline uint8_t pixel_area_c1u8_clamp(
+    const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr, float hr, int x, int y)
 {
     ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
 
@@ -159,7 +166,7 @@ pixel_area_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t widt
 
     // Sum pixels in area
     int sum = 0;
-    for(int j = yi + y_from, je = yi + y_to; j <= je; ++j)
+    for (int j = yi + y_from, je = yi + y_to; j <= je; ++j)
     {
         const uint8_t *ptr = first_pixel_ptr + j * stride + xi + x_from;
         sum                = std::accumulate(ptr, ptr + x_elements, sum);
diff --git a/src/core/helpers/SoftmaxHelpers.cpp b/src/core/helpers/SoftmaxHelpers.cpp
index 71b971af31..8184991ab5 100644
--- a/src/core/helpers/SoftmaxHelpers.cpp
+++ b/src/core/helpers/SoftmaxHelpers.cpp
@@ -29,7 +29,7 @@ namespace softmax_helpers
 {
 PermutationVector get_permutation_vector_from_softmax_axis(size_t axis)
 {
-    switch(axis)
+    switch (axis)
     {
         case 1:
             return PermutationVector(1U, 0U, 2U, 3U);
diff --git a/src/core/helpers/Utils.cpp b/src/core/helpers/Utils.cpp
index 3900475355..6ca29d180d 100644
--- a/src/core/helpers/Utils.cpp
+++ b/src/core/helpers/Utils.cpp
@@ -31,9 +31,9 @@ bool has_holes(const ITensorInfo &info, size_t dimension)
     const auto &strides        = info.strides_in_bytes();
     size_t      squashed_bytes = info.element_size();
 
-    for(size_t dim = 0; dim <= dimension; ++dim)
+    for (size_t dim = 0; dim <= dimension; ++dim)
     {
-        if(strides[dim] != squashed_bytes)
+        if (strides[dim] != squashed_bytes)
         {
             return true;
         }
diff --git a/src/core/helpers/Utils.h b/src/core/helpers/Utils.h
index 7ad960bfa2..2e7224c55b 100644
--- a/src/core/helpers/Utils.h
+++ b/src/core/helpers/Utils.h
@@ -45,7 +45,7 @@ inline Strides compute_strides(const ITensorInfo &info, T stride_x, Ts &&...fixe
     // Create strides object
     Strides strides(stride_x, fixed_strides...);
 
-    for(size_t i = 1 + sizeof...(Ts); i < info.num_dimensions(); ++i)
+    for (size_t i = 1 + sizeof...(Ts); i < info.num_dimensions(); ++i)
     {
         strides.set(i, shape[i - 1] * strides[i - 1]);
     }
diff --git a/src/core/helpers/WindowHelpers.cpp b/src/core/helpers/WindowHelpers.cpp
index a4d46db352..30a55fcbc6 100644
--- a/src/core/helpers/WindowHelpers.cpp
+++ b/src/core/helpers/WindowHelpers.cpp
@@ -25,9 +25,10 @@
 
 namespace arm_compute
 {
-Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
+Window
+calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
 {
-    if(!skip_border)
+    if (!skip_border)
     {
         border_size = BorderSize(0);
     }
@@ -38,40 +39,47 @@ Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps,
     Window window;
 
     window.set(0, Window::Dimension(
-                   // Skip the border left of the image
-                   anchor[0] + border_size.left,
-                   // Skip the border right of the image
-                   // Make sure the window width is a multiple of the step size
-                   anchor[0] + border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]),
-                   steps[0]));
+                      // Skip the border left of the image
+                      anchor[0] + border_size.left,
+                      // Skip the border right of the image
+                      // Make sure the window width is a multiple of the step size
+                      anchor[0] + border_size.left +
+                          ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) -
+                                                           static_cast<int>(border_size.right)),
+                                           steps[0]),
+                      steps[0]));
 
     size_t n = 1;
 
-    if(anchor.num_dimensions() > 1)
+    if (anchor.num_dimensions() > 1)
     {
-        window.set(1, Window::Dimension(
+        window.set(1,
+                   Window::Dimension(
                        // Skip the border above the image
                        anchor[1] + border_size.top,
                        // Skip the border below the image
-                       anchor[1] + border_size.top + ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) - static_cast<int>(border_size.top) - static_cast<int>(border_size.bottom)), steps[1]),
+                       anchor[1] + border_size.top +
+                           ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) - static_cast<int>(border_size.top) -
+                                                            static_cast<int>(border_size.bottom)),
+                                            steps[1]),
                        steps[1]));
 
         ++n;
     }
 
-    if(anchor.num_dimensions() > 2)
+    if (anchor.num_dimensions() > 2)
     {
         window.set(2, Window::Dimension(anchor[2], std::max<size_t>(1, shape[2]), steps[2]));
 
         ++n;
     }
 
-    for(; n < anchor.num_dimensions(); ++n)
+    for (; n < anchor.num_dimensions(); ++n)
     {
         window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
     }
 
-    for(; n < Coordinates::num_max_dimensions; ++n)
+    for (; n < Coordinates::num_max_dimensions; ++n)
     {
         window.set(n, Window::Dimension(0, 1));
     }
@@ -81,7 +89,7 @@ Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps,
 
 Window calculate_max_window(const TensorShape &shape, const Steps &steps, bool skip_border, BorderSize border_size)
 {
-    if(!skip_border)
+    if (!skip_border)
     {
         border_size = BorderSize(0);
     }
@@ -89,40 +97,46 @@ Window calculate_max_window(const TensorShape &shape, const Steps &steps, bool s
     Window window;
 
     window.set(0, Window::Dimension(
-                   // Skip the border left of the image
-                   border_size.left,
-                   // Skip the border right of the image
-                   // Make sure the window width is a multiple of the step size
-                   border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]),
-                   steps[0]));
+                      // Skip the border left of the image
+                      border_size.left,
+                      // Skip the border right of the image
+                      // Make sure the window width is a multiple of the step size
+                      border_size.left +
+                          ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) -
+                                                           static_cast<int>(border_size.right)),
+                                           steps[0]),
+                      steps[0]));
 
     size_t n = 1;
 
-    if(shape.num_dimensions() > 1)
+    if (shape.num_dimensions() > 1)
     {
         window.set(1, Window::Dimension(
-                       // Skip the border above the image
-                       border_size.top,
-                       // Skip the border below the image
-                       border_size.top + ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) - static_cast<int>(border_size.top) - static_cast<int>(border_size.bottom)), steps[1]),
-                       steps[1]));
+                          // Skip the border above the image
+                          border_size.top,
+                          // Skip the border below the image
+                          border_size.top + ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) -
+                                                                             static_cast<int>(border_size.top) -
+                                                                             static_cast<int>(border_size.bottom)),
+                                                             steps[1]),
+                          steps[1]));
 
         ++n;
     }
 
-    if(shape.num_dimensions() > 2)
+    if (shape.num_dimensions() > 2)
     {
         window.set(2, Window::Dimension(0, std::max<size_t>(1, shape[2]), steps[2]));
 
         ++n;
     }
 
-    for(; n < shape.num_dimensions(); ++n)
+    for (; n < shape.num_dimensions(); ++n)
     {
         window.set(n, Window::Dimension(0, std::max<size_t>(1, shape[n])));
     }
 
-    for(; n < Coordinates::num_max_dimensions; ++n)
+    for (; n < Coordinates::num_max_dimensions; ++n)
     {
         window.set(n, Window::Dimension(0, 1));
     }
@@ -138,40 +152,42 @@ Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Step
     Window window;
 
     window.set(0, Window::Dimension(
-                   // move the anchor to the start from the border
-                   anchor[0] - border_size.left,
-                   // move the anchor to include the right end border
-                   // Make sure the window width is a multiple of the step size
-                   anchor[0] - border_size.left + ceil_to_multiple(shape[0] + border_size.left + border_size.right, steps[0]),
-                   steps[0]));
+                      // move the anchor to the start from the border
+                      anchor[0] - border_size.left,
+                      // move the anchor to include the right end border
+                      // Make sure the window width is a multiple of the step size
+                      anchor[0] - border_size.left +
+                          ceil_to_multiple(shape[0] + border_size.left + border_size.right, steps[0]),
+                      steps[0]));
 
     size_t n = 1;
 
-    if(anchor.num_dimensions() > 1)
+    if (anchor.num_dimensions() > 1)
     {
         window.set(1, Window::Dimension(
-                       // Include the border above the image
-                       anchor[1] - border_size.top,
-                       // Include the border below the image
-                       anchor[1] - border_size.top + ceil_to_multiple(shape[1] + border_size.top + border_size.bottom, steps[1]),
-                       steps[1]));
+                          // Include the border above the image
+                          anchor[1] - border_size.top,
+                          // Include the border below the image
+                          anchor[1] - border_size.top +
+                              ceil_to_multiple(shape[1] + border_size.top + border_size.bottom, steps[1]),
+                          steps[1]));
 
         ++n;
     }
 
-    if(anchor.num_dimensions() > 2)
+    if (anchor.num_dimensions() > 2)
     {
         window.set(2, Window::Dimension(0, std::max<size_t>(1, shape[n]), steps[2]));
 
         ++n;
     }
 
-    for(; n < anchor.num_dimensions(); ++n)
+    for (; n < anchor.num_dimensions(); ++n)
     {
         window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
     }
 
-    for(; n < Coordinates::num_max_dimensions; ++n)
+    for (; n < Coordinates::num_max_dimensions; ++n)
     {
         window.set(n, Window::Dimension(0, 1));
     }
@@ -179,9 +195,12 @@ Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Step
     return window;
 }
 
-Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
+Window calculate_max_window_horizontal(const ValidRegion &valid_region,
+                                       const Steps       &steps,
+                                       bool               skip_border,
+                                       BorderSize         border_size)
 {
-    if(skip_border)
+    if (skip_border)
     {
         border_size.top    = 0;
         border_size.bottom = 0;
@@ -198,33 +217,35 @@ Window calculate_max_window_horizontal(const ValidRegion &valid_region, const St
     Window window;
 
     window.set(0, Window::Dimension(
-                   // Skip the border left of the image
-                   anchor[0] + border_size.left,
-                   // Skip the border right of the image
-                   // Make sure the window width is a multiple of the step size
-                   anchor[0] + border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]),
-                   steps[0]));
+                      // Skip the border left of the image
+                      anchor[0] + border_size.left,
+                      // Skip the border right of the image
+                      // Make sure the window width is a multiple of the step size
+                      anchor[0] + border_size.left +
+                          ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) -
+                                                           static_cast<int>(border_size.right)),
+                                           steps[0]),
+                      steps[0]));
 
     size_t n = 1;
 
-    if(anchor.num_dimensions() > 1)
+    if (anchor.num_dimensions() > 1)
     {
         window.set(1, Window::Dimension(
-                       // Skip the border above the image
-                       anchor[1] - border_size.top,
-                       // Skip the border below the image
-                       anchor[1] + shape[1] + border_size.bottom,
-                       1));
+                          // Skip the border above the image
+                          anchor[1] - border_size.top,
+                          // Skip the border below the image
+                          anchor[1] + shape[1] + border_size.bottom, 1));
 
         ++n;
     }
 
-    for(; n < anchor.num_dimensions(); ++n)
+    for (; n < anchor.num_dimensions(); ++n)
     {
         window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
     }
 
-    for(; n < Coordinates::num_max_dimensions; ++n)
+    for (; n < Coordinates::num_max_dimensions; ++n)
     {
         window.set(n, Window::Dimension(0, 1));
     }
@@ -247,9 +268,9 @@ std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &sr
     size_t squashed_bytes = src0.element_size();
 
     // Try to squash the low dimensions together.
-    for(; dim < num_dimensions; ++dim)
+    for (; dim < num_dimensions; ++dim)
     {
-        if(shape0[dim] != shape1[dim] || strides0[dim] != squashed_bytes || strides1[dim] != squashed_bytes)
+        if (shape0[dim] != shape1[dim] || strides0[dim] != squashed_bytes || strides1[dim] != squashed_bytes)
         {
             break;
         }
@@ -257,7 +278,7 @@ std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &sr
         squashed_bytes *= shape0[dim];
     }
 
-    if(dim == num_dimensions)
+    if (dim == num_dimensions)
     {
         auto squashed_elements = squashed_bytes / src0.element_size();
 
@@ -266,7 +287,7 @@ std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &sr
         // The input tensors can be interpreted as 1D array.
         win.set(0, Window::Dimension(0, squashed_elements, 1));
 
-        for(dim = 1; dim < Coordinates::num_max_dimensions; ++dim)
+        for (dim = 1; dim < Coordinates::num_max_dimensions; ++dim)
         {
             win.set(dim, Window::Dimension(0, 1, 1));
         }
@@ -274,7 +295,7 @@ std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &sr
     else
     {
         // Generates the max window.
-        for(dim = 0; dim < Coordinates::num_max_dimensions; ++dim)
+        for (dim = 0; dim < Coordinates::num_max_dimensions; ++dim)
         {
             win.set(dim, Window::Dimension(0, std::max(shape0[dim], shape1[dim]), 1));
         }
@@ -295,21 +316,21 @@ std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &sr
     size_t squashed_bytes  = src.element_size();
 
     // Try to squash the low dimensions together.
-    for(; dim < num_dimensions; ++dim)
+    for (; dim < num_dimensions; ++dim)
     {
-        if(strides[dim] != squashed_bytes)
+        if (strides[dim] != squashed_bytes)
         {
             break;
         }
         squashed_bytes *= shape[dim];
     }
-    if(dim == num_dimensions)
+    if (dim == num_dimensions)
     {
         const auto squashed_elements = squashed_bytes / src.element_size();
         split_dimension              = Window::DimX;
         // The input tensor can be interpreted as 1D array.
         win.set(0, Window::Dimension(0, squashed_elements, 1));
-        for(dim = 1; dim < Coordinates::num_max_dimensions; ++dim)
+        for (dim = 1; dim < Coordinates::num_max_dimensions; ++dim)
         {
             win.set(dim, Window::Dimension(0, 1, 1));
         }
@@ -317,7 +338,7 @@ std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &sr
     else
     {
         // Generate the max window.
-        for(dim = 0; dim < Coordinates::num_max_dimensions; ++dim)
+        for (dim = 0; dim < Coordinates::num_max_dimensions; ++dim)
         {
             win.set(dim, Window::Dimension(0, shape[dim], 1));
         }
diff --git a/src/core/helpers/WindowHelpers.h b/src/core/helpers/WindowHelpers.h
index eccf7f2d18..e404c18e8a 100644
--- a/src/core/helpers/WindowHelpers.h
+++ b/src/core/helpers/WindowHelpers.h
@@ -43,21 +43,13 @@ namespace arm_compute
  *         influence the returned value.
  */
 template <typename... Ts>
-bool update_window_and_padding(Window &win, Ts &&... patterns)
+bool update_window_and_padding(Window &win, Ts &&...patterns)
 {
     bool window_changed = false;
 
-    utility::for_each([&](const IAccessWindow & w)
-    {
-        window_changed |= w.update_window_if_needed(win);
-    },
-    patterns...);
+    utility::for_each([&](const IAccessWindow &w) { window_changed |= w.update_window_if_needed(win); }, patterns...);
 
-    utility::for_each([&](IAccessWindow & w)
-    {
-        w.update_padding_if_needed(win);
-    },
-    patterns...);
+    utility::for_each([&](IAccessWindow &w) { w.update_padding_if_needed(win); }, patterns...);
 
     return window_changed;
 }
@@ -69,18 +61,18 @@ bool update_window_and_padding(Window &win, Ts &&... patterns)
  * @return Intersection of all regions.
  */
 template <typename... Ts>
-ValidRegion intersect_valid_regions(const Ts &... regions)
+ValidRegion intersect_valid_regions(const Ts &...regions)
 {
-    auto intersect = [](const ValidRegion & r1, const ValidRegion & r2) -> ValidRegion
+    auto intersect = [](const ValidRegion &r1, const ValidRegion &r2) -> ValidRegion
     {
         ValidRegion region;
 
-        for(size_t d = 0; d < std::min(r1.anchor.num_dimensions(), r2.anchor.num_dimensions()); ++d)
+        for (size_t d = 0; d < std::min(r1.anchor.num_dimensions(), r2.anchor.num_dimensions()); ++d)
         {
             region.anchor.set(d, std::max(r1.anchor[d], r2.anchor[d]));
         }
 
-        for(size_t d = 0; d < std::min(r1.shape.num_dimensions(), r2.shape.num_dimensions()); ++d)
+        for (size_t d = 0; d < std::min(r1.shape.num_dimensions(), r2.shape.num_dimensions()); ++d)
         {
             region.shape.set(d, std::min(r1.shape[d], r2.shape[d]));
         }
@@ -101,7 +93,10 @@ ValidRegion intersect_valid_regions(const Ts &... regions)
  *
  * @return The maximum window the kernel can be executed on.
  */
-Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
+Window calculate_max_window(const ValidRegion &valid_region,
+                            const Steps       &steps       = Steps(),
+                            bool               skip_border = false,
+                            BorderSize         border_size = BorderSize());
 
 /** Calculate the maximum window for a given tensor shape and border setting
  *
@@ -112,7 +107,10 @@ Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps
  *
  * @return The maximum window the kernel can be executed on.
  */
-Window calculate_max_window(const TensorShape &shape, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
+Window calculate_max_window(const TensorShape &shape,
+                            const Steps       &steps       = Steps(),
+                            bool               skip_border = false,
+                            BorderSize         border_size = BorderSize());
 
 /** Calculate the maximum window for a given tensor shape and border setting
  *
@@ -123,7 +121,10 @@ Window calculate_max_window(const TensorShape &shape, const Steps &steps = Steps
  *
  * @return The maximum window the kernel can be executed on.
  */
-inline Window calculate_max_window(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize())
+inline Window calculate_max_window(const ITensorInfo &info,
+                                   const Steps       &steps       = Steps(),
+                                   bool               skip_border = false,
+                                   BorderSize         border_size = BorderSize())
 {
     return calculate_max_window(info.tensor_shape(), steps, skip_border, border_size);
 }
@@ -137,7 +138,10 @@ inline Window calculate_max_window(const ITensorInfo &info, const Steps &steps =
  *
  * @return The maximum window the kernel can be executed on.
  */
-Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
+Window calculate_max_window_horizontal(const ValidRegion &valid_region,
+                                       const Steps       &steps       = Steps(),
+                                       bool               skip_border = false,
+                                       BorderSize         border_size = BorderSize());
 
 /** Calculate the maximum window used by a horizontal kernel for a given tensor shape and border setting
  *
@@ -148,7 +152,10 @@ Window calculate_max_window_horizontal(const ValidRegion &valid_region, const St
  *
  * @return The maximum window the kernel can be executed on.
  */
-inline Window calculate_max_window_horizontal(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize())
+inline Window calculate_max_window_horizontal(const ITensorInfo &info,
+                                              const Steps       &steps       = Steps(),
+                                              bool               skip_border = false,
+                                              BorderSize         border_size = BorderSize())
 {
     return calculate_max_window_horizontal(info.valid_region(), steps, skip_border, border_size);
 }
@@ -161,7 +168,9 @@ inline Window calculate_max_window_horizontal(const ITensorInfo &info, const Ste
  *
  * @return The maximum window the kernel can be executed on.
  */
-Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Steps &steps = Steps(), BorderSize border_size = BorderSize());
+Window calculate_max_enlarged_window(const ValidRegion &valid_region,
+                                     const Steps       &steps       = Steps(),
+                                     BorderSize         border_size = BorderSize());
 
 /** Calculate the maximum window for a given tensor shape and border setting. The window will also includes the border.
  *
@@ -171,7 +180,9 @@ Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Step
  *
  * @return The maximum window the kernel can be executed on.
  */
-inline Window calculate_max_enlarged_window(const ITensorInfo &info, const Steps &steps = Steps(), BorderSize border_size = BorderSize())
+inline Window calculate_max_enlarged_window(const ITensorInfo &info,
+                                            const Steps       &steps       = Steps(),
+                                            BorderSize         border_size = BorderSize())
 {
     return calculate_max_enlarged_window(info.valid_region(), steps, border_size);
 }
@@ -208,7 +219,7 @@ std::pair<Window, size_t> calculate_squashed_or_max_window(const ITensorInfo &sr
  * @return A pair of the shape and window
  */
 template <typename... Shapes>
-std::pair<TensorShape, Window> compute_output_shape_and_window(const Shapes &... shapes)
+std::pair<TensorShape, Window> compute_output_shape_and_window(const Shapes &...shapes)
 {
     const TensorShape out_shape = TensorShape::broadcast_shape(shapes...);
     return std::make_pair(out_shape, calculate_max_window(out_shape));
diff --git a/src/core/utils/ActivationFunctionUtils.cpp b/src/core/utils/ActivationFunctionUtils.cpp
index 4854b8eb0b..017170a0c5 100644
--- a/src/core/utils/ActivationFunctionUtils.cpp
+++ b/src/core/utils/ActivationFunctionUtils.cpp
@@ -28,26 +28,24 @@
 
 namespace arm_compute
 {
-const std::string &string_from_activation_func(const ActivationFunction& act)
+const std::string &string_from_activation_func(const ActivationFunction &act)
 {
-    static std::map<ActivationFunction, const std::string> act_map =
-    {
-        { ActivationFunction::ABS, "ABS" },
-        { ActivationFunction::LINEAR, "LINEAR" },
-        { ActivationFunction::LOGISTIC, "LOGISTIC" },
-        { ActivationFunction::RELU, "RELU" },
-        { ActivationFunction::BOUNDED_RELU, "BRELU" },
-        { ActivationFunction::LU_BOUNDED_RELU, "LU_BRELU" },
-        { ActivationFunction::LEAKY_RELU, "LRELU" },
-        { ActivationFunction::SOFT_RELU, "SRELU" },
-        { ActivationFunction::ELU, "ELU" },
-        { ActivationFunction::SQRT, "SQRT" },
-        { ActivationFunction::SQUARE, "SQUARE" },
-        { ActivationFunction::TANH, "TANH" },
-        { ActivationFunction::IDENTITY, "IDENTITY" },
-        { ActivationFunction::HARD_SWISH, "HARD_SWISH" },
-        { ActivationFunction::SWISH, "SWISH" },
-        { ActivationFunction::GELU, "GELU" }
+    static std::map<ActivationFunction, const std::string> act_map = {{ActivationFunction::ABS, "ABS"},
+                                                                      {ActivationFunction::LINEAR, "LINEAR"},
+                                                                      {ActivationFunction::LOGISTIC, "LOGISTIC"},
+                                                                      {ActivationFunction::RELU, "RELU"},
+                                                                      {ActivationFunction::BOUNDED_RELU, "BRELU"},
+                                                                      {ActivationFunction::LU_BOUNDED_RELU, "LU_BRELU"},
+                                                                      {ActivationFunction::LEAKY_RELU, "LRELU"},
+                                                                      {ActivationFunction::SOFT_RELU, "SRELU"},
+                                                                      {ActivationFunction::ELU, "ELU"},
+                                                                      {ActivationFunction::SQRT, "SQRT"},
+                                                                      {ActivationFunction::SQUARE, "SQUARE"},
+                                                                      {ActivationFunction::TANH, "TANH"},
+                                                                      {ActivationFunction::IDENTITY, "IDENTITY"},
+                                                                      {ActivationFunction::HARD_SWISH, "HARD_SWISH"},
+                                                                      {ActivationFunction::SWISH, "SWISH"},
+                                                                      {ActivationFunction::GELU, "GELU"}
 
     };
 
diff --git a/src/core/utils/AssemblyUtils.cpp b/src/core/utils/AssemblyUtils.cpp
index 6d483adc7f..d97ea42091 100644
--- a/src/core/utils/AssemblyUtils.cpp
+++ b/src/core/utils/AssemblyUtils.cpp
@@ -34,12 +34,12 @@ arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act)
     arm_gemm::Activation gemm_act;
 
     // Early exit in case lower bound is other than 0, as it's not yet supported
-    if(act.b() != 0.f)
+    if (act.b() != 0.f)
     {
         return gemm_act;
     }
 
-    switch(act.activation())
+    switch (act.activation())
     {
         case ActivationLayerInfo::ActivationFunction::RELU:
             gemm_act.type = arm_gemm::Activation::Type::ReLU;
@@ -63,17 +63,15 @@ arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act)
 
 arm_conv::PaddingValues map_to_arm_conv_padding(const PadStrideInfo &pad_stride_info)
 {
-    return arm_conv::PaddingValues{ pad_stride_info.pad_left(),
-                                    pad_stride_info.pad_top(),
-                                    pad_stride_info.pad_right(),
-                                    pad_stride_info.pad_bottom() };
+    return arm_conv::PaddingValues{pad_stride_info.pad_left(), pad_stride_info.pad_top(), pad_stride_info.pad_right(),
+                                   pad_stride_info.pad_bottom()};
 }
 
 arm_gemm::WeightFormat map_to_arm_gemm_weight_format(const arm_compute::WeightFormat &weight_format)
 {
     arm_gemm::WeightFormat gemm_weight_fromat;
 
-    switch(weight_format)
+    switch (weight_format)
     {
         case arm_compute::WeightFormat::UNSPECIFIED:
             gemm_weight_fromat = arm_gemm::WeightFormat::UNSPECIFIED;
@@ -193,7 +191,7 @@ arm_compute::WeightFormat map_to_arm_compute_weight_format(const arm_gemm::Weigh
 {
     arm_compute::WeightFormat acl_weight_fromat;
 
-    switch(weight_format)
+    switch (weight_format)
     {
         case arm_gemm::WeightFormat::UNSPECIFIED:
             acl_weight_fromat = arm_compute::WeightFormat::UNSPECIFIED;
diff --git a/src/core/utils/AssemblyUtils.h b/src/core/utils/AssemblyUtils.h
index 60bad3b618..7d0d37c4ef 100644
--- a/src/core/utils/AssemblyUtils.h
+++ b/src/core/utils/AssemblyUtils.h
@@ -25,6 +25,7 @@
 #define UTILS_CORE_ASSEMBLY_UTILS_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/kernels/assembly/common.hpp"
 #include "src/cpu/kernels/assembly/arm_gemm.hpp"
 
@@ -65,6 +66,6 @@ arm_gemm::WeightFormat map_to_arm_gemm_weight_format(const arm_compute::WeightFo
  * @return Compute Library WeightFormat
  */
 arm_compute::WeightFormat map_to_arm_compute_weight_format(const arm_gemm::WeightFormat &weight_format);
-} // namespace assembly
+} // namespace assembly_utils
 } // namespace arm_compute
 #endif /* UTILS_CORE_ASSEMBLY_UTILS_H */
diff --git a/src/core/utils/DataLayoutUtils.cpp b/src/core/utils/DataLayoutUtils.cpp
index 4919b79a42..234bed71cb 100644
--- a/src/core/utils/DataLayoutUtils.cpp
+++ b/src/core/utils/DataLayoutUtils.cpp
@@ -29,11 +29,10 @@ namespace arm_compute
 
 const std::string &string_from_data_layout(DataLayout dl)
 {
-    static std::map<DataLayout, const std::string> dl_map =
-    {
-        { DataLayout::UNKNOWN, "UNKNOWN" },
-        { DataLayout::NCHW, "NCHW" },
-        { DataLayout::NHWC, "NHWC" },
+    static std::map<DataLayout, const std::string> dl_map = {
+        {DataLayout::UNKNOWN, "UNKNOWN"},
+        {DataLayout::NCHW, "NCHW"},
+        {DataLayout::NHWC, "NHWC"},
     };
 
     return dl_map[dl];
diff --git a/src/core/utils/DataTypeUtils.cpp b/src/core/utils/DataTypeUtils.cpp
index 07999354d9..1394339987 100644
--- a/src/core/utils/DataTypeUtils.cpp
+++ b/src/core/utils/DataTypeUtils.cpp
@@ -30,27 +30,26 @@ namespace arm_compute
 {
 const std::string &string_from_data_type(DataType dt)
 {
-    static std::map<DataType, const std::string> dt_map =
-    {
-        { DataType::UNKNOWN, "UNKNOWN" },
-        { DataType::S8, "S8" },
-        { DataType::U8, "U8" },
-        { DataType::S16, "S16" },
-        { DataType::U16, "U16" },
-        { DataType::S32, "S32" },
-        { DataType::U32, "U32" },
-        { DataType::S64, "S64" },
-        { DataType::U64, "U64" },
-        { DataType::F16, "F16" },
-        { DataType::F32, "F32" },
-        { DataType::F64, "F64" },
-        { DataType::SIZET, "SIZET" },
-        { DataType::QSYMM8, "QSYMM8" },
-        { DataType::QSYMM8_PER_CHANNEL, "QSYMM8_PER_CHANNEL" },
-        { DataType::QASYMM8, "QASYMM8" },
-        { DataType::QASYMM8_SIGNED, "QASYMM8_SIGNED" },
-        { DataType::QSYMM16, "QSYMM16" },
-        { DataType::QASYMM16, "QASYMM16" },
+    static std::map<DataType, const std::string> dt_map = {
+        {DataType::UNKNOWN, "UNKNOWN"},
+        {DataType::S8, "S8"},
+        {DataType::U8, "U8"},
+        {DataType::S16, "S16"},
+        {DataType::U16, "U16"},
+        {DataType::S32, "S32"},
+        {DataType::U32, "U32"},
+        {DataType::S64, "S64"},
+        {DataType::U64, "U64"},
+        {DataType::F16, "F16"},
+        {DataType::F32, "F32"},
+        {DataType::F64, "F64"},
+        {DataType::SIZET, "SIZET"},
+        {DataType::QSYMM8, "QSYMM8"},
+        {DataType::QSYMM8_PER_CHANNEL, "QSYMM8_PER_CHANNEL"},
+        {DataType::QASYMM8, "QASYMM8"},
+        {DataType::QASYMM8_SIGNED, "QASYMM8_SIGNED"},
+        {DataType::QSYMM16, "QSYMM16"},
+        {DataType::QASYMM16, "QASYMM16"},
     };
 
     return dt_map[dt];
@@ -58,12 +57,11 @@ const std::string &string_from_data_type(DataType dt)
 
 DataType data_type_from_name(const std::string &name)
 {
-    static const std::map<std::string, DataType> data_types =
-    {
-        { "f16", DataType::F16 },
-        { "f32", DataType::F32 },
-        { "qasymm8", DataType::QASYMM8 },
-        { "qasymm8_signed", DataType::QASYMM8_SIGNED },
+    static const std::map<std::string, DataType> data_types = {
+        {"f16", DataType::F16},
+        {"f32", DataType::F32},
+        {"qasymm8", DataType::QASYMM8},
+        {"qasymm8_signed", DataType::QASYMM8_SIGNED},
     };
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
@@ -74,7 +72,7 @@ DataType data_type_from_name(const std::string &name)
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
-    catch(const std::out_of_range &)
+    catch (const std::out_of_range &)
     {
         ARM_COMPUTE_ERROR_VAR("Invalid data type name: %s", name.c_str());
     }
diff --git a/src/core/utils/FormatUtils.cpp b/src/core/utils/FormatUtils.cpp
index 05b649ded2..46f8455315 100644
--- a/src/core/utils/FormatUtils.cpp
+++ b/src/core/utils/FormatUtils.cpp
@@ -30,26 +30,16 @@ namespace arm_compute
 {
 const std::string &string_from_format(Format format)
 {
-    static std::map<Format, const std::string> formats_map =
-    {
-        { Format::UNKNOWN, "UNKNOWN" },
-        { Format::U8, "U8" },
-        { Format::S16, "S16" },
-        { Format::U16, "U16" },
-        { Format::S32, "S32" },
-        { Format::U32, "U32" },
-        { Format::F16, "F16" },
-        { Format::F32, "F32" },
-        { Format::UV88, "UV88" },
-        { Format::RGB888, "RGB888" },
-        { Format::RGBA8888, "RGBA8888" },
-        { Format::YUV444, "YUV444" },
-        { Format::YUYV422, "YUYV422" },
-        { Format::NV12, "NV12" },
-        { Format::NV21, "NV21" },
-        { Format::IYUV, "IYUV" },
-        { Format::UYVY422, "UYVY422" }
-    };
+    static std::map<Format, const std::string> formats_map = {
+        {Format::UNKNOWN, "UNKNOWN"},   {Format::U8, "U8"},
+        {Format::S16, "S16"},           {Format::U16, "U16"},
+        {Format::S32, "S32"},           {Format::U32, "U32"},
+        {Format::F16, "F16"},           {Format::F32, "F32"},
+        {Format::UV88, "UV88"},         {Format::RGB888, "RGB888"},
+        {Format::RGBA8888, "RGBA8888"}, {Format::YUV444, "YUV444"},
+        {Format::YUYV422, "YUYV422"},   {Format::NV12, "NV12"},
+        {Format::NV21, "NV21"},         {Format::IYUV, "IYUV"},
+        {Format::UYVY422, "UYVY422"}};
 
     return formats_map[format];
 }
diff --git a/src/core/utils/InterpolationPolicyUtils.cpp b/src/core/utils/InterpolationPolicyUtils.cpp
index 2d6cabe85e..276e760544 100644
--- a/src/core/utils/InterpolationPolicyUtils.cpp
+++ b/src/core/utils/InterpolationPolicyUtils.cpp
@@ -29,11 +29,10 @@ namespace arm_compute
 
 const std::string &string_from_interpolation_policy(InterpolationPolicy policy)
 {
-    static std::map<InterpolationPolicy, const std::string> interpolation_policy_map =
-    {
-        { InterpolationPolicy::AREA, "AREA" },
-        { InterpolationPolicy::BILINEAR, "BILINEAR" },
-        { InterpolationPolicy::NEAREST_NEIGHBOR, "NEAREST_NEIGHBOUR" },
+    static std::map<InterpolationPolicy, const std::string> interpolation_policy_map = {
+        {InterpolationPolicy::AREA, "AREA"},
+        {InterpolationPolicy::BILINEAR, "BILINEAR"},
+        {InterpolationPolicy::NEAREST_NEIGHBOR, "NEAREST_NEIGHBOUR"},
     };
 
     return interpolation_policy_map[policy];
diff --git a/src/core/utils/ScaleUtils.cpp b/src/core/utils/ScaleUtils.cpp
index ee57a8e7a7..a92da39b67 100644
--- a/src/core/utils/ScaleUtils.cpp
+++ b/src/core/utils/ScaleUtils.cpp
@@ -23,11 +23,12 @@
  */
 
 #include "src/core/utils/ScaleUtils.h"
-#include "src/common/cpuinfo/CpuIsaInfo.h"
 
 #include "arm_compute/core/CPP/CPPTypes.h"
 #include "arm_compute/core/TensorInfo.h"
 
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+
 float arm_compute::scale_utils::calculate_resize_ratio(size_t input_size, size_t output_size, bool align_corners)
 {
     const size_t offset = (align_corners && output_size > 1) ? 1 : 0;
@@ -40,13 +41,15 @@ float arm_compute::scale_utils::calculate_resize_ratio(size_t input_size, size_t
     return static_cast<float>(in) / static_cast<float>(out);
 }
 
-bool arm_compute::scale_utils::is_precomputation_required(DataLayout data_layout, DataType data_type,
-                                                          InterpolationPolicy policy, BorderMode border_mode)
+bool arm_compute::scale_utils::is_precomputation_required(DataLayout          data_layout,
+                                                          DataType            data_type,
+                                                          InterpolationPolicy policy,
+                                                          BorderMode          border_mode)
 {
     // Do not calculate precomputed weights and indices if kernel code doesn't use them
-    if(data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NHWC)
     {
-        switch(data_type)
+        switch (data_type)
         {
             case DataType::F32:
             case DataType::F16:
@@ -62,4 +65,4 @@ bool arm_compute::scale_utils::is_precomputation_required(DataLayout data_layout
     }
 
     return true;
-}
-\ No newline at end of file
+}
diff --git a/src/core/utils/ScaleUtils.h b/src/core/utils/ScaleUtils.h
index 1484824a7f..d8dddc8c70 100644
--- a/src/core/utils/ScaleUtils.h
+++ b/src/core/utils/ScaleUtils.h
@@ -60,8 +60,11 @@ inline bool is_align_corners_allowed_sampling_policy(SamplingPolicy sampling_pol
  *
  * @return True if precomputation is required
  */
-bool is_precomputation_required(DataLayout data_layout, DataType data_type, InterpolationPolicy policy, BorderMode border_mode);
+bool is_precomputation_required(DataLayout          data_layout,
+                                DataType            data_type,
+                                InterpolationPolicy policy,
+                                BorderMode          border_mode);
 
 } // namespace scale_utils
 } // namespace arm_compute
-#endif /* UTILS_CORE_SCALEUTILS_H */
-\ No newline at end of file
+#endif /* UTILS_CORE_SCALEUTILS_H */
diff --git a/src/core/utils/StringUtils.cpp b/src/core/utils/StringUtils.cpp
index 6d05c9b64e..bcab0ce10c 100644
--- a/src/core/utils/StringUtils.cpp
+++ b/src/core/utils/StringUtils.cpp
@@ -55,7 +55,7 @@ std::string float_to_string_with_full_precision(float val)
     ss.precision(std::numeric_limits<float>::max_digits10);
     ss << val;
 
-    if(val != static_cast<int>(val))
+    if (val != static_cast<int>(val))
     {
         ss << "f";
     }
@@ -65,17 +65,11 @@ std::string float_to_string_with_full_precision(float val)
 
 std::string join(const std::vector<std::string> strings, const std::string &sep)
 {
-    if(strings.empty())
+    if (strings.empty())
     {
         return "";
     }
-    return std::accumulate(
-               std::next(strings.begin()),
-               strings.end(),
-               strings.at(0),
-               [&sep](const std::string & a, const std::string & b)
-    {
-        return a + sep + b;
-    });
-}
+    return std::accumulate(std::next(strings.begin()), strings.end(), strings.at(0),
+                           [&sep](const std::string &a, const std::string &b) { return a + sep + b; });
 }
+} // namespace arm_compute
diff --git a/src/core/utils/helpers/fft.cpp b/src/core/utils/helpers/fft.cpp
index 64633c643d..edc8d0eacc 100644
--- a/src/core/utils/helpers/fft.cpp
+++ b/src/core/utils/helpers/fft.cpp
@@ -37,7 +37,7 @@ std::vector<unsigned int> decompose_stages(unsigned int N, const std::set<unsign
     unsigned int              res = N;
 
     // Early exit if no supported factors are provided
-    if(supported_factors.empty())
+    if (supported_factors.empty())
     {
         return stages;
     }
@@ -46,10 +46,10 @@ std::vector<unsigned int> decompose_stages(unsigned int N, const std::set<unsign
     auto rfactor_it = supported_factors.rbegin();
 
     // Decomposition step
-    while(res != 0)
+    while (res != 0)
     {
         const unsigned int factor = *rfactor_it;
-        if(0 == (res % factor) && res >= factor)
+        if (0 == (res % factor) && res >= factor)
         {
             stages.push_back(factor);
             res /= factor;
@@ -57,9 +57,9 @@ std::vector<unsigned int> decompose_stages(unsigned int N, const std::set<unsign
         else
         {
             ++rfactor_it;
-            if(rfactor_it == supported_factors.rend())
+            if (rfactor_it == supported_factors.rend())
             {
-                if(res > 1)
+                if (res > 1)
                 {
                     // Couldn't decompose with given factors
                     stages.clear();
@@ -81,8 +81,9 @@ std::vector<unsigned int> digit_reverse_indices(unsigned int N, const std::vecto
     std::vector<unsigned int> idx_digit_reverse;
 
     // Early exit in case N and fft stages do not match
-    const float stages_prod = std::accumulate(std::begin(fft_stages), std::end(fft_stages), 1, std::multiplies<unsigned int>());
-    if(stages_prod != N)
+    const float stages_prod =
+        std::accumulate(std::begin(fft_stages), std::end(fft_stages), 1, std::multiplies<unsigned int>());
+    if (stages_prod != N)
     {
         return idx_digit_reverse;
     }
@@ -94,13 +95,13 @@ std::vector<unsigned int> digit_reverse_indices(unsigned int N, const std::vecto
     unsigned int n_stages = fft_stages.size();
 
     // Scan elements
-    for(unsigned int n = 0; n < N; ++n)
+    for (unsigned int n = 0; n < N; ++n)
     {
         unsigned int k  = n;
         unsigned int Nx = fft_stages[0];
 
         // Scan stages
-        for(unsigned int s = 1; s < n_stages; ++s)
+        for (unsigned int s = 1; s < n_stages; ++s)
         {
             // radix of stage i-th
             unsigned int Ny = fft_stages[s];
diff --git a/src/core/utils/helpers/float_ops.h b/src/core/utils/helpers/float_ops.h
index 99e1ea54ee..7f7fbd13bf 100644
--- a/src/core/utils/helpers/float_ops.h
+++ b/src/core/utils/helpers/float_ops.h
@@ -39,8 +39,7 @@ union RawFloat
      *
      * @param[in] val Floating-point value
      */
-    explicit RawFloat(float val)
-        : f32(val)
+    explicit RawFloat(float val) : f32(val)
     {
     }
     /** Extract sign of floating point number
diff --git a/src/core/utils/helpers/tensor_info.h b/src/core/utils/helpers/tensor_info.h
index 9279532e2a..fd4745a453 100644
--- a/src/core/utils/helpers/tensor_info.h
+++ b/src/core/utils/helpers/tensor_info.h
@@ -41,15 +41,17 @@ namespace tensor_info
  * @return True if tensors have mismatching quantization info else false.
  */
 template <typename... Ts>
-inline bool tensors_have_different_quantization_info(const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos)
+inline bool tensors_have_different_quantization_info(const ITensorInfo *tensor_info_1,
+                                                     const ITensorInfo *tensor_info_2,
+                                                     Ts... tensor_infos)
 {
     const QuantizationInfo first_quantization_info = tensor_info_1->quantization_info();
 
-    const std::array < const ITensorInfo *, 1 + sizeof...(Ts) > tensor_infos_array{ { tensor_info_2, std::forward<Ts>(tensor_infos)... } };
-    return std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info)
-    {
-        return tensor_info->quantization_info() != first_quantization_info;
-    });
+    const std::array<const ITensorInfo *, 1 + sizeof...(Ts)> tensor_infos_array{
+        {tensor_info_2, std::forward<Ts>(tensor_infos)...}};
+    return std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(),
+                       [&](const ITensorInfo *tensor_info)
+                       { return tensor_info->quantization_info() != first_quantization_info; });
 }
 } // namespace tensor_info
 } // namespace helpers
diff --git a/src/core/utils/helpers/tensor_transform.cpp b/src/core/utils/helpers/tensor_transform.cpp
index f2216995a9..19d0badd74 100644
--- a/src/core/utils/helpers/tensor_transform.cpp
+++ b/src/core/utils/helpers/tensor_transform.cpp
@@ -36,10 +36,11 @@ int calculate_stride_on_index(int index, Coordinates strides)
     return index >= static_cast<int>(strides.num_dimensions()) ? 1 : strides[index];
 }
 
-int calculate_start_on_index(TensorShape input_shape, int index, Coordinates starts, Coordinates strides, int32_t begin_mask)
+int calculate_start_on_index(
+    TensorShape input_shape, int index, Coordinates starts, Coordinates strides, int32_t begin_mask)
 {
     // Early exit
-    if(index >= static_cast<int>(starts.num_dimensions()))
+    if (index >= static_cast<int>(starts.num_dimensions()))
     {
         return 0;
     }
@@ -51,14 +52,14 @@ int calculate_start_on_index(TensorShape input_shape, int index, Coordinates sta
     int start = starts[index];
 
     // Reset in case of begin mask present
-    if(arm_compute::helpers::bit_ops::is_bit_set(begin_mask, index))
+    if (arm_compute::helpers::bit_ops::is_bit_set(begin_mask, index))
     {
         start = stride > 0 ? std::numeric_limits<int>::lowest() : std::numeric_limits<int>::max();
     }
 
     // Account negative start points
     const int dim_size = input_shape[index];
-    if(start < 0)
+    if (start < 0)
     {
         start += dim_size;
     }
@@ -69,12 +70,16 @@ int calculate_start_on_index(TensorShape input_shape, int index, Coordinates sta
     return start;
 }
 
-int calculate_end_on_index(TensorShape input_shape, int index, int start_on_index,
-                           Coordinates ends, Coordinates strides,
-                           int32_t end_mask, int32_t shrink_axis_mask)
+int calculate_end_on_index(TensorShape input_shape,
+                           int         index,
+                           int         start_on_index,
+                           Coordinates ends,
+                           Coordinates strides,
+                           int32_t     end_mask,
+                           int32_t     shrink_axis_mask)
 {
     // Early exit
-    if(index >= static_cast<int>(ends.num_dimensions()))
+    if (index >= static_cast<int>(ends.num_dimensions()))
     {
         return input_shape[index];
     }
@@ -86,9 +91,9 @@ int calculate_end_on_index(TensorShape input_shape, int index, int start_on_inde
     int stop = ends[index];
 
     // Shrink dimension
-    if(shrink_axis)
+    if (shrink_axis)
     {
-        if(start_on_index == std::numeric_limits<int>::max())
+        if (start_on_index == std::numeric_limits<int>::max())
         {
             stop = start_on_index;
         }
@@ -99,14 +104,14 @@ int calculate_end_on_index(TensorShape input_shape, int index, int start_on_inde
     }
 
     // Reset in case of begin mask present
-    if(arm_compute::helpers::bit_ops::is_bit_set(end_mask, index) && !shrink_axis)
+    if (arm_compute::helpers::bit_ops::is_bit_set(end_mask, index) && !shrink_axis)
     {
         stop = (stride > 0) ? std::numeric_limits<int>::max() : std::numeric_limits<int>::lowest();
     }
 
     // Account negative end points
     const int dim_size = input_shape[index];
-    if(stop < 0)
+    if (stop < 0)
     {
         stop += dim_size;
     }
@@ -118,14 +123,18 @@ int calculate_end_on_index(TensorShape input_shape, int index, int start_on_inde
 }
 
 std::tuple<Coordinates, Coordinates, Coordinates> calculate_strided_slice_coords(TensorShape input_shape,
-                                                                                 Coordinates starts, Coordinates ends, Coordinates strides,
-                                                                                 int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+                                                                                 Coordinates starts,
+                                                                                 Coordinates ends,
+                                                                                 Coordinates strides,
+                                                                                 int32_t     begin_mask,
+                                                                                 int32_t     end_mask,
+                                                                                 int32_t     shrink_axis_mask)
 {
     Coordinates starts_abs{};
     Coordinates ends_abs{};
     Coordinates final_strides{};
 
-    for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
     {
         const int start_i = calculate_start_on_index(input_shape, i, starts, strides, begin_mask);
         starts_abs.set(i, start_i);
@@ -136,13 +145,19 @@ std::tuple<Coordinates, Coordinates, Coordinates> calculate_strided_slice_coords
     return std::make_tuple(starts_abs, ends_abs, final_strides);
 }
 
-TensorShape compute_strided_slice_output_shape(TensorShape input_shape, Coordinates starts, Coordinates ends, Coordinates strides,
-                                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask, bool return_unshrinked)
+TensorShape compute_strided_slice_output_shape(TensorShape input_shape,
+                                               Coordinates starts,
+                                               Coordinates ends,
+                                               Coordinates strides,
+                                               int32_t     begin_mask,
+                                               int32_t     end_mask,
+                                               int32_t     shrink_axis_mask,
+                                               bool        return_unshrinked)
 {
     unsigned int index = 0;
 
     TensorShape output_shape;
-    for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
     {
         const int stride = calculate_stride_on_index(index, strides);
         const int start  = calculate_start_on_index(input_shape, i, starts, strides, begin_mask);
@@ -150,11 +165,11 @@ TensorShape compute_strided_slice_output_shape(TensorShape input_shape, Coordina
         const int range  = end - start;
 
         const bool is_shrink = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, i);
-        if(return_unshrinked || !is_shrink)
+        if (return_unshrinked || !is_shrink)
         {
-            if((range == 0) ||               // Zero range
-               (range < 0 && stride >= 0) || // Negative range with positive stride
-               (range > 0 && stride <= 0))   // Positive range with negative stride
+            if ((range == 0) ||               // Zero range
+                (range < 0 && stride >= 0) || // Negative range with positive stride
+                (range > 0 && stride <= 0))   // Positive range with negative stride
             {
                 output_shape.set(index, 0);
                 return output_shape;
@@ -173,9 +188,9 @@ int32_t construct_slice_end_mask(Coordinates ends)
 {
     // Create end mask
     int32_t end_mask = 0;
-    for(unsigned int i = 0; i < ends.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < ends.num_dimensions(); ++i)
     {
-        if(ends[i] < 0)
+        if (ends[i] < 0)
         {
             end_mask |= 1 << i;
         }
diff --git a/src/core/utils/io/FileHandler.cpp b/src/core/utils/io/FileHandler.cpp
index 95fc2e3fa2..d106493238 100644
--- a/src/core/utils/io/FileHandler.cpp
+++ b/src/core/utils/io/FileHandler.cpp
@@ -21,16 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include <string>
-
 #include "arm_compute/core/utils/io/FileHandler.h"
 
 #include "arm_compute/core/Error.h"
 
+#include <string>
+
 using namespace arm_compute::io;
 
-FileHandler::FileHandler()
-    : _filestream(), _filename(" "), _mode()
+FileHandler::FileHandler() : _filestream(), _filename(" "), _mode()
 {
 }
 
diff --git a/src/core/utils/logging/FilePrinter.cpp b/src/core/utils/logging/FilePrinter.cpp
index 55e78f9630..7b4eead38d 100644
--- a/src/core/utils/logging/FilePrinter.cpp
+++ b/src/core/utils/logging/FilePrinter.cpp
@@ -25,8 +25,7 @@
 
 using namespace arm_compute::logging;
 
-FilePrinter::FilePrinter(const std::string &filename)
-    : _handler()
+FilePrinter::FilePrinter(const std::string &filename) : _handler()
 {
     _handler.open(filename, std::fstream::out | std::fstream::trunc);
 }
@@ -34,4 +33,4 @@ FilePrinter::FilePrinter(const std::string &filename)
 void FilePrinter::print_internal(const std::string &msg)
 {
     _handler.stream() << msg << std::endl;
-}
-\ No newline at end of file
+}
diff --git a/src/core/utils/logging/Helpers.cpp b/src/core/utils/logging/Helpers.cpp
index c3df7f6207..14ad910562 100644
--- a/src/core/utils/logging/Helpers.cpp
+++ b/src/core/utils/logging/Helpers.cpp
@@ -30,13 +30,12 @@ using namespace arm_compute::logging;
 
 const std::string &arm_compute::logging::string_from_log_level(LogLevel log_level)
 {
-    static std::map<LogLevel, const std::string> log_level_map =
-    {
-        { LogLevel::VERBOSE, "VERBOSE" },
-        { LogLevel::INFO, "INFO" },
-        { LogLevel::WARN, "WARN" },
-        { LogLevel::OFF, "OFF" },
+    static std::map<LogLevel, const std::string> log_level_map = {
+        {LogLevel::VERBOSE, "VERBOSE"},
+        {LogLevel::INFO, "INFO"},
+        {LogLevel::WARN, "WARN"},
+        {LogLevel::OFF, "OFF"},
     };
 
     return log_level_map[log_level];
-}
-\ No newline at end of file
+}
diff --git a/src/core/utils/logging/Logger.cpp b/src/core/utils/logging/Logger.cpp
index 70b5868da8..d6681f8179 100644
--- a/src/core/utils/logging/Logger.cpp
+++ b/src/core/utils/logging/Logger.cpp
@@ -30,10 +30,7 @@
 using namespace arm_compute::logging;
 
 Logger::Logger(std::string name, LogLevel log_level, std::shared_ptr<Printer> printer)
-    : _name(std::move(name)), _log_level(log_level), _printers(
-{
-    std::move(printer)
-}), _decorators()
+    : _name(std::move(name)), _log_level(log_level), _printers({std::move(printer)}), _decorators()
 {
     // Check printer
     ARM_COMPUTE_ERROR_ON(printer == nullptr);
@@ -46,7 +43,7 @@ Logger::Logger(std::string name, LogLevel log_level, std::vector<std::shared_ptr
     : _name(std::move(name)), _log_level(log_level), _printers(std::move(printers)), _decorators()
 {
     // Check printers
-    for(const auto &p : _printers)
+    for (const auto &p : _printers)
     {
         ARM_COMPUTE_UNUSED(p);
         ARM_COMPUTE_ERROR_ON(p == nullptr);
@@ -62,13 +59,13 @@ Logger::Logger(std::string                              name,
     : _name(std::move(name)), _log_level(log_level), _printers(std::move(printers)), _decorators(std::move(decorators))
 {
     // Check printers
-    for(const auto &p : _printers)
+    for (const auto &p : _printers)
     {
         ARM_COMPUTE_UNUSED(p);
         ARM_COMPUTE_ERROR_ON(p == nullptr);
     }
     // Check decorators
-    for(const auto &d : _decorators)
+    for (const auto &d : _decorators)
     {
         ARM_COMPUTE_UNUSED(d);
         ARM_COMPUTE_ERROR_ON(d == nullptr);
@@ -79,7 +76,7 @@ void Logger::log(LogLevel log_level, const std::string &msg)
 {
     // Return if message shouldn't be logged
     // i.e. if log level does not match the logger's
-    if(!is_loggable(log_level))
+    if (!is_loggable(log_level))
     {
         return;
     }
@@ -129,7 +126,7 @@ bool Logger::is_loggable(LogLevel log_level)
 
 void Logger::decorate_log_msg(LogMsg &msg)
 {
-    for(const auto &d : _decorators)
+    for (const auto &d : _decorators)
     {
         d->decorate(msg);
     }
@@ -148,7 +145,7 @@ std::string Logger::create_log_msg(const std::string &str, LogLevel log_level)
 
 void Logger::print_all(const std::string &msg)
 {
-    for(auto &p : _printers)
+    for (auto &p : _printers)
     {
         p->print(msg);
     }
diff --git a/src/core/utils/logging/LoggerRegistry.cpp b/src/core/utils/logging/LoggerRegistry.cpp
index c281d8863c..17015d9ae9 100644
--- a/src/core/utils/logging/LoggerRegistry.cpp
+++ b/src/core/utils/logging/LoggerRegistry.cpp
@@ -24,15 +24,15 @@
 #include "arm_compute/core/utils/logging/LoggerRegistry.h"
 
 #include "arm_compute/core/Error.h"
+
 #include "support/Mutex.h"
 
 using namespace arm_compute::logging;
 
 /** Reserved logger used by the library */
-std::set<std::string> LoggerRegistry::_reserved_loggers = { "CORE", "RUNTIME", "GRAPH" };
+std::set<std::string> LoggerRegistry::_reserved_loggers = {"CORE", "RUNTIME", "GRAPH"};
 
-LoggerRegistry::LoggerRegistry()
-    : _mtx(), _loggers()
+LoggerRegistry::LoggerRegistry() : _mtx(), _loggers()
 {
 }
 
@@ -42,10 +42,12 @@ LoggerRegistry &LoggerRegistry::get()
     return _instance;
 }
 
-void LoggerRegistry::create_logger(const std::string &name, LogLevel log_level, const std::vector<std::shared_ptr<Printer>> &printers)
+void LoggerRegistry::create_logger(const std::string                           &name,
+                                   LogLevel                                     log_level,
+                                   const std::vector<std::shared_ptr<Printer>> &printers)
 {
     arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
-    if((_loggers.find(name) == _loggers.end()) && (_reserved_loggers.find(name) == _reserved_loggers.end()))
+    if ((_loggers.find(name) == _loggers.end()) && (_reserved_loggers.find(name) == _reserved_loggers.end()))
     {
         _loggers[name] = std::make_shared<Logger>(name, log_level, printers);
     }
@@ -54,7 +56,7 @@ void LoggerRegistry::create_logger(const std::string &name, LogLevel log_level,
 void LoggerRegistry::remove_logger(const std::string &name)
 {
     arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
-    if(_loggers.find(name) != _loggers.end())
+    if (_loggers.find(name) != _loggers.end())
     {
         _loggers.erase(name);
     }
@@ -69,9 +71,9 @@ std::shared_ptr<Logger> LoggerRegistry::logger(const std::string &name)
 void LoggerRegistry::create_reserved_loggers(LogLevel log_level, const std::vector<std::shared_ptr<Printer>> &printers)
 {
     arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
-    for(const auto &r : _reserved_loggers)
+    for (const auto &r : _reserved_loggers)
     {
-        if(_loggers.find(r) == _loggers.end())
+        if (_loggers.find(r) == _loggers.end())
         {
             _loggers[r] = std::make_shared<Logger>(r, log_level, printers);
         }
diff --git a/src/core/utils/misc/MMappedFile.cpp b/src/core/utils/misc/MMappedFile.cpp
index adae8a2bf0..a467cb3320 100644
--- a/src/core/utils/misc/MMappedFile.cpp
+++ b/src/core/utils/misc/MMappedFile.cpp
@@ -27,12 +27,11 @@
 
 #include <cstdio>
 #include <cstring>
-#include <tuple>
-
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <sys/types.h>
+#include <tuple>
 #include <unistd.h>
 
 namespace arm_compute
@@ -53,7 +52,7 @@ std::pair<size_t, bool> get_file_size(const std::string &filename)
 {
     struct stat st; // NOLINT
     memset(&st, 0, sizeof(struct stat));
-    if(stat(filename.c_str(), &st) == 0)
+    if (stat(filename.c_str(), &st) == 0)
     {
         return std::make_pair(st.st_size, true);
     }
@@ -73,8 +72,7 @@ size_t get_page_size()
 }
 } // namespace
 
-MMappedFile::MMappedFile()
-    : _filename(), _file_size(0), _map_size(0), _map_offset(0), _fp(nullptr), _data(nullptr)
+MMappedFile::MMappedFile() : _filename(), _file_size(0), _map_size(0), _map_offset(0), _fp(nullptr), _data(nullptr)
 {
 }
 
@@ -92,14 +90,14 @@ MMappedFile::~MMappedFile()
 bool MMappedFile::map(const std::string &filename, size_t size, size_t offset)
 {
     // Check if file is mapped
-    if(is_mapped())
+    if (is_mapped())
     {
         return false;
     }
 
     // Open file
     _fp = fopen(filename.c_str(), "a+be");
-    if(_fp == nullptr)
+    if (_fp == nullptr)
     {
         return false;
     }
@@ -107,26 +105,26 @@ bool MMappedFile::map(const std::string &filename, size_t size, size_t offset)
     // Extract file descriptor
     int  fd     = fileno(_fp);
     bool status = fd >= 0;
-    if(status)
+    if (status)
     {
         // Get file size
         std::tie(_file_size, status) = get_file_size(_filename);
 
-        if(status)
+        if (status)
         {
             // Map all file from offset if map size is 0
             _map_size   = (size == 0) ? _file_size : size;
             _map_offset = offset;
 
             // Check offset mapping
-            if((_map_offset > _file_size) || (_map_offset % get_page_size() != 0))
+            if ((_map_offset > _file_size) || (_map_offset % get_page_size() != 0))
             {
                 status = false;
             }
             else
             {
                 // Truncate to file size
-                if(_map_offset + _map_size > _file_size)
+                if (_map_offset + _map_size > _file_size)
                 {
                     _map_size = _file_size - _map_offset;
                 }
@@ -137,7 +135,7 @@ bool MMappedFile::map(const std::string &filename, size_t size, size_t offset)
         }
     }
 
-    if(!status)
+    if (!status)
     {
         fclose(_fp);
     }
@@ -148,14 +146,14 @@ bool MMappedFile::map(const std::string &filename, size_t size, size_t offset)
 void MMappedFile::release()
 {
     // Unmap file
-    if(_data != nullptr)
+    if (_data != nullptr)
     {
         ::munmap(_data, _file_size);
         _data = nullptr;
     }
 
     // Close file
-    if(_fp != nullptr)
+    if (_fp != nullptr)
     {
         fclose(_fp);
         _fp = nullptr;
diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp
index 086d63b968..f66d3e7064 100644
--- a/src/core/utils/quantization/AsymmHelpers.cpp
+++ b/src/core/utils/quantization/AsymmHelpers.cpp
@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/utils/quantization/AsymmHelpers.h"
 #include "support/ToolchainSupport.h"
 
@@ -40,7 +42,7 @@ constexpr float   epsilon            = 0.00001f;
 
 Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplier, int32_t *shift, bool ignore_epsilon)
 {
-    if(multiplier >= 1.f)
+    if (multiplier >= 1.f)
     {
         Status status = calculate_quantized_multiplier_greater_than_one(multiplier, quant_multiplier, shift);
         *shift *= -1;
@@ -69,13 +71,13 @@ Status calculate_quantized_multiplier_less_than_one(float    multiplier,
     *right_shift           = -1 * shift_exp;
     auto q_fixed           = static_cast<int64_t>(support::cpp11::round(q * fixed_point_one_Q0));
     ARM_COMPUTE_RETURN_ERROR_ON(q_fixed > fixed_point_one_Q0);
-    if(q_fixed == fixed_point_one_Q0)
+    if (q_fixed == fixed_point_one_Q0)
     {
         q_fixed /= 2;
         --*right_shift;
     }
 
-    if(ignore_epsilon && *right_shift > 31)
+    if (ignore_epsilon && *right_shift > 31)
     {
         *right_shift = 0;
         q_fixed      = 0;
@@ -88,9 +90,8 @@ Status calculate_quantized_multiplier_less_than_one(float    multiplier,
     return Status{};
 }
 
-Status calculate_quantized_multiplier_greater_than_one(float    multiplier,
-                                                       int32_t *quantized_multiplier,
-                                                       int32_t *left_shift)
+Status
+calculate_quantized_multiplier_greater_than_one(float multiplier, int32_t *quantized_multiplier, int32_t *left_shift)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(quantized_multiplier == nullptr);
     ARM_COMPUTE_RETURN_ERROR_ON(left_shift == nullptr);
@@ -101,7 +102,7 @@ Status calculate_quantized_multiplier_greater_than_one(float    multiplier,
     *left_shift            = shift_exp;
     auto q_fixed           = static_cast<int64_t>(support::cpp11::round(q * fixed_point_one_Q0));
     ARM_COMPUTE_RETURN_ERROR_ON(q_fixed > fixed_point_one_Q0);
-    if(q_fixed == fixed_point_one_Q0)
+    if (q_fixed == fixed_point_one_Q0)
     {
         q_fixed /= 2;
         ++*left_shift;
@@ -113,9 +114,9 @@ Status calculate_quantized_multiplier_greater_than_one(float    multiplier,
     return Status{};
 }
 
-arm_compute::Status calculate_quantized_multipliers(const QuantizationInfo &iq_info,
-                                                    const QuantizationInfo &wq_info,
-                                                    const QuantizationInfo &oq_info,
+arm_compute::Status calculate_quantized_multipliers(const QuantizationInfo  &iq_info,
+                                                    const QuantizationInfo  &wq_info,
+                                                    const QuantizationInfo  &oq_info,
                                                     GEMMLowpOutputStageInfo &stage_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(iq_info.scale().empty());
@@ -133,7 +134,7 @@ arm_compute::Status calculate_quantized_multipliers(const QuantizationInfo &iq_i
     const float i_scale  = iq_info.scale().at(0);
     const float o_scale  = oq_info.scale().at(0);
 
-    for(unsigned int i = 0; i < size; ++i)
+    for (unsigned int i = 0; i < size; ++i)
     {
         const float multiplier       = i_scale * w_scales[i] / o_scale;
         int32_t     quant_multiplier = 0;
@@ -154,7 +155,7 @@ std::pair<int, int> get_min_max_values_from_quantized_data_type(DataType data_ty
 {
     int min_quant_val = 0;
     int max_quant_val = 0;
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::QASYMM8:
             min_quant_val = std::numeric_limits<uint8_t>::min();
@@ -179,7 +180,9 @@ std::pair<int, int> get_min_max_values_from_quantized_data_type(DataType data_ty
     return std::make_pair(min_quant_val, max_quant_val);
 }
 
-std::tuple<int32_t, int32_t> get_quantized_asymmetric_output_min_max(const QuantizationInfo &q_info, const ActivationLayerInfo &act_info, DataType data_type)
+std::tuple<int32_t, int32_t> get_quantized_asymmetric_output_min_max(const QuantizationInfo    &q_info,
+                                                                     const ActivationLayerInfo &act_info,
+                                                                     DataType                   data_type)
 {
     ARM_COMPUTE_ERROR_ON(data_type != DataType::QASYMM8 && data_type != DataType::QASYMM8_SIGNED);
 
@@ -190,20 +193,23 @@ std::tuple<int32_t, int32_t> get_quantized_asymmetric_output_min_max(const Quant
 
     const UniformQuantizationInfo q_unif = q_info.uniform();
 
-    if(act_info.enabled())
+    if (act_info.enabled())
     {
-        switch(act_info.activation())
+        switch (act_info.activation())
         {
             case ActivationLayerInfo::ActivationFunction::RELU:
                 type_min = q_unif.offset;
                 break;
             case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
                 type_min = q_unif.offset;
-                type_max = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.a(), q_info) : quantize_qasymm8_signed(act_info.a(), q_info);
+                type_max = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.a(), q_info)
+                                                            : quantize_qasymm8_signed(act_info.a(), q_info);
                 break;
             case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                type_min = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.b(), q_info) : quantize_qasymm8_signed(act_info.b(), q_info);
-                type_max = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.a(), q_info) : quantize_qasymm8_signed(act_info.a(), q_info);
+                type_min = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.b(), q_info)
+                                                            : quantize_qasymm8_signed(act_info.b(), q_info);
+                type_max = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.a(), q_info)
+                                                            : quantize_qasymm8_signed(act_info.a(), q_info);
                 break;
             default:
                 ARM_COMPUTE_ERROR("Activation function not supported.");
@@ -226,7 +232,7 @@ void compute_quantized_multipliers_and_shifts(const ITensorInfo *input,
 
     const unsigned int num_filters = wq_info.scale().size();
 
-    for(unsigned int i = 0; i < num_filters; ++i)
+    for (unsigned int i = 0; i < num_filters; ++i)
     {
         int32_t     output_multiplier = 0;
         int32_t     output_shift      = 0;
@@ -267,11 +273,11 @@ int32_t multiply_by_quantized_multiplier(int32_t input, int32_t qmul, int32_t sh
 
 int32_t saturating_rounding_multiply_by_pow2(int32_t exponent, int32_t v)
 {
-    if(exponent == 0)
+    if (exponent == 0)
     {
         return v;
     }
-    else if(exponent < 0)
+    else if (exponent < 0)
     {
         return rounding_divide_by_pow2(v, -exponent);
     }
@@ -291,11 +297,14 @@ int32_t saturating_rounding_multiply_by_pow2(int32_t exponent, int32_t v)
     }
 }
 
-void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift, int32_t &output_inv_sqrt, int32_t &output_shift)
+void get_invsqrt_quantized_multiplier_exp(int32_t  input,
+                                          int32_t  reverse_shift,
+                                          int32_t &output_inv_sqrt,
+                                          int32_t &output_shift)
 {
     ARM_COMPUTE_ERROR_ON(input < 0);
 
-    if(input <= 1)
+    if (input <= 1)
     {
         // dealing the inputs (0 and 1) separately to avoid overflow
         output_inv_sqrt = std::numeric_limits<std::int32_t>::max();
@@ -305,7 +314,7 @@ void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift,
 
     // prepare input for fixed point operation and compute shift value
     output_shift = 11;
-    while(input >= (1 << 29))
+    while (input >= (1 << 29))
     {
         input /= 4;
         ++output_shift;
@@ -334,9 +343,7 @@ void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift,
 
     // multiplication of two fixed point numbers, defined for readability
     auto fixed_point_mul = [](FixedPointRawType a, FixedPointRawType b) -> FixedPointRawType
-    {
-        return saturating_rounding_doubling_highmul(a, b);
-    };
+    { return saturating_rounding_doubling_highmul(a, b); };
 
     // rescaling of fixed point to have dst_bit integer bits, defined for readability
     auto fixed_point_rescale = [](FixedPointRawType a, uint32_t src_bit, uint32_t dst_bit) -> FixedPointRawType
@@ -347,17 +354,18 @@ void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift,
 
     // 5 iterations of Newton-Raphson method for inverse square root - 1.5 * x_n = input/2 * (x_n)^3
     constexpr int32_t num_iteration = 5;
-    for(int32_t i = 0; i < num_iteration; ++i)
+    for (int32_t i = 0; i < num_iteration; ++i)
     {
         const auto x3 = fixed_point_rescale(fixed_point_mul(fixed_point_mul(x, x), x), 9, fixedpoint_position);
-        x             = fixed_point_rescale(fixed_point_mul(fixedpoint_half_three, x) - fixed_point_mul(fixedpoint_half_input, x3), 6, fixedpoint_position);
+        x = fixed_point_rescale(fixed_point_mul(fixedpoint_half_three, x) - fixed_point_mul(fixedpoint_half_input, x3),
+                                6, fixedpoint_position);
     }
 
     // fixed point representation of sqrt(1/2)
     const FixedPoint0 fixedpoint_half_sqrt_2 = 1518500250;
     x                                        = fixed_point_mul(fixedpoint_half_sqrt_2, x);
     output_inv_sqrt                          = x;
-    if(output_shift < 0)
+    if (output_shift < 0)
     {
         output_inv_sqrt <<= -output_shift;
         output_shift = 0;
@@ -365,5 +373,5 @@ void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift,
     // convert right shift to left shift
     output_shift *= reverse_shift;
 }
-} // quantization
-} // arm_compute
+} // namespace quantization
+} // namespace arm_compute
diff --git a/src/core/utils/quantization/AsymmHelpers.h b/src/core/utils/quantization/AsymmHelpers.h
index f9701095cb..5dc607ce58 100644
--- a/src/core/utils/quantization/AsymmHelpers.h
+++ b/src/core/utils/quantization/AsymmHelpers.h
@@ -29,7 +29,8 @@
 
 namespace arm_compute
 {
-namespace quantization {
+namespace quantization
+{
 
 /** Get minimum and maximum output of the activation function after quantization.
  *
@@ -41,7 +42,9 @@ namespace quantization {
  *
  * @return The minimum and maximum output of the activation function after quantization.
  */
-std::tuple<int32_t, int32_t> get_quantized_asymmetric_output_min_max(const QuantizationInfo &q_info, const ActivationLayerInfo &act_info, DataType data_type);
+std::tuple<int32_t, int32_t> get_quantized_asymmetric_output_min_max(const QuantizationInfo    &q_info,
+                                                                     const ActivationLayerInfo &act_info,
+                                                                     DataType                   data_type);
 
 } // namespace quantization
 } // namespace arm_compute
diff --git a/src/cpu/CpuContext.cpp b/src/cpu/CpuContext.cpp
index 7c14891ef8..b745af8229 100644
--- a/src/cpu/CpuContext.cpp
+++ b/src/cpu/CpuContext.cpp
@@ -24,6 +24,7 @@
 #include "src/cpu/CpuContext.h"
 
 #include "arm_compute/core/CPP/CPPTypes.h"
+
 #include "src/cpu/CpuQueue.h"
 #include "src/cpu/CpuTensor.h"
 
@@ -32,7 +33,7 @@
 #include <malloc.h>
 
 #if defined(_WIN64)
-#define posix_memalign _aligned_realloc
+#define posix_memalign      _aligned_realloc
 #define posix_memalign_free _aligned_free
 #endif // defined(_WIN64)
 #endif // !defined(__APPLE__) && !defined(__OpenBSD__)
@@ -66,7 +67,7 @@ void *default_aligned_allocate(void *user_data, size_t size, size_t alignment)
     size_t real_size = (rem) ? (size + alignment - rem) : size;
     ptr              = memalign(alignment, real_size);
 #else  /* defined(BARE_METAL) */
-    if(posix_memalign(&ptr, alignment, size) != 0)
+    if (posix_memalign(&ptr, alignment, size) != 0)
     {
         // posix_memalign returns non-zero on failures, the return values will be
         // - EINVAL: wrong alignment
@@ -81,17 +82,13 @@ void default_aligned_free(void *user_data, void *ptr)
     ARM_COMPUTE_UNUSED(user_data);
     free(ptr);
 }
-static AclAllocator default_allocator = { &default_allocate,
-                                          &default_free,
-                                          &default_aligned_allocate,
-                                          &default_aligned_free,
-                                          nullptr
-                                        };
+static AclAllocator default_allocator = {&default_allocate, &default_free, &default_aligned_allocate,
+                                         &default_aligned_free, nullptr};
 
 AllocatorWrapper populate_allocator(AclAllocator *external_allocator)
 {
     bool is_valid = (external_allocator != nullptr);
-    if(is_valid)
+    if (is_valid)
     {
         is_valid = is_valid && (external_allocator->alloc != nullptr);
         is_valid = is_valid && (external_allocator->free != nullptr);
@@ -123,14 +120,13 @@ cpuinfo::CpuIsaInfo populate_capabilities_flags(AclTargetCapabilities external_c
     return isa_caps;
 }
 
-CpuCapabilities populate_capabilities(AclTargetCapabilities external_caps,
-                                      int32_t               max_threads)
+CpuCapabilities populate_capabilities(AclTargetCapabilities external_caps, int32_t max_threads)
 {
     CpuCapabilities caps;
 
     // Populate capabilities with system information
     caps.cpu_info = cpuinfo::CpuInfo::build();
-    if(external_caps != AclCpuCapabilitiesAuto)
+    if (external_caps != AclCpuCapabilitiesAuto)
     {
         cpuinfo::CpuIsaInfo isa  = populate_capabilities_flags(external_caps);
         auto                cpus = caps.cpu_info.cpus();
@@ -151,11 +147,9 @@ CpuCapabilities populate_capabilities(AclTargetCapabilities external_caps,
 } // namespace
 
 CpuContext::CpuContext(const AclContextOptions *options)
-    : IContext(Target::Cpu),
-      _allocator(default_allocator),
-      _caps(populate_capabilities(AclCpuCapabilitiesAuto, -1))
+    : IContext(Target::Cpu), _allocator(default_allocator), _caps(populate_capabilities(AclCpuCapabilitiesAuto, -1))
 {
-    if(options != nullptr)
+    if (options != nullptr)
     {
         _allocator = populate_allocator(options->allocator);
         _caps      = populate_capabilities(options->capabilities, options->max_compute_units);
@@ -175,7 +169,7 @@ AllocatorWrapper &CpuContext::allocator()
 ITensorV2 *CpuContext::create_tensor(const AclTensorDescriptor &desc, bool allocate)
 {
     CpuTensor *tensor = new CpuTensor(this, desc);
-    if(tensor != nullptr && allocate)
+    if (tensor != nullptr && allocate)
     {
         tensor->allocate();
     }
diff --git a/src/cpu/CpuContext.h b/src/cpu/CpuContext.h
index da241ed097..0c8ae49f49 100644
--- a/src/cpu/CpuContext.h
+++ b/src/cpu/CpuContext.h
@@ -25,8 +25,8 @@
 #define SRC_CPU_CPUCONTEXT_H
 
 #include "src/common/AllocatorWrapper.h"
-#include "src/common/IContext.h"
 #include "src/common/cpuinfo/CpuInfo.h"
+#include "src/common/IContext.h"
 
 namespace arm_compute
 {
@@ -36,7 +36,7 @@ namespace cpu
 struct CpuCapabilities
 {
     cpuinfo::CpuInfo cpu_info{};
-    int32_t          max_threads{ -1 };
+    int32_t          max_threads{-1};
 };
 
 /** CPU context implementation class */
@@ -60,9 +60,9 @@ public:
     AllocatorWrapper &allocator();
 
     // Inherrited methods overridden
-    ITensorV2 *create_tensor(const AclTensorDescriptor &desc, bool allocate) override;
-    IQueue *create_queue(const AclQueueOptions *options) override;
-    std::tuple<IOperator *, StatusCode> create_activation(const AclTensorDescriptor &src,
+    ITensorV2                          *create_tensor(const AclTensorDescriptor &desc, bool allocate) override;
+    IQueue                             *create_queue(const AclQueueOptions *options) override;
+    std::tuple<IOperator *, StatusCode> create_activation(const AclTensorDescriptor     &src,
                                                           const AclTensorDescriptor     &dst,
                                                           const AclActivationDescriptor &act,
                                                           bool                           is_validate) override;
@@ -74,4 +74,4 @@ private:
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* SRC_CPU_CPUCONTEXT_H */
-\ No newline at end of file
+#endif /* SRC_CPU_CPUCONTEXT_H */
diff --git a/src/cpu/CpuQueue.cpp b/src/cpu/CpuQueue.cpp
index 0f0097b3f4..be781d6794 100644
--- a/src/cpu/CpuQueue.cpp
+++ b/src/cpu/CpuQueue.cpp
@@ -29,8 +29,7 @@ namespace arm_compute
 {
 namespace cpu
 {
-CpuQueue::CpuQueue(IContext *ctx, const AclQueueOptions *options)
-    : IQueue(ctx)
+CpuQueue::CpuQueue(IContext *ctx, const AclQueueOptions *options) : IQueue(ctx)
 {
     ARM_COMPUTE_UNUSED(options);
 }
diff --git a/src/cpu/CpuQueue.h b/src/cpu/CpuQueue.h
index 871a36c85b..b6a2be0e23 100644
--- a/src/cpu/CpuQueue.h
+++ b/src/cpu/CpuQueue.h
@@ -24,10 +24,10 @@
 #ifndef SRC_CPU_CPUQUEUE_H
 #define SRC_CPU_CPUQUEUE_H
 
-#include "src/common/IQueue.h"
-
 #include "arm_compute/runtime/IScheduler.h"
 
+#include "src/common/IQueue.h"
+
 namespace arm_compute
 {
 namespace cpu
diff --git a/src/cpu/CpuTensor.cpp b/src/cpu/CpuTensor.cpp
index 6dd6d9c31b..59082b5350 100644
--- a/src/cpu/CpuTensor.cpp
+++ b/src/cpu/CpuTensor.cpp
@@ -29,8 +29,7 @@ namespace arm_compute
 {
 namespace cpu
 {
-CpuTensor::CpuTensor(IContext *ctx, const AclTensorDescriptor &desc)
-    : ITensorV2(ctx), _legacy_tensor()
+CpuTensor::CpuTensor(IContext *ctx, const AclTensorDescriptor &desc) : ITensorV2(ctx), _legacy_tensor()
 {
     ARM_COMPUTE_ASSERT((ctx != nullptr) && (ctx->type() == Target::Cpu));
     _legacy_tensor = std::make_unique<Tensor>();
@@ -41,7 +40,7 @@ void *CpuTensor::map()
 {
     ARM_COMPUTE_ASSERT(_legacy_tensor.get() != nullptr);
 
-    if(_legacy_tensor == nullptr)
+    if (_legacy_tensor == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[CpuTensor:map]: Backing tensor does not exist!");
         return nullptr;
diff --git a/src/cpu/CpuTensor.h b/src/cpu/CpuTensor.h
index b078774c99..89931e1f94 100644
--- a/src/cpu/CpuTensor.h
+++ b/src/cpu/CpuTensor.h
@@ -24,10 +24,10 @@
 #ifndef SRC_CPU_CPUTENSOR_H
 #define SRC_CPU_CPUTENSOR_H
 
-#include "src/common/ITensorV2.h"
-
 #include "arm_compute/runtime/Tensor.h"
 
+#include "src/common/ITensorV2.h"
+
 namespace arm_compute
 {
 namespace cpu
@@ -52,7 +52,7 @@ public:
     void                 *map() override;
     StatusCode            unmap() override;
     arm_compute::ITensor *tensor() const override;
-    StatusCode import(void *handle, ImportMemoryType type) override;
+    StatusCode            import(void *handle, ImportMemoryType type) override;
 
 private:
     std::unique_ptr<Tensor> _legacy_tensor;
@@ -60,4 +60,4 @@ private:
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* SRC_CPU_CPUTENSOR_H */
-\ No newline at end of file
+#endif /* SRC_CPU_CPUTENSOR_H */
diff --git a/src/cpu/CpuTypes.h b/src/cpu/CpuTypes.h
index 0f7b9b6552..8726bc470a 100644
--- a/src/cpu/CpuTypes.h
+++ b/src/cpu/CpuTypes.h
@@ -31,6 +31,6 @@ namespace arm_compute
 typedef __fp16 float16_t;
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 typedef float float32_t;
-}
+} // namespace arm_compute
 
 #endif /* ARM_COMPUTE_CPUTYPES */
diff --git a/src/cpu/ICpuKernel.h b/src/cpu/ICpuKernel.h
index 8f4106240d..bcd0cb2c70 100644
--- a/src/cpu/ICpuKernel.h
+++ b/src/cpu/ICpuKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_ICPUKERNEL_H
 
 #include "arm_compute/core/CPP/ICPPKernel.h"
+
 #include "src/cpu/kernels/CpuKernelSelectionTypes.h"
 
 namespace arm_compute
@@ -34,7 +35,7 @@ namespace cpu
 enum class KernelSelectionType
 {
     Preferred, /**< Retrieve the best implementation available for the given Cpu ISA, ignoring the build flags */
-    Supported  /**< Retrieve the best implementation available for the given Cpu ISA that is supported by the current build */
+    Supported /**< Retrieve the best implementation available for the given Cpu ISA that is supported by the current build */
 };
 
 template <class Derived>
@@ -50,13 +51,15 @@ public:
      */
 
     template <typename SelectorType>
-    static const auto *get_implementation(const SelectorType &selector, KernelSelectionType selection_type = KernelSelectionType::Supported)
+    static const auto *get_implementation(const SelectorType &selector,
+                                          KernelSelectionType selection_type = KernelSelectionType::Supported)
     {
-        using kernel_type = typename std::remove_reference<decltype(Derived::get_available_kernels())>::type::value_type;
+        using kernel_type =
+            typename std::remove_reference<decltype(Derived::get_available_kernels())>::type::value_type;
 
-        for(const auto &uk : Derived::get_available_kernels())
+        for (const auto &uk : Derived::get_available_kernels())
         {
-            if(uk.is_selected(selector) && (selection_type == KernelSelectionType::Preferred || uk.ukernel != nullptr))
+            if (uk.is_selected(selector) && (selection_type == KernelSelectionType::Preferred || uk.ukernel != nullptr))
             {
                 return &uk;
             }
diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp
index f4bd4e6cad..50bf672d3c 100644
--- a/src/cpu/kernels/CpuActivationKernel.cpp
+++ b/src/cpu/kernels/CpuActivationKernel.cpp
@@ -26,11 +26,11 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/common/Registrars.h"
 #include "src/cpu/kernels/activation/list.h"
 
 #include <array>
@@ -43,126 +43,126 @@ namespace kernels
 {
 namespace
 {
-static const std::vector<CpuActivationKernel::ActivationKernel> available_kernels =
-{
+static const std::vector<CpuActivationKernel::ActivationKernel> available_kernels = {
 #ifdef ARM_COMPUTE_ENABLE_SVE
-    {
-        "sve2_q8_activation_lut",
-        [](const ActivationDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) && data.cpumodel == CPUModel::A510 && data.isa.sve2; },
-        REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_q8_activation_lut)
-    },
+    {"sve2_q8_activation_lut",
+     [](const ActivationDataTypeISASelectorData &data)
+     {
+         return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) &&
+                data.cpumodel == CPUModel::A510 && data.isa.sve2;
+     },
+     REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_q8_activation_lut)},
 #endif // ARM_COMPUTE_ENABLE_SVE
 #ifdef __aarch64__
-    {
-        // Neon LUT implementantion takes precedence
-        "neon_q8_activation_lut",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED; },
-        REGISTER_Q8_NEON(arm_compute::cpu::neon_q8_activation_lut)
-    },
+    {// Neon LUT implementantion takes precedence
+     "neon_q8_activation_lut",
+     [](const ActivationDataTypeISASelectorData &data)
+     { return data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED; },
+     REGISTER_Q8_NEON(arm_compute::cpu::neon_q8_activation_lut)},
 #endif // __aarch64__
-    {
-        "sve2_qu8_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8 && data.isa.sve2 && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
-        REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_activation)
-    },
-    {
-        "sve2_qs8_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
-        REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_activation)
-    },
-    {
-        "sve2_qs16_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QSYMM16 && data.isa.sve2 && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
-        REGISTER_QSYMM16_SVE2(arm_compute::cpu::sve2_qsymm16_activation)
-    },
-    {
-        "sve_fp16_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
-        REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation)
-    },
-    {
-        "sve_fp32_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::F32 && data.isa.sve && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
-        REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_activation)
-    },
-    {
-        "neon_fp16_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_activation)
-    },
-    {
-        "neon_fp32_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_activation)
-    },
-    {
-        "neon_qu8_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_activation)
-    },
-    {
-        "neon_qs8_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_activation)
-    },
-    {
-        "neon_qs16_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QSYMM16; },
-        REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qsymm16_activation)
-    },
+    {"sve2_qu8_activation",
+     [](const ActivationDataTypeISASelectorData &data) {
+         return data.dt == DataType::QASYMM8 && data.isa.sve2 &&
+                data.f != ActivationLayerInfo::ActivationFunction::GELU;
+     },
+     REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_activation)},
+    {"sve2_qs8_activation",
+     [](const ActivationDataTypeISASelectorData &data)
+     {
+         return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 &&
+                data.f != ActivationLayerInfo::ActivationFunction::GELU;
+     },
+     REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_activation)},
+    {"sve2_qs16_activation",
+     [](const ActivationDataTypeISASelectorData &data) {
+         return data.dt == DataType::QSYMM16 && data.isa.sve2 &&
+                data.f != ActivationLayerInfo::ActivationFunction::GELU;
+     },
+     REGISTER_QSYMM16_SVE2(arm_compute::cpu::sve2_qsymm16_activation)},
+    {"sve_fp16_activation",
+     [](const ActivationDataTypeISASelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 &&
+                data.f != ActivationLayerInfo::ActivationFunction::GELU;
+     },
+     REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation)},
+    {"sve_fp32_activation",
+     [](const ActivationDataTypeISASelectorData &data)
+     { return data.dt == DataType::F32 && data.isa.sve && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
+     REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_activation)},
+    {"neon_fp16_activation",
+     [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_activation)},
+    {"neon_fp32_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_activation)},
+    {"neon_qu8_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_activation)},
+    {"neon_qs8_activation",
+     [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_activation)},
+    {"neon_qs16_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QSYMM16; },
+     REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qsymm16_activation)},
 };
 
 /* Supported activation in the 8-bit integer domain */
-static const std::array<ActivationLayerInfo::ActivationFunction, 8> qasymm8_activations =
-{
-    ActivationLayerInfo::ActivationFunction::RELU,
-    ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
-    ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-    ActivationLayerInfo::ActivationFunction::LOGISTIC,
-    ActivationLayerInfo::ActivationFunction::TANH,
-    ActivationLayerInfo::ActivationFunction::HARD_SWISH,
-    ActivationLayerInfo::ActivationFunction::LEAKY_RELU,
-    ActivationLayerInfo::ActivationFunction::GELU,
+static const std::array<ActivationLayerInfo::ActivationFunction, 8> qasymm8_activations = {
+    ActivationLayerInfo::ActivationFunction::RELU,         ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+    ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, ActivationLayerInfo::ActivationFunction::LOGISTIC,
+    ActivationLayerInfo::ActivationFunction::TANH,         ActivationLayerInfo::ActivationFunction::HARD_SWISH,
+    ActivationLayerInfo::ActivationFunction::LEAKY_RELU,   ActivationLayerInfo::ActivationFunction::GELU,
 };
 /* Supported activation in the 16-bit integer domain */
-static const std::array<ActivationLayerInfo::ActivationFunction, 4> qsymm16_activations =
-{
-    ActivationLayerInfo::ActivationFunction::LOGISTIC,
-    ActivationLayerInfo::ActivationFunction::TANH,
-    ActivationLayerInfo::ActivationFunction::HARD_SWISH,
-    ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-};
+static const std::array<ActivationLayerInfo::ActivationFunction, 4> qsymm16_activations = {
+    ActivationLayerInfo::ActivationFunction::LOGISTIC, ActivationLayerInfo::ActivationFunction::TANH,
+    ActivationLayerInfo::ActivationFunction::HARD_SWISH, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
 
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &activation_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::QSYMM16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::QSYMM16, DataType::F16, DataType::F32);
 
-    const auto *uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation() });
+    const auto *uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{
+        src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation()});
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
-    const DataType                                data_type = src->data_type();
-    const QuantizationInfo                       &oq_info   = (dst != nullptr) ? dst->quantization_info() : src->quantization_info();
-    const ActivationLayerInfo::ActivationFunction f_act     = activation_info.activation();
+    const DataType          data_type = src->data_type();
+    const QuantizationInfo &oq_info   = (dst != nullptr) ? dst->quantization_info() : src->quantization_info();
+    const ActivationLayerInfo::ActivationFunction f_act = activation_info.activation();
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_asymmetric(data_type) && (std::find(std::begin(qasymm8_activations), std::end(qasymm8_activations), f_act) == std::end(qasymm8_activations)),
-                                    "For QASYMM8 only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        is_data_type_quantized_asymmetric(data_type) &&
+            (std::find(std::begin(qasymm8_activations), std::end(qasymm8_activations), f_act) ==
+             std::end(qasymm8_activations)),
+        "For QASYMM8 only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_symmetric(data_type) && (std::find(std::begin(qsymm16_activations), std::end(qsymm16_activations), f_act) == std::end(qsymm16_activations)),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_symmetric(data_type) &&
+                                        (std::find(std::begin(qsymm16_activations), std::end(qsymm16_activations),
+                                                   f_act) == std::end(qsymm16_activations)),
                                     "For QSYMM16 only tanh and logistic are supported");
-    ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && (f_act == ActivationLayerInfo::ActivationFunction::TANH)
-                                && (oq_info != QuantizationInfo(1.f / 128.f, 128)));
-    ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-                                && (oq_info != QuantizationInfo(1.f / 256.f, 0)));
-
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 0)));
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, -128)));
-
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+                                (oq_info != QuantizationInfo(1.f / 128.f, 128)));
+    ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+                                (oq_info != QuantizationInfo(1.f / 256.f, 0)));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+                                (oq_info != QuantizationInfo(1.f / 128.f, 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+                                (oq_info != QuantizationInfo(1.f / 256.f, -128)));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+                                (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+                                (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
 
     // Checks performed when dst is configured
-    if((dst != nullptr) && (dst->total_size() != 0))
+    if ((dst != nullptr) && (dst->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
@@ -176,7 +176,7 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src,
     // Configure kernel window
     Window win = calculate_max_window(*src, Steps());
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         // dst auto inizialitation if not yet initialized
         auto_init_if_empty(*dst, *src->clone());
@@ -185,14 +185,19 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src,
     return std::make_pair(Status{}, win);
 }
 #ifdef __aarch64__
-void init_lut(ActivationLayerInfo::ActivationFunction act_func, DataType data_type,
-              const UniformQuantizationInfo &qi_in, const UniformQuantizationInfo &qi_out,
-              ActivationLayerInfo::LookupTable256 &lut, float a, float b)
+void init_lut(ActivationLayerInfo::ActivationFunction act_func,
+              DataType                                data_type,
+              const UniformQuantizationInfo          &qi_in,
+              const UniformQuantizationInfo          &qi_out,
+              ActivationLayerInfo::LookupTable256    &lut,
+              float                                   a,
+              float                                   b)
 {
-    for(size_t i = 0; i < lut.size(); ++i)
+    for (size_t i = 0; i < lut.size(); ++i)
     {
-        float tmp_f = (data_type == DataType::QASYMM8) ? dequantize_qasymm8(i, qi_in) : dequantize_qasymm8_signed(i, qi_in);
-        switch(act_func)
+        float tmp_f =
+            (data_type == DataType::QASYMM8) ? dequantize_qasymm8(i, qi_in) : dequantize_qasymm8_signed(i, qi_in);
+        switch (act_func)
         {
             case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
                 tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
@@ -246,7 +251,8 @@ void init_lut(ActivationLayerInfo::ActivationFunction act_func, DataType data_ty
                 tmp_f = 0;
                 break;
         }
-        lut[i] = (data_type == DataType::QASYMM8) ? quantize_qasymm8(tmp_f, qi_out) : quantize_qasymm8_signed(tmp_f, qi_out);
+        lut[i] =
+            (data_type == DataType::QASYMM8) ? quantize_qasymm8(tmp_f, qi_out) : quantize_qasymm8_signed(tmp_f, qi_out);
     }
 }
 #endif // __aarch64__
@@ -258,8 +264,9 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, activation_info));
 
-    const auto uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation() });
-    if(dst != nullptr)
+    const auto uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{
+        src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation()});
+    if (dst != nullptr)
     {
         // dst auto inizialitation if not yet initialized
         auto_init_if_empty(*dst, *src->clone());
@@ -271,11 +278,12 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac
     _name       = std::string("CpuActivationKernel").append("/").append(uk->name);
 
 #ifdef __aarch64__
-    if(src->data_type() == DataType::QASYMM8 || src->data_type() == DataType::QASYMM8_SIGNED)
+    if (src->data_type() == DataType::QASYMM8 || src->data_type() == DataType::QASYMM8_SIGNED)
     {
         ActivationLayerInfo::LookupTable256 tmp_lut;
-        init_lut(activation_info.activation(), src->data_type(), src->quantization_info().uniform(), (dst) ? dst->quantization_info().uniform() : src->quantization_info().uniform(),
-                 tmp_lut, activation_info.a(), activation_info.b());
+        init_lut(activation_info.activation(), src->data_type(), src->quantization_info().uniform(),
+                 (dst) ? dst->quantization_info().uniform() : src->quantization_info().uniform(), tmp_lut,
+                 activation_info.a(), activation_info.b());
         activation_info.setLookupTable256(tmp_lut);
     }
 #endif // __aarch64__
@@ -288,11 +296,13 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac
     ICPPKernel::configure(win);
 }
 
-Status CpuActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status
+CpuActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, act_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), (dst != nullptr) ? dst->clone().get() : nullptr).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(src->clone().get(), (dst != nullptr) ? dst->clone().get() : nullptr).first);
 
     return Status{};
 }
@@ -302,7 +312,7 @@ size_t CpuActivationKernel::get_mws(const CPUInfo &platform, size_t thread_count
     ARM_COMPUTE_UNUSED(thread_count);
     ARM_COMPUTE_UNUSED(platform);
 
-    if(_split_dimension == Window::DimX)
+    if (_split_dimension == Window::DimX)
     {
         // Don't split the work load too small if the tensor has been reinterpreted as 1D.
         // This number is loosely chosen as threading overhead in each platform varies wildly.
@@ -314,7 +324,7 @@ size_t CpuActivationKernel::get_mws(const CPUInfo &platform, size_t thread_count
 void CpuActivationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     // Early exit on disabled activation
-    if(!_act_info.enabled())
+    if (!_act_info.enabled())
     {
         return;
     }
diff --git a/src/cpu/kernels/CpuActivationKernel.h b/src/cpu/kernels/CpuActivationKernel.h
index 804407653f..4bad9fb3e8 100644
--- a/src/cpu/kernels/CpuActivationKernel.h
+++ b/src/cpu/kernels/CpuActivationKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -38,7 +39,8 @@ namespace kernels
 class CpuActivationKernel : public ICpuKernel<CpuActivationKernel>
 {
 private:
-    using ActivationKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const ActivationLayerInfo &, const Window &)>::type;
+    using ActivationKernelPtr =
+        std::add_pointer<void(const ITensor *, ITensor *, const ActivationLayerInfo &, const Window &)>::type;
 
 public:
     CpuActivationKernel() = default;
@@ -71,7 +73,7 @@ public:
     size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     /** Get the preferred dimension in which the scheduler splits the work into multiple jobs.
@@ -94,8 +96,8 @@ public:
 
 private:
     ActivationLayerInfo _act_info{};
-    ActivationKernelPtr _run_method{ nullptr };
-    size_t              _split_dimension{ Window::DimY };
+    ActivationKernelPtr _run_method{nullptr};
+    size_t              _split_dimension{Window::DimY};
     std::string         _name{};
 };
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuAddKernel.cpp b/src/cpu/kernels/CpuAddKernel.cpp
index 2983575cb6..a990aa4715 100644
--- a/src/cpu/kernels/CpuAddKernel.cpp
+++ b/src/cpu/kernels/CpuAddKernel.cpp
@@ -26,19 +26,21 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/add/list.h"
+
 #include <array>
 
 #if defined(ENABLE_FP32_KERNELS)
 namespace
 {
-    static constexpr size_t default_mws_N1_fp32_neon = 24536;
-    static constexpr size_t default_mws_V1_fp32_neon = 40510;
-}
+static constexpr size_t default_mws_N1_fp32_neon = 24536;
+static constexpr size_t default_mws_V1_fp32_neon = 40510;
+} // namespace
 #endif /* ENABLE_FP32_KERNELS */
 
 namespace arm_compute
@@ -49,152 +51,82 @@ namespace kernels
 {
 namespace
 {
-static const std::vector<CpuAddKernel::AddKernel> available_kernels =
-{
-    {
-        "neon_qu8_add_fixedpoint",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::QASYMM8) && data.can_use_fixedpoint;
-        },
-        REGISTER_FP32_NEON(arm_compute::cpu::add_q8_neon_fixedpoint<uint8_t>)
-    },
-    {
-        "neon_qs8_add_fixedpoint",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::QASYMM8_SIGNED) && data.can_use_fixedpoint;
-        },
-        REGISTER_FP32_NEON(arm_compute::cpu::add_q8_neon_fixedpoint<int8_t>)
-    },
-    {
-        "sve2_qu8_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::QASYMM8) && data.isa.sve2;
-        },
-        REGISTER_QASYMM8_SVE2(arm_compute::cpu::add_qasymm8_sve2)
-    },
-    {
-        "sve2_qs8_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2;
-        },
-        REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::add_qasymm8_signed_sve2)
-    },
-    {
-        "sve2_qs16_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::QSYMM16) && data.isa.sve2;
-        },
-        REGISTER_QSYMM16_SVE2(arm_compute::cpu::add_qsymm16_sve2)
-    },
-    {
-        "sve_fp32_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::F32) && data.isa.sve;
-        },
-        REGISTER_FP32_SVE(arm_compute::cpu::add_fp32_sve)
-    },
-    {
-        "sve_fp16_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16;
-        },
-        REGISTER_FP16_SVE(arm_compute::cpu::add_fp16_sve)
-    },
-    {
-        "sve_u8_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::U8) && data.isa.sve;
-        },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_sve)
-    },
-    {
-        "sve_s16_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::S16) && data.isa.sve;
-        },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::add_s16_sve)
-    },
-    {
-        "sve_s32_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::S32) && data.isa.sve;
-        },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::add_s32_sve)
-    },
-    {
-        "neon_fp32_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_NEON(arm_compute::cpu::add_fp32_neon)
-    },
-    {
-        "neon_fp16_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::F16) && data.isa.fp16;
-        },
-        REGISTER_FP16_NEON(arm_compute::cpu::add_fp16_neon)
-    },
-    {
-        "neon_u8_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::U8); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_neon)
-    },
-    {
-        "neon_s16_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::S16); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::add_s16_neon)
-    },
-    {
-        "neon_s32_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::S32); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::add_s32_neon)
-    },
-    {
-        "neon_qu8_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::add_qasymm8_neon)
-    },
-    {
-        "neon_qs8_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_qasymm8_signed_neon)
-    },
-    {
-        "neon_qs16_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QSYMM16); },
-        REGISTER_QSYMM16_NEON(arm_compute::cpu::add_qsymm16_neon)
-    }
-};
-
-Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy)
+static const std::vector<CpuAddKernel::AddKernel> available_kernels = {
+    {"neon_qu8_add_fixedpoint",
+     [](const CpuAddKernelDataTypeISASelectorData &data)
+     { return (data.dt == DataType::QASYMM8) && data.can_use_fixedpoint; },
+     REGISTER_FP32_NEON(arm_compute::cpu::add_q8_neon_fixedpoint<uint8_t>)},
+    {"neon_qs8_add_fixedpoint",
+     [](const CpuAddKernelDataTypeISASelectorData &data)
+     { return (data.dt == DataType::QASYMM8_SIGNED) && data.can_use_fixedpoint; },
+     REGISTER_FP32_NEON(arm_compute::cpu::add_q8_neon_fixedpoint<int8_t>)},
+    {"sve2_qu8_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; },
+     REGISTER_QASYMM8_SVE2(arm_compute::cpu::add_qasymm8_sve2)},
+    {"sve2_qs8_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data)
+     { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; },
+     REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::add_qasymm8_signed_sve2)},
+    {"sve2_qs16_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QSYMM16) && data.isa.sve2; },
+     REGISTER_QSYMM16_SVE2(arm_compute::cpu::add_qsymm16_sve2)},
+    {"sve_fp32_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; },
+     REGISTER_FP32_SVE(arm_compute::cpu::add_fp32_sve)},
+    {"sve_fp16_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data)
+     { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
+     REGISTER_FP16_SVE(arm_compute::cpu::add_fp16_sve)},
+    {"sve_u8_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::U8) && data.isa.sve; },
+     REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_sve)},
+    {"sve_s16_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S16) && data.isa.sve; },
+     REGISTER_INTEGER_SVE(arm_compute::cpu::add_s16_sve)},
+    {"sve_s32_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S32) && data.isa.sve; },
+     REGISTER_INTEGER_SVE(arm_compute::cpu::add_s32_sve)},
+    {"neon_fp32_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(arm_compute::cpu::add_fp32_neon)},
+    {"neon_fp16_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::add_fp16_neon)},
+    {"neon_u8_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::U8); },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_neon)},
+    {"neon_s16_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S16); },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::add_s16_neon)},
+    {"neon_s32_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S32); },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::add_s32_neon)},
+    {"neon_qu8_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::add_qasymm8_neon)},
+    {"neon_qs8_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_qasymm8_signed_neon)},
+    {"neon_qs16_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QSYMM16); },
+     REGISTER_QSYMM16_NEON(arm_compute::cpu::add_qsymm16_neon)}};
+
+Status
+validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy)
 {
     ARM_COMPUTE_UNUSED(policy);
 
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16, DataType::F16,
-                                                         DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+                                                         DataType::F16, DataType::S32, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1);
 
     const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape());
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((src0.tensor_shape().x() != src1.tensor_shape().x()) && ((src0.data_type() != src1.data_type()) || (src0.data_type() != dst.data_type())
-                                                                                             || (src1.data_type() != dst.data_type())),
-                                    "Broadcasting across width is supported on configurations where all tensors have the same data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (src0.tensor_shape().x() != src1.tensor_shape().x()) &&
+            ((src0.data_type() != src1.data_type()) || (src0.data_type() != dst.data_type()) ||
+             (src1.data_type() != dst.data_type())),
+        "Broadcasting across width is supported on configurations where all tensors have the same data type");
 
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
@@ -202,8 +134,8 @@ Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, cons
     }
 
     const auto can_use_fixedpoint = add_q8_neon_fixedpoint_possible(&src0, &src1, &dst);
-    const auto uk = CpuAddKernel::get_implementation<CpuAddKernelDataTypeISASelectorData>(CpuAddKernelDataTypeISASelectorData{ src0.data_type(),
-                                                                                          CPUInfo::get().get_isa(), can_use_fixedpoint });
+    const auto uk                 = CpuAddKernel::get_implementation<CpuAddKernelDataTypeISASelectorData>(
+        CpuAddKernelDataTypeISASelectorData{src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint});
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     return Status{};
@@ -215,9 +147,9 @@ void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy));
 
-    const auto can_use_fixedpoint     = add_q8_neon_fixedpoint_possible(src0, src1, dst);
-    const auto uk                     = CpuAddKernel::get_implementation<CpuAddKernelDataTypeISASelectorData>(CpuAddKernelDataTypeISASelectorData{ src0->data_type(),
-                                                                                                              CPUInfo::get().get_isa(), can_use_fixedpoint });
+    const auto can_use_fixedpoint = add_q8_neon_fixedpoint_possible(src0, src1, dst);
+    const auto uk                 = CpuAddKernel::get_implementation<CpuAddKernelDataTypeISASelectorData>(
+        CpuAddKernelDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint});
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
 
@@ -237,7 +169,8 @@ void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I
     ICpuKernel::configure(win);
 }
 
-Status CpuAddKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy)
+Status
+CpuAddKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
 
@@ -277,14 +210,14 @@ size_t CpuAddKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
     ARM_COMPUTE_UNUSED(thread_count);
 
 #if defined(ENABLE_FP32_KERNELS)
-    if(this->_run_method == &add_fp32_neon)
+    if (this->_run_method == &add_fp32_neon)
     {
         size_t mws = ICPPKernel::default_mws;
-        if(platform.get_cpu_model() == CPUModel::N1)
+        if (platform.get_cpu_model() == CPUModel::N1)
         {
             mws = default_mws_N1_fp32_neon;
         }
-        else if(platform.get_cpu_model() == CPUModel::V1)
+        else if (platform.get_cpu_model() == CPUModel::V1)
         {
             mws = default_mws_V1_fp32_neon;
         }
@@ -294,7 +227,7 @@ size_t CpuAddKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
         }
 
         // tensor is 1D or was re-interpreted as 1D
-        if(this->window().shape().num_dimensions() == 1)
+        if (this->window().shape().num_dimensions() == 1)
         {
             return mws;
         }
@@ -307,7 +240,7 @@ size_t CpuAddKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
             return std::max(static_cast<size_t>(1), mws);
         }
     }
-#else /* ENABLE_FP32_KERNELS */
+#else  /* ENABLE_FP32_KERNELS */
     ARM_COMPUTE_UNUSED(platform);
 #endif /* ENABLE_FP32_KERNELS */
     return ICPPKernel::default_mws;
diff --git a/src/cpu/kernels/CpuAddKernel.h b/src/cpu/kernels/CpuAddKernel.h
index 9921feabe2..4adba8bb16 100644
--- a/src/cpu/kernels/CpuAddKernel.h
+++ b/src/cpu/kernels/CpuAddKernel.h
@@ -37,7 +37,8 @@ namespace kernels
 class CpuAddKernel : public ICpuKernel<CpuAddKernel>
 {
 private:
-    using AddKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
+    using AddKernelPtr = std::add_pointer<void(
+        const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
 
 public:
     struct AddKernel
@@ -74,10 +75,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy);
+    static Status
+    validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     /** Return minimum workload size of the relevant kernel
@@ -98,9 +100,9 @@ public:
 
 private:
     ConvertPolicy _policy{};
-    AddKernelPtr  _run_method{ nullptr };
+    AddKernelPtr  _run_method{nullptr};
     std::string   _name{};
-    size_t        _split_dimension{ Window::DimY };
+    size_t        _split_dimension{Window::DimY};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuAddMulAddKernel.cpp b/src/cpu/kernels/CpuAddMulAddKernel.cpp
index b84bdd54e9..6a632e8702 100644
--- a/src/cpu/kernels/CpuAddMulAddKernel.cpp
+++ b/src/cpu/kernels/CpuAddMulAddKernel.cpp
@@ -27,8 +27,8 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 
-#include "src/core/CPP/Validate.h"
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/addmuladd/list.h"
@@ -41,36 +41,28 @@ namespace kernels
 {
 namespace
 {
-static const std::vector<CpuAddMulAddKernel::AddMulAddKernel> available_kernels =
-{
+static const std::vector<CpuAddMulAddKernel::AddMulAddKernel> available_kernels = {
 #ifdef __aarch64__
-    {
-        "neon_fp32_add_mul_add",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_NEON(arm_compute::cpu::add_mul_add_fp32_neon)
-    },
-    {
-        "neon_fp16_add_mul_add",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16); },
-        REGISTER_FP16_NEON(arm_compute::cpu::add_mul_add_fp16_neon)
-    },
-    {
-        "neon_qasymm8_add_mul_add",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::add_mul_add_u8_neon)
-    },
-    {
-        "neon_qasymm8_signed_add_mul_add",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_mul_add_s8_neon)
-    }
+    {"neon_fp32_add_mul_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(arm_compute::cpu::add_mul_add_fp32_neon)},
+    {"neon_fp16_add_mul_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16); },
+     REGISTER_FP16_NEON(arm_compute::cpu::add_mul_add_fp16_neon)},
+    {"neon_qasymm8_add_mul_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::add_mul_add_u8_neon)},
+    {"neon_qasymm8_signed_add_mul_add",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_mul_add_s8_neon)}
 #endif // __aarch64__
 };
 
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
-                          const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
-                          const ITensorInfo *add_output, const ITensorInfo *final_output,
-                          ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status validate_arguments(const ITensorInfo         *input1,
+                          const ITensorInfo         *input2,
+                          const ITensorInfo         *bn_mul,
+                          const ITensorInfo         *bn_add,
+                          const ITensorInfo         *add_output,
+                          const ITensorInfo         *final_output,
+                          ConvertPolicy              policy,
+                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, bn_mul, bn_add, final_output);
 
@@ -78,16 +70,16 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
 
     using ActFunction          = ActivationLayerInfo::ActivationFunction;
     const ActFunction act_func = act_info.activation();
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        (act_func != ActFunction::BOUNDED_RELU && act_func != ActFunction::RELU && act_func != ActFunction::LU_BOUNDED_RELU && act_func != ActFunction::IDENTITY),
-        "Only RELU Family activations, or no activation, is supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_func != ActFunction::BOUNDED_RELU && act_func != ActFunction::RELU &&
+                                     act_func != ActFunction::LU_BOUNDED_RELU && act_func != ActFunction::IDENTITY),
+                                    "Only RELU Family activations, or no activation, is supported");
 
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input1);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
                                                          DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
 
-    if(is_data_type_quantized(input1->data_type()))
+    if (is_data_type_quantized(input1->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bn_mul, 1, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bn_add, 1, DataType::F32);
@@ -101,39 +93,47 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2); // No broadcasting
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mul, bn_add);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(bn_mul->num_dimensions() != 1, "BatchNorm coefficients should be 1D array");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(bn_mul->tensor_shape()[0] != input1->tensor_shape()[0], "First dimensions of inputs and batchNorm coefs should match");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(bn_mul->tensor_shape()[0] != input1->tensor_shape()[0],
+                                    "First dimensions of inputs and batchNorm coefs should match");
 
     // Validate in case we have add layer's output (intermediate) initialized
-    if(add_output != nullptr && add_output->total_size() > 0)
+    if (add_output != nullptr && add_output->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, add_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, add_output);
     }
 
     // Validate in case final output has been initialized
-    if(final_output->total_size() > 0)
+    if (final_output->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, final_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, final_output);
     }
 
-    const auto uk = CpuAddMulAddKernel::get_implementation<DataTypeISASelectorData>(DataTypeISASelectorData{ input1->data_type(), CPUInfo::get().get_isa() });
+    const auto uk = CpuAddMulAddKernel::get_implementation<DataTypeISASelectorData>(
+        DataTypeISASelectorData{input1->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     return Status{};
 }
 } // namespace
 
-void CpuAddMulAddKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2,
-                                   const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
-                                   ITensorInfo *add_output, ITensorInfo *final_output,
-                                   ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CpuAddMulAddKernel::configure(const ITensorInfo         *input1,
+                                   const ITensorInfo         *input2,
+                                   const ITensorInfo         *bn_mul,
+                                   const ITensorInfo         *bn_add,
+                                   ITensorInfo               *add_output,
+                                   ITensorInfo               *final_output,
+                                   ConvertPolicy              policy,
+                                   const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(bn_mul, bn_add, input2);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, bn_add, bn_mul, final_output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info));
 
-    const auto uk = CpuAddMulAddKernel::get_implementation<DataTypeISASelectorData>(DataTypeISASelectorData{ input1->data_type(), CPUInfo::get().get_isa() });
+    const auto uk = CpuAddMulAddKernel::get_implementation<DataTypeISASelectorData>(
+        DataTypeISASelectorData{input1->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
     ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
 
@@ -146,7 +146,7 @@ void CpuAddMulAddKernel::configure(const ITensorInfo *input1, const ITensorInfo
     set_shape_if_empty(*final_output, input1->tensor_shape());
     set_data_type_if_unknown(*final_output, input1->data_type());
 
-    if(add_output != nullptr)
+    if (add_output != nullptr)
     {
         set_shape_if_empty(*add_output, input1->tensor_shape());
         set_data_type_if_unknown(*add_output, input1->data_type());
@@ -158,14 +158,19 @@ void CpuAddMulAddKernel::configure(const ITensorInfo *input1, const ITensorInfo
     ICpuKernel::configure(win);
 }
 
-Status CpuAddMulAddKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2,
-                                    const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
-                                    const ITensorInfo *add_output, const ITensorInfo *final_output,
-                                    ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CpuAddMulAddKernel::validate(const ITensorInfo         *input1,
+                                    const ITensorInfo         *input2,
+                                    const ITensorInfo         *bn_mul,
+                                    const ITensorInfo         *bn_add,
+                                    const ITensorInfo         *add_output,
+                                    const ITensorInfo         *final_output,
+                                    ConvertPolicy              policy,
+                                    const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, bn_mul, bn_add, final_output);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info));
 
     return Status{};
 }
diff --git a/src/cpu/kernels/CpuAddMulAddKernel.h b/src/cpu/kernels/CpuAddMulAddKernel.h
index 67ce6f029a..c5e31ec291 100644
--- a/src/cpu/kernels/CpuAddMulAddKernel.h
+++ b/src/cpu/kernels/CpuAddMulAddKernel.h
@@ -26,6 +26,7 @@
 #define SRC_CPU_KERNELS_CPUADDMULADDKERNEL
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -39,8 +40,15 @@ namespace kernels
 class CpuAddMulAddKernel : public ICpuKernel<CpuAddMulAddKernel>
 {
 private:
-    using AddMulAddKernelPtr =
-        std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, const ITensor *, ITensor *, ITensor *, ConvertPolicy, const ActivationLayerInfo &, const Window &)>::type;
+    using AddMulAddKernelPtr = std::add_pointer<void(const ITensor *,
+                                                     const ITensor *,
+                                                     const ITensor *,
+                                                     const ITensor *,
+                                                     ITensor *,
+                                                     ITensor *,
+                                                     ConvertPolicy,
+                                                     const ActivationLayerInfo &,
+                                                     const Window &)>::type;
 
 public:
     struct AddMulAddKernel
@@ -57,23 +65,31 @@ public:
      * Similar to @ref NEAddMulAdd::configure()
      *
      */
-    void configure(const ITensorInfo *input1, const ITensorInfo *input2,
-                   const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
-                   ITensorInfo *add_output, ITensorInfo *final_output,
-                   ConvertPolicy policy, const ActivationLayerInfo &act_info);
+    void configure(const ITensorInfo         *input1,
+                   const ITensorInfo         *input2,
+                   const ITensorInfo         *bn_mul,
+                   const ITensorInfo         *bn_add,
+                   ITensorInfo               *add_output,
+                   ITensorInfo               *final_output,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuAddMulAddKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
-                           const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
-                           const ITensorInfo *add_output, const ITensorInfo *final_output,
-                           ConvertPolicy policy, const ActivationLayerInfo &act_info);
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *bn_mul,
+                           const ITensorInfo         *bn_add,
+                           const ITensorInfo         *add_output,
+                           const ITensorInfo         *final_output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     static const std::vector<AddMulAddKernel> &get_available_kernels();
@@ -81,7 +97,7 @@ public:
 private:
     ConvertPolicy       _policy{};
     ActivationLayerInfo _act_info{};
-    AddMulAddKernelPtr  _run_method{ nullptr };
+    AddMulAddKernelPtr  _run_method{nullptr};
     std::string         _name{};
 };
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuCastKernel.cpp b/src/cpu/kernels/CpuCastKernel.cpp
index 764a1ec71c..05c7742b03 100644
--- a/src/cpu/kernels/CpuCastKernel.cpp
+++ b/src/cpu/kernels/CpuCastKernel.cpp
@@ -28,16 +28,16 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/common/Registrars.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/SaturateCast.h"
-
 #include "src/cpu/kernels/cast/list.h"
+#include "support/SaturateCast.h"
 
 namespace arm_compute
 {
@@ -47,38 +47,30 @@ namespace kernels
 {
 namespace
 {
-static const std::vector<CpuCastKernel::CastKernel> available_kernels =
-{
-    {
-        "neon_qs8_cast",
-        [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::QASYMM8_SIGNED && data.dst_dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_qasymm8_signed_to_fp16_cast)
-    },
-    {
-        "neon_qu8_cast",
-        [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::QASYMM8 && data.dst_dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)
-    },
-    {
-        "neon_u8_cast",
-        [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::U8 && data.dst_dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)
-    },
-    {
-        "neon_fp16_cast",
-        [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_to_other_dt_cast)
-    },
-    {
-        "neon_fp32_to_fp16_cast",
-        [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::F32 && data.dst_dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp32_to_fp16_cast)
-    },
-    {
-        "neon_s32_cast",
-        [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::S32 && data.dst_dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_s32_to_fp16_cast)
-    },
+static const std::vector<CpuCastKernel::CastKernel> available_kernels = {
+    {"neon_qs8_cast",
+     [](const CastDataTypeISASelectorData &data)
+     { return data.src_dt == DataType::QASYMM8_SIGNED && data.dst_dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_qasymm8_signed_to_fp16_cast)},
+    {"neon_qu8_cast",
+     [](const CastDataTypeISASelectorData &data)
+     { return data.src_dt == DataType::QASYMM8 && data.dst_dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)},
+    {"neon_u8_cast",
+     [](const CastDataTypeISASelectorData &data)
+     { return data.src_dt == DataType::U8 && data.dst_dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)},
+    {"neon_fp16_cast",
+     [](const CastDataTypeISASelectorData &data) { return data.src_dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_to_other_dt_cast)},
+    {"neon_fp32_to_fp16_cast",
+     [](const CastDataTypeISASelectorData &data)
+     { return data.src_dt == DataType::F32 && data.dst_dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp32_to_fp16_cast)},
+    {"neon_s32_cast",
+     [](const CastDataTypeISASelectorData &data)
+     { return data.src_dt == DataType::S32 && data.dst_dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_s32_to_fp16_cast)},
 };
 
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
@@ -88,57 +80,67 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Conver
     ARM_COMPUTE_UNUSED(policy);
     ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
 #ifdef __aarch64__
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
-                                                         DataType::S16, DataType::U16, DataType::F16,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::U8, DataType::S16, DataType::U16, DataType::F16,
                                                          DataType::F32, DataType::S32, DataType::S64, DataType::U64);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
-                                                         DataType::S16, DataType::U16, DataType::F16,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::U8, DataType::S16, DataType::U16, DataType::F16,
                                                          DataType::U32, DataType::S32, DataType::F32, DataType::S64);
 
 #else  // __aarch64__
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
-                                                         DataType::S16, DataType::U16, DataType::F16,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::U8, DataType::S16, DataType::U16, DataType::F16,
                                                          DataType::F32, DataType::S32);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
-                                                         DataType::S16, DataType::U16, DataType::F16,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::U8, DataType::S16, DataType::U16, DataType::F16,
                                                          DataType::U32, DataType::S32, DataType::F32);
 #endif // __aarch64__
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8_SIGNED && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::S32
-                                                                                     && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8_SIGNED &&
+                                        (dst->data_type() != DataType::S16 && dst->data_type() != DataType::S32 &&
+                                         dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
                                     "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16
-                                                                              && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8 &&
+                                        (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16 &&
+                                         dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 &&
+                                         dst->data_type() != DataType::F32),
                                     "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16
-                                                                         && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U8 &&
+                                        (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16 &&
+                                         dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 &&
+                                         dst->data_type() != DataType::F32),
                                     "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U16 && (dst->data_type() != DataType::U8 && dst->data_type() != DataType::U32),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U16 &&
+                                        (dst->data_type() != DataType::U8 && dst->data_type() != DataType::U32),
                                     "Only data_types supported [in] U16 ->  [out] U8, U32");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::U8 && dst->data_type() != DataType::S32),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S16 &&
+                                        (dst->data_type() != DataType::QASYMM8_SIGNED &&
+                                         dst->data_type() != DataType::U8 && dst->data_type() != DataType::S32),
                                     "Only data_types supported [in] S16 ->  [out] U8, S32");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
-                                                                          && dst->data_type() != DataType::U8
-                                                                          && dst->data_type() != DataType::F32 && dst->data_type() != DataType::S32),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F16 &&
+                                        (dst->data_type() != DataType::QASYMM8_SIGNED &&
+                                         dst->data_type() != DataType::QASYMM8 && dst->data_type() != DataType::U8 &&
+                                         dst->data_type() != DataType::F32 && dst->data_type() != DataType::S32),
                                     "Only data_types supported [in] F16 ->  [out] QASYMM8, F32, S32, U8");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
-                                                                          && dst->data_type() != DataType::F16
-                                                                          && dst->data_type() != DataType::S32 && dst->data_type() != DataType::U8),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F32 &&
+                                        (dst->data_type() != DataType::QASYMM8_SIGNED &&
+                                         dst->data_type() != DataType::QASYMM8 && dst->data_type() != DataType::F16 &&
+                                         dst->data_type() != DataType::S32 && dst->data_type() != DataType::U8),
                                     "Only data_types supported [in] F32 ->  [out] QASYMM8, F16, S32, U8");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
-                                                                          && dst->data_type() != DataType::F16
-                                                                          && dst->data_type() != DataType::F32
-                                                                          && dst->data_type() != DataType::U8
-                                                                          && dst->data_type() != DataType::S64),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S32 &&
+                                        (dst->data_type() != DataType::QASYMM8_SIGNED &&
+                                         dst->data_type() != DataType::QASYMM8 && dst->data_type() != DataType::F16 &&
+                                         dst->data_type() != DataType::F32 && dst->data_type() != DataType::U8 &&
+                                         dst->data_type() != DataType::S64),
                                     "Only data_types supported [in] S32 ->  [out] QASYMM8, F16, F32, U8, S64");
 #ifdef __aarch64__
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S64 && dst->data_type() != DataType::F32,
@@ -149,7 +151,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Conver
 #endif // __aarch64__
 
     // Validate in case of configured dst
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
     }
@@ -193,15 +195,8 @@ inline void internal_neon_convert(const T1 *src_ptr, T2 *dst_ptr)
 template <>
 inline void internal_neon_convert<int32_t, int64_t>(const int32_t *src_ptr, int64_t *dst_ptr)
 {
-    const int32x4x4_t texels =
-    {
-        {
-            vld1q_s32(src_ptr),
-            vld1q_s32(src_ptr + 4),
-            vld1q_s32(src_ptr + 8),
-            vld1q_s32(src_ptr + 12)
-        }
-    };
+    const int32x4x4_t texels = {
+        {vld1q_s32(src_ptr), vld1q_s32(src_ptr + 4), vld1q_s32(src_ptr + 8), vld1q_s32(src_ptr + 12)}};
     vst1q_s64(dst_ptr, vmovl_s32(vget_low_s32(texels.val[0])));
     vst1q_s64(dst_ptr + 2, vmovl_s32(vget_high_s32(texels.val[0])));
     vst1q_s64(dst_ptr + 4, vmovl_s32(vget_low_s32(texels.val[1])));
@@ -215,33 +210,14 @@ inline void internal_neon_convert<int32_t, int64_t>(const int32_t *src_ptr, int6
 template <>
 inline void internal_neon_convert<int64_t, float>(const int64_t *src_ptr, float *dst_ptr)
 {
-    const float64x2x4_t texels0 =
-    {
-        {
-            vcvtq_f64_s64(vld1q_s64(src_ptr)),
-            vcvtq_f64_s64(vld1q_s64(src_ptr + 2)),
-            vcvtq_f64_s64(vld1q_s64(src_ptr + 4)),
-            vcvtq_f64_s64(vld1q_s64(src_ptr + 6))
-        }
-    };
-    const float64x2x4_t texels1 =
-    {
-        {
-            vcvtq_f64_s64(vld1q_s64(src_ptr + 8)),
-            vcvtq_f64_s64(vld1q_s64(src_ptr + 10)),
-            vcvtq_f64_s64(vld1q_s64(src_ptr + 12)),
-            vcvtq_f64_s64(vld1q_s64(src_ptr + 14))
-        }
-    };
-    const float32x4x4_t texels =
-    {
-        {
-            vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])),
-            vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])),
-            vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])),
-            vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))
-        }
-    };
+    const float64x2x4_t texels0 = {{vcvtq_f64_s64(vld1q_s64(src_ptr)), vcvtq_f64_s64(vld1q_s64(src_ptr + 2)),
+                                    vcvtq_f64_s64(vld1q_s64(src_ptr + 4)), vcvtq_f64_s64(vld1q_s64(src_ptr + 6))}};
+    const float64x2x4_t texels1 = {{vcvtq_f64_s64(vld1q_s64(src_ptr + 8)), vcvtq_f64_s64(vld1q_s64(src_ptr + 10)),
+                                    vcvtq_f64_s64(vld1q_s64(src_ptr + 12)), vcvtq_f64_s64(vld1q_s64(src_ptr + 14))}};
+    const float32x4x4_t texels  = {{vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])),
+                                    vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])),
+                                    vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])),
+                                    vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))}};
     vst1q_f32(dst_ptr, texels.val[0]);
     vst1q_f32(dst_ptr + 4, texels.val[1]);
     vst1q_f32(dst_ptr + 8, texels.val[2]);
@@ -251,34 +227,15 @@ inline void internal_neon_convert<int64_t, float>(const int64_t *src_ptr, float
 template <>
 inline void internal_neon_convert<uint64_t, float>(const uint64_t *src_ptr, float *dst_ptr)
 {
-    const float64x2x4_t texels0 =
-    {
-        {
-            vcvtq_f64_u64(vld1q_u64(src_ptr)),
-            vcvtq_f64_u64(vld1q_u64(src_ptr + 2)),
-            vcvtq_f64_u64(vld1q_u64(src_ptr + 4)),
-            vcvtq_f64_u64(vld1q_u64(src_ptr + 6))
-        }
-    };
-    const float64x2x4_t texels1 =
-    {
-        {
-            vcvtq_f64_u64(vld1q_u64(src_ptr + 8)),
-            vcvtq_f64_u64(vld1q_u64(src_ptr + 10)),
-            vcvtq_f64_u64(vld1q_u64(src_ptr + 12)),
-            vcvtq_f64_u64(vld1q_u64(src_ptr + 14))
-        }
-    };
+    const float64x2x4_t texels0 = {{vcvtq_f64_u64(vld1q_u64(src_ptr)), vcvtq_f64_u64(vld1q_u64(src_ptr + 2)),
+                                    vcvtq_f64_u64(vld1q_u64(src_ptr + 4)), vcvtq_f64_u64(vld1q_u64(src_ptr + 6))}};
+    const float64x2x4_t texels1 = {{vcvtq_f64_u64(vld1q_u64(src_ptr + 8)), vcvtq_f64_u64(vld1q_u64(src_ptr + 10)),
+                                    vcvtq_f64_u64(vld1q_u64(src_ptr + 12)), vcvtq_f64_u64(vld1q_u64(src_ptr + 14))}};
 
-    const float32x4x4_t texels =
-    {
-        {
-            vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])),
-            vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])),
-            vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])),
-            vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))
-        }
-    };
+    const float32x4x4_t texels = {{vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])),
+                                   vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])),
+                                   vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])),
+                                   vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))}};
 
     vst1q_f32(dst_ptr, texels.val[0]);
     vst1q_f32(dst_ptr + 4, texels.val[1]);
@@ -287,23 +244,26 @@ inline void internal_neon_convert<uint64_t, float>(const uint64_t *src_ptr, floa
 }
 
 template <typename T1, typename T2>
-inline void convert64(Iterator &src, Iterator &dst, const Window &win, int window_start_x, int window_end_x, int window_step_x)
+inline void
+convert64(Iterator &src, Iterator &dst, const Window &win, int window_start_x, int window_end_x, int window_step_x)
 {
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto src_ptr = reinterpret_cast<const T1 *>(src.ptr());
-        const auto dst_ptr = reinterpret_cast<T2 *>(dst.ptr());
-        int        x       = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            internal_neon_convert<T1, T2>(src_ptr + x, dst_ptr + x);
-        }
-        for(; x < window_end_x; ++x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            *(dst_ptr + x) = static_cast<T2>(*(src_ptr + x));
-        }
-    },
-    src, dst);
+            const auto src_ptr = reinterpret_cast<const T1 *>(src.ptr());
+            const auto dst_ptr = reinterpret_cast<T2 *>(dst.ptr());
+            int        x       = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                internal_neon_convert<T1, T2>(src_ptr + x, dst_ptr + x);
+            }
+            for (; x < window_end_x; ++x)
+            {
+                *(dst_ptr + x) = static_cast<T2>(*(src_ptr + x));
+            }
+        },
+        src, dst);
 }
 } // namespace
 #endif // __aarch64__
@@ -325,21 +285,22 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator src(_src, win);
     Iterator dst(_dst, win);
 
     /*ukernel runs only when using fp16, so we validate it isn't a nullptr only before using it */
-    const auto *uk = CpuCastKernel::get_implementation(CastDataTypeISASelectorData{ _src->info()->data_type(), _dst->info()->data_type(), CPUInfo::get().get_isa() });
+    const auto *uk = CpuCastKernel::get_implementation(
+        CastDataTypeISASelectorData{_src->info()->data_type(), _dst->info()->data_type(), CPUInfo::get().get_isa()});
 
-    switch(_src->info()->data_type())
+    switch (_src->info()->data_type())
     {
 #ifdef __aarch64__
         case DataType::U64:
         {
-            switch(_dst->info()->data_type())
+            switch (_dst->info()->data_type())
             {
                 case DataType::F32:
                 {
@@ -353,7 +314,7 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
         }
         case DataType::S64:
         {
-            switch(_dst->info()->data_type())
+            switch (_dst->info()->data_type())
             {
                 case DataType::F32:
                 {
@@ -369,111 +330,102 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
 
         case DataType::QASYMM8_SIGNED:
         {
-            switch(_dst->info()->data_type())
+            switch (_dst->info()->data_type())
             {
                 case DataType::S16:
                 {
                     /* Up-conversion QASYMM8_SIGNED -> S16 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-                        int        x       = window_start_x;
-
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
+                            const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
+                            int        x       = window_start_x;
 
-                            const int16x8x2_t texels =
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
-                                    vmovl_s8(vget_low_s8(texels_s8)),
-                                    vmovl_s8(vget_high_s8(texels_s8))
-                                }
-                            };
+                                const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
 
-                            vst1q_s16(dst_ptr + x, texels.val[0]);
-                            vst1q_s16(dst_ptr + x + 8, texels.val[1]);
-                        }
+                                const int16x8x2_t texels = {
+                                    {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
 
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<int16_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                                vst1q_s16(dst_ptr + x, texels.val[0]);
+                                vst1q_s16(dst_ptr + x + 8, texels.val[1]);
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<int16_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 case DataType::S32:
                 {
                     /* Up-conversion QASYMM8_SIGNED -> S32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-                        int        x       = window_start_x;
-
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
+                            const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+                            int        x       = window_start_x;
 
-                            const int16x8x2_t texels =
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
-                                    vmovl_s8(vget_low_s8(texels_s8)),
-                                    vmovl_s8(vget_high_s8(texels_s8))
-                                }
-                            };
+                                const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
 
-                            vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
-                            vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
-                            vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
-                            vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
-                        }
+                                const int16x8x2_t texels = {
+                                    {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
 
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                                vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
+                                vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
+                                vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
+                                vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 case DataType::F32:
                 {
                     /* Up-conversion QASYMM8_SIGNED -> F32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
+                            const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
 
-                            const int16x8x2_t texels =
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
-                                    vmovl_s8(vget_low_s8(texels_s8)),
-                                    vmovl_s8(vget_high_s8(texels_s8))
-                                }
-                            };
-                            vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
-                            vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
-                            vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
-                            vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                                const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
+
+                                const int16x8x2_t texels = {
+                                    {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
+                                vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
+                                vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
+                                vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
+                                vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 case DataType::F16:
@@ -492,111 +444,102 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
         case DataType::QASYMM8:
         case DataType::U8:
         {
-            switch(_dst->info()->data_type())
+            switch (_dst->info()->data_type())
             {
                 case DataType::S16:
                 {
                     /* Up-conversion U8 -> S16 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+                            const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
 
-                            const int16x8x2_t texels =
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
-                                }
-                            };
+                                const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
 
-                            vst1q_s16(dst_ptr + x, texels.val[0]);
-                            vst1q_s16(dst_ptr + x + 8, texels.val[1]);
-                        }
+                                const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
+                                                             vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
 
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                                vst1q_s16(dst_ptr + x, texels.val[0]);
+                                vst1q_s16(dst_ptr + x + 8, texels.val[1]);
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 case DataType::S32:
                 {
                     /* Up-conversion U8 -> S32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+                            const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
 
-                            const int16x8x2_t texels =
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
-                                }
-                            };
+                                const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
 
-                            vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
-                            vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
-                            vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
-                            vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
-                        }
+                                const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
+                                                             vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
 
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                                vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
+                                vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
+                                vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
+                                vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 case DataType::F32:
                 {
                     /* Up-conversion U8 -> F32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+                            const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
 
-                            const int16x8x2_t texels =
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
-                                }
-                            };
-                            vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
-                            vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
-                            vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
-                            vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                                const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+
+                                const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
+                                                             vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
+                                vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
+                                vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
+                                vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
+                                vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 case DataType::F16:
@@ -609,35 +552,32 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
                 case DataType::U16:
                 {
                     /* Up-conversion U8 -> U16 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<uint16_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+                            const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<uint16_t *>(dst.ptr());
 
-                            const uint16x8x2_t texels =
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
-                                    vmovl_u8(vget_low_u8(texels_u8)),
-                                    vmovl_u8(vget_high_u8(texels_u8))
-                                }
-                            };
+                                const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
 
-                            vst1q_u16(dst_ptr + x, texels.val[0]);
-                            vst1q_u16(dst_ptr + x + 8, texels.val[1]);
-                        }
+                                const uint16x8x2_t texels = {
+                                    {vmovl_u8(vget_low_u8(texels_u8)), vmovl_u8(vget_high_u8(texels_u8))}};
 
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<uint16_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                                vst1q_u16(dst_ptr + x, texels.val[0]);
+                                vst1q_u16(dst_ptr + x + 8, texels.val[1]);
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<uint16_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 default:
@@ -647,177 +587,154 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
         }
         case DataType::S16:
         {
-            switch(_dst->info()->data_type())
+            switch (_dst->info()->data_type())
             {
                 case DataType::QASYMM8_SIGNED:
                 {
                     /* Down-conversion S16 -> QASYMM8_SIGNED */
-                    if(ConvertPolicy::SATURATE == _policy)
+                    if (ConvertPolicy::SATURATE == _policy)
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const int16x8x2_t texels =
+                                const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
-                                    {
-                                        vld1q_s16(src_ptr + x),
-                                        vld1q_s16(src_ptr + x + 8)
-                                    }
-                                };
+                                    const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
 
-                                vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
-                            }
+                                    vst1q_s8(dst_ptr + x,
+                                             vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
+                                }
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     else
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const int16x8x2_t texels =
+                                const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
-                                    {
-                                        vld1q_s16(src_ptr + x),
-                                        vld1q_s16(src_ptr + x + 8)
-                                    }
-                                };
+                                    const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
 
-                                vst1q_s8(dst_ptr + x, vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
-                            }
+                                    vst1q_s8(dst_ptr + x,
+                                             vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
+                                }
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     break;
                 }
                 case DataType::U8:
                 {
                     /* Down-conversion S16 -> U8 */
-                    if(ConvertPolicy::SATURATE == _policy)
+                    if (ConvertPolicy::SATURATE == _policy)
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const int16x8x2_t texels =
+                                const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
-                                    {
-                                        vld1q_s16(src_ptr + x),
-                                        vld1q_s16(src_ptr + x + 8)
-                                    }
-                                };
+                                    const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
 
-                                vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
-                            }
+                                    vst1q_u8(dst_ptr + x,
+                                             vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
+                                }
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     else
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const int16x8x2_t texels =
+                                const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
-                                    {
-                                        vld1q_s16(src_ptr + x),
-                                        vld1q_s16(src_ptr + x + 8)
-                                    }
-                                };
-
-                                vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
-                                                                  vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
-                            }
+                                    const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
+                                    vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
+                                                                      vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
+                                }
+
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     break;
                 }
                 case DataType::S32:
                 {
                     /* Up-conversion S16 -> S32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const int16x8x2_t texels =
-                            {
-                                {
-                                    vld1q_s16(src_ptr + x),
-                                    vld1q_s16(src_ptr + x + 8)
-                                }
-                            };
+                            const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
 
-                            const int32x4x4_t texels_s32 =
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
-                                    vmovl_s16(vget_low_s16(texels.val[0])),
-                                    vmovl_s16(vget_high_s16(texels.val[0])),
-                                    vmovl_s16(vget_low_s16(texels.val[1])),
-                                    vmovl_s16(vget_high_s16(texels.val[1]))
-                                }
-                            };
+                                const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
 
-                            vst1q_s32(dst_ptr + x, texels_s32.val[0]);
-                            vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]);
-                            vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]);
-                            vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]);
-                        }
+                                const int32x4x4_t texels_s32 = {
+                                    {vmovl_s16(vget_low_s16(texels.val[0])), vmovl_s16(vget_high_s16(texels.val[0])),
+                                     vmovl_s16(vget_low_s16(texels.val[1])), vmovl_s16(vget_high_s16(texels.val[1]))}};
 
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                                vst1q_s32(dst_ptr + x, texels_s32.val[0]);
+                                vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]);
+                                vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]);
+                                vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]);
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 default:
@@ -828,104 +745,92 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
 
         case DataType::U16:
         {
-            switch(_dst->info()->data_type())
+            switch (_dst->info()->data_type())
             {
                 case DataType::U8:
                 {
                     /* Down-conversion U16 -> U8 */
-                    if(ConvertPolicy::SATURATE == _policy)
+                    if (ConvertPolicy::SATURATE == _policy)
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const uint16x8x2_t texels =
+                                const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
-                                    {
-                                        vld1q_u16(src_ptr + x),
-                                        vld1q_u16(src_ptr + x + 8)
-                                    }
-                                };
+                                    const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}};
 
-                                vst1q_u8(dst_ptr + x, vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
-                            }
+                                    vst1q_u8(dst_ptr + x,
+                                             vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
+                                }
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     else
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const uint16x8x2_t texels =
-                                {
-                                    {
-                                        vld1q_u16(src_ptr + x),
-                                        vld1q_u16(src_ptr + x + 8)
-                                    }
-                                };
+                                const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
 
-                                vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
-                            }
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                                {
+                                    const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}};
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
-                            }
+                                    vst1q_u8(dst_ptr + x,
+                                             vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
+                                }
 
-                        },
-                        src, dst);
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     break;
                 }
                 case DataType::U32:
                 {
                     /* Up-conversion U16 -> U32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<uint32_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const uint16x8x2_t texels =
+                            const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<uint32_t *>(dst.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
-                                    vld1q_u16(src_ptr + x),
-                                    vld1q_u16(src_ptr + x + 8)
-                                }
-                            };
-
-                            vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0])));
-                            vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0])));
-                            vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1])));
-                            vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1])));
-                        }
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
-                        }
+                                const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}};
 
-                    },
-                    src, dst);
+                                vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0])));
+                                vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0])));
+                                vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1])));
+                                vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1])));
+                            }
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 default:
@@ -941,7 +846,7 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
             break;
         }
         case DataType::F32:
-            switch(_dst->info()->data_type())
+            switch (_dst->info()->data_type())
             {
                 case DataType::F16:
                 {
@@ -953,105 +858,110 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
                 case DataType::S32:
                 {
                     /* Conversion F32 -> S32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const float32x4x4_t texels =
+                            const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
+                                const float32x4x4_t texels = {{
                                     vld1q_f32(src_ptr + x),
                                     vld1q_f32(src_ptr + x + 4),
                                     vld1q_f32(src_ptr + x + 8),
                                     vld1q_f32(src_ptr + x + 12),
-                                }
-                            };
+                                }};
 
-                            vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0]));
-                            vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
-                            vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
-                            vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
-                        }
+                                vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0]));
+                                vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
+                                vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
+                                vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
+                            }
 
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 case DataType::QASYMM8:
                 case DataType::U8:
                 {
                     /* Down-conversion F32 -> U8 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const float32x4x4_t texels =
+                            const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
+                                const float32x4x4_t texels = {{
                                     vld1q_f32(src_ptr + x),
                                     vld1q_f32(src_ptr + x + 4),
                                     vld1q_f32(src_ptr + x + 8),
                                     vld1q_f32(src_ptr + x + 12),
-                                }
-                            };
-
-                            vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
-                            vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
-                        }
+                                }};
+
+                                vst1_u8(dst_ptr + x,
+                                        vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])),
+                                                                vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
+                                vst1_u8(dst_ptr + x + 8,
+                                        vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])),
+                                                                vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
+                            }
 
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 case DataType::QASYMM8_SIGNED:
                 {
                     /* Down-conversion F32 -> QASYMM8_SIGNED */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const float32x4x4_t texels =
+                            const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
+                                const float32x4x4_t texels = {{
                                     vld1q_f32(src_ptr + x),
                                     vld1q_f32(src_ptr + x + 4),
                                     vld1q_f32(src_ptr + x + 8),
                                     vld1q_f32(src_ptr + x + 12),
-                                }
-                            };
-
-                            vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
-                            vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
-                        }
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                                }};
+
+                                vst1_s8(dst_ptr + x,
+                                        vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])),
+                                                                vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
+                                vst1_s8(dst_ptr + x + 8,
+                                        vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])),
+                                                                vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
+                            }
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
 
@@ -1060,7 +970,7 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
             }
             break;
         case DataType::S32:
-            switch(_dst->info()->data_type())
+            switch (_dst->info()->data_type())
             {
 #if __aarch64__
                 case DataType::S64:
@@ -1079,104 +989,102 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
                 case DataType::F32:
                 {
                     /* Conversion S32 -> F32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const int32x4x4_t texels =
+                            const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
+                                const int32x4x4_t texels = {{
                                     vld1q_s32(src_ptr + x),
                                     vld1q_s32(src_ptr + x + 4),
                                     vld1q_s32(src_ptr + x + 8),
                                     vld1q_s32(src_ptr + x + 12),
-                                }
-                            };
+                                }};
 
-                            vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0]));
-                            vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
-                            vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
-                            vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
-                        }
+                                vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0]));
+                                vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
+                                vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
+                                vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
+                            }
 
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 case DataType::QASYMM8_SIGNED:
                 {
                     /* Down-conversion S32 -> QASYMM8_SIGNED */
-                    if(ConvertPolicy::SATURATE == _policy)
+                    if (ConvertPolicy::SATURATE == _policy)
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const int32x4x4_t texels =
+                                const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
-                                    {
+                                    const int32x4x4_t texels = {{
                                         vld1q_s32(src_ptr + x),
                                         vld1q_s32(src_ptr + x + 4),
                                         vld1q_s32(src_ptr + x + 8),
                                         vld1q_s32(src_ptr + x + 12),
-                                    }
-                                };
-                                vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), vqmovn_s32(texels.val[1]))));
-                                vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), vqmovn_s32(texels.val[3]))));
-                            }
+                                    }};
+                                    vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]),
+                                                                                 vqmovn_s32(texels.val[1]))));
+                                    vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]),
+                                                                                     vqmovn_s32(texels.val[3]))));
+                                }
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     else
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const int32x4x4_t texels =
-                                {
-                                    {
-                                        vld1q_s32(src_ptr + x),
-                                        vld1q_s32(src_ptr + x + 4),
-                                        vld1q_s32(src_ptr + x + 8),
-                                        vld1q_s32(src_ptr + x + 12)
-                                    }
-                                };
+                                const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
 
-                                vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), vmovn_s32(texels.val[1]))));
-                                vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), vmovn_s32(texels.val[3]))));
-                            }
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                                {
+                                    const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4),
+                                                                 vld1q_s32(src_ptr + x + 8),
+                                                                 vld1q_s32(src_ptr + x + 12)}};
+
+                                    vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]),
+                                                                                vmovn_s32(texels.val[1]))));
+                                    vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]),
+                                                                                    vmovn_s32(texels.val[3]))));
+                                }
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     break;
                 }
@@ -1184,68 +1092,66 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
                 case DataType::U8:
                 {
                     /* Down-conversion S32 -> U8 */
-                    if(ConvertPolicy::SATURATE == _policy)
+                    if (ConvertPolicy::SATURATE == _policy)
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const int32x4x4_t texels =
+                                const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
-                                    {
-                                        vld1q_s32(src_ptr + x),
-                                        vld1q_s32(src_ptr + x + 4),
-                                        vld1q_s32(src_ptr + x + 8),
-                                        vld1q_s32(src_ptr + x + 12)
-                                    }
-                                };
-                                vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1]))));
-                                vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3]))));
-                            }
+                                    const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4),
+                                                                 vld1q_s32(src_ptr + x + 8),
+                                                                 vld1q_s32(src_ptr + x + 12)}};
+                                    vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]),
+                                                                                 vqmovun_s32(texels.val[1]))));
+                                    vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]),
+                                                                                     vqmovun_s32(texels.val[3]))));
+                                }
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     else
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const int32x4x4_t texels =
-                                {
-                                    {
-                                        vld1q_s32(src_ptr + x),
-                                        vld1q_s32(src_ptr + x + 4),
-                                        vld1q_s32(src_ptr + x + 8),
-                                        vld1q_s32(src_ptr + x + 12)
-                                    }
-                                };
+                                const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
 
-                                vst1_u8(dst_ptr + x, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
-                                vst1_u8(dst_ptr + x + 8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
-                            }
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                                {
+                                    const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4),
+                                                                 vld1q_s32(src_ptr + x + 8),
+                                                                 vld1q_s32(src_ptr + x + 12)}};
+
+                                    vst1_u8(dst_ptr + x,
+                                            vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])),
+                                                                   vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
+                                    vst1_u8(dst_ptr + x + 8,
+                                            vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])),
+                                                                   vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
+                                }
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     break;
                 }
diff --git a/src/cpu/kernels/CpuCastKernel.h b/src/cpu/kernels/CpuCastKernel.h
index a7e6417ff2..ddbfe1f034 100644
--- a/src/cpu/kernels/CpuCastKernel.h
+++ b/src/cpu/kernels/CpuCastKernel.h
@@ -40,7 +40,8 @@ namespace kernels
 class CpuCastKernel : public ICpuKernel<CpuCastKernel>
 {
 private:
-    using CastKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const ThreadInfo &, ConvertPolicy, const Window &)>::type;
+    using CastKernelPtr =
+        std::add_pointer<void(const ITensor *, ITensor *, const ThreadInfo &, ConvertPolicy, const Window &)>::type;
 
 public:
     CpuCastKernel() = default;
@@ -76,7 +77,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct CastKernel
@@ -89,7 +90,7 @@ public:
     static const std::vector<CastKernel> &get_available_kernels();
 
 private:
-    ConvertPolicy _policy{ ConvertPolicy::SATURATE };
+    ConvertPolicy _policy{ConvertPolicy::SATURATE};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuCol2ImKernel.cpp b/src/cpu/kernels/CpuCol2ImKernel.cpp
index bf5a44d78b..a52a1f58ea 100644
--- a/src/cpu/kernels/CpuCol2ImKernel.cpp
+++ b/src/cpu/kernels/CpuCol2ImKernel.cpp
@@ -29,8 +29,9 @@
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -49,9 +50,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
 
     // Validate configured output
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), compute_col2im_shape(*src, convolved_dims, false));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
+                                                           compute_col2im_shape(*src, convolved_dims, false));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
     }
@@ -106,13 +108,16 @@ void CpuCol2ImKernel::run_op(ITensorPack &tensors, const Window &window, const T
     Iterator in(src, window);
     Iterator out(dst, window_out);
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const int hidx = id.y();
-        const int idx  = id.x() * output_stride_z + (hidx / _convolved_dims.width) * output_stride_y + (hidx % _convolved_dims.width) * output_stride_x;
-        std::memcpy(out.ptr() + idx, in.ptr(), el_size);
-    },
-    in, out);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const int hidx = id.y();
+            const int idx  = id.x() * output_stride_z + (hidx / _convolved_dims.width) * output_stride_y +
+                            (hidx % _convolved_dims.width) * output_stride_x;
+            std::memcpy(out.ptr() + idx, in.ptr(), el_size);
+        },
+        in, out);
 }
 
 const char *CpuCol2ImKernel::name() const
@@ -121,4 +126,4 @@ const char *CpuCol2ImKernel::name() const
 }
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuCol2ImKernel.h b/src/cpu/kernels/CpuCol2ImKernel.h
index deafcc14df..3e394ac914 100644
--- a/src/cpu/kernels/CpuCol2ImKernel.h
+++ b/src/cpu/kernels/CpuCol2ImKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_COL2IM_KERNEL_H
 
 #include "arm_compute/core/Size2D.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -75,7 +76,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
diff --git a/src/cpu/kernels/CpuConcatenateBatchKernel.cpp b/src/cpu/kernels/CpuConcatenateBatchKernel.cpp
index 29d40f0e52..8c290173e8 100644
--- a/src/cpu/kernels/CpuConcatenateBatchKernel.cpp
+++ b/src/cpu/kernels/CpuConcatenateBatchKernel.cpp
@@ -30,10 +30,11 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
@@ -50,13 +51,14 @@ void batch_concat(const ITensor *src, ITensor *dst, unsigned int batch_offset, c
     uint8_t *src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes();
 
     // Offset dst
-    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + batch_offset * dst->info()->strides_in_bytes()[3];
+    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() +
+                       batch_offset * dst->info()->strides_in_bytes()[3];
 
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
     const int  window_step_x  = 16 / dst->info()->element_size();
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
     win.set(3, Window::Dimension(0, src->info()->tensor_shape()[3], 1));
 
@@ -66,66 +68,74 @@ void batch_concat(const ITensor *src, ITensor *dst, unsigned int batch_offset, c
     const DataType                dt        = src->info()->data_type();
     const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
     const UniformQuantizationInfo dst_qinfo = dst->info()->quantization_info().uniform();
-    if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
+    if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset());
-
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                wrapper::vstore(out_ptr, vquantize(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
+                const auto in_ptr  = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset());
+                const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset());
+
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr, vquantize(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
     }
-    else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
+    else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset());
-            int        x       = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                wrapper::vstore(out_ptr, vquantize_signed(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo));
-            }
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                *(out_ptr + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
+                const auto in_ptr  = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset());
+                const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset());
+                int        x       = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr,
+                                    vquantize_signed(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo));
+                }
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) =
+                        quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
     }
     else
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const T *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset());
-
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                *(out_ptr + x) = *(in_ptr + x);
-            }
-        },
-        src_it, dst_it);
+                const auto in_ptr  = reinterpret_cast<const T *>(src_ptr + src_it.offset());
+                const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset());
+
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) = *(in_ptr + x);
+                }
+            },
+            src_it, dst_it);
     }
 }
 
@@ -154,7 +164,7 @@ void CpuConcatenateBatchKernel::configure(const ITensorInfo *src, unsigned int b
     _func         = nullptr;
     _batch_offset = batch_offset;
 
-    switch(src->data_type())
+    switch (src->data_type())
     {
         case DataType::S8:
         case DataType::U8:
@@ -196,9 +206,7 @@ void CpuConcatenateBatchKernel::run_op(ITensorPack &tensors, const Window &windo
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC),
-             tensors.get_tensor(TensorType::ACL_DST),
-             _batch_offset,
+    (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC), tensors.get_tensor(TensorType::ACL_DST), _batch_offset,
              window);
 }
 
diff --git a/src/cpu/kernels/CpuConcatenateBatchKernel.h b/src/cpu/kernels/CpuConcatenateBatchKernel.h
index 0de68a5d64..52ea553a7d 100644
--- a/src/cpu/kernels/CpuConcatenateBatchKernel.h
+++ b/src/cpu/kernels/CpuConcatenateBatchKernel.h
@@ -57,15 +57,15 @@ public:
     static Status validate(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
     using BatchConcatFunction = void(const ITensor *, ITensor *, unsigned int, const Window &);
 
 private:
-    BatchConcatFunction *_func{ nullptr };
-    unsigned int         _batch_offset{ 0 };
+    BatchConcatFunction *_func{nullptr};
+    unsigned int         _batch_offset{0};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuConcatenateDepthKernel.cpp b/src/cpu/kernels/CpuConcatenateDepthKernel.cpp
index ebc5322aee..c75e1e4477 100644
--- a/src/cpu/kernels/CpuConcatenateDepthKernel.cpp
+++ b/src/cpu/kernels/CpuConcatenateDepthKernel.cpp
@@ -30,11 +30,12 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
 
 #include <cstdint>
 
@@ -53,13 +54,14 @@ void depth_concat(const ITensor *src, ITensor *dst, unsigned int depth_offset, c
     uint8_t *src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes();
 
     // Offset destination
-    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + depth_offset * dst->info()->strides_in_bytes()[2];
+    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() +
+                       depth_offset * dst->info()->strides_in_bytes()[2];
 
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
     const int  window_step_x  = 16 / dst->info()->element_size();
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
     win.set(Window::DimZ, Window::Dimension(0, src->info()->tensor_shape().z(), 1));
 
@@ -69,64 +71,73 @@ void depth_concat(const ITensor *src, ITensor *dst, unsigned int depth_offset, c
     const DataType                dt        = src->info()->data_type();
     const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
     const UniformQuantizationInfo dst_qinfo = dst->info()->quantization_info().uniform();
-    if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
+    if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset());
-            int        x       = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                wrapper::vstore(out_ptr + x, vquantize(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
+                const auto in_ptr  = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset());
+                const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset());
+                int        x       = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr + x,
+                                    vquantize(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
     }
-    else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
+    else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset());
-            int        x       = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                wrapper::vstore(out_ptr + x, vquantize_signed(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
+                const auto in_ptr  = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset());
+                const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset());
+                int        x       = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr + x,
+                                    vquantize_signed(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) =
+                        quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
     }
     else
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const T *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset());
-            int        x       = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
-            }
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                *(out_ptr + x) = *(in_ptr + x);
-            }
-        },
-        src_it, dst_it);
+                const auto in_ptr  = reinterpret_cast<const T *>(src_ptr + src_it.offset());
+                const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset());
+                int        x       = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
+                }
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) = *(in_ptr + x);
+                }
+            },
+            src_it, dst_it);
     }
 }
 
@@ -134,7 +145,8 @@ Status validate_arguments(const ITensorInfo *input, unsigned int depth_offset, c
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimX) != output->dimension(Window::DimX));
@@ -154,7 +166,7 @@ void CpuConcatenateDepthKernel::configure(const ITensorInfo *src, unsigned int d
     _func         = nullptr;
     _depth_offset = depth_offset;
 
-    switch(src->data_type())
+    switch (src->data_type())
     {
         case DataType::QASYMM8:
             _func = &depth_concat<uint8_t>;
@@ -192,9 +204,7 @@ void CpuConcatenateDepthKernel::run_op(ITensorPack &tensors, const Window &windo
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC),
-             tensors.get_tensor(TensorType::ACL_DST),
-             _depth_offset,
+    (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC), tensors.get_tensor(TensorType::ACL_DST), _depth_offset,
              window);
 }
 
diff --git a/src/cpu/kernels/CpuConcatenateDepthKernel.h b/src/cpu/kernels/CpuConcatenateDepthKernel.h
index 5a0edb95bb..54de9aff46 100644
--- a/src/cpu/kernels/CpuConcatenateDepthKernel.h
+++ b/src/cpu/kernels/CpuConcatenateDepthKernel.h
@@ -65,15 +65,15 @@ public:
     static Status validate(const ITensorInfo *src, unsigned int depth_offset, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
     using DepthConcatFunction = void(const ITensor *, ITensor *, unsigned int, const Window &);
 
 private:
-    DepthConcatFunction *_func{ nullptr };
-    unsigned int         _depth_offset{ 0 };
+    DepthConcatFunction *_func{nullptr};
+    unsigned int         _depth_offset{0};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuConcatenateHeightKernel.cpp b/src/cpu/kernels/CpuConcatenateHeightKernel.cpp
index 47a2b44443..b6c11d948b 100644
--- a/src/cpu/kernels/CpuConcatenateHeightKernel.cpp
+++ b/src/cpu/kernels/CpuConcatenateHeightKernel.cpp
@@ -30,10 +30,11 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <cstdint>
 
@@ -53,7 +54,7 @@ Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, co
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX));
     ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) + height_offset > dst->dimension(Window::DimY));
-    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
     }
@@ -91,13 +92,14 @@ void CpuConcatenateHeightKernel::run_op(ITensorPack &tensors, const Window &wind
     auto       dst = tensors.get_tensor(TensorType::ACL_DST);
 
     // Offset destination pointer to the correct position
-    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + _height_offset * dst->info()->strides_in_bytes()[Window::DimY];
+    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() +
+                       _height_offset * dst->info()->strides_in_bytes()[Window::DimY];
 
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end()) * static_cast<int>(dst->info()->element_size());
     const int  window_step_x  = 16;
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
     win.set(Window::DimY, Window::Dimension(0, src->info()->tensor_shape().y(), 1));
 
@@ -108,64 +110,74 @@ void CpuConcatenateHeightKernel::run_op(ITensorPack &tensors, const Window &wind
     const DataType                 dt        = src->info()->data_type();
     const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
     const UniformQuantizationInfo &dst_qinfo = dst->info()->quantization_info().uniform();
-    if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
+    if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                vst1q_u8(dst_ptr + dst_it.offset() + x, vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
-            }
-
-        },
-        src_it, dst_it);
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    vst1q_u8(dst_ptr + dst_it.offset() + x,
+                             vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(dst_ptr + dst_it.offset() + x) =
+                        quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
     }
-    else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
+    else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                vst1q_s8(reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x),
-                         vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr()) + x), src_qinfo), dst_qinfo));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    vst1q_s8(
+                        reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x),
+                        vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr()) + x), src_qinfo),
+                                         dst_qinfo));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(dst_ptr + dst_it.offset() + x) =
+                        quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
     }
     else
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = src_it.ptr();
-            const auto out_ptr = dst_ptr + dst_it.offset();
-
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                *(out_ptr + x) = *(in_ptr + x);
-            }
-        },
-        src_it, dst_it);
+                const auto in_ptr  = src_it.ptr();
+                const auto out_ptr = dst_ptr + dst_it.offset();
+
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) = *(in_ptr + x);
+                }
+            },
+            src_it, dst_it);
     }
 }
 
diff --git a/src/cpu/kernels/CpuConcatenateHeightKernel.h b/src/cpu/kernels/CpuConcatenateHeightKernel.h
index 74d5d0c2c3..df880c4878 100644
--- a/src/cpu/kernels/CpuConcatenateHeightKernel.h
+++ b/src/cpu/kernels/CpuConcatenateHeightKernel.h
@@ -58,11 +58,11 @@ public:
     static Status validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
-    unsigned int _height_offset{ 0 };
+    unsigned int _height_offset{0};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuConcatenateWidthKernel.cpp b/src/cpu/kernels/CpuConcatenateWidthKernel.cpp
index f00b37a01b..f6100cccca 100644
--- a/src/cpu/kernels/CpuConcatenateWidthKernel.cpp
+++ b/src/cpu/kernels/CpuConcatenateWidthKernel.cpp
@@ -24,12 +24,12 @@
 #include "src/cpu/kernels/CpuConcatenateWidthKernel.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Steps.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Steps.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/helpers/WindowHelpers.h"
 
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
 
 namespace arm_compute
 {
@@ -47,7 +47,7 @@ Status validate_arguments(const ITensorInfo *src, unsigned int width_offset, con
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) + width_offset > dst->dimension(0));
 
-    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
     }
@@ -86,13 +86,14 @@ void CpuConcatenateWidthKernel::run_op(ITensorPack &tensors, const Window &windo
     auto       dst = tensors.get_tensor(TensorType::ACL_DST);
 
     // Offset output pointer to the correct position
-    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + _width_offset * dst->info()->strides_in_bytes()[0];
+    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() +
+                       _width_offset * dst->info()->strides_in_bytes()[0];
 
     const auto    window_start_x = static_cast<int>(window.x().start());
     const auto    window_end_x   = static_cast<int>(window.x().end()) * static_cast<int>(dst->info()->element_size());
     constexpr int window_step_x  = 16;
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     // Create iterators
@@ -101,62 +102,73 @@ void CpuConcatenateWidthKernel::run_op(ITensorPack &tensors, const Window &windo
     const DataType                 dt        = src->info()->data_type();
     const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
     const UniformQuantizationInfo &dst_qinfo = dst->info()->quantization_info().uniform();
-    if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
+    if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                vst1q_u8(dst_ptr + dst_it.offset() + x, vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    vst1q_u8(dst_ptr + dst_it.offset() + x,
+                             vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(dst_ptr + dst_it.offset() + x) =
+                        quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
     }
-    else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
+    else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                vst1q_s8(reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x),
-                         vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr() + x)), src_qinfo), dst_qinfo));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    vst1q_s8(
+                        reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x),
+                        vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr() + x)), src_qinfo),
+                                         dst_qinfo));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(dst_ptr + dst_it.offset() + x) =
+                        quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
     }
     else
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = src_it.ptr();
-            const auto out_ptr = dst_ptr + dst_it.offset();
-            int        x       = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                *(out_ptr + x) = *(in_ptr + x);
-            }
-        },
-        src_it, dst_it);
+                const auto in_ptr  = src_it.ptr();
+                const auto out_ptr = dst_ptr + dst_it.offset();
+                int        x       = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) = *(in_ptr + x);
+                }
+            },
+            src_it, dst_it);
     }
 }
 
diff --git a/src/cpu/kernels/CpuConcatenateWidthKernel.h b/src/cpu/kernels/CpuConcatenateWidthKernel.h
index 418bc51b33..560e44e35a 100644
--- a/src/cpu/kernels/CpuConcatenateWidthKernel.h
+++ b/src/cpu/kernels/CpuConcatenateWidthKernel.h
@@ -58,11 +58,11 @@ public:
     static Status validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
-    unsigned int _width_offset{ 0 };
+    unsigned int _width_offset{0};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp
index 08b39deef2..87703ec631 100644
--- a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp
+++ b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -34,8 +35,10 @@ namespace cpu
 {
 namespace kernels
 {
-void CpuConvertFullyConnectedWeightsKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_input_shape,
-                                                      DataLayout data_layout)
+void CpuConvertFullyConnectedWeightsKernel::configure(const ITensorInfo *src,
+                                                      ITensorInfo       *dst,
+                                                      const TensorShape &original_input_shape,
+                                                      DataLayout         data_layout)
 
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
@@ -43,7 +46,8 @@ void CpuConvertFullyConnectedWeightsKernel::configure(const ITensorInfo *src, IT
     // Output tensor auto initialisation if not yet initialized
     auto_init_if_empty(*dst, *src->clone());
 
-    ARM_COMPUTE_ERROR_THROW_ON(CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_input_shape, data_layout));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_input_shape, data_layout));
 
     const DataLayout input_data_layout = (data_layout == DataLayout::NCHW) ? DataLayout::NHWC : DataLayout::NCHW;
 
@@ -62,8 +66,10 @@ void CpuConvertFullyConnectedWeightsKernel::configure(const ITensorInfo *src, IT
     ICpuKernel::configure(win);
 }
 
-Status CpuConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_input_shape,
-                                                       DataLayout data_layout)
+Status CpuConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src,
+                                                       const ITensorInfo *dst,
+                                                       const TensorShape &original_input_shape,
+                                                       DataLayout         data_layout)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
@@ -72,7 +78,7 @@ Status CpuConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, c
     ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN);
 
     // Checks performed when dst is configured
-    if((dst != nullptr) && (dst->total_size() != 0))
+    if ((dst != nullptr) && (dst->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
@@ -97,11 +103,15 @@ void CpuConvertFullyConnectedWeightsKernel::run_op(ITensorPack &tensors, const W
     Iterator input(src, window);
     Iterator output(dst, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        memcpy(output.ptr() + id.x() * dst_stride_x + (id.y() % _factor1 * _factor2 + id.y() / _factor1) * dst_stride_y, input.ptr(), element_size);
-    },
-    input);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            memcpy(output.ptr() + id.x() * dst_stride_x +
+                       (id.y() % _factor1 * _factor2 + id.y() / _factor1) * dst_stride_y,
+                   input.ptr(), element_size);
+        },
+        input);
 }
 
 const char *CpuConvertFullyConnectedWeightsKernel::name() const
diff --git a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h
index 9a1393323b..2253889e69 100644
--- a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h
+++ b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h
@@ -53,24 +53,32 @@ public:
      * @param[in] original_input_shape Shape of the original src tensor (the one entering fully connected layer).
      * @param[in] data_layout          The data layout the weights have been trained in.
      */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_input_shape, DataLayout data_layout);
+    void configure(const ITensorInfo *src,
+                   ITensorInfo       *dst,
+                   const TensorShape &original_input_shape,
+                   DataLayout         data_layout);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref CpuConvertFullyConnectedWeightsKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_input_shape, DataLayout data_layout);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           const TensorShape &original_input_shape,
+                           DataLayout         data_layout);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
-    unsigned int _factor1{ 0 }; /* equals to the number of elements per original src plane if @p data_layout == NCHW; its number of channels otherwise */
-    unsigned int _factor2{ 0 }; /* equals to the number of elements per original src plane if @p data_layout == NHWC; its number of channels otherwise */
+    unsigned int _factor1{
+        0}; /* equals to the number of elements per original src plane if @p data_layout == NCHW; its number of channels otherwise */
+    unsigned int _factor2{
+        0}; /* equals to the number of elements per original src plane if @p data_layout == NHWC; its number of channels otherwise */
 };
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H */
diff --git a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp
index 1005d001ab..745b1566c2 100644
--- a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp
+++ b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp
@@ -29,9 +29,10 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
@@ -47,7 +48,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
 
     // Validate output if initialized
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst->tensor_shape());
@@ -60,11 +61,11 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src,
 {
     // Output auto inizialitation if not yet initialized
     {
-        const bool                    is_input_signed   = src->data_type() == DataType::QASYMM8_SIGNED;
-        const DataType                dt                = is_input_signed ? DataType::QASYMM8 : DataType::QASYMM8_SIGNED;
-        const UniformQuantizationInfo qinfo             = src->quantization_info().uniform();
+        const bool                    is_input_signed = src->data_type() == DataType::QASYMM8_SIGNED;
+        const DataType                dt              = is_input_signed ? DataType::QASYMM8 : DataType::QASYMM8_SIGNED;
+        const UniformQuantizationInfo qinfo           = src->quantization_info().uniform();
         const int                     offset_correction = is_input_signed ? -128 : 128;
-        const QuantizationInfo        corrected_qinfo   = QuantizationInfo(qinfo.scale, qinfo.offset + offset_correction);
+        const QuantizationInfo        corrected_qinfo = QuantizationInfo(qinfo.scale, qinfo.offset + offset_correction);
 
         auto_init_if_empty(*dst, src->clone()->set_data_type(dt).set_quantization_info(corrected_qinfo));
     }
@@ -110,27 +111,29 @@ void CpuConvertQuantizedSignednessKernel::run_op(ITensorPack &tensors, const Win
     const uint8_t mask  = 128;
     const auto    vmask = wrapper::vdup_n(mask, wrapper::traits::vector_128_tag{});
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            wrapper::vstore(output_ptr + x, wrapper::veor(vin, vmask));
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            const uint8_t in  = *(reinterpret_cast<const uint8_t *>(input_ptr + x));
-            *(output_ptr + x) = in ^ mask;
-        }
-    },
-    input, output);
+            const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin = wrapper::vloadq(input_ptr + x);
+                wrapper::vstore(output_ptr + x, wrapper::veor(vin, vmask));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                const uint8_t in  = *(reinterpret_cast<const uint8_t *>(input_ptr + x));
+                *(output_ptr + x) = in ^ mask;
+            }
+        },
+        input, output);
 }
 
 const char *CpuConvertQuantizedSignednessKernel::name() const
diff --git a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h
index b5eaf65487..e94d3d5ef2 100644
--- a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h
+++ b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h
@@ -54,7 +54,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 };
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuCopyKernel.cpp b/src/cpu/kernels/CpuCopyKernel.cpp
index 3f0f3fe422..1b693d7a3a 100644
--- a/src/cpu/kernels/CpuCopyKernel.cpp
+++ b/src/cpu/kernels/CpuCopyKernel.cpp
@@ -27,9 +27,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -48,9 +49,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 4);
 
     // Validate destination if initialized
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_padded_shape(src->tensor_shape(), padding), dst->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            misc::shape_calculator::compute_padded_shape(src->tensor_shape(), padding), dst->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     }
 
@@ -64,7 +66,8 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src,
     return std::make_pair(Status{}, calculate_max_window(*dst));
 }
 
-std::pair<Status, Window> validate_and_configure_window_with_padding(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding)
+std::pair<Status, Window>
+validate_and_configure_window_with_padding(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding)
 {
     const TensorShape src_shape    = src->tensor_shape();
     const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(src_shape, padding);
@@ -84,7 +87,7 @@ void CpuCopyKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Pa
     _padding = padding;
 
     std::pair<Status, Window> win_config;
-    if(padding.empty())
+    if (padding.empty())
     {
         win_config = validate_and_configure_window(src, dst);
     }
@@ -97,17 +100,20 @@ void CpuCopyKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Pa
     ICpuKernel::configure(win_config.second);
 }
 
-Status CpuCopyKernel::validate(const arm_compute::ITensorInfo *src, const arm_compute::ITensorInfo *dst, const PaddingList &padding)
+Status CpuCopyKernel::validate(const arm_compute::ITensorInfo *src,
+                               const arm_compute::ITensorInfo *dst,
+                               const PaddingList              &padding)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, padding));
 
-    if(padding.empty())
+    if (padding.empty())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first);
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_with_padding(src->clone().get(), dst->clone().get(), padding).first);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_and_configure_window_with_padding(src->clone().get(), dst->clone().get(), padding).first);
     }
 
     return Status{};
@@ -122,38 +128,41 @@ void CpuCopyKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
     auto       dst = tensors.get_tensor(TensorType::ACL_DST);
 
-    if(_padding.empty())
+    if (_padding.empty())
     {
-        Window dst_window{ window };
-        dst_window.set(Window::DimX, Window::Dimension(dst_window.x().start(), dst_window.x().end(), src->info()->dimension(0)));
+        Window dst_window{window};
+        dst_window.set(Window::DimX,
+                       Window::Dimension(dst_window.x().start(), dst_window.x().end(), src->info()->dimension(0)));
         Window out_slice = dst_window.first_slice_window_1D();
         do
         {
             Iterator src_it(src, out_slice);
             Iterator dst_it(dst, out_slice);
 
-            execute_window_loop(out_slice, [&](const Coordinates &)
-            {
-                memcpy(dst_it.ptr(), src_it.ptr(), dst->info()->dimension(0) * dst->info()->element_size());
-            },
-            src_it, dst_it);
-        }
-        while(dst_window.slide_window_slice_1D(out_slice));
+            execute_window_loop(
+                out_slice,
+                [&](const Coordinates &)
+                { memcpy(dst_it.ptr(), src_it.ptr(), dst->info()->dimension(0) * dst->info()->element_size()); },
+                src_it, dst_it);
+        } while (dst_window.slide_window_slice_1D(out_slice));
     }
     else
     {
-        Window src_window{ window };
-        src_window.set(Window::DimX, Window::Dimension(0, window.x().end() - _padding[0].first, src->info()->dimension(0)));
+        Window src_window{window};
+        src_window.set(Window::DimX,
+                       Window::Dimension(0, window.x().end() - _padding[0].first, src->info()->dimension(0)));
 
         Iterator     src_it(src, src_window);
         Iterator     dst_it(dst, window);
         const size_t row_size_in_bytes = src->info()->dimension(0) * src->info()->element_size();
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-            auto dst_ptr = dst_it.ptr() + _padding[0].first * dst->info()->element_size();
-            std::memcpy(dst_ptr, src_it.ptr(), row_size_in_bytes);
-        },
-        src_it, dst_it);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &)
+            {
+                auto dst_ptr = dst_it.ptr() + _padding[0].first * dst->info()->element_size();
+                std::memcpy(dst_ptr, src_it.ptr(), row_size_in_bytes);
+            },
+            src_it, dst_it);
     }
 }
 
diff --git a/src/cpu/kernels/CpuCopyKernel.h b/src/cpu/kernels/CpuCopyKernel.h
index c9ef8eba76..a05053f07e 100644
--- a/src/cpu/kernels/CpuCopyKernel.h
+++ b/src/cpu/kernels/CpuCopyKernel.h
@@ -55,7 +55,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PaddingList &padding = PaddingList());
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
diff --git a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
index d6c56d2012..82e3a5ce00 100644
--- a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
+++ b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
@@ -26,11 +26,12 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/traits.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/traits.h"
 #include "src/cpu/kernels/depthwiseconv2d/list.h"
 
 namespace arm_compute
@@ -41,72 +42,53 @@ namespace kernels
 {
 namespace
 {
-static const std::vector<CpuDepthwiseConv2dNativeKernel::DepthwiseConv2dNativeKernel> available_kernels =
-{
-    {
-        "neon_qu8_deptwiseconv2dnative",
-        [](const DepthwiseConv2dNativeDataTypeISASelectorData & data)
-        {
-            return (data.weights_dt == DataType::QASYMM8);
-        },
-        REGISTER_QASYMM8_NEON(neon_qu8_deptwiseconv2dnative)
-    },
-    {
-        "neon_qs8_deptwiseconv2dnative",
-        [](const DepthwiseConv2dNativeDataTypeISASelectorData & data)
-        {
-            return (data.weights_dt == DataType::QASYMM8_SIGNED);
-        },
-        REGISTER_QASYMM8_SIGNED_NEON(neon_qs8_deptwiseconv2dnative)
-    },
-    {
-        "neon_fp16_deptwiseconv2dnative",
-        [](const DepthwiseConv2dNativeDataTypeISASelectorData & data)
-        {
-            return (data.weights_dt == DataType::F16 && data.isa.fp16);
-        },
-        REGISTER_FP16_NEON(neon_fp16_deptwiseconv2dnative)
-    },
-    {
-        "neon_fp32_deptwiseconv2dnative",
-        [](const DepthwiseConv2dNativeDataTypeISASelectorData & data)
-        {
-            return (data.weights_dt == DataType::F32);
-        },
-        REGISTER_FP32_NEON(neon_fp32_deptwiseconv2dnative)
-    },
-    {
-        "neon_qp8_qu8_deptwiseconv2dnative",
-        [](const DepthwiseConv2dNativeDataTypeISASelectorData & data)
-        {
-            return (data.weights_dt == DataType::QSYMM8_PER_CHANNEL && data.source_dt == DataType::QASYMM8);
-        },
-        REGISTER_QASYMM8_NEON(neon_qp8_qu8_deptwiseconv2dnative)
-    },
-    {
-        "neon_qp8_qs8_deptwiseconv2dnative",
-        [](const DepthwiseConv2dNativeDataTypeISASelectorData & data)
-        {
-            return (data.weights_dt == DataType::QSYMM8_PER_CHANNEL && data.source_dt != DataType::QASYMM8);
-        },
-        REGISTER_QASYMM8_SIGNED_NEON(neon_qp8_qs8_deptwiseconv2dnative)
-    },
+static const std::vector<CpuDepthwiseConv2dNativeKernel::DepthwiseConv2dNativeKernel> available_kernels = {
+    {"neon_qu8_deptwiseconv2dnative",
+     [](const DepthwiseConv2dNativeDataTypeISASelectorData &data) { return (data.weights_dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(neon_qu8_deptwiseconv2dnative)},
+    {"neon_qs8_deptwiseconv2dnative",
+     [](const DepthwiseConv2dNativeDataTypeISASelectorData &data)
+     { return (data.weights_dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(neon_qs8_deptwiseconv2dnative)},
+    {"neon_fp16_deptwiseconv2dnative",
+     [](const DepthwiseConv2dNativeDataTypeISASelectorData &data)
+     { return (data.weights_dt == DataType::F16 && data.isa.fp16); },
+     REGISTER_FP16_NEON(neon_fp16_deptwiseconv2dnative)},
+    {"neon_fp32_deptwiseconv2dnative",
+     [](const DepthwiseConv2dNativeDataTypeISASelectorData &data) { return (data.weights_dt == DataType::F32); },
+     REGISTER_FP32_NEON(neon_fp32_deptwiseconv2dnative)},
+    {"neon_qp8_qu8_deptwiseconv2dnative",
+     [](const DepthwiseConv2dNativeDataTypeISASelectorData &data)
+     { return (data.weights_dt == DataType::QSYMM8_PER_CHANNEL && data.source_dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(neon_qp8_qu8_deptwiseconv2dnative)},
+    {"neon_qp8_qs8_deptwiseconv2dnative",
+     [](const DepthwiseConv2dNativeDataTypeISASelectorData &data)
+     { return (data.weights_dt == DataType::QSYMM8_PER_CHANNEL && data.source_dt != DataType::QASYMM8); },
+     REGISTER_QASYMM8_SIGNED_NEON(neon_qp8_qs8_deptwiseconv2dnative)},
 };
 
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
+Status validate_arguments(const ITensorInfo     *src,
+                          const ITensorInfo     *weights,
+                          const ITensorInfo     *biases,
+                          const ITensorInfo     *dst,
+                          const ConvolutionInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(info.depth_multiplier == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) > src->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right());
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) > src->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) >
+                                src->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) >
+                                src->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom());
     ARM_COMPUTE_RETURN_ERROR_ON((src->dimension(0) * info.depth_multiplier) != weights->dimension(0));
     ARM_COMPUTE_RETURN_ERROR_ON((info.dilation.x() < 1) || (info.dilation.y() < 1));
-    ARM_COMPUTE_RETURN_ERROR_ON((info.pad_stride_info.stride().first < 1) || (info.pad_stride_info.stride().second < 1));
+    ARM_COMPUTE_RETURN_ERROR_ON((info.pad_stride_info.stride().first < 1) ||
+                                (info.pad_stride_info.stride().second < 1));
 
-    if(is_data_type_quantized_per_channel(weights->data_type()))
+    if (is_data_type_quantized_per_channel(weights->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
@@ -116,12 +98,12 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
     }
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
 
-        if(is_data_type_quantized_asymmetric(src->data_type()))
+        if (is_data_type_quantized_asymmetric(src->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -131,9 +113,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
         }
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+        const TensorShape output_shape =
+            misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     }
@@ -142,7 +125,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
 }
 } // namespace
 
-void CpuDepthwiseConv2dNativeKernel::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
+void CpuDepthwiseConv2dNativeKernel::configure(const ITensorInfo     *src,
+                                               const ITensorInfo     *weights,
+                                               const ITensorInfo     *biases,
+                                               ITensorInfo           *dst,
+                                               const ConvolutionInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, (biases != nullptr) ? biases : nullptr, dst, info));
@@ -151,18 +138,26 @@ void CpuDepthwiseConv2dNativeKernel::configure(const ITensorInfo *src, const ITe
     _conv_info  = info;
 
     const auto uk = CpuDepthwiseConv2dNativeKernel::get_implementation(
-                        DepthwiseConv2dNativeDataTypeISASelectorData{ weights->data_type(), src->data_type(), CPUInfo::get().get_isa() });
+        DepthwiseConv2dNativeDataTypeISASelectorData{weights->data_type(), src->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
     _func = uk->ukernel;
 
     const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
-    auto_init_if_empty(*dst, src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(dst->quantization_info()));
+    auto_init_if_empty(*dst, src->clone()
+                                 ->set_is_resizable(true)
+                                 .reset_padding()
+                                 .set_tensor_shape(output_shape)
+                                 .set_quantization_info(dst->quantization_info()));
 
     Window win = calculate_max_window(*dst, Steps());
     ICpuKernel::configure(win);
 }
 
-Status CpuDepthwiseConv2dNativeKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
+Status CpuDepthwiseConv2dNativeKernel::validate(const ITensorInfo     *src,
+                                                const ITensorInfo     *weights,
+                                                const ITensorInfo     *biases,
+                                                const ITensorInfo     *dst,
+                                                const ConvolutionInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, info));
     return Status{};
@@ -187,7 +182,8 @@ const char *CpuDepthwiseConv2dNativeKernel::name() const
     return "CpuDepthwiseConv2dNativeKernel";
 }
 
-const std::vector<CpuDepthwiseConv2dNativeKernel::DepthwiseConv2dNativeKernel> &CpuDepthwiseConv2dNativeKernel::get_available_kernels()
+const std::vector<CpuDepthwiseConv2dNativeKernel::DepthwiseConv2dNativeKernel> &
+CpuDepthwiseConv2dNativeKernel::get_available_kernels()
 {
     return available_kernels;
 }
diff --git a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
index 9fabd0b01c..7e78f52e13 100644
--- a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
+++ b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/utils/misc/Traits.h"
 #include "arm_compute/function_info/ConvolutionInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 #include "support/AclRequires.h"
@@ -44,8 +45,9 @@ namespace kernels
 class CpuDepthwiseConv2dNativeKernel : public ICpuKernel<CpuDepthwiseConv2dNativeKernel>
 {
 private:
-    using DepthwiseConv2dNativeKernelPtr =
-        std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Window &, bool, const ConvolutionInfo &)>::type;
+    using DepthwiseConv2dNativeKernelPtr = std::add_pointer<void(
+        const ITensor *, const ITensor *, const ITensor *, ITensor *, const Window &, bool, const ConvolutionInfo &)>::
+        type;
 
 public:
     CpuDepthwiseConv2dNativeKernel() = default;
@@ -64,17 +66,25 @@ public:
      * @param[in]  info    Depthwise convolution meta-data.
      *
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+    void configure(const ITensorInfo     *src,
+                   const ITensorInfo     *weights,
+                   const ITensorInfo     *biases,
+                   ITensorInfo           *dst,
+                   const ConvolutionInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuDepthwiseConv2dNativeKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+    static Status validate(const ITensorInfo     *src,
+                           const ITensorInfo     *weights,
+                           const ITensorInfo     *biases,
+                           const ITensorInfo     *dst,
+                           const ConvolutionInfo &info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
     struct DepthwiseConv2dNativeKernel
     {
@@ -89,9 +99,9 @@ private:
      *
      * @param[in] window Region on which to execute the kernel.
      */
-    DepthwiseConv2dNativeKernelPtr _func{ nullptr };
+    DepthwiseConv2dNativeKernelPtr _func{nullptr};
     ConvolutionInfo                _conv_info{};
-    bool                           _has_biases{ false };
+    bool                           _has_biases{false};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuDequantizeKernel.cpp b/src/cpu/kernels/CpuDequantizeKernel.cpp
index a2d24f9243..d17128b5ac 100644
--- a/src/cpu/kernels/CpuDequantizeKernel.cpp
+++ b/src/cpu/kernels/CpuDequantizeKernel.cpp
@@ -28,12 +28,13 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NESymm.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
@@ -48,9 +49,11 @@ namespace
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8,
+                                                         DataType::QSYMM16);
 
-    if(dst->tensor_shape().total_size() > 0)
+    if (dst->tensor_shape().total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(dst);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32);
@@ -124,28 +127,30 @@ void run_dequantization_qasymm8(const ITensor *input, ITensor *output, const Win
     Iterator in(input, win_collapsed);
     Iterator out(output, win_collapsed);
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const TIn *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<TOut *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize(vin, scale, offset);
+            const auto in_ptr  = reinterpret_cast<const TIn *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr());
 
-            store_result(reinterpret_cast<TOut *>(out_ptr + x), vdeq);
-        }
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin  = wrapper::vloadq(in_ptr + x);
+                const auto vdeq = vdequantize(vin, scale, offset);
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            auto val       = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<TOut>(Qasymm8QuantizationHelper<TIn>::dequantize(val, qinfo));
-        }
-    },
-    in, out);
+                store_result(reinterpret_cast<TOut *>(out_ptr + x), vdeq);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                auto val       = *(in_ptr + x);
+                *(out_ptr + x) = static_cast<TOut>(Qasymm8QuantizationHelper<TIn>::dequantize(val, qinfo));
+            }
+        },
+        in, out);
 }
 
 template <typename T>
@@ -165,28 +170,30 @@ void run_dequantization_qsymm8_per_channel_nchw(const ITensor *input, ITensor *o
     Iterator in(input, win);
     Iterator out(output, win);
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize(vin, scale[id.z()]);
+            const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
 
-            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
-        }
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin  = wrapper::vloadq(in_ptr + x);
+                const auto vdeq = vdequantize(vin, scale[id.z()]);
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int8_t val     = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<T>(dequantize(val, scale[id.z()]));
-        }
-    },
-    in, out);
+                store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                int8_t val     = *(in_ptr + x);
+                *(out_ptr + x) = static_cast<T>(dequantize(val, scale[id.z()]));
+            }
+        },
+        in, out);
 }
 
 template <typename T>
@@ -206,37 +213,34 @@ void run_dequantization_qsymm8_per_channel_nhwc(const ITensor *input, ITensor *o
     Iterator in(input, win);
     Iterator out(output, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const float32x4x4_t vscale =
+            const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                {
-                    scale[x + 0], scale[x + 1], scale[x + 2], scale[x + 3],
-                    scale[x + 4], scale[x + 5], scale[x + 6], scale[x + 7],
-                    scale[x + 8], scale[x + 9], scale[x + 10], scale[x + 11],
-                    scale[x + 12], scale[x + 13], scale[x + 14], scale[x + 15]
-                }
-            };
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize(vin, vscale);
-
-            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int8_t val     = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<T>(dequantize(val, scale[x]));
-        }
-    },
-    in, out);
+                const float32x4x4_t vscale = {{scale[x + 0], scale[x + 1], scale[x + 2], scale[x + 3], scale[x + 4],
+                                               scale[x + 5], scale[x + 6], scale[x + 7], scale[x + 8], scale[x + 9],
+                                               scale[x + 10], scale[x + 11], scale[x + 12], scale[x + 13],
+                                               scale[x + 14], scale[x + 15]}};
+                const auto          vin    = wrapper::vloadq(in_ptr + x);
+                const auto          vdeq   = vdequantize(vin, vscale);
+
+                store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                int8_t val     = *(in_ptr + x);
+                *(out_ptr + x) = static_cast<T>(dequantize(val, scale[x]));
+            }
+        },
+        in, out);
 }
 
 template <typename T>
@@ -257,28 +261,30 @@ void run_dequantization_qsymm8(const ITensor *input, ITensor *output, const Wind
     Iterator in(input, win_collapsed);
     Iterator out(output, win_collapsed);
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize(vin, scale);
+            const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
 
-            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
-        }
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin  = wrapper::vloadq(in_ptr + x);
+                const auto vdeq = vdequantize(vin, scale);
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int8_t val     = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<T>(dequantize(val, scale));
-        }
-    },
-    in, out);
+                store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                int8_t val     = *(in_ptr + x);
+                *(out_ptr + x) = static_cast<T>(dequantize(val, scale));
+            }
+        },
+        in, out);
 }
 
 template <typename T>
@@ -299,34 +305,36 @@ void run_dequantization_qsymm16(const ITensor *input, ITensor *output, const Win
     Iterator in(input, win_collapsed);
     Iterator out(output, win_collapsed);
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const int16_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize_int16(vin, scale);
+            const auto in_ptr  = reinterpret_cast<const int16_t *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
 
-            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
-        }
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin  = wrapper::vloadq(in_ptr + x);
+                const auto vdeq = vdequantize_int16(vin, scale);
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int16_t val    = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<T>(dequantize_qsymm16(val, scale));
-        }
-    },
-    in, out);
+                store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                int16_t val    = *(in_ptr + x);
+                *(out_ptr + x) = static_cast<T>(dequantize_qsymm16(val, scale));
+            }
+        },
+        in, out);
 }
 
 template <typename T>
 void run_dequantization_core(const ITensor *input, ITensor *output, const Window &window)
 {
-    switch(input->info()->data_type())
+    switch (input->info()->data_type())
     {
         case DataType::QASYMM8:
             run_dequantization_qasymm8<T, uint8_t>(input, output, window);
@@ -335,7 +343,9 @@ void run_dequantization_core(const ITensor *input, ITensor *output, const Window
             run_dequantization_qasymm8<T, int8_t>(input, output, window);
             break;
         case DataType::QSYMM8_PER_CHANNEL:
-            input->info()->data_layout() == DataLayout::NHWC ? run_dequantization_qsymm8_per_channel_nhwc<T>(input, output, window) : run_dequantization_qsymm8_per_channel_nchw<T>(input, output, window);
+            input->info()->data_layout() == DataLayout::NHWC
+                ? run_dequantization_qsymm8_per_channel_nhwc<T>(input, output, window)
+                : run_dequantization_qsymm8_per_channel_nchw<T>(input, output, window);
             break;
         case DataType::QSYMM8:
             run_dequantization_qsymm8<T>(input, output, window);
@@ -377,7 +387,7 @@ void CpuDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, con
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
     auto       dst = tensors.get_tensor(TensorType::ACL_DST);
 
-    switch(dst->info()->data_type())
+    switch (dst->info()->data_type())
     {
         case DataType::F32:
             run_dequantization_core<float>(src, dst, window);
diff --git a/src/cpu/kernels/CpuDequantizeKernel.h b/src/cpu/kernels/CpuDequantizeKernel.h
index cfa991dc74..6ed58587c9 100644
--- a/src/cpu/kernels/CpuDequantizeKernel.h
+++ b/src/cpu/kernels/CpuDequantizeKernel.h
@@ -54,7 +54,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 };
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuDirectConv2dKernel.cpp b/src/cpu/kernels/CpuDirectConv2dKernel.cpp
index a4cdddee5e..4cb0fb1c40 100644
--- a/src/cpu/kernels/CpuDirectConv2dKernel.cpp
+++ b/src/cpu/kernels/CpuDirectConv2dKernel.cpp
@@ -22,13 +22,14 @@
  * SOFTWARE.
  */
 #include "src/cpu/kernels/CpuDirectConv2dKernel.h"
-#include "src/cpu/kernels/directconv2d/list.h"
 
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/directconv2d/list.h"
 
 using namespace arm_compute::detail;
 
@@ -38,26 +39,25 @@ namespace cpu
 {
 namespace kernels
 {
-static const std::vector<CpuDirectConv2dKernel::DirectConv2dKernel> available_kernels =
-{
-    {
-        "neon_fp32_nhwc_directconv2d",
-        [](const DataTypeDataLayoutISASelectorData & data) { return data.dt == DataType::F32 && data.dl == DataLayout::NHWC; },
-        REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nhwc_directconv2d)
-    },
-    {
-        "neon_fp32_nchw_directconv2d",
-        [](const DataTypeDataLayoutISASelectorData & data) { return data.dt == DataType::F32 && data.dl == DataLayout::NCHW; },
-        REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nchw_directconv2d)
-    },
-    {
-        "neon_fp16_nchw_directconv2d",
-        [](const DataTypeDataLayoutISASelectorData & data) { return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::kernels::neon_fp16_nchw_directconv2d)
-    },
+static const std::vector<CpuDirectConv2dKernel::DirectConv2dKernel> available_kernels = {
+    {"neon_fp32_nhwc_directconv2d",
+     [](const DataTypeDataLayoutISASelectorData &data)
+     { return data.dt == DataType::F32 && data.dl == DataLayout::NHWC; },
+     REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nhwc_directconv2d)},
+    {"neon_fp32_nchw_directconv2d",
+     [](const DataTypeDataLayoutISASelectorData &data)
+     { return data.dt == DataType::F32 && data.dl == DataLayout::NCHW; },
+     REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nchw_directconv2d)},
+    {"neon_fp16_nchw_directconv2d",
+     [](const DataTypeDataLayoutISASelectorData &data)
+     { return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::kernels::neon_fp16_nchw_directconv2d)},
 };
 
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
+Status validate_arguments(const ITensorInfo   *src,
+                          const ITensorInfo   *weights,
+                          const ITensorInfo   *dst,
+                          const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
@@ -76,7 +76,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
     ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::NHWC && src->data_type() != DataType::F32);
     ARM_COMPUTE_UNUSED(width_idx);
     // Checks performed when output is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
 
@@ -100,11 +100,15 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso
     // Configure window without any padding
     win = calculate_max_window(*dst, Steps());
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 
-void CpuDirectConv2dKernel::configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info)
+void CpuDirectConv2dKernel::configure(ITensorInfo         *src,
+                                      ITensorInfo         *weights,
+                                      ITensorInfo         *dst,
+                                      const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
 
@@ -129,12 +133,13 @@ void CpuDirectConv2dKernel::configure(ITensorInfo *src, ITensorInfo *weights, IT
     ICpuKernel::configure(win_config.second);
 }
 
-Status CpuDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
+Status CpuDirectConv2dKernel::validate(const ITensorInfo   *src,
+                                       const ITensorInfo   *weights,
+                                       const ITensorInfo   *dst,
+                                       const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, dst, conv_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(),
-                                                              dst->clone().get())
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first);
 
     return Status{};
 }
@@ -149,7 +154,8 @@ void CpuDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, c
     auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
     auto dst     = tensors.get_tensor(TensorType::ACL_DST);
 
-    const auto *uk = CpuDirectConv2dKernel::get_implementation(DataTypeDataLayoutISASelectorData{ src->info()->data_type(), _data_layout, CPUInfo::get().get_isa() });
+    const auto *uk = CpuDirectConv2dKernel::get_implementation(
+        DataTypeDataLayoutISASelectorData{src->info()->data_type(), _data_layout, CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr);
 
     uk->ukernel(window, src, weights, dst, _conv_info);
diff --git a/src/cpu/kernels/CpuDirectConv2dKernel.h b/src/cpu/kernels/CpuDirectConv2dKernel.h
index b9265dc630..ad4caea193 100644
--- a/src/cpu/kernels/CpuDirectConv2dKernel.h
+++ b/src/cpu/kernels/CpuDirectConv2dKernel.h
@@ -37,7 +37,8 @@ namespace kernels
 class CpuDirectConv2dKernel : public ICpuKernel<CpuDirectConv2dKernel>
 {
 private:
-    using DirectConv2dKernel_Ptr = std::add_pointer<void(const Window &, const ITensor *, const ITensor *, ITensor *, const PadStrideInfo &)>::type;
+    using DirectConv2dKernel_Ptr = std::add_pointer<void(
+        const Window &, const ITensor *, const ITensor *, ITensor *, const PadStrideInfo &)>::type;
 
 public:
     CpuDirectConv2dKernel() = default;
@@ -64,10 +65,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info);
+    static Status validate(const ITensorInfo   *src,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *dst,
+                           const PadStrideInfo &conv_info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct DirectConv2dKernel
@@ -81,8 +85,8 @@ public:
 
 private:
     PadStrideInfo _conv_info{};
-    unsigned int  _kernel_size{ 0 };
-    DataLayout    _data_layout{ DataLayout::UNKNOWN };
+    unsigned int  _kernel_size{0};
+    DataLayout    _data_layout{DataLayout::UNKNOWN};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
index 93ad5e5eba..d4af8bedaf 100644
--- a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
+++ b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
@@ -27,15 +27,16 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Traits.h"
+
 #include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
@@ -49,7 +50,9 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
+Status validate_arguments(const ITensorInfo                                 *src,
+                          const ITensorInfo                                 *bias,
+                          const ITensorInfo                                 *dst,
                           const DirectConvolutionLayerOutputStageKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
@@ -57,22 +60,23 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::S32, DataType::F32);
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != src->dimension(get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL)));
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != src->dimension(get_data_layout_dimension_index(
+                                                              src->data_layout(), DataLayoutDimension::CHANNEL)));
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
     }
 
-    if(src->data_type() == DataType::S32)
+    if (src->data_type() == DataType::S32)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst == nullptr, "In-place computation not allowed for quantized output");
     }
 
     // Checks performed when output is configured
-    if((dst != nullptr) && (dst->total_size() != 0))
+    if ((dst != nullptr) && (dst->total_size() != 0))
     {
-        if(is_data_type_float(src->data_type()))
+        if (is_data_type_float(src->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         }
@@ -82,10 +86,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const
         }
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
     }
-    else if(src->data_type() == DataType::S32)
+    else if (src->data_type() == DataType::S32)
     {
         // In case of quantized computation and unconfigured output, the output data type must be provided through DirectConvolutionLayerOutputStageKernelInfo
-        ARM_COMPUTE_RETURN_ERROR_ON((info.output_data_type != DataType::QASYMM8) && (info.output_data_type != DataType::QASYMM8_SIGNED));
+        ARM_COMPUTE_RETURN_ERROR_ON((info.output_data_type != DataType::QASYMM8) &&
+                                    (info.output_data_type != DataType::QASYMM8_SIGNED));
     }
 
     return Status{};
@@ -93,8 +98,13 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const
 
 template <typename T>
 typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+output_stage_nchw(ITensor       *src,
+                  const ITensor *bias,
+                  const Window  &window,
+                  ITensor       *dst,
+                  int            result_fixedpoint_multiplier,
+                  int            result_shift,
+                  int            result_offset_after_shift)
 {
     const bool has_bias = bias != nullptr;
     /** SIMD vector tag type. */
@@ -113,50 +123,57 @@ output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITens
 
     Iterator in(src, win);
     Iterator out(dst, win);
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            // Get bias and pointer to input
-            const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x;
-            auto       v_in   = wrapper::vloadq(in_ptr);
-
-            // Accumulate bias
-            if(has_bias)
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                const auto vb = wrapper::vdup_n(*reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{});
-                v_in          = wrapper::vadd(v_in, vb);
-            }
+                // Get bias and pointer to input
+                const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x;
+                auto       v_in   = wrapper::vloadq(in_ptr);
 
-            const auto out_ptr = reinterpret_cast<T *>(out.ptr()) + x;
-            wrapper::vstore(out_ptr, v_in);
-        }
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto vb = wrapper::vdup_n(
+                        *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{});
+                    v_in = wrapper::vadd(v_in, vb);
+                }
 
-        // Left-overs loop
-        for(; x < window_end_x; ++x)
-        {
-            // Get bias and pointer to input
-            auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
+                const auto out_ptr = reinterpret_cast<T *>(out.ptr()) + x;
+                wrapper::vstore(out_ptr, v_in);
+            }
 
-            // Accumulate bias
-            if(has_bias)
+            // Left-overs loop
+            for (; x < window_end_x; ++x)
             {
-                const auto b = *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z())));
-                s_in += b;
-            }
+                // Get bias and pointer to input
+                auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
 
-            *(reinterpret_cast<T *>(out.ptr()) + x) = s_in;
-        }
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto b = *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z())));
+                    s_in += b;
+                }
 
-    },
-    in, out);
+                *(reinterpret_cast<T *>(out.ptr()) + x) = s_in;
+            }
+        },
+        in, out);
 }
 
 template <typename T>
 typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+output_stage_nhwc(ITensor       *src,
+                  const ITensor *bias,
+                  const Window  &window,
+                  ITensor       *dst,
+                  int            result_fixedpoint_multiplier,
+                  int            result_shift,
+                  int            result_offset_after_shift)
 {
     const bool has_bias = bias != nullptr;
     ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
@@ -179,50 +196,59 @@ output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITens
     Iterator bi(bias, window_bias);
     Iterator out(dst, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            // Get bias and pointer to input
-            const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
-            auto       v_in   = wrapper::vloadq(in_ptr + x);
-
-            // Accumulate bias
-            if(has_bias)
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
-                v_in                = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr));
-            }
+                // Get bias and pointer to input
+                const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
+                auto       v_in   = wrapper::vloadq(in_ptr + x);
 
-            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-            wrapper::vstore(out_ptr + x, v_in);
-        }
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
+                    v_in                = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr));
+                }
 
-        // Left-overs loop
-        for(; x < window_end_x; ++x)
-        {
-            // Get bias and pointer to input
-            auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
+                const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+                wrapper::vstore(out_ptr + x, v_in);
+            }
 
-            // Accumulate bias
-            if(has_bias)
+            // Left-overs loop
+            for (; x < window_end_x; ++x)
             {
-                const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
-                s_in += *bias_ptr;
-            }
+                // Get bias and pointer to input
+                auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
 
-            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-            *(out_ptr + x)     = s_in;
-        }
-    },
-    in, bi, out);
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
+                    s_in += *bias_ptr;
+                }
+
+                const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+                *(out_ptr + x)     = s_in;
+            }
+        },
+        in, bi, out);
 }
 
 // Quantized case
-template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
-void output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+template <
+    typename TOut,
+    typename std::enable_if<std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int>::type = 0>
+void output_stage_nchw(ITensor       *src,
+                       const ITensor *bias,
+                       const Window  &window,
+                       ITensor       *dst,
+                       int            result_fixedpoint_multiplier,
+                       int            result_shift,
+                       int            result_offset_after_shift)
 {
     const bool has_bias = bias != nullptr;
     using VectorType    = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
@@ -242,67 +268,63 @@ void output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window,
     Iterator in(src, win);
     Iterator out(dst, win);
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            // Get bias and pointer to input
-            const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
-            int32x4x4_t v_in =
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
+                // Get bias and pointer to input
+                const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+                int32x4x4_t v_in = {{wrapper::vloadq(in_ptr), wrapper::vloadq(in_ptr + 4), wrapper::vloadq(in_ptr + 8),
+                                     wrapper::vloadq(in_ptr + 12)}};
+
+                // Accumulate bias
+                if (has_bias)
                 {
-                    wrapper::vloadq(in_ptr),
-                    wrapper::vloadq(in_ptr + 4),
-                    wrapper::vloadq(in_ptr + 8),
-                    wrapper::vloadq(in_ptr + 12)
+                    const auto vb = wrapper::vdup_n(
+                        *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{});
+                    v_in = {{wrapper::vadd(v_in.val[0], vb), wrapper::vadd(v_in.val[1], vb),
+                             wrapper::vadd(v_in.val[2], vb), wrapper::vadd(v_in.val[3], vb)}};
                 }
-            };
 
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto vb = wrapper::vdup_n(*reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{});
-                v_in =
-                {
-                    {
-                        wrapper::vadd(v_in.val[0], vb),
-                        wrapper::vadd(v_in.val[1], vb),
-                        wrapper::vadd(v_in.val[2], vb),
-                        wrapper::vadd(v_in.val[3], vb)
-                    }
-                };
+                const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+                wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift,
+                                                               result_offset_after_shift_s32, min, max, false));
             }
 
-            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-            wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32,
-                                                           min, max, false));
-        }
+            // Left-overs loop
+            for (; x < window_end_x; ++x)
+            {
+                // Get bias and pointer to input
+                int32_t s_in = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
 
-        // Left-overs loop
-        for(; x < window_end_x; ++x)
-        {
-            // Get bias and pointer to input
-            int32_t s_in = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto b = *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z())));
+                    s_in += b;
+                }
 
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto b = *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z())));
-                s_in += b;
+                const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+                *out_ptr =
+                    finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
+                                          std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
             }
-
-            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-            *out_ptr           = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
-                                                       std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
-        }
-    },
-    in, out);
+        },
+        in, out);
 }
-template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
-void output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+template <
+    typename TOut,
+    typename std::enable_if<std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int>::type = 0>
+void output_stage_nhwc(ITensor       *src,
+                       const ITensor *bias,
+                       const Window  &window,
+                       ITensor       *dst,
+                       int            result_fixedpoint_multiplier,
+                       int            result_shift,
+                       int            result_offset_after_shift)
 {
     const bool has_bias = bias != nullptr;
     using VectorType    = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
@@ -329,62 +351,65 @@ void output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window,
     Iterator bi(bias, window_bias);
     Iterator out(dst, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            // Get bias and pointer to input
-            const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
-            int32x4x4_t v_in =
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
+                // Get bias and pointer to input
+                const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+                int32x4x4_t v_in   = {{
+                      wrapper::vloadq(in_ptr),
+                      wrapper::vloadq(in_ptr + 4),
+                      wrapper::vloadq(in_ptr + 8),
+                      wrapper::vloadq(in_ptr + 12),
+                }};
+
+                // Accumulate bias
+                if (has_bias)
                 {
-                    wrapper::vloadq(in_ptr),
-                    wrapper::vloadq(in_ptr + 4),
-                    wrapper::vloadq(in_ptr + 8),
-                    wrapper::vloadq(in_ptr + 12),
-                }
-            };
+                    const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
 
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
+                    wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr));
+                    wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4));
+                    wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8));
+                    wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12));
+                }
 
-                wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr));
-                wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4));
-                wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8));
-                wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12));
+                const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+                wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift,
+                                                               result_offset_after_shift_s32, min, max, false));
             }
 
-            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-            wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false));
-        }
+            // Left-overs loop
+            for (; x < window_end_x; ++x)
+            {
+                // Get bias and pointer to input
+                const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+                int32_t    s_in   = *in_ptr;
 
-        // Left-overs loop
-        for(; x < window_end_x; ++x)
-        {
-            // Get bias and pointer to input
-            const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
-            int32_t    s_in   = *in_ptr;
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
+                    s_in += *bias_ptr;
+                }
 
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
-                s_in += *bias_ptr;
+                const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+                *out_ptr =
+                    finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
+                                          std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
             }
-
-            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-            *out_ptr           = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
-                                                       std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
-        }
-    },
-    in, bi, out);
+        },
+        in, bi, out);
 }
 } // namespace
 
-void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
+void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo                                       *src,
+                                                 const ITensorInfo                                 *bias,
+                                                 ITensorInfo                                       *dst,
                                                  const DirectConvolutionLayerOutputStageKernelInfo &info)
 {
     ARM_COMPUTE_UNUSED(bias);
@@ -398,7 +423,7 @@ void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensor
     _result_offset_after_shift    = info.result_offset_after_shift;
 
     // Auto-initialize output output if required
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         // Work out expected output data type
         const DataType output_dt = (src->data_type() == DataType::S32) ? info.output_data_type : DataType::S32;
@@ -410,16 +435,17 @@ void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensor
 
     ICpuKernel::configure(win);
 
-    const bool is_qasymm8_signed = (dst != nullptr) ? is_data_type_quantized_asymmetric_signed(dst->data_type()) : false;
+    const bool is_qasymm8_signed =
+        (dst != nullptr) ? is_data_type_quantized_asymmetric_signed(dst->data_type()) : false;
 
     // Set appropriate function
-    if(src->data_layout() == DataLayout::NCHW)
+    if (src->data_layout() == DataLayout::NCHW)
     {
-        switch(src->data_type())
+        switch (src->data_type())
         {
             case DataType::S32:
             {
-                if(is_qasymm8_signed)
+                if (is_qasymm8_signed)
                 {
                     _func = &output_stage_nchw<int8_t>;
                 }
@@ -449,11 +475,11 @@ void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensor
     }
     else
     {
-        switch(src->data_type())
+        switch (src->data_type())
         {
             case DataType::S32:
             {
-                if(is_qasymm8_signed)
+                if (is_qasymm8_signed)
                 {
                     _func = &output_stage_nhwc<int8_t>;
                 }
@@ -483,7 +509,9 @@ void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensor
     }
 }
 
-Status CpuDirectConv2dOutputStageKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
+Status CpuDirectConv2dOutputStageKernel::validate(const ITensorInfo                                 *src,
+                                                  const ITensorInfo                                 *bias,
+                                                  const ITensorInfo                                 *dst,
                                                   const DirectConvolutionLayerOutputStageKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info));
diff --git a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
index d3ef17b7c9..ce84f49cf6 100644
--- a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
+++ b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_DIRECT_CONV2D_OUTPUT_STAGE_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -55,29 +56,40 @@ public:
      *                      Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p src is S32
      * @param[in]      info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata
      */
-    void configure(ITensorInfo *src, const ITensorInfo *bias = nullptr, ITensorInfo *dst = nullptr,
-                   const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
+    void
+    configure(ITensorInfo                                       *src,
+              const ITensorInfo                                 *bias = nullptr,
+              ITensorInfo                                       *dst  = nullptr,
+              const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuDirectConv2dOutputStageKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias = nullptr, const ITensorInfo *dst = nullptr,
-                           const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
+    static Status
+    validate(const ITensorInfo                                 *src,
+             const ITensorInfo                                 *bias = nullptr,
+             const ITensorInfo                                 *dst  = nullptr,
+             const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
-    using OutputStageKernel = void(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                                   int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift);
+    using OutputStageKernel = void(ITensor       *src,
+                                   const ITensor *bias,
+                                   const Window  &window,
+                                   ITensor       *dst,
+                                   int            result_fixedpoint_multiplier,
+                                   int            result_shift,
+                                   int            result_offset_after_shift);
 
-    OutputStageKernel *_func{ nullptr };
-    int                _result_fixedpoint_multiplier{ 0 };
-    int                _result_shift{ 0 };
-    int                _result_offset_after_shift{ 0 };
+    OutputStageKernel *_func{nullptr};
+    int                _result_fixedpoint_multiplier{0};
+    int                _result_shift{0};
+    int                _result_offset_after_shift{0};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuDirectConv3dKernel.cpp b/src/cpu/kernels/CpuDirectConv3dKernel.cpp
index 22c60cd994..b5b2aed1ba 100644
--- a/src/cpu/kernels/CpuDirectConv3dKernel.cpp
+++ b/src/cpu/kernels/CpuDirectConv3dKernel.cpp
@@ -29,12 +29,13 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/conv3d/neon/list.h"
 
 #include <algorithm>
@@ -49,43 +50,37 @@ namespace kernels
 {
 namespace
 {
-static const std::vector<CpuDirectConv3dKernel::DirectConv3dKernel> available_kernels =
-{
+static const std::vector<CpuDirectConv3dKernel::DirectConv3dKernel> available_kernels = {
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    {
-        "neon_fp16_directconv3d",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float16_t>)
-    },
+    {"neon_fp16_directconv3d",
+     [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float16_t>)},
 #endif /* !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
-    {
-        "neon_fp32_directconv3d",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float>)
-    },
-    {
-        "neon_qasymm8_directconv3d",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<uint8_t>)
-    },
-    {
-        "neon_qasymm8_signed_directconv3d",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<int8_t>)
-    }
-};
-
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv_info)
+    {"neon_fp32_directconv3d", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float>)},
+    {"neon_qasymm8_directconv3d", [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<uint8_t>)},
+    {"neon_qasymm8_signed_directconv3d",
+     [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<int8_t>)}};
+
+Status validate_arguments(const ITensorInfo *src0,
+                          const ITensorInfo *src1,
+                          const ITensorInfo *src2,
+                          const ITensorInfo *dst,
+                          const Conv3dInfo  &conv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(src0->data_layout() != DataLayout::NDHWC);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation != Size3D(1U, 1U, 1U));
 
-    const auto *uk = CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa() });
+    const auto *uk =
+        CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa()});
 
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
@@ -96,9 +91,9 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     ARM_COMPUTE_RETURN_ERROR_ON(src1->num_dimensions() > 5);
     ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != src0->dimension(channel_idx));
 
-    if(src2 != nullptr)
+    if (src2 != nullptr)
     {
-        if(is_data_type_quantized(src0->data_type()))
+        if (is_data_type_quantized(src0->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::S32);
         }
@@ -106,14 +101,16 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
         }
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->dimension(0) != src1->dimension(0), "Biases size and number of dst feature maps should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->dimension(0) != src1->dimension(0),
+                                        "Biases size and number of dst feature maps should match");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->num_dimensions() > 1, "Biases should be one dimensional");
     }
 
     // Checks performed when output is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        TensorShape output_shape = misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv_info);
+        TensorShape output_shape =
+            misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv_info);
 
         DataType data_type = src0->data_type();
 
@@ -125,12 +122,17 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
 }
 } // namespace
 
-void CpuDirectConv3dKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv_info)
+void CpuDirectConv3dKernel::configure(const ITensorInfo *src0,
+                                      const ITensorInfo *src1,
+                                      const ITensorInfo *src2,
+                                      ITensorInfo       *dst,
+                                      const Conv3dInfo  &conv_info)
 {
     ARM_COMPUTE_UNUSED(src2);
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
-    const auto *uk = CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa() });
+    const auto *uk =
+        CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa()});
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
 
@@ -139,7 +141,8 @@ void CpuDirectConv3dKernel::configure(const ITensorInfo *src0, const ITensorInfo
     _name       = std::string("CpuDirectConv3dKernel").append("/").append(uk->name);
 
     // Get convolved dimensions
-    TensorShape output_shape = misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv_info);
+    TensorShape output_shape =
+        misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv_info);
 
     DataType data_type = src0->data_type();
 
@@ -154,7 +157,11 @@ void CpuDirectConv3dKernel::configure(const ITensorInfo *src0, const ITensorInfo
     ICpuKernel::configure(win);
 }
 
-Status CpuDirectConv3dKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv_info)
+Status CpuDirectConv3dKernel::validate(const ITensorInfo *src0,
+                                       const ITensorInfo *src1,
+                                       const ITensorInfo *src2,
+                                       const ITensorInfo *dst,
+                                       const Conv3dInfo  &conv_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, conv_info));
 
@@ -188,4 +195,4 @@ const std::vector<CpuDirectConv3dKernel::DirectConv3dKernel> &CpuDirectConv3dKer
 
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuDirectConv3dKernel.h b/src/cpu/kernels/CpuDirectConv3dKernel.h
index 688f368b9f..8e6f564679 100644
--- a/src/cpu/kernels/CpuDirectConv3dKernel.h
+++ b/src/cpu/kernels/CpuDirectConv3dKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_DIRECT_CONV3D_KERNEL_H
 
 #include "arm_compute/runtime/FunctionDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -39,7 +40,8 @@ class CpuDirectConv3dKernel : public ICpuKernel<CpuDirectConv3dKernel>
 {
 private:
     /* Template function for convolution 3d NDHWC */
-    using DirectConv3dKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Conv3dInfo &, const Window &)>::type;
+    using DirectConv3dKernelPtr = std::add_pointer<void(
+        const ITensor *, const ITensor *, const ITensor *, ITensor *, const Conv3dInfo &, const Window &)>::type;
 
 public:
     CpuDirectConv3dKernel() = default;
@@ -63,17 +65,25 @@ public:
      * @param[in]      conv_info Contains padding, stride, acitvation information.
      *
      */
-    void configure(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv_info);
+    void configure(const ITensorInfo *src0,
+                   const ITensorInfo *src1,
+                   const ITensorInfo *src2,
+                   ITensorInfo       *dst,
+                   const Conv3dInfo  &conv_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuDirectConv3dKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv_info);
+    static Status validate(const ITensorInfo *src0,
+                           const ITensorInfo *src1,
+                           const ITensorInfo *src2,
+                           const ITensorInfo *dst,
+                           const Conv3dInfo  &conv_info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct DirectConv3dKernel
@@ -87,7 +97,7 @@ public:
 
 private:
     Conv3dInfo            _conv_info{};
-    DirectConv3dKernelPtr _run_method{ nullptr };
+    DirectConv3dKernelPtr _run_method{nullptr};
     std::string           _name{};
 };
 
diff --git a/src/cpu/kernels/CpuElementwiseKernel.cpp b/src/cpu/kernels/CpuElementwiseKernel.cpp
index a045855b1a..57a3f39822 100644
--- a/src/cpu/kernels/CpuElementwiseKernel.cpp
+++ b/src/cpu/kernels/CpuElementwiseKernel.cpp
@@ -24,8 +24,9 @@
 #include "src/cpu/kernels/CpuElementwiseKernel.h"
 
 #include "arm_compute/core/Helpers.h"
-#include "src/core/CPP/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/elementwise_binary/list.h"
@@ -35,11 +36,11 @@
 #if defined(ENABLE_FP32_KERNELS)
 namespace
 {
-    static constexpr size_t default_min_max_mws_N1_fp32_neon = 25308;
-    static constexpr size_t default_min_max_mws_V1_fp32_neon = 34772;
-    static constexpr size_t default_div_mws_N1_fp32_neon = 19043;
-    static constexpr size_t default_div_mws_V1_fp32_neon = 25511;
-}
+static constexpr size_t default_min_max_mws_N1_fp32_neon = 25308;
+static constexpr size_t default_min_max_mws_V1_fp32_neon = 34772;
+static constexpr size_t default_div_mws_N1_fp32_neon     = 19043;
+static constexpr size_t default_div_mws_V1_fp32_neon     = 25511;
+} // namespace
 #endif /* ENABLE_FP32_KERNELS */
 
 namespace arm_compute
@@ -50,255 +51,178 @@ namespace kernels
 {
 namespace
 {
-template <ArithmeticOperation                                                   op>
-const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> available_kernels_arithmetic =
-{
-    {
-        "sve2_qu8_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8 && data.isa.sve2 && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_QASYMM8_SVE2(sve2_qasymm8_elementwise_binary<op>)
-    },
-    {
-        "sve2_qs8_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_elementwise_binary<op>)
-    },
-    {
-        "sve_fp32_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F32 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_FP32_SVE(sve_fp32_elementwise_binary<op>)
-    },
-    {
-        "sve_s32_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::S32 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_SVE(sve_s32_elementwise_binary<op>)
-    },
-    {
-        "sve_s16_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::S16 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_SVE(sve_s16_elementwise_binary<op>)
-    },
-    {
-        "sve_fp16_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_FP16_SVE(sve_fp16_elementwise_binary<op>)
-    },
-    {
-        "neon_fp32_arithmetic",
-
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F32 && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_FP32_NEON(neon_fp32_elementwise_binary<op>)
-    },
-    {
-        "neon_s32_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::S32 && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_NEON(neon_s32_elementwise_binary<op>)
-    },
-    {
-        "neon_fp16_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.isa.fp16 && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_FP16_NEON(neon_fp16_elementwise_binary<op>)
-    },
-    {
-        "neon_s16_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::S16 && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_NEON(neon_s16_elementwise_binary<op>)
-    },
-    {
-        "neon_qu8_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8 && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_QASYMM8_NEON(neon_qasymm8_elementwise_binary<op>)
-    },
-    {
-        "neon_qs8_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8_SIGNED && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_elementwise_binary<op>)
-    },
+template <ArithmeticOperation op>
+const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> available_kernels_arithmetic = {
+    {"sve2_qu8_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::QASYMM8 && data.isa.sve2 && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_QASYMM8_SVE2(sve2_qasymm8_elementwise_binary<op>)},
+    {"sve2_qs8_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data) {
+         return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && static_cast<ArithmeticOperation>(data.op) == op;
+     },
+     REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_elementwise_binary<op>)},
+    {"sve_fp32_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::F32 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_FP32_SVE(sve_fp32_elementwise_binary<op>)},
+    {"sve_s32_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S32 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_INTEGER_SVE(sve_s32_elementwise_binary<op>)},
+    {"sve_s16_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S16 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_INTEGER_SVE(sve_s16_elementwise_binary<op>)},
+    {"sve_fp16_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 &&
+                static_cast<ArithmeticOperation>(data.op) == op;
+     },
+     REGISTER_FP16_SVE(sve_fp16_elementwise_binary<op>)},
+    {"neon_fp32_arithmetic",
+
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::F32 && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_FP32_NEON(neon_fp32_elementwise_binary<op>)},
+    {"neon_s32_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S32 && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_INTEGER_NEON(neon_s32_elementwise_binary<op>)},
+    {"neon_fp16_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::F16 && data.isa.fp16 && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_FP16_NEON(neon_fp16_elementwise_binary<op>)},
+    {"neon_s16_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S16 && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_INTEGER_NEON(neon_s16_elementwise_binary<op>)},
+    {"neon_qu8_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::QASYMM8 && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_QASYMM8_NEON(neon_qasymm8_elementwise_binary<op>)},
+    {"neon_qs8_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::QASYMM8_SIGNED && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_elementwise_binary<op>)},
 };
-template <ComparisonOperation                                                   op>
-const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> available_kernels_comperison =
-{
-    {
-        "sve2_qu8_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8 && data.isa.sve2 && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_QASYMM8_SVE2(sve2_qasymm8_comparison_elementwise_binary<op>)
-    },
-    {
-        "sve2_qs8_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_comparison_elementwise_binary<op>)
-    },
-    {
-        "sve_u8_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::U8 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_SVE(sve_u8_comparison_elementwise_binary<op>)
-    },
-    {
-        "sve_fp32_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F32 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_FP32_SVE(sve_fp32_comparison_elementwise_binary<op>)
-    },
-    {
-        "sve_s16_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::S16 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_SVE(sve_s16_comparison_elementwise_binary<op>)
-    },
-    {
-        "sve_s32_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::S32 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_SVE(sve_s32_comparison_elementwise_binary<op>)
-    },
-    {
-        "sve_fp16_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_FP16_SVE(sve_fp16_comparison_elementwise_binary<op>)
-    },
-    {
-        "neon_u8_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::U8 && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_NEON(neon_u8_comparison_elementwise_binary<op>)
-    },
-    {
-        "neon_fp32_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F32 && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_FP32_NEON(neon_fp32_comparison_elementwise_binary<op>)
-    },
-    {
-        "neon_s16_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::S16 && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_NEON(neon_s16_comparison_elementwise_binary<op>)
-    },
-    {
-        "neon_s32_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::S32 && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_NEON(neon_s32_comparison_elementwise_binary<op>)
-    },
-    {
-        "neon_qu8_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8 && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_QASYMM8_NEON(neon_qasymm8_comparison_elementwise_binary<op>)
-    },
-    {
-        "neon_qs8_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8_SIGNED && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_comparison_elementwise_binary<op>)
-    },
-    {
-        "neon_fp16_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.isa.fp16 && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_FP16_NEON(neon_fp16_comparison_elementwise_binary<op>)
-    },
+template <ComparisonOperation op>
+const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> available_kernels_comperison = {
+    {"sve2_qu8_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::QASYMM8 && data.isa.sve2 && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_QASYMM8_SVE2(sve2_qasymm8_comparison_elementwise_binary<op>)},
+    {"sve2_qs8_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data) {
+         return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && static_cast<ComparisonOperation>(data.op) == op;
+     },
+     REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_comparison_elementwise_binary<op>)},
+    {"sve_u8_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::U8 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_INTEGER_SVE(sve_u8_comparison_elementwise_binary<op>)},
+    {"sve_fp32_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::F32 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_FP32_SVE(sve_fp32_comparison_elementwise_binary<op>)},
+    {"sve_s16_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S16 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_INTEGER_SVE(sve_s16_comparison_elementwise_binary<op>)},
+    {"sve_s32_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S32 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_INTEGER_SVE(sve_s32_comparison_elementwise_binary<op>)},
+    {"sve_fp16_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 &&
+                static_cast<ComparisonOperation>(data.op) == op;
+     },
+     REGISTER_FP16_SVE(sve_fp16_comparison_elementwise_binary<op>)},
+    {"neon_u8_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::U8 && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_INTEGER_NEON(neon_u8_comparison_elementwise_binary<op>)},
+    {"neon_fp32_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::F32 && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_FP32_NEON(neon_fp32_comparison_elementwise_binary<op>)},
+    {"neon_s16_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S16 && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_INTEGER_NEON(neon_s16_comparison_elementwise_binary<op>)},
+    {"neon_s32_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S32 && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_INTEGER_NEON(neon_s32_comparison_elementwise_binary<op>)},
+    {"neon_qu8_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::QASYMM8 && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_QASYMM8_NEON(neon_qasymm8_comparison_elementwise_binary<op>)},
+    {"neon_qs8_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::QASYMM8_SIGNED && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_comparison_elementwise_binary<op>)},
+    {"neon_fp16_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::F16 && data.isa.fp16 && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_FP16_NEON(neon_fp16_comparison_elementwise_binary<op>)},
 };
 } // namespace
 
-const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> &CpuArithmeticKernel::get_available_kernels()
+const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> &
+CpuArithmeticKernel::get_available_kernels()
 {
     static std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> available_kernels;
-    std::move(available_kernels_arithmetic<ArithmeticOperation::ADD>.begin(), available_kernels_arithmetic<ArithmeticOperation::ADD>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_arithmetic<ArithmeticOperation::SUB>.begin(), available_kernels_arithmetic<ArithmeticOperation::SUB>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_arithmetic<ArithmeticOperation::DIV>.begin(), available_kernels_arithmetic<ArithmeticOperation::DIV>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_arithmetic<ArithmeticOperation::MIN>.begin(), available_kernels_arithmetic<ArithmeticOperation::MIN>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_arithmetic<ArithmeticOperation::MAX>.begin(), available_kernels_arithmetic<ArithmeticOperation::MAX>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_arithmetic<ArithmeticOperation::SQUARED_DIFF>.begin(), available_kernels_arithmetic<ArithmeticOperation::SQUARED_DIFF>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_arithmetic<ArithmeticOperation::POWER>.begin(), available_kernels_arithmetic<ArithmeticOperation::POWER>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_arithmetic<ArithmeticOperation::PRELU>.begin(), available_kernels_arithmetic<ArithmeticOperation::PRELU>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::ADD>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::ADD>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::SUB>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::SUB>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::DIV>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::DIV>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::MIN>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::MIN>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::MAX>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::MAX>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::SQUARED_DIFF>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::SQUARED_DIFF>.end(),
+              std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::POWER>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::POWER>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::PRELU>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::PRELU>.end(), std::back_inserter(available_kernels));
 
     return available_kernels;
 }
 
-const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> &CpuComparisonKernel::get_available_kernels()
+const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> &
+CpuComparisonKernel::get_available_kernels()
 {
     static std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> available_kernels;
-    std::move(available_kernels_comperison<ComparisonOperation::Equal>.begin(), available_kernels_comperison<ComparisonOperation::Equal>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_comperison<ComparisonOperation::NotEqual>.begin(), available_kernels_comperison<ComparisonOperation::NotEqual>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_comperison<ComparisonOperation::Greater>.begin(), available_kernels_comperison<ComparisonOperation::Greater>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_comperison<ComparisonOperation::GreaterEqual>.begin(), available_kernels_comperison<ComparisonOperation::GreaterEqual>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_comperison<ComparisonOperation::Less>.begin(), available_kernels_comperison<ComparisonOperation::Less>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_comperison<ComparisonOperation::LessEqual>.begin(), available_kernels_comperison<ComparisonOperation::LessEqual>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_comperison<ComparisonOperation::Equal>.begin(),
+              available_kernels_comperison<ComparisonOperation::Equal>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_comperison<ComparisonOperation::NotEqual>.begin(),
+              available_kernels_comperison<ComparisonOperation::NotEqual>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_comperison<ComparisonOperation::Greater>.begin(),
+              available_kernels_comperison<ComparisonOperation::Greater>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_comperison<ComparisonOperation::GreaterEqual>.begin(),
+              available_kernels_comperison<ComparisonOperation::GreaterEqual>.end(),
+              std::back_inserter(available_kernels));
+    std::move(available_kernels_comperison<ComparisonOperation::Less>.begin(),
+              available_kernels_comperison<ComparisonOperation::Less>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_comperison<ComparisonOperation::LessEqual>.begin(),
+              available_kernels_comperison<ComparisonOperation::LessEqual>.end(),
+              std::back_inserter(available_kernels));
 
     return available_kernels;
 }
 
 template <class Derived>
-Status CpuElementwiseKernel<Derived>::validate_arguments_common(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
+Status CpuElementwiseKernel<Derived>::validate_arguments_common(const ITensorInfo &src0,
+                                                                const ITensorInfo &src1,
+                                                                const ITensorInfo &dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1);
@@ -308,7 +232,7 @@ Status CpuElementwiseKernel<Derived>::validate_arguments_common(const ITensorInf
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
                                         "Wrong shape for output");
@@ -321,7 +245,8 @@ void CpuArithmeticKernel::configure_common(const ITensorInfo *src0, const ITenso
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
-    const auto *uk = CpuArithmeticKernel::get_implementation(ElementwiseDataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa(), static_cast<int>(_op) });
+    const auto *uk = CpuArithmeticKernel::get_implementation(
+        ElementwiseDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), static_cast<int>(_op)});
 
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
@@ -329,7 +254,7 @@ void CpuArithmeticKernel::configure_common(const ITensorInfo *src0, const ITenso
     _name       = std::string("CpuArithmeticKernel").append("/").append(uk->name);
 
     // If any of shapes is dynamic, expect a configured window and dst at run-time.
-    if(src0->is_dynamic() || src1->is_dynamic())
+    if (src0->is_dynamic() || src1->is_dynamic())
     {
         return;
     }
@@ -343,7 +268,8 @@ void CpuComparisonKernel::configure_common(const ITensorInfo *src0, const ITenso
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
-    const auto *uk = CpuComparisonKernel::get_implementation(ElementwiseDataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa(), static_cast<int>(_op) });
+    const auto *uk = CpuComparisonKernel::get_implementation(
+        ElementwiseDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), static_cast<int>(_op)});
 
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
@@ -351,7 +277,7 @@ void CpuComparisonKernel::configure_common(const ITensorInfo *src0, const ITenso
     _name       = std::string("CpuComparisonKernel").append("/").append(uk->name);
 
     // If any of shapes is dynamic, expect a configured window and dst at run-time.
-    if(src0->is_dynamic() || src1->is_dynamic())
+    if (src0->is_dynamic() || src1->is_dynamic())
     {
         return;
     }
@@ -373,8 +299,10 @@ void CpuElementwiseKernel<Derived>::run_op(ITensorPack &tensors, const Window &w
 
     _run_method(src0, src1, dst, window);
 }
-template void CpuElementwiseKernel<CpuArithmeticKernel>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info);
-template void CpuElementwiseKernel<CpuComparisonKernel>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info);
+template void
+CpuElementwiseKernel<CpuArithmeticKernel>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info);
+template void
+CpuElementwiseKernel<CpuComparisonKernel>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info);
 
 template <class Derived>
 const char *CpuElementwiseKernel<Derived>::name() const
@@ -385,7 +313,10 @@ template const char *CpuElementwiseKernel<CpuArithmeticKernel>::name() const;
 template const char *CpuElementwiseKernel<CpuComparisonKernel>::name() const;
 
 /** Arithmetic operators (min, max, squared_diff) */
-void CpuArithmeticKernel::configure(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
+void CpuArithmeticKernel::configure(ArithmeticOperation op,
+                                    const ITensorInfo  *src0,
+                                    const ITensorInfo  *src1,
+                                    ITensorInfo        *dst)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
     _op = op;
@@ -394,16 +325,20 @@ void CpuArithmeticKernel::configure(ArithmeticOperation op, const ITensorInfo *s
 
 Status CpuArithmeticKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::S16, DataType::F16, DataType::S32, DataType::F32);
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst);
     }
     return validate_arguments_common(src0, src1, dst);
 }
 
-Status CpuArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+Status CpuArithmeticKernel::validate(ArithmeticOperation op,
+                                     const ITensorInfo  *src0,
+                                     const ITensorInfo  *src1,
+                                     const ITensorInfo  *dst)
 {
     ARM_COMPUTE_UNUSED(op);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
@@ -416,15 +351,15 @@ size_t CpuArithmeticKernel::get_mws(const CPUInfo &platform, size_t thread_count
     ARM_COMPUTE_UNUSED(thread_count);
 
 #if defined(ENABLE_FP32_KERNELS)
-    if(this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MIN>
-    || this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MAX>)
+    if (this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MIN> ||
+        this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MAX>)
     {
         size_t mws = ICPPKernel::default_mws;
-        if(platform.get_cpu_model() == CPUModel::N1)
+        if (platform.get_cpu_model() == CPUModel::N1)
         {
             mws = default_min_max_mws_N1_fp32_neon;
         }
-        else if(platform.get_cpu_model() == CPUModel::V1)
+        else if (platform.get_cpu_model() == CPUModel::V1)
         {
             mws = default_min_max_mws_V1_fp32_neon;
         }
@@ -434,7 +369,7 @@ size_t CpuArithmeticKernel::get_mws(const CPUInfo &platform, size_t thread_count
         }
 
         // tensor is 1D or was re-interpreted as 1D
-        if(this->window().shape().num_dimensions() == 1)
+        if (this->window().shape().num_dimensions() == 1)
         {
             return mws;
         }
@@ -447,7 +382,7 @@ size_t CpuArithmeticKernel::get_mws(const CPUInfo &platform, size_t thread_count
             return std::max(static_cast<size_t>(1), mws);
         }
     }
-#else /* ENABLE_FP32_KERNELS */
+#else  /* ENABLE_FP32_KERNELS */
     ARM_COMPUTE_UNUSED(platform);
 #endif /* ENABLE_FP32_KERNELS */
     return ICPPKernel::default_mws;
@@ -467,14 +402,14 @@ size_t CpuDivisionKernel::get_mws(const CPUInfo &platform, size_t thread_count)
     ARM_COMPUTE_UNUSED(thread_count);
 
 #if defined(ENABLE_FP32_KERNELS)
-    if(this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::DIV>)
+    if (this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::DIV>)
     {
         size_t mws = ICPPKernel::default_mws;
-        if(platform.get_cpu_model() == CPUModel::N1)
+        if (platform.get_cpu_model() == CPUModel::N1)
         {
             mws = default_div_mws_N1_fp32_neon;
         }
-        else if(platform.get_cpu_model() == CPUModel::V1)
+        else if (platform.get_cpu_model() == CPUModel::V1)
         {
             mws = default_div_mws_V1_fp32_neon;
         }
@@ -484,7 +419,7 @@ size_t CpuDivisionKernel::get_mws(const CPUInfo &platform, size_t thread_count)
         }
 
         // tensor is 1D or was re-interpreted as 1D
-        if(this->window().shape().num_dimensions() == 1)
+        if (this->window().shape().num_dimensions() == 1)
         {
             return mws;
         }
@@ -497,7 +432,7 @@ size_t CpuDivisionKernel::get_mws(const CPUInfo &platform, size_t thread_count)
             return std::max(static_cast<size_t>(1), mws);
         }
     }
-#else /* ENABLE_FP32_KERNELS */
+#else  /* ENABLE_FP32_KERNELS */
     ARM_COMPUTE_UNUSED(platform);
 #endif /* ENABLE_FP32_KERNELS */
     return ICPPKernel::default_mws;
@@ -538,7 +473,10 @@ Status CpuPowerKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1
 }
 
 /** Comparison operators (equal, not equal, less than, greater than, less than or equal, greater than or equal) */
-void CpuComparisonKernel::configure(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
+void CpuComparisonKernel::configure(ComparisonOperation op,
+                                    const ITensorInfo  *src0,
+                                    const ITensorInfo  *src1,
+                                    ITensorInfo        *dst)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
     _op = op;
@@ -547,16 +485,21 @@ void CpuComparisonKernel::configure(ComparisonOperation op, const ITensorInfo *s
 
 Status CpuComparisonKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16,
+                                                         DataType::S32, DataType::F32);
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::U8);
     }
     return validate_arguments_common(src0, src1, dst);
 }
 
-Status CpuComparisonKernel::validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+Status CpuComparisonKernel::validate(ComparisonOperation op,
+                                     const ITensorInfo  *src0,
+                                     const ITensorInfo  *src1,
+                                     const ITensorInfo  *dst)
 {
     ARM_COMPUTE_UNUSED(op);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
diff --git a/src/cpu/kernels/CpuElementwiseKernel.h b/src/cpu/kernels/CpuElementwiseKernel.h
index 634e38bf9f..1f3e613b80 100644
--- a/src/cpu/kernels/CpuElementwiseKernel.h
+++ b/src/cpu/kernels/CpuElementwiseKernel.h
@@ -43,7 +43,8 @@ template <class Derived>
 class CpuElementwiseKernel : public ICpuKernel<Derived>
 {
 private:
-    using ElementwiseKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const Window &)>::type;
+    using ElementwiseKernelPtr =
+        std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const Window &)>::type;
 
 public:
     CpuElementwiseKernel() = default;
@@ -72,7 +73,7 @@ protected:
     static Status validate_arguments_common(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
 
 protected:
-    ElementwiseKernelPtr _run_method{ nullptr };
+    ElementwiseKernelPtr _run_method{nullptr};
     std::string          _name{};
 };
 
@@ -96,7 +97,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
+    static Status
+    validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
 
     static const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> &get_available_kernels();
 
@@ -200,7 +202,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
+    static Status
+    validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
 
     static const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> &get_available_kernels();
 
@@ -226,4 +229,4 @@ private:
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H */
diff --git a/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp
index 04a7f15715..88545ee756 100644
--- a/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp
+++ b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp
@@ -28,8 +28,9 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/elementwise_unary/list.h"
@@ -59,12 +60,13 @@ std::unique_ptr<uint8_t[]> q8_prepare_lut(ElementWiseUnary op, const ITensorInfo
     const auto dst_min_fp = (((is_signed) ? -128 : 0) - dst_qi.offset) * dst_qi.scale;
     const auto dst_max_fp = (((is_signed) ? 127 : 255) - dst_qi.offset) * dst_qi.scale;
 
-    for(int i = 0; i < 256; ++i)
+    for (int i = 0; i < 256; ++i)
     {
-        const auto in     = (is_signed) ? dequantize_qasymm8_signed(static_cast<int8_t>(i), src_qi) : dequantize_qasymm8(i, src_qi);
-        float      result = 0;
+        const auto in =
+            (is_signed) ? dequantize_qasymm8_signed(static_cast<int8_t>(i), src_qi) : dequantize_qasymm8(i, src_qi);
+        float result = 0;
 
-        switch(op)
+        switch (op)
         {
             case ElementWiseUnary::RSQRT:
                 result = 1 / sqrt(in);
@@ -100,7 +102,8 @@ std::unique_ptr<uint8_t[]> q8_prepare_lut(ElementWiseUnary op, const ITensorInfo
 
         result = utility::clamp(result, dst_min_fp, dst_max_fp);
 
-        const auto out = (is_signed) ? static_cast<uint8_t>(quantize_qasymm8_signed(result, dst_qi)) : quantize_qasymm8(result, dst_qi);
+        const auto out = (is_signed) ? static_cast<uint8_t>(quantize_qasymm8_signed(result, dst_qi))
+                                     : quantize_qasymm8(result, dst_qi);
         lut[i]         = out;
     }
 
@@ -109,97 +112,68 @@ std::unique_ptr<uint8_t[]> q8_prepare_lut(ElementWiseUnary op, const ITensorInfo
 
 #endif // __aarch64__
 
-static const std::vector<CpuElementwiseUnaryKernel::ElementwiseUnaryKernel> available_kernels =
-{
+static const std::vector<CpuElementwiseUnaryKernel::ElementwiseUnaryKernel> available_kernels = {
     {
         "sve_fp32_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::F32 && data.isa.sve);
-        },
+        [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32 && data.isa.sve); },
         REGISTER_FP32_SVE(sve_fp32_elementwise_unary),
         nullptr,
     },
     {
         "sve_fp16_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::F16 && data.isa.sve && data.isa.fp16);
-        },
+        [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16 && data.isa.sve && data.isa.fp16); },
         REGISTER_FP16_SVE(sve_fp16_elementwise_unary),
         nullptr,
     },
     {
         "sve_s32_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::S32 && data.isa.sve);
-        },
+        [](const DataTypeISASelectorData &data) { return (data.dt == DataType::S32 && data.isa.sve); },
         REGISTER_INTEGER_SVE(sve_s32_elementwise_unary),
         nullptr,
     },
     {
         "neon_fp32_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F32;
-        },
+        [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; },
         REGISTER_FP32_NEON(neon_fp32_elementwise_unary),
         nullptr,
     },
     {
         "neon_fp16_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.isa.fp16;
-        },
+        [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
         REGISTER_FP16_NEON(neon_fp16_elementwise_unary),
         nullptr,
     },
     {
         "neon_s32_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::S32;
-        },
+        [](const DataTypeISASelectorData &data) { return data.dt == DataType::S32; },
         REGISTER_INTEGER_NEON(neon_s32_elementwise_unary),
         nullptr,
     },
 #ifdef __aarch64__
     {
         "sve2_q8_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2;
-        },
+        [](const DataTypeISASelectorData &data)
+        { return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; },
         REGISTER_QASYMM8_SVE2(sve2_q8_elementwise_unary),
         &q8_prepare_lut,
     },
     {
         "neon_q8_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED;
-        },
+        [](const DataTypeISASelectorData &data)
+        { return data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED; },
         REGISTER_QASYMM8_NEON(neon_q8_elementwise_unary),
         &q8_prepare_lut,
     },
 #else  // __aarch64__
     {
         "neon_qasymm8_signed_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8_SIGNED;
-        },
+        [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
         REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_elementwise_unary),
         nullptr,
     },
     {
         "neon_qasymm8_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8;
-        },
+        [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; },
         REGISTER_QASYMM8_NEON(neon_qasymm8_elementwise_unary),
         nullptr,
     },
@@ -211,7 +185,8 @@ static const std::vector<CpuElementwiseUnaryKernel::ElementwiseUnaryKernel> avai
 void CpuElementwiseUnaryKernel::configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate(op, src, dst));
-    const auto uk = CpuElementwiseUnaryKernel::get_implementation(DataTypeISASelectorData{ src.data_type(), CPUInfo::get().get_isa() });
+    const auto uk = CpuElementwiseUnaryKernel::get_implementation(
+        DataTypeISASelectorData{src.data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     _op         = op;
@@ -219,12 +194,12 @@ void CpuElementwiseUnaryKernel::configure(ElementWiseUnary op, const ITensorInfo
     _name       = std::string("CpuElementwiseUnaryKernel").append("/").append(uk->name);
 
     // If input shape is dynamic, expect a configured window and dst at run-time.
-    if(src.is_dynamic())
+    if (src.is_dynamic())
     {
         return;
     }
 
-    if(uk->prepare_func != nullptr)
+    if (uk->prepare_func != nullptr)
     {
         _lut = uk->prepare_func(op, &src, &dst);
     }
@@ -238,28 +213,31 @@ Status CpuElementwiseUnaryKernel::validate(ElementWiseUnary op, const ITensorInf
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src);
 
-    const auto *uk = CpuElementwiseUnaryKernel::get_implementation(DataTypeISASelectorData{ src.data_type(), CPUInfo::get().get_isa() });
+    const auto *uk = CpuElementwiseUnaryKernel::get_implementation(
+        DataTypeISASelectorData{src.data_type(), CPUInfo::get().get_isa()});
 
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
-    switch(op)
+    switch (op)
     {
         case ElementWiseUnary::EXP:
         case ElementWiseUnary::RSQRT:
         case ElementWiseUnary::LOG:
         case ElementWiseUnary::ROUND:
         case ElementWiseUnary::SIN:
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32,
+                                                                 DataType::QASYMM8, DataType::QASYMM8_SIGNED);
             break;
         case ElementWiseUnary::NEG:
         case ElementWiseUnary::ABS:
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32,
+                                                                 DataType::QASYMM8, DataType::QASYMM8_SIGNED);
             break;
         default:
             ARM_COMPUTE_ERROR("ElementWiseUnary operation not supported");
     }
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
     }
diff --git a/src/cpu/kernels/CpuElementwiseUnaryKernel.h b/src/cpu/kernels/CpuElementwiseUnaryKernel.h
index 00188f0d49..249909854e 100644
--- a/src/cpu/kernels/CpuElementwiseUnaryKernel.h
+++ b/src/cpu/kernels/CpuElementwiseUnaryKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_KERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -42,8 +43,10 @@ namespace kernels
 class CpuElementwiseUnaryKernel : public ICpuKernel<CpuElementwiseUnaryKernel>
 {
 private:
-    using ElementwiseUnaryUkernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &, ElementWiseUnary, const uint8_t *)>::type;
-    using ElementwiseUnaryPreparePtr = std::add_pointer<std::unique_ptr<uint8_t[]>(ElementWiseUnary op, const ITensorInfo *, const ITensorInfo *)>::type;
+    using ElementwiseUnaryUkernelPtr =
+        std::add_pointer<void(const ITensor *, ITensor *, const Window &, ElementWiseUnary, const uint8_t *)>::type;
+    using ElementwiseUnaryPreparePtr = std::add_pointer<std::unique_ptr<uint8_t[]>(
+        ElementWiseUnary op, const ITensorInfo *, const ITensorInfo *)>::type;
 
 public:
     CpuElementwiseUnaryKernel() = default;
@@ -65,7 +68,7 @@ public:
     static Status validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct ElementwiseUnaryKernel
@@ -80,7 +83,7 @@ public:
 
 private:
     ElementWiseUnary           _op{};
-    ElementwiseUnaryUkernelPtr _run_method{ nullptr };
+    ElementwiseUnaryUkernelPtr _run_method{nullptr};
     std::string                _name{};
     std::unique_ptr<uint8_t[]> _lut{};
 };
diff --git a/src/cpu/kernels/CpuFillKernel.cpp b/src/cpu/kernels/CpuFillKernel.cpp
index f69de0082d..754da97ae1 100644
--- a/src/cpu/kernels/CpuFillKernel.cpp
+++ b/src/cpu/kernels/CpuFillKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -68,17 +69,18 @@ void CpuFillKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
     collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator tensor_it(inout, collapsed);
-    execute_window_loop(collapsed, [&](const Coordinates &)
-    {
-        uint8_t *base_addr = start_valid_region + tensor_it.offset();
-        // Set memory
-        for(int i = 0; i < window_width; ++i)
+    execute_window_loop(
+        collapsed,
+        [&](const Coordinates &)
         {
-            std::memcpy(base_addr + i * element_size, &_constant_value.value, element_size);
-        }
-
-    },
-    tensor_it);
+            uint8_t *base_addr = start_valid_region + tensor_it.offset();
+            // Set memory
+            for (int i = 0; i < window_width; ++i)
+            {
+                std::memcpy(base_addr + i * element_size, &_constant_value.value, element_size);
+            }
+        },
+        tensor_it);
 }
 
 const char *CpuFillKernel::name() const
diff --git a/src/cpu/kernels/CpuFillKernel.h b/src/cpu/kernels/CpuFillKernel.h
index ce41afc462..7c200c9b59 100644
--- a/src/cpu/kernels/CpuFillKernel.h
+++ b/src/cpu/kernels/CpuFillKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_FILL_KERNEL_H
 
 #include "arm_compute/core/PixelValue.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -48,7 +49,7 @@ public:
     void configure(const ITensorInfo *tensor, const PixelValue &constant_value);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
diff --git a/src/cpu/kernels/CpuFloorKernel.cpp b/src/cpu/kernels/CpuFloorKernel.cpp
index 65e390a81a..df7e6aad46 100644
--- a/src/cpu/kernels/CpuFloorKernel.cpp
+++ b/src/cpu/kernels/CpuFloorKernel.cpp
@@ -27,11 +27,11 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
+
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/common/Registrars.h"
 #include "src/cpu/kernels/floor/list.h"
 
 namespace arm_compute
@@ -42,29 +42,22 @@ namespace kernels
 {
 namespace
 {
-static const std::vector<CpuFloorKernel::FloorKernel> available_kernels =
-{
-    {
-        "neon_fp16_floor",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_floor)
-    },
-    {
-        "neon_fp32_floor",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_floor)
-    }
-};
+static const std::vector<CpuFloorKernel::FloorKernel> available_kernels = {
+    {"neon_fp16_floor", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_floor)},
+    {"neon_fp32_floor", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_floor)}};
 
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
 
-    const auto *uk = CpuFloorKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+    const auto *uk =
+        CpuFloorKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     // Validate in case of configured output
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
@@ -81,7 +74,8 @@ void CpuFloorKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
 
     auto_init_if_empty(*dst, src->tensor_shape(), 1, src->data_type());
 
-    const auto *uk = CpuFloorKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+    const auto *uk =
+        CpuFloorKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
 
     _run_method = uk->ukernel;
@@ -122,17 +116,14 @@ void CpuFloorKernel::run_op(ITensorPack &tensors, const Window &window, const Th
     ITensor       *dst = tensors.get_tensor(TensorType::ACL_DST);
     const auto     len = static_cast<int>(window.x().end()) - static_cast<int>(window.x().start());
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator src_it(src, win);
     Iterator dst_it(dst, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        _run_method(src_it.ptr(), dst_it.ptr(), len);
-    },
-    src_it, dst_it);
+    execute_window_loop(
+        win, [&](const Coordinates &) { _run_method(src_it.ptr(), dst_it.ptr(), len); }, src_it, dst_it);
 }
 
 const char *CpuFloorKernel::name() const
diff --git a/src/cpu/kernels/CpuFloorKernel.h b/src/cpu/kernels/CpuFloorKernel.h
index 35ab534ca8..57107d0532 100644
--- a/src/cpu/kernels/CpuFloorKernel.h
+++ b/src/cpu/kernels/CpuFloorKernel.h
@@ -65,7 +65,7 @@ public:
     Window infer_window(const ITensorInfo *src, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct FloorKernel
@@ -78,7 +78,7 @@ public:
     static const std::vector<FloorKernel> &get_available_kernels();
 
 private:
-    FloorKernelPtr _run_method{ nullptr };
+    FloorKernelPtr _run_method{nullptr};
     std::string    _name{};
 };
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp
index 9fbf2d54c6..db433c99a8 100644
--- a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp
+++ b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp
@@ -24,9 +24,10 @@
 #include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
 
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -60,7 +61,7 @@ Status CpuGemmInterleave4x4Kernel::validate(const ITensorInfo *src, const ITenso
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorShape dst_shape = compute_interleaved_shape(*src);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
@@ -111,35 +112,42 @@ void CpuGemmInterleave4x4Kernel::run_op(ITensorPack &tensors, const Window &wind
     Iterator in(src, win);
     Iterator out(dst, win_out);
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        if(id.y() + 4 <= static_cast<int>(in_height))
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            for(size_t x = window_start_x; x < window_end_x; ++x)
+            if (id.y() + 4 <= static_cast<int>(in_height))
             {
-                std::memcpy(out.ptr() + (x * 4 + 0) * element_size, (in.ptr() + 0 * in_stride) + x * element_size, element_size);
-                std::memcpy(out.ptr() + (x * 4 + 1) * element_size, (in.ptr() + 1 * in_stride) + x * element_size, element_size);
-                std::memcpy(out.ptr() + (x * 4 + 2) * element_size, (in.ptr() + 2 * in_stride) + x * element_size, element_size);
-                std::memcpy(out.ptr() + (x * 4 + 3) * element_size, (in.ptr() + 3 * in_stride) + x * element_size, element_size);
-            }
-        }
-        else
-        {
-            for(size_t x = window_start_x; x < window_end_x; ++x)
-            {
-                size_t y = 0;
-                for(; y < partial_y; ++y)
+                for (size_t x = window_start_x; x < window_end_x; ++x)
                 {
-                    std::memcpy(out.ptr() + (x * 4 + y) * element_size, (in.ptr() + y * in_stride) + x * element_size, element_size);
+                    std::memcpy(out.ptr() + (x * 4 + 0) * element_size, (in.ptr() + 0 * in_stride) + x * element_size,
+                                element_size);
+                    std::memcpy(out.ptr() + (x * 4 + 1) * element_size, (in.ptr() + 1 * in_stride) + x * element_size,
+                                element_size);
+                    std::memcpy(out.ptr() + (x * 4 + 2) * element_size, (in.ptr() + 2 * in_stride) + x * element_size,
+                                element_size);
+                    std::memcpy(out.ptr() + (x * 4 + 3) * element_size, (in.ptr() + 3 * in_stride) + x * element_size,
+                                element_size);
                 }
-                for(; y < 4; ++y)
+            }
+            else
+            {
+                for (size_t x = window_start_x; x < window_end_x; ++x)
                 {
-                    std::memset(out.ptr() + (x * 4 + y) * element_size, 0, element_size);
+                    size_t y = 0;
+                    for (; y < partial_y; ++y)
+                    {
+                        std::memcpy(out.ptr() + (x * 4 + y) * element_size,
+                                    (in.ptr() + y * in_stride) + x * element_size, element_size);
+                    }
+                    for (; y < 4; ++y)
+                    {
+                        std::memset(out.ptr() + (x * 4 + y) * element_size, 0, element_size);
+                    }
                 }
             }
-        }
-    },
-    in, out);
+        },
+        in, out);
 }
 
 const char *CpuGemmInterleave4x4Kernel::name() const
diff --git a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h
index 4fb6a52a8b..2ce34bc4bc 100644
--- a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h
+++ b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h
@@ -71,7 +71,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 };
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp
index f8bef64066..a3ed2cd171 100644
--- a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -44,646 +45,494 @@ namespace kernels
 {
 namespace
 {
-void inline vector_matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, int width_out, size_t stride_b, const Window &window)
+void inline vector_matrix_multiply_u8(Iterator     &ina,
+                                      Iterator     &inb,
+                                      Iterator     &out,
+                                      int           width_a,
+                                      int           width_b,
+                                      int           width_out,
+                                      size_t        stride_b,
+                                      const Window &window)
 {
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        if(id.x() > width_b)
-        {
-            return;
-        }
-
-        // Note: Since the input are all positives, we can use uint32_t
-        // Accumulators for the block 0
-        uint32x4x4_t c0 =
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
+            if (id.x() > width_b)
             {
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0)
+                return;
             }
-        };
-
-        auto vec_a          = reinterpret_cast<const uint8_t *>(ina.ptr());
-        auto matrix_b       = reinterpret_cast<const uint8_t *>(inb.ptr());
-        auto vec_a_end_addr = vec_a + width_a;
-
-        // This for loop performs 8 accumulations
-        for(; vec_a <= (vec_a_end_addr - 8);)
-        {
-            const uint8x8_t  a00_u8 = vld1_u8(vec_a);
-            const uint8x16_t b00_u8 = vld1q_u8(matrix_b + 0 * stride_b);
-            const uint8x16_t b10_u8 = vld1q_u8(matrix_b + 1 * stride_b);
-            const uint8x16_t b20_u8 = vld1q_u8(matrix_b + 2 * stride_b);
-            const uint8x16_t b30_u8 = vld1q_u8(matrix_b + 3 * stride_b);
-            const uint8x16_t b40_u8 = vld1q_u8(matrix_b + 4 * stride_b);
-            const uint8x16_t b50_u8 = vld1q_u8(matrix_b + 5 * stride_b);
-            const uint8x16_t b60_u8 = vld1q_u8(matrix_b + 6 * stride_b);
-            const uint8x16_t b70_u8 = vld1q_u8(matrix_b + 7 * stride_b);
-
-            // Convert a00_u8 to uint16_t and get the lower part
-            const uint16x4x2_t a00_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(a00_u8)),
-                    vget_high_u16(vmovl_u8(a00_u8))
-                }
-            };
-
-            const uint16x4x4_t b00_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
-                }
-            };
-
-            const uint16x4x4_t b10_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b10_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b10_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b10_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b10_u8)))
-                }
-            };
-
-            const uint16x4x4_t b20_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b20_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b20_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b20_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b20_u8)))
-                }
-            };
 
-            const uint16x4x4_t b30_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b30_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b30_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b30_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b30_u8)))
-                }
-            };
+            // Note: Since the input are all positives, we can use uint32_t
+            // Accumulators for the block 0
+            uint32x4x4_t c0 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}};
+
+            auto vec_a          = reinterpret_cast<const uint8_t *>(ina.ptr());
+            auto matrix_b       = reinterpret_cast<const uint8_t *>(inb.ptr());
+            auto vec_a_end_addr = vec_a + width_a;
+
+            // This for loop performs 8 accumulations
+            for (; vec_a <= (vec_a_end_addr - 8);)
+            {
+                const uint8x8_t  a00_u8 = vld1_u8(vec_a);
+                const uint8x16_t b00_u8 = vld1q_u8(matrix_b + 0 * stride_b);
+                const uint8x16_t b10_u8 = vld1q_u8(matrix_b + 1 * stride_b);
+                const uint8x16_t b20_u8 = vld1q_u8(matrix_b + 2 * stride_b);
+                const uint8x16_t b30_u8 = vld1q_u8(matrix_b + 3 * stride_b);
+                const uint8x16_t b40_u8 = vld1q_u8(matrix_b + 4 * stride_b);
+                const uint8x16_t b50_u8 = vld1q_u8(matrix_b + 5 * stride_b);
+                const uint8x16_t b60_u8 = vld1q_u8(matrix_b + 6 * stride_b);
+                const uint8x16_t b70_u8 = vld1q_u8(matrix_b + 7 * stride_b);
+
+                // Convert a00_u8 to uint16_t and get the lower part
+                const uint16x4x2_t a00_u16 = {{vget_low_u16(vmovl_u8(a00_u8)), vget_high_u16(vmovl_u8(a00_u8))}};
+
+                const uint16x4x4_t b00_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))}};
+
+                const uint16x4x4_t b10_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b10_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b10_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b10_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b10_u8)))}};
+
+                const uint16x4x4_t b20_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b20_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b20_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b20_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b20_u8)))}};
+
+                const uint16x4x4_t b30_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b30_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b30_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b30_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b30_u8)))}};
+
+                const uint16x4x4_t b40_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b40_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b40_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b40_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b40_u8)))}};
+
+                const uint16x4x4_t b50_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b50_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b50_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b50_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b50_u8)))}};
+
+                const uint16x4x4_t b60_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b60_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b60_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b60_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b60_u8)))}};
+
+                const uint16x4x4_t b70_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b70_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b70_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b70_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b70_u8)))}};
+
+                // Accumulate 0:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16.val[0], 0);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16.val[0], 0);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16.val[0], 0);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16.val[0], 0);
+
+                // Accumulate 1:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b10_u16.val[0], a00_u16.val[0], 1);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b10_u16.val[1], a00_u16.val[0], 1);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b10_u16.val[2], a00_u16.val[0], 1);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b10_u16.val[3], a00_u16.val[0], 1);
+
+                // Accumulate 2:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b20_u16.val[0], a00_u16.val[0], 2);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b20_u16.val[1], a00_u16.val[0], 2);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b20_u16.val[2], a00_u16.val[0], 2);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b20_u16.val[3], a00_u16.val[0], 2);
+
+                // Accumulate 3:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b30_u16.val[0], a00_u16.val[0], 3);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b30_u16.val[1], a00_u16.val[0], 3);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b30_u16.val[2], a00_u16.val[0], 3);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b30_u16.val[3], a00_u16.val[0], 3);
+
+                // Accumulate 4:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b40_u16.val[0], a00_u16.val[1], 0);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b40_u16.val[1], a00_u16.val[1], 0);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b40_u16.val[2], a00_u16.val[1], 0);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b40_u16.val[3], a00_u16.val[1], 0);
+
+                // Accumulate 5:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b50_u16.val[0], a00_u16.val[1], 1);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b50_u16.val[1], a00_u16.val[1], 1);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b50_u16.val[2], a00_u16.val[1], 1);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b50_u16.val[3], a00_u16.val[1], 1);
+
+                // Accumulate 6:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b60_u16.val[0], a00_u16.val[1], 2);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b60_u16.val[1], a00_u16.val[1], 2);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b60_u16.val[2], a00_u16.val[1], 2);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b60_u16.val[3], a00_u16.val[1], 2);
+
+                // Accumulate 7:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b70_u16.val[0], a00_u16.val[1], 3);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b70_u16.val[1], a00_u16.val[1], 3);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b70_u16.val[2], a00_u16.val[1], 3);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b70_u16.val[3], a00_u16.val[1], 3);
+
+                vec_a += 8;
+                matrix_b += 8 * stride_b;
+            }
 
-            const uint16x4x4_t b40_u16 =
+            // This for loop performs the left-over accumulations
+            for (; vec_a < vec_a_end_addr;)
             {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b40_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b40_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b40_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b40_u8)))
-                }
-            };
+                const uint8x8_t  a00_u8 = vld1_dup_u8(vec_a);
+                const uint8x16_t b00_u8 = vld1q_u8(matrix_b);
 
-            const uint16x4x4_t b50_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b50_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b50_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b50_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b50_u8)))
-                }
-            };
+                const uint16x4x4_t b00_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))}};
 
-            const uint16x4x4_t b60_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b60_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b60_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b60_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b60_u8)))
-                }
-            };
+                // Convert a00_u8 to uint16_t and get the lower part
+                const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
 
-            const uint16x4x4_t b70_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b70_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b70_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b70_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b70_u8)))
-                }
-            };
-
-            // Accumulate 0:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16.val[0], 0);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16.val[0], 0);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16.val[0], 0);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16.val[0], 0);
-
-            // Accumulate 1:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b10_u16.val[0], a00_u16.val[0], 1);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b10_u16.val[1], a00_u16.val[0], 1);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b10_u16.val[2], a00_u16.val[0], 1);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b10_u16.val[3], a00_u16.val[0], 1);
-
-            // Accumulate 2:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b20_u16.val[0], a00_u16.val[0], 2);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b20_u16.val[1], a00_u16.val[0], 2);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b20_u16.val[2], a00_u16.val[0], 2);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b20_u16.val[3], a00_u16.val[0], 2);
-
-            // Accumulate 3:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b30_u16.val[0], a00_u16.val[0], 3);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b30_u16.val[1], a00_u16.val[0], 3);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b30_u16.val[2], a00_u16.val[0], 3);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b30_u16.val[3], a00_u16.val[0], 3);
-
-            // Accumulate 4:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b40_u16.val[0], a00_u16.val[1], 0);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b40_u16.val[1], a00_u16.val[1], 0);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b40_u16.val[2], a00_u16.val[1], 0);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b40_u16.val[3], a00_u16.val[1], 0);
-
-            // Accumulate 5:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b50_u16.val[0], a00_u16.val[1], 1);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b50_u16.val[1], a00_u16.val[1], 1);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b50_u16.val[2], a00_u16.val[1], 1);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b50_u16.val[3], a00_u16.val[1], 1);
-
-            // Accumulate 6:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b60_u16.val[0], a00_u16.val[1], 2);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b60_u16.val[1], a00_u16.val[1], 2);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b60_u16.val[2], a00_u16.val[1], 2);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b60_u16.val[3], a00_u16.val[1], 2);
-
-            // Accumulate 7:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b70_u16.val[0], a00_u16.val[1], 3);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b70_u16.val[1], a00_u16.val[1], 3);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b70_u16.val[2], a00_u16.val[1], 3);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b70_u16.val[3], a00_u16.val[1], 3);
-
-            vec_a += 8;
-            matrix_b += 8 * stride_b;
-        }
+                // Accumulate 0:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
 
-        // This for loop performs the left-over accumulations
-        for(; vec_a < vec_a_end_addr;)
-        {
-            const uint8x8_t  a00_u8 = vld1_dup_u8(vec_a);
-            const uint8x16_t b00_u8 = vld1q_u8(matrix_b);
+                vec_a += 1;
+                matrix_b += stride_b;
+            }
 
-            const uint16x4x4_t b00_u16 =
+            auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
+            if (id.x() < (width_out - 16))
             {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
-                }
-            };
-
-            // Convert a00_u8 to uint16_t and get the lower part
-            const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
-
-            // Accumulate 0:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
-
-            vec_a += 1;
-            matrix_b += stride_b;
-        }
-
-        auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
-        if(id.x() < (width_out - 16))
-        {
-            vst1q_s32(vec_out + 0, vreinterpretq_s32_u32(c0.val[0]));
-            vst1q_s32(vec_out + 4, vreinterpretq_s32_u32(c0.val[1]));
-            vst1q_s32(vec_out + 8, vreinterpretq_s32_u32(c0.val[2]));
-            vst1q_s32(vec_out + 12, vreinterpretq_s32_u32(c0.val[3]));
-        }
-        else
-        {
-            auto left_over = width_out - id.x();
-            for(auto k = 0; k < 4 && left_over; ++k)
+                vst1q_s32(vec_out + 0, vreinterpretq_s32_u32(c0.val[0]));
+                vst1q_s32(vec_out + 4, vreinterpretq_s32_u32(c0.val[1]));
+                vst1q_s32(vec_out + 8, vreinterpretq_s32_u32(c0.val[2]));
+                vst1q_s32(vec_out + 12, vreinterpretq_s32_u32(c0.val[3]));
+            }
+            else
             {
-                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                auto left_over = width_out - id.x();
+                for (auto k = 0; k < 4 && left_over; ++k)
                 {
-                    *(vec_out + k * 4 + j) = c0.val[k][j];
+                    for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                    {
+                        *(vec_out + k * 4 + j) = c0.val[k][j];
+                    }
                 }
             }
-        }
-    },
-    ina, inb, out);
+        },
+        ina, inb, out);
 }
 
-void inline vector_matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, int width_out, size_t stride_b, const Window &window)
+void inline vector_matrix_multiply_s8(Iterator     &ina,
+                                      Iterator     &inb,
+                                      Iterator     &out,
+                                      int           width_a,
+                                      int           width_b,
+                                      int           width_out,
+                                      size_t        stride_b,
+                                      const Window &window)
 {
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        if(id.x() > width_b)
-        {
-            return;
-        }
-
-        // Accumulators for the block 0
-        int32x4x4_t c0 =
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
+            if (id.x() > width_b)
             {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
+                return;
             }
-        };
-
-        auto vec_a          = reinterpret_cast<const int8_t *>(ina.ptr());
-        auto matrix_b       = reinterpret_cast<const int8_t *>(inb.ptr());
-        auto vec_a_end_addr = vec_a + width_a;
-
-        // This for loop performs 8 accumulations
-        for(; vec_a <= (vec_a_end_addr - 8);)
-        {
-            const int8x8_t  a00_s8 = vld1_s8(vec_a);
-            const int8x16_t b00_s8 = vld1q_s8(matrix_b + 0 * stride_b);
-            const int8x16_t b10_s8 = vld1q_s8(matrix_b + 1 * stride_b);
-            const int8x16_t b20_s8 = vld1q_s8(matrix_b + 2 * stride_b);
-            const int8x16_t b30_s8 = vld1q_s8(matrix_b + 3 * stride_b);
-            const int8x16_t b40_s8 = vld1q_s8(matrix_b + 4 * stride_b);
-            const int8x16_t b50_s8 = vld1q_s8(matrix_b + 5 * stride_b);
-            const int8x16_t b60_s8 = vld1q_s8(matrix_b + 6 * stride_b);
-            const int8x16_t b70_s8 = vld1q_s8(matrix_b + 7 * stride_b);
-
-            // Convert a00_s8 to int16_t and get the lower part
-            const int16x4x2_t a00_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(a00_s8)),
-                    vget_high_s16(vmovl_s8(a00_s8))
-                }
-            };
-
-            const int16x4x4_t b00_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
-                }
-            };
-
-            const int16x4x4_t b10_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b10_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b10_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b10_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b10_s8)))
-                }
-            };
 
-            const int16x4x4_t b20_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b20_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b20_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b20_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b20_s8)))
-                }
-            };
+            // Accumulators for the block 0
+            int32x4x4_t c0 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}};
+
+            auto vec_a          = reinterpret_cast<const int8_t *>(ina.ptr());
+            auto matrix_b       = reinterpret_cast<const int8_t *>(inb.ptr());
+            auto vec_a_end_addr = vec_a + width_a;
+
+            // This for loop performs 8 accumulations
+            for (; vec_a <= (vec_a_end_addr - 8);)
+            {
+                const int8x8_t  a00_s8 = vld1_s8(vec_a);
+                const int8x16_t b00_s8 = vld1q_s8(matrix_b + 0 * stride_b);
+                const int8x16_t b10_s8 = vld1q_s8(matrix_b + 1 * stride_b);
+                const int8x16_t b20_s8 = vld1q_s8(matrix_b + 2 * stride_b);
+                const int8x16_t b30_s8 = vld1q_s8(matrix_b + 3 * stride_b);
+                const int8x16_t b40_s8 = vld1q_s8(matrix_b + 4 * stride_b);
+                const int8x16_t b50_s8 = vld1q_s8(matrix_b + 5 * stride_b);
+                const int8x16_t b60_s8 = vld1q_s8(matrix_b + 6 * stride_b);
+                const int8x16_t b70_s8 = vld1q_s8(matrix_b + 7 * stride_b);
+
+                // Convert a00_s8 to int16_t and get the lower part
+                const int16x4x2_t a00_s16 = {{vget_low_s16(vmovl_s8(a00_s8)), vget_high_s16(vmovl_s8(a00_s8))}};
+
+                const int16x4x4_t b00_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))}};
+
+                const int16x4x4_t b10_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b10_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b10_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b10_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b10_s8)))}};
+
+                const int16x4x4_t b20_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b20_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b20_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b20_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b20_s8)))}};
+
+                const int16x4x4_t b30_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b30_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b30_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b30_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b30_s8)))}};
+
+                const int16x4x4_t b40_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b40_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b40_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b40_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b40_s8)))}};
+
+                const int16x4x4_t b50_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b50_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b50_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b50_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b50_s8)))}};
+
+                const int16x4x4_t b60_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b60_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b60_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b60_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b60_s8)))}};
+
+                const int16x4x4_t b70_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b70_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b70_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b70_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b70_s8)))}};
+
+                // Accumulate 0:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16.val[0], 0);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16.val[0], 0);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16.val[0], 0);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16.val[0], 0);
+
+                // Accumulate 1:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b10_s16.val[0], a00_s16.val[0], 1);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b10_s16.val[1], a00_s16.val[0], 1);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b10_s16.val[2], a00_s16.val[0], 1);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b10_s16.val[3], a00_s16.val[0], 1);
+
+                // Accumulate 2:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b20_s16.val[0], a00_s16.val[0], 2);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b20_s16.val[1], a00_s16.val[0], 2);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b20_s16.val[2], a00_s16.val[0], 2);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b20_s16.val[3], a00_s16.val[0], 2);
+
+                // Accumulate 3:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b30_s16.val[0], a00_s16.val[0], 3);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b30_s16.val[1], a00_s16.val[0], 3);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b30_s16.val[2], a00_s16.val[0], 3);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b30_s16.val[3], a00_s16.val[0], 3);
+
+                // Accumulate 4:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b40_s16.val[0], a00_s16.val[1], 0);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b40_s16.val[1], a00_s16.val[1], 0);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b40_s16.val[2], a00_s16.val[1], 0);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b40_s16.val[3], a00_s16.val[1], 0);
+
+                // Accumulate 5:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b50_s16.val[0], a00_s16.val[1], 1);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b50_s16.val[1], a00_s16.val[1], 1);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b50_s16.val[2], a00_s16.val[1], 1);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b50_s16.val[3], a00_s16.val[1], 1);
+
+                // Accumulate 6:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b60_s16.val[0], a00_s16.val[1], 2);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b60_s16.val[1], a00_s16.val[1], 2);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b60_s16.val[2], a00_s16.val[1], 2);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b60_s16.val[3], a00_s16.val[1], 2);
+
+                // Accumulate 7:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b70_s16.val[0], a00_s16.val[1], 3);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b70_s16.val[1], a00_s16.val[1], 3);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b70_s16.val[2], a00_s16.val[1], 3);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b70_s16.val[3], a00_s16.val[1], 3);
+
+                vec_a += 8;
+                matrix_b += 8 * stride_b;
+            }
 
-            const int16x4x4_t b30_s16 =
+            // This for loop performs the left-over accumulations
+            for (; vec_a < vec_a_end_addr;)
             {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b30_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b30_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b30_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b30_s8)))
-                }
-            };
+                const int8x8_t  a00_s8 = vld1_dup_s8(vec_a);
+                const int8x16_t b00_s8 = vld1q_s8(matrix_b);
 
-            const int16x4x4_t b40_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b40_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b40_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b40_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b40_s8)))
-                }
-            };
+                const int16x4x4_t b00_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))}};
 
-            const int16x4x4_t b50_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b50_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b50_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b50_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b50_s8)))
-                }
-            };
+                // Convert a00_s8 to uint16_t and get the lower part
+                const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
 
-            const int16x4x4_t b60_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b60_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b60_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b60_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b60_s8)))
-                }
-            };
-
-            const int16x4x4_t b70_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b70_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b70_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b70_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b70_s8)))
-                }
-            };
-
-            // Accumulate 0:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16.val[0], 0);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16.val[0], 0);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16.val[0], 0);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16.val[0], 0);
-
-            // Accumulate 1:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b10_s16.val[0], a00_s16.val[0], 1);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b10_s16.val[1], a00_s16.val[0], 1);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b10_s16.val[2], a00_s16.val[0], 1);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b10_s16.val[3], a00_s16.val[0], 1);
-
-            // Accumulate 2:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b20_s16.val[0], a00_s16.val[0], 2);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b20_s16.val[1], a00_s16.val[0], 2);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b20_s16.val[2], a00_s16.val[0], 2);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b20_s16.val[3], a00_s16.val[0], 2);
-
-            // Accumulate 3:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b30_s16.val[0], a00_s16.val[0], 3);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b30_s16.val[1], a00_s16.val[0], 3);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b30_s16.val[2], a00_s16.val[0], 3);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b30_s16.val[3], a00_s16.val[0], 3);
-
-            // Accumulate 4:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b40_s16.val[0], a00_s16.val[1], 0);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b40_s16.val[1], a00_s16.val[1], 0);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b40_s16.val[2], a00_s16.val[1], 0);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b40_s16.val[3], a00_s16.val[1], 0);
-
-            // Accumulate 5:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b50_s16.val[0], a00_s16.val[1], 1);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b50_s16.val[1], a00_s16.val[1], 1);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b50_s16.val[2], a00_s16.val[1], 1);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b50_s16.val[3], a00_s16.val[1], 1);
-
-            // Accumulate 6:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b60_s16.val[0], a00_s16.val[1], 2);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b60_s16.val[1], a00_s16.val[1], 2);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b60_s16.val[2], a00_s16.val[1], 2);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b60_s16.val[3], a00_s16.val[1], 2);
-
-            // Accumulate 7:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b70_s16.val[0], a00_s16.val[1], 3);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b70_s16.val[1], a00_s16.val[1], 3);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b70_s16.val[2], a00_s16.val[1], 3);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b70_s16.val[3], a00_s16.val[1], 3);
-
-            vec_a += 8;
-            matrix_b += 8 * stride_b;
-        }
+                // Accumulate 0:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
 
-        // This for loop performs the left-over accumulations
-        for(; vec_a < vec_a_end_addr;)
-        {
-            const int8x8_t  a00_s8 = vld1_dup_s8(vec_a);
-            const int8x16_t b00_s8 = vld1q_s8(matrix_b);
+                vec_a += 1;
+                matrix_b += stride_b;
+            }
 
-            const int16x4x4_t b00_s16 =
+            auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
+            if (id.x() < (width_out - 16))
             {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
-                }
-            };
-
-            // Convert a00_s8 to uint16_t and get the lower part
-            const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
-
-            // Accumulate 0:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
-
-            vec_a += 1;
-            matrix_b += stride_b;
-        }
-
-        auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
-        if(id.x() < (width_out - 16))
-        {
-            vst1q_s32(vec_out + 0, c0.val[0]);
-            vst1q_s32(vec_out + 4, c0.val[1]);
-            vst1q_s32(vec_out + 8, c0.val[2]);
-            vst1q_s32(vec_out + 12, c0.val[3]);
-        }
-        else
-        {
-            auto left_over = width_out - id.x();
-            for(auto k = 0; k < 4 && left_over; ++k)
+                vst1q_s32(vec_out + 0, c0.val[0]);
+                vst1q_s32(vec_out + 4, c0.val[1]);
+                vst1q_s32(vec_out + 8, c0.val[2]);
+                vst1q_s32(vec_out + 12, c0.val[3]);
+            }
+            else
             {
-                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                auto left_over = width_out - id.x();
+                for (auto k = 0; k < 4 && left_over; ++k)
                 {
-                    *(vec_out + k * 4 + j) = c0.val[k][j];
+                    for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                    {
+                        *(vec_out + k * 4 + j) = c0.val[k][j];
+                    }
                 }
             }
-        }
-    },
-    ina, inb, out);
+        },
+        ina, inb, out);
 }
 
-void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)
+void inline matrix_multiply_u8(
+    Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)
 {
     const auto   width_out  = static_cast<int>(out_info.dimension(0));
     const auto   height_out = static_cast<int>(out_info.dimension(1));
     const size_t out_stride = out_info.strides_in_bytes()[1] / out_info.element_size();
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const uint8_t *mtx_a0 = ina.ptr();
-        const uint8_t *mtx_b0 = inb.ptr();
-
-        // Note: Since the input are all positives, we can use uint32_t
-        // Accumulators for the block 0
-        uint32x4x4_t c0 =
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            {
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0)
+            const uint8_t *mtx_a0 = ina.ptr();
+            const uint8_t *mtx_b0 = inb.ptr();
+
+            // Note: Since the input are all positives, we can use uint32_t
+            // Accumulators for the block 0
+            uint32x4x4_t c0 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}};
+
+            // Accumulators for the block 1
+            uint32x4x4_t c1 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}};
+
+            // Accumulators for the block 2
+            uint32x4x4_t c2 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}};
+
+            // Accumulators for the block 3
+            uint32x4x4_t c3 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}};
+
+            for (int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
+            {
+                const uint8x8_t  a00_u8 = vld1_u8(mtx_a0);
+                const uint8x16_t b00_u8 = vld1q_u8(mtx_b0);
+
+                // Convert a00_u8 to uint16_t and get the lower part
+                const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
+
+                // Convert b00_s8 to uint16_t
+                const uint16x4x4_t b00_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))}};
+
+                // 4x4 block 0
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
+
+                // 4x4 block 1
+                c1.val[0] = vmlal_lane_u16(c1.val[0], b00_u16.val[0], a00_u16, 1);
+                c1.val[1] = vmlal_lane_u16(c1.val[1], b00_u16.val[1], a00_u16, 1);
+                c1.val[2] = vmlal_lane_u16(c1.val[2], b00_u16.val[2], a00_u16, 1);
+                c1.val[3] = vmlal_lane_u16(c1.val[3], b00_u16.val[3], a00_u16, 1);
+
+                // 4x4 block 2
+                c2.val[0] = vmlal_lane_u16(c2.val[0], b00_u16.val[0], a00_u16, 2);
+                c2.val[1] = vmlal_lane_u16(c2.val[1], b00_u16.val[1], a00_u16, 2);
+                c2.val[2] = vmlal_lane_u16(c2.val[2], b00_u16.val[2], a00_u16, 2);
+                c2.val[3] = vmlal_lane_u16(c2.val[3], b00_u16.val[3], a00_u16, 2);
+
+                // 4x4 block 3
+                c3.val[0] = vmlal_lane_u16(c3.val[0], b00_u16.val[0], a00_u16, 3);
+                c3.val[1] = vmlal_lane_u16(c3.val[1], b00_u16.val[1], a00_u16, 3);
+                c3.val[2] = vmlal_lane_u16(c3.val[2], b00_u16.val[2], a00_u16, 3);
+                c3.val[3] = vmlal_lane_u16(c3.val[3], b00_u16.val[3], a00_u16, 3);
             }
-        };
 
-        // Accumulators for the block 1
-        uint32x4x4_t c1 =
-        {
-            {
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0)
-            }
-        };
+            auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());
 
-        // Accumulators for the block 2
-        uint32x4x4_t c2 =
-        {
-            {
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0)
-            }
-        };
-
-        // Accumulators for the block 3
-        uint32x4x4_t c3 =
-        {
-            {
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0)
-            }
-        };
-
-        for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
-        {
-            const uint8x8_t  a00_u8 = vld1_u8(mtx_a0);
-            const uint8x16_t b00_u8 = vld1q_u8(mtx_b0);
-
-            // Convert a00_u8 to uint16_t and get the lower part
-            const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
-
-            // Convert b00_s8 to uint16_t
-            const uint16x4x4_t b00_u16 =
+            if (id.y() < height_out && id.x() < (width_out - 16))
             {
+                vst1q_s32(mtx_out + 0 * out_stride + 0, vreinterpretq_s32_u32(c0.val[0]));
+                vst1q_s32(mtx_out + 0 * out_stride + 4, vreinterpretq_s32_u32(c0.val[1]));
+                vst1q_s32(mtx_out + 0 * out_stride + 8, vreinterpretq_s32_u32(c0.val[2]));
+                vst1q_s32(mtx_out + 0 * out_stride + 12, vreinterpretq_s32_u32(c0.val[3]));
+                if (id.y() + 1 < height_out)
                 {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
-                }
-            };
-
-            // 4x4 block 0
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
-
-            // 4x4 block 1
-            c1.val[0] = vmlal_lane_u16(c1.val[0], b00_u16.val[0], a00_u16, 1);
-            c1.val[1] = vmlal_lane_u16(c1.val[1], b00_u16.val[1], a00_u16, 1);
-            c1.val[2] = vmlal_lane_u16(c1.val[2], b00_u16.val[2], a00_u16, 1);
-            c1.val[3] = vmlal_lane_u16(c1.val[3], b00_u16.val[3], a00_u16, 1);
-
-            // 4x4 block 2
-            c2.val[0] = vmlal_lane_u16(c2.val[0], b00_u16.val[0], a00_u16, 2);
-            c2.val[1] = vmlal_lane_u16(c2.val[1], b00_u16.val[1], a00_u16, 2);
-            c2.val[2] = vmlal_lane_u16(c2.val[2], b00_u16.val[2], a00_u16, 2);
-            c2.val[3] = vmlal_lane_u16(c2.val[3], b00_u16.val[3], a00_u16, 2);
-
-            // 4x4 block 3
-            c3.val[0] = vmlal_lane_u16(c3.val[0], b00_u16.val[0], a00_u16, 3);
-            c3.val[1] = vmlal_lane_u16(c3.val[1], b00_u16.val[1], a00_u16, 3);
-            c3.val[2] = vmlal_lane_u16(c3.val[2], b00_u16.val[2], a00_u16, 3);
-            c3.val[3] = vmlal_lane_u16(c3.val[3], b00_u16.val[3], a00_u16, 3);
-        }
-
-        auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());
-
-        if(id.y() < height_out && id.x() < (width_out - 16))
-        {
-            vst1q_s32(mtx_out + 0 * out_stride + 0, vreinterpretq_s32_u32(c0.val[0]));
-            vst1q_s32(mtx_out + 0 * out_stride + 4, vreinterpretq_s32_u32(c0.val[1]));
-            vst1q_s32(mtx_out + 0 * out_stride + 8, vreinterpretq_s32_u32(c0.val[2]));
-            vst1q_s32(mtx_out + 0 * out_stride + 12, vreinterpretq_s32_u32(c0.val[3]));
-            if(id.y() + 1 < height_out)
-            {
-                vst1q_s32(mtx_out + 1 * out_stride + 0, vreinterpretq_s32_u32(c1.val[0]));
-                vst1q_s32(mtx_out + 1 * out_stride + 4, vreinterpretq_s32_u32(c1.val[1]));
-                vst1q_s32(mtx_out + 1 * out_stride + 8, vreinterpretq_s32_u32(c1.val[2]));
-                vst1q_s32(mtx_out + 1 * out_stride + 12, vreinterpretq_s32_u32(c1.val[3]));
-                if(id.y() + 2 < height_out)
-                {
-                    vst1q_s32(mtx_out + 2 * out_stride + 0, vreinterpretq_s32_u32(c2.val[0]));
-                    vst1q_s32(mtx_out + 2 * out_stride + 4, vreinterpretq_s32_u32(c2.val[1]));
-                    vst1q_s32(mtx_out + 2 * out_stride + 8, vreinterpretq_s32_u32(c2.val[2]));
-                    vst1q_s32(mtx_out + 2 * out_stride + 12, vreinterpretq_s32_u32(c2.val[3]));
-                    if(id.y() + 3 < height_out)
+                    vst1q_s32(mtx_out + 1 * out_stride + 0, vreinterpretq_s32_u32(c1.val[0]));
+                    vst1q_s32(mtx_out + 1 * out_stride + 4, vreinterpretq_s32_u32(c1.val[1]));
+                    vst1q_s32(mtx_out + 1 * out_stride + 8, vreinterpretq_s32_u32(c1.val[2]));
+                    vst1q_s32(mtx_out + 1 * out_stride + 12, vreinterpretq_s32_u32(c1.val[3]));
+                    if (id.y() + 2 < height_out)
                     {
-                        vst1q_s32(mtx_out + 3 * out_stride + 0, vreinterpretq_s32_u32(c3.val[0]));
-                        vst1q_s32(mtx_out + 3 * out_stride + 4, vreinterpretq_s32_u32(c3.val[1]));
-                        vst1q_s32(mtx_out + 3 * out_stride + 8, vreinterpretq_s32_u32(c3.val[2]));
-                        vst1q_s32(mtx_out + 3 * out_stride + 12, vreinterpretq_s32_u32(c3.val[3]));
+                        vst1q_s32(mtx_out + 2 * out_stride + 0, vreinterpretq_s32_u32(c2.val[0]));
+                        vst1q_s32(mtx_out + 2 * out_stride + 4, vreinterpretq_s32_u32(c2.val[1]));
+                        vst1q_s32(mtx_out + 2 * out_stride + 8, vreinterpretq_s32_u32(c2.val[2]));
+                        vst1q_s32(mtx_out + 2 * out_stride + 12, vreinterpretq_s32_u32(c2.val[3]));
+                        if (id.y() + 3 < height_out)
+                        {
+                            vst1q_s32(mtx_out + 3 * out_stride + 0, vreinterpretq_s32_u32(c3.val[0]));
+                            vst1q_s32(mtx_out + 3 * out_stride + 4, vreinterpretq_s32_u32(c3.val[1]));
+                            vst1q_s32(mtx_out + 3 * out_stride + 8, vreinterpretq_s32_u32(c3.val[2]));
+                            vst1q_s32(mtx_out + 3 * out_stride + 12, vreinterpretq_s32_u32(c3.val[3]));
+                        }
                     }
                 }
             }
-        }
-        else
-        {
-            const auto left_over_value = width_out - id.x();
-            auto       left_over       = left_over_value;
-            for(auto k = 0; k < 4 && left_over; ++k)
+            else
             {
-                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                const auto left_over_value = width_out - id.x();
+                auto       left_over       = left_over_value;
+                for (auto k = 0; k < 4 && left_over; ++k)
                 {
-                    *(mtx_out + k * 4 + j) = c0.val[k][j];
-                }
-            }
-            if(id.y() + 1 < height_out)
-            {
-                left_over = left_over_value;
-                for(auto k = 0; k < 4 && left_over; ++k)
-                {
-                    for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                    for (auto j = 0; j < 4 && left_over; ++j, --left_over)
                     {
-                        *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];
+                        *(mtx_out + k * 4 + j) = c0.val[k][j];
                     }
                 }
-                if(id.y() + 2 < height_out)
+                if (id.y() + 1 < height_out)
                 {
                     left_over = left_over_value;
-                    for(auto k = 0; k < 4 && left_over; ++k)
+                    for (auto k = 0; k < 4 && left_over; ++k)
                     {
-                        for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                        for (auto j = 0; j < 4 && left_over; ++j, --left_over)
                         {
-                            *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];
+                            *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];
                         }
                     }
-                    if(id.y() + 3 < height_out)
+                    if (id.y() + 2 < height_out)
                     {
                         left_over = left_over_value;
-                        for(auto k = 0; k < 4 && left_over; ++k)
+                        for (auto k = 0; k < 4 && left_over; ++k)
+                        {
+                            for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                            {
+                                *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];
+                            }
+                        }
+                        if (id.y() + 3 < height_out)
                         {
-                            for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                            left_over = left_over_value;
+                            for (auto k = 0; k < 4 && left_over; ++k)
                             {
-                                *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];
+                                for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                                {
+                                    *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];
+                                }
                             }
                         }
                     }
                 }
             }
-        }
-    },
-    ina, inb, out);
+        },
+        ina, inb, out);
 }
 
-void inline matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)
+void inline matrix_multiply_s8(
+    Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)
 {
     const auto   width_out  = static_cast<int>(out_info.dimension(0));
     const auto   height_out = static_cast<int>(out_info.dimension(1));
@@ -691,182 +540,148 @@ void inline matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int
     // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with CpuGemmInterleave4x4 and CpuGemmTranspose1xW
     // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
     // All the values needed for computing a single 4x4 block will be read from consecutive memory positions
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        auto *mtx_a0 = reinterpret_cast<const int8_t *>(ina.ptr());
-        auto *mtx_b0 = reinterpret_cast<const int8_t *>(inb.ptr());
-
-        // Note: Since the input are all positives, we can use uint32_t
-        // Accumulators for the block 0
-        int32x4x4_t c0 =
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
+            auto *mtx_a0 = reinterpret_cast<const int8_t *>(ina.ptr());
+            auto *mtx_b0 = reinterpret_cast<const int8_t *>(inb.ptr());
+
+            // Note: Since the input are all positives, we can use uint32_t
+            // Accumulators for the block 0
+            int32x4x4_t c0 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}};
+
+            // Accumulators for the block 1
+            int32x4x4_t c1 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}};
+
+            // Accumulators for the block 2
+            int32x4x4_t c2 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}};
+
+            // Accumulators for the block 3
+            int32x4x4_t c3 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}};
+
+            for (int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
+            {
+                const int8x8_t  a00_s8 = vld1_s8(mtx_a0);
+                const int8x16_t b00_s8 = vld1q_s8(mtx_b0);
+
+                // Convert a00_s8 to uint16_t and get the lower part
+                const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
+
+                // Convert b00_s8 to int16_t
+                const int16x4x4_t b00_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))}};
+
+                // 4x4 block 0
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
+
+                // 4x4 block 1
+                c1.val[0] = vmlal_lane_s16(c1.val[0], b00_s16.val[0], a00_s16, 1);
+                c1.val[1] = vmlal_lane_s16(c1.val[1], b00_s16.val[1], a00_s16, 1);
+                c1.val[2] = vmlal_lane_s16(c1.val[2], b00_s16.val[2], a00_s16, 1);
+                c1.val[3] = vmlal_lane_s16(c1.val[3], b00_s16.val[3], a00_s16, 1);
+
+                // 4x4 block 2
+                c2.val[0] = vmlal_lane_s16(c2.val[0], b00_s16.val[0], a00_s16, 2);
+                c2.val[1] = vmlal_lane_s16(c2.val[1], b00_s16.val[1], a00_s16, 2);
+                c2.val[2] = vmlal_lane_s16(c2.val[2], b00_s16.val[2], a00_s16, 2);
+                c2.val[3] = vmlal_lane_s16(c2.val[3], b00_s16.val[3], a00_s16, 2);
+
+                // 4x4 block 3
+                c3.val[0] = vmlal_lane_s16(c3.val[0], b00_s16.val[0], a00_s16, 3);
+                c3.val[1] = vmlal_lane_s16(c3.val[1], b00_s16.val[1], a00_s16, 3);
+                c3.val[2] = vmlal_lane_s16(c3.val[2], b00_s16.val[2], a00_s16, 3);
+                c3.val[3] = vmlal_lane_s16(c3.val[3], b00_s16.val[3], a00_s16, 3);
             }
-        };
-
-        // Accumulators for the block 1
-        int32x4x4_t c1 =
-        {
-            {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
-            }
-        };
-
-        // Accumulators for the block 2
-        int32x4x4_t c2 =
-        {
-            {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
-            }
-        };
-
-        // Accumulators for the block 3
-        int32x4x4_t c3 =
-        {
-            {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
-            }
-        };
-
-        for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
-        {
-            const int8x8_t  a00_s8 = vld1_s8(mtx_a0);
-            const int8x16_t b00_s8 = vld1q_s8(mtx_b0);
-
-            // Convert a00_s8 to uint16_t and get the lower part
-            const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
-
-            // Convert b00_s8 to int16_t
-            const int16x4x4_t b00_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
-                }
-            };
-
-            // 4x4 block 0
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
-
-            // 4x4 block 1
-            c1.val[0] = vmlal_lane_s16(c1.val[0], b00_s16.val[0], a00_s16, 1);
-            c1.val[1] = vmlal_lane_s16(c1.val[1], b00_s16.val[1], a00_s16, 1);
-            c1.val[2] = vmlal_lane_s16(c1.val[2], b00_s16.val[2], a00_s16, 1);
-            c1.val[3] = vmlal_lane_s16(c1.val[3], b00_s16.val[3], a00_s16, 1);
-
-            // 4x4 block 2
-            c2.val[0] = vmlal_lane_s16(c2.val[0], b00_s16.val[0], a00_s16, 2);
-            c2.val[1] = vmlal_lane_s16(c2.val[1], b00_s16.val[1], a00_s16, 2);
-            c2.val[2] = vmlal_lane_s16(c2.val[2], b00_s16.val[2], a00_s16, 2);
-            c2.val[3] = vmlal_lane_s16(c2.val[3], b00_s16.val[3], a00_s16, 2);
-
-            // 4x4 block 3
-            c3.val[0] = vmlal_lane_s16(c3.val[0], b00_s16.val[0], a00_s16, 3);
-            c3.val[1] = vmlal_lane_s16(c3.val[1], b00_s16.val[1], a00_s16, 3);
-            c3.val[2] = vmlal_lane_s16(c3.val[2], b00_s16.val[2], a00_s16, 3);
-            c3.val[3] = vmlal_lane_s16(c3.val[3], b00_s16.val[3], a00_s16, 3);
-        }
-        auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());
-        if(id.y() < height_out && id.x() < (width_out - 16))
-        {
-            vst1q_s32(mtx_out + 0 * out_stride + 0, c0.val[0]);
-            vst1q_s32(mtx_out + 0 * out_stride + 4, c0.val[1]);
-            vst1q_s32(mtx_out + 0 * out_stride + 8, c0.val[2]);
-            vst1q_s32(mtx_out + 0 * out_stride + 12, c0.val[3]);
-            if(id.y() + 1 < height_out)
-            {
-                vst1q_s32(mtx_out + 1 * out_stride + 0, c1.val[0]);
-                vst1q_s32(mtx_out + 1 * out_stride + 4, c1.val[1]);
-                vst1q_s32(mtx_out + 1 * out_stride + 8, c1.val[2]);
-                vst1q_s32(mtx_out + 1 * out_stride + 12, c1.val[3]);
-                if(id.y() + 2 < height_out)
-                {
-                    vst1q_s32(mtx_out + 2 * out_stride + 0, c2.val[0]);
-                    vst1q_s32(mtx_out + 2 * out_stride + 4, c2.val[1]);
-                    vst1q_s32(mtx_out + 2 * out_stride + 8, c2.val[2]);
-                    vst1q_s32(mtx_out + 2 * out_stride + 12, c2.val[3]);
-                    if(id.y() + 3 < height_out)
+            auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());
+            if (id.y() < height_out && id.x() < (width_out - 16))
+            {
+                vst1q_s32(mtx_out + 0 * out_stride + 0, c0.val[0]);
+                vst1q_s32(mtx_out + 0 * out_stride + 4, c0.val[1]);
+                vst1q_s32(mtx_out + 0 * out_stride + 8, c0.val[2]);
+                vst1q_s32(mtx_out + 0 * out_stride + 12, c0.val[3]);
+                if (id.y() + 1 < height_out)
+                {
+                    vst1q_s32(mtx_out + 1 * out_stride + 0, c1.val[0]);
+                    vst1q_s32(mtx_out + 1 * out_stride + 4, c1.val[1]);
+                    vst1q_s32(mtx_out + 1 * out_stride + 8, c1.val[2]);
+                    vst1q_s32(mtx_out + 1 * out_stride + 12, c1.val[3]);
+                    if (id.y() + 2 < height_out)
                     {
-                        vst1q_s32(mtx_out + 3 * out_stride + 0, c3.val[0]);
-                        vst1q_s32(mtx_out + 3 * out_stride + 4, c3.val[1]);
-                        vst1q_s32(mtx_out + 3 * out_stride + 8, c3.val[2]);
-                        vst1q_s32(mtx_out + 3 * out_stride + 12, c3.val[3]);
+                        vst1q_s32(mtx_out + 2 * out_stride + 0, c2.val[0]);
+                        vst1q_s32(mtx_out + 2 * out_stride + 4, c2.val[1]);
+                        vst1q_s32(mtx_out + 2 * out_stride + 8, c2.val[2]);
+                        vst1q_s32(mtx_out + 2 * out_stride + 12, c2.val[3]);
+                        if (id.y() + 3 < height_out)
+                        {
+                            vst1q_s32(mtx_out + 3 * out_stride + 0, c3.val[0]);
+                            vst1q_s32(mtx_out + 3 * out_stride + 4, c3.val[1]);
+                            vst1q_s32(mtx_out + 3 * out_stride + 8, c3.val[2]);
+                            vst1q_s32(mtx_out + 3 * out_stride + 12, c3.val[3]);
+                        }
                     }
                 }
             }
-        }
-        else if(id.y() < height_out)
-        {
-            const auto left_over_value = width_out - id.x();
-            auto       left_over       = left_over_value;
-            for(auto k = 0; k < 4 && left_over; ++k)
-            {
-                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                {
-                    *(mtx_out + k * 4 + j) = c0.val[k][j];
-                }
-            }
-            if(id.y() + 1 < height_out)
+            else if (id.y() < height_out)
             {
-                left_over = left_over_value;
-                for(auto k = 0; k < 4 && left_over; ++k)
+                const auto left_over_value = width_out - id.x();
+                auto       left_over       = left_over_value;
+                for (auto k = 0; k < 4 && left_over; ++k)
                 {
-                    for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                    for (auto j = 0; j < 4 && left_over; ++j, --left_over)
                     {
-                        *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];
+                        *(mtx_out + k * 4 + j) = c0.val[k][j];
                     }
                 }
-                if(id.y() + 2 < height_out)
+                if (id.y() + 1 < height_out)
                 {
                     left_over = left_over_value;
-                    for(auto k = 0; k < 4 && left_over; ++k)
+                    for (auto k = 0; k < 4 && left_over; ++k)
                     {
-                        for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                        for (auto j = 0; j < 4 && left_over; ++j, --left_over)
                         {
-                            *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];
+                            *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];
                         }
                     }
-                    if(id.y() + 3 < height_out)
+                    if (id.y() + 2 < height_out)
                     {
                         left_over = left_over_value;
-                        for(auto k = 0; k < 4 && left_over; ++k)
+                        for (auto k = 0; k < 4 && left_over; ++k)
                         {
-                            for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                            for (auto j = 0; j < 4 && left_over; ++j, --left_over)
                             {
-                                *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];
+                                *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];
+                            }
+                        }
+                        if (id.y() + 3 < height_out)
+                        {
+                            left_over = left_over_value;
+                            for (auto k = 0; k < 4 && left_over; ++k)
+                            {
+                                for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                                {
+                                    *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];
+                                }
                             }
                         }
                     }
                 }
             }
-        }
-
-    },
-    ina, inb, out);
+        },
+        ina, inb, out);
 }
 
 Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, DataType::U8);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::S8, DataType::U8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::S8, DataType::U8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::S8,
+                                                         DataType::U8);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
 
     TensorShape in0_shape = src0->tensor_shape();
@@ -874,9 +689,10 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     TensorShape out_shape = dst->tensor_shape();
 
     // Check vector-by-matrix case
-    if(out_shape[1] == 1)
+    if (out_shape[1] == 1)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[0] != in1_shape[1], "The number of input0's columns must be equal to input1's rows");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[0] != in1_shape[1],
+                                        "The number of input0's columns must be equal to input1's rows");
     }
     else
     {
@@ -884,8 +700,11 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         in1_shape.collapse(2);
         out_shape.collapse(2);
 
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2],
+                                        "Output tensor must have the same number of batches of input0 tensor");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            in1_shape[2] != 1 && in0_shape[2] != in1_shape[2],
+            "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[0] % 16, "Input1's width must be a multiple of 16");
     }
 
@@ -909,20 +728,22 @@ void CpuGemmLowpMatrixMultiplyKernel::configure(const ITensorInfo *src0, const I
 
     Window win;
     // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
-    if((dst->dimension(1) == 1))
+    if ((dst->dimension(1) == 1))
     {
         // Configure kernel window
         win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x));
     }
     else
     {
-        win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+        win =
+            calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
     }
 
     ICpuKernel::configure(win);
 }
 
-Status CpuGemmLowpMatrixMultiplyKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+Status
+CpuGemmLowpMatrixMultiplyKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst));
     return Status{};
@@ -939,12 +760,13 @@ void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window
     auto dst  = tensors.get_tensor(TensorType::ACL_DST);
 
     // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication path
-    if((dst->info()->dimension(1) == 1))
+    if ((dst->info()->dimension(1) == 1))
     {
         const auto width_matrix_a = static_cast<int>(src0->info()->dimension(0));
         const auto width_matrix_b = static_cast<int>(src1->info()->dimension(0));
         const auto width_out      = static_cast<int>(dst->info()->dimension(0));
-        const auto in_b_stride    = static_cast<int>(src1->info()->strides_in_bytes()[1] / data_size_from_type(src1->info()->data_type()));
+        const auto in_b_stride =
+            static_cast<int>(src1->info()->strides_in_bytes()[1] / data_size_from_type(src1->info()->data_type()));
 
         // The implementation computes 16 elements per iteration
         const int window_start_x = 16 * info.thread_id;
@@ -963,7 +785,7 @@ void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window
         Window win_b;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-        if(src1->info()->num_dimensions() >= 3)
+        if (src1->info()->num_dimensions() >= 3)
         {
             win_b = window;
         }
@@ -974,18 +796,20 @@ void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window
         Iterator inb(src1, win_b);
         Iterator out(dst, win_out);
 
-        switch(src0->info()->data_type())
+        switch (src0->info()->data_type())
         {
             case DataType::S8:
             case DataType::QASYMM8_SIGNED:
             {
-                vector_matrix_multiply_s8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, window);
+                vector_matrix_multiply_s8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride,
+                                          window);
                 break;
             }
             case DataType::U8:
             case DataType::QASYMM8:
             {
-                vector_matrix_multiply_u8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, window);
+                vector_matrix_multiply_u8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride,
+                                          window);
                 break;
             }
             default:
@@ -1009,7 +833,7 @@ void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window
         Window win_b;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-        if(_slide_matrix_b)
+        if (_slide_matrix_b)
         {
             win_b = window;
         }
@@ -1021,7 +845,7 @@ void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window
         Iterator inb(src1, win_b);
         Iterator out(dst, window);
 
-        switch(src0->info()->data_type())
+        switch (src0->info()->data_type())
         {
             case DataType::S8:
             case DataType::QASYMM8_SIGNED:
@@ -1050,4 +874,4 @@ const char *CpuGemmLowpMatrixMultiplyKernel::name() const
 }
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h
index 2cc789d6d9..439ada1b47 100644
--- a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h
@@ -68,11 +68,11 @@ public:
     static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
-    bool _slide_matrix_b{ true };
+    bool _slide_matrix_b{true};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp
index 534076b97c..9bd1eae663 100644
--- a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp
@@ -26,9 +26,10 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
@@ -38,37 +39,49 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
+Status validate_arguments_matrix_a_reduction(const ITensorInfo                 *src,
+                                             const ITensorInfo                 *dst,
+                                             const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(1), "Output vector must have length equal to the number of rows of the input matrix");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->dimension(0) != src->dimension(1),
+            "Output vector must have length equal to the number of rows of the input matrix");
     }
     return Status{};
 }
-Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
+Status validate_arguments_matrix_b_reduction(const ITensorInfo                 *src,
+                                             const ITensorInfo                 *dst,
+                                             const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(0), "Output vector must have length equal to the number of columns of the input matrix");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->dimension(0) != src->dimension(0),
+            "Output vector must have length equal to the number of columns of the input matrix");
     }
     return Status{};
 }
 } // namespace
 
-void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
+void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo                 *src,
+                                                  ITensorInfo                       *dst,
+                                                  const GEMMLowpReductionKernelInfo &info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
@@ -77,7 +90,7 @@ void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo *src, ITenso
     _scalar        = info.scalar;
     _mul_by_scalar = info.mul_by_scalar;
 
-    switch(src->data_type())
+    switch (src->data_type())
     {
         case DataType::QASYMM8:
             _func = &CpuGemmLowpMatrixAReductionKernel::run_internal<uint8_t>;
@@ -98,14 +111,18 @@ void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo *src, ITenso
     ICpuKernel::configure(win);
 }
 
-Status CpuGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
+Status CpuGemmLowpMatrixAReductionKernel::validate(const ITensorInfo                 *src,
+                                                   const ITensorInfo                 *dst,
+                                                   const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(src, dst, info));
     return Status{};
 }
 
 template <typename T>
-void CpuGemmLowpMatrixAReductionKernel::run_internal(const ITensor *src, ITensor *dst, const arm_compute::Window &window)
+void CpuGemmLowpMatrixAReductionKernel::run_internal(const ITensor             *src,
+                                                     ITensor                   *dst,
+                                                     const arm_compute::Window &window)
 {
     // Intermediate and final accumulator types
     using TIAcc = wrapper::traits::promote_t<T>;
@@ -121,55 +138,58 @@ void CpuGemmLowpMatrixAReductionKernel::run_internal(const ITensor *src, ITensor
     Iterator in(src, win_input);
     Iterator out(dst, collapsed_window);
 
-    execute_window_loop(collapsed_window, [&](const Coordinates & id)
-    {
-        auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
-        TAcc sum_row  = 0;
+    execute_window_loop(
+        collapsed_window,
+        [&](const Coordinates &id)
+        {
+            auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
+            TAcc sum_row  = 0;
 
-        const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + id.x() * src->info()->strides_in_bytes()[1] + id.y() * src->info()->strides_in_bytes()[2]));
+            const T *matrix_a = reinterpret_cast<const T *>(
+                (in.ptr() + id.x() * src->info()->strides_in_bytes()[1] + id.y() * src->info()->strides_in_bytes()[2]));
 
 #if __arm__
-        asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
+            asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
 #endif /* __arm__ */
 
-        int i = 0;
-        // This for loop performs 16 accumulations
-        for(; i <= (_k - 16); i += 16)
-        {
-            const auto a0_d8 = wrapper::vloadq(matrix_a + i);
+            int i = 0;
+            // This for loop performs 16 accumulations
+            for (; i <= (_k - 16); i += 16)
+            {
+                const auto a0_d8 = wrapper::vloadq(matrix_a + i);
 
-            // Partial accumulations in U16
-            const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8));
+                // Partial accumulations in U16
+                const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8));
 
-            // Accumulate to U32
-            vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0));
-        }
+                // Accumulate to U32
+                vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0));
+            }
 
-        // This for loop performs the leftover accumulations
-        for(; i < _k; ++i)
-        {
-            sum_row += static_cast<TAcc>(matrix_a[i]);
-        }
+            // This for loop performs the leftover accumulations
+            for (; i < _k; ++i)
+            {
+                sum_row += static_cast<TAcc>(matrix_a[i]);
+            }
 
 #if defined(__aarch64__)
-        // Reduction operation available on 64 bit architectures only
-        sum_row += wrapper::vaddv(vsum_row);
+            // Reduction operation available on 64 bit architectures only
+            sum_row += wrapper::vaddv(vsum_row);
 #else  // __aarch64__
-        auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row));
-        tmp      = wrapper::vpadd(tmp, tmp);
+            auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row));
+            tmp      = wrapper::vpadd(tmp, tmp);
 
-        sum_row += wrapper::vgetlane(tmp, 0);
+            sum_row += wrapper::vgetlane(tmp, 0);
 #endif // __aarch64__
 
-        // Multiply by scalar if necessary
-        if(_mul_by_scalar)
-        {
-            sum_row *= _scalar;
-        }
+            // Multiply by scalar if necessary
+            if (_mul_by_scalar)
+            {
+                sum_row *= _scalar;
+            }
 
-        *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row);
-    },
-    in, out);
+            *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row);
+        },
+        in, out);
 }
 
 void CpuGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
@@ -189,7 +209,9 @@ const char *CpuGemmLowpMatrixAReductionKernel::name() const
     return "CpuGemmLowpMatrixAReductionKernel";
 }
 
-void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
+void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo                 *src,
+                                                  ITensorInfo                       *dst,
+                                                  const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(src, dst, info));
@@ -201,7 +223,7 @@ void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo *src, ITenso
     // Configure kernel window
     constexpr unsigned int num_elems_processed_per_iteration = 16;
 
-    switch(src->data_type())
+    switch (src->data_type())
     {
         case DataType::QASYMM8:
             _func = &CpuGemmLowpMatrixBReductionKernel::run_internal<uint8_t>;
@@ -223,14 +245,19 @@ void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo *src, ITenso
     ICpuKernel::configure(win);
 }
 
-Status CpuGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
+Status CpuGemmLowpMatrixBReductionKernel::validate(const ITensorInfo                 *src,
+                                                   const ITensorInfo                 *dst,
+                                                   const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(src, dst, info));
     return Status{};
 }
 
 template <typename T>
-void CpuGemmLowpMatrixBReductionKernel::run_internal(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info)
+void CpuGemmLowpMatrixBReductionKernel::run_internal(const ITensor    *src,
+                                                     ITensor          *dst,
+                                                     const Window     &window,
+                                                     const ThreadInfo &info)
 {
     // Intermediate and final accumulator types
     using TIAcc = wrapper::traits::promote_t<T>;
@@ -258,121 +285,116 @@ void CpuGemmLowpMatrixBReductionKernel::run_internal(const ITensor *src, ITensor
     Iterator inb(src, win_in);
     Iterator out(dst, win_out);
 
-    execute_window_loop(win_out, [&](const Coordinates & id)
-    {
-        if(id.x() > width_matrix_b)
+    execute_window_loop(
+        win_out,
+        [&](const Coordinates &id)
         {
-            return;
-        }
+            if (id.x() > width_matrix_b)
+            {
+                return;
+            }
 
-        // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
-        typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] =
-        {
-            wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
-            wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
-            wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
-            wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})
-        };
+            // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
+            typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] = {
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})};
 
-        const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * src->info()->strides_in_bytes()[2]);
+            const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * src->info()->strides_in_bytes()[2]);
 
 #if __arm__
-        asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
-        asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride));
+            asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
+            asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride));
 #endif /* __arm__ */
 
-        int i = 0;
-        // This for loop performs 4 accumulations
-        for(; i <= (_k - 4); i += 4)
-        {
-            const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
-            const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride);
-            const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride);
-            const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride);
+            int i = 0;
+            // This for loop performs 4 accumulations
+            for (; i <= (_k - 4); i += 4)
+            {
+                const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
+                const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride);
+                const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride);
+                const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride);
 
 #if __arm__
-            asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));
-            asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride));
-            asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride));
-            asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));
+                asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));
+                asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride));
+                asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride));
+                asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));
 #endif /* __arm__ */
 
-            // Partial accumulation in 16bit
-            typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type tmp_sum[2] =
-            {
-                wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}),
-                wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{})
-            };
-
-            tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8));
-            tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8));
-            tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8));
-            tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8));
-            tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8));
-            tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8));
-            tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8));
-            tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8));
-
-            // Accumulate to 32bit
-            sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0]));
-            sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0]));
-            sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1]));
-            sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1]));
-
-            matrix_b += 4 * in_b_stride;
-        }
-
-        // This for loop perfoms the leftover accumulations
-        for(; i < _k; ++i)
-        {
-            const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
+                // Partial accumulation in 16bit
+                typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type tmp_sum[2] = {
+                    wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}),
+                    wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{})};
+
+                tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8));
+                tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8));
+                tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8));
+                tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8));
+                tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8));
+                tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8));
+                tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8));
+                tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8));
+
+                // Accumulate to 32bit
+                sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0]));
+                sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0]));
+                sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1]));
+                sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1]));
+
+                matrix_b += 4 * in_b_stride;
+            }
 
-            // Convert S8 to S16
-            const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2]
+            // This for loop perfoms the leftover accumulations
+            for (; i < _k; ++i)
             {
-                wrapper::vmovl(wrapper::vgetlow(b0_b8)),
-                wrapper::vmovl(wrapper::vgethigh(b0_b8))
-            };
+                const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
 
-            // Accumulate to 32bit
-            sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
-            sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
-            sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
-            sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
+                // Convert S8 to S16
+                const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2]{
+                    wrapper::vmovl(wrapper::vgetlow(b0_b8)), wrapper::vmovl(wrapper::vgethigh(b0_b8))};
 
-            matrix_b += in_b_stride;
-        }
+                // Accumulate to 32bit
+                sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
+                sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
+                sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
+                sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
 
-        // Multiply by scalar if necessary
-        if(_mul_by_scalar)
-        {
-            sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar);
-            sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar);
-            sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar);
-            sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar);
-        }
-
-        auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
-        if(id.x() + 16 < width_matrix_b)
-        {
-            wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0]));
-            wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1]));
-            wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2]));
-            wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3]));
-        }
-        else
-        {
-            auto left_over = width_matrix_b - id.x();
-            for(auto k = 0; k < 4 && left_over; ++k)
+                matrix_b += in_b_stride;
+            }
+
+            // Multiply by scalar if necessary
+            if (_mul_by_scalar)
+            {
+                sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar);
+                sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar);
+                sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar);
+                sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar);
+            }
+
+            auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
+            if (id.x() + 16 < width_matrix_b)
+            {
+                wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0]));
+                wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1]));
+                wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2]));
+                wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3]));
+            }
+            else
             {
-                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                auto left_over = width_matrix_b - id.x();
+                for (auto k = 0; k < 4 && left_over; ++k)
                 {
-                    *(vector_sum_col + k * 4 + j) = sum_col[k][j];
+                    for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                    {
+                        *(vector_sum_col + k * 4 + j) = sum_col[k][j];
+                    }
                 }
             }
-        }
-    },
-    inb, out);
+        },
+        inb, out);
 }
 
 void CpuGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
@@ -393,4 +415,4 @@ const char *CpuGemmLowpMatrixBReductionKernel::name() const
 }
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h
index e469629cdb..20ef17e96d 100644
--- a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h
@@ -66,7 +66,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
@@ -85,12 +85,14 @@ private:
      * @param[out] dst    Output tensor
      * @param[in]  window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
      */
-    using CpuGemmLowpMatrixAReductionKernelPtr = void (CpuGemmLowpMatrixAReductionKernel::*)(const ITensor *src, ITensor *dst, const Window &window);
+    using CpuGemmLowpMatrixAReductionKernelPtr = void (CpuGemmLowpMatrixAReductionKernel::*)(const ITensor *src,
+                                                                                             ITensor       *dst,
+                                                                                             const Window  &window);
 
-    CpuGemmLowpMatrixAReductionKernelPtr _func{ nullptr };
-    int32_t                              _k{ 0 };
-    int32_t                              _scalar{ 0 };
-    bool                                 _mul_by_scalar{ false };
+    CpuGemmLowpMatrixAReductionKernelPtr _func{nullptr};
+    int32_t                              _k{0};
+    int32_t                              _scalar{0};
+    bool                                 _mul_by_scalar{false};
 };
 
 /** Kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
@@ -124,7 +126,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
@@ -144,12 +146,15 @@ private:
      * @param[out] dst    Output tensor
      * @param[in]  window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
      */
-    using CpuGemmLowpMatrixBReductionKernelPtr = void (CpuGemmLowpMatrixBReductionKernel::*)(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info);
+    using CpuGemmLowpMatrixBReductionKernelPtr = void (CpuGemmLowpMatrixBReductionKernel::*)(const ITensor    *src,
+                                                                                             ITensor          *dst,
+                                                                                             const Window     &window,
+                                                                                             const ThreadInfo &info);
 
-    CpuGemmLowpMatrixBReductionKernelPtr _func{ nullptr };
-    int32_t                              _k{ 0 };
-    int32_t                              _scalar{ 0 };
-    bool                                 _mul_by_scalar{ false };
+    CpuGemmLowpMatrixBReductionKernelPtr _func{nullptr};
+    int32_t                              _k{0};
+    int32_t                              _scalar{0};
+    bool                                 _mul_by_scalar{false};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp
index a65f1a33de..e290783021 100644
--- a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -44,32 +45,37 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
-                          int32_t a_offset, int32_t b_offset)
+Status validate_arguments(const ITensorInfo *mm_result,
+                          const ITensorInfo *vector_sum_col,
+                          const ITensorInfo *vector_sum_row,
+                          int32_t            a_offset,
+                          int32_t            b_offset)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
 
     // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
     }
 
     // If b_offset == 0, vector_sum_row can be a nullptr
-    if(b_offset != 0)
+    if (b_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
 
         // Check if input is a 3D reinterpretation
-        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+        const bool reinterpret_as_3d =
+            mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
 
         // Validate input
-        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
+        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+                                                             (mm_result->dimension(1) * mm_result->dimension(2)));
         ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
 
         TensorShape output_shape = mm_result->tensor_shape();
-        if(output_shape.num_dimensions() > 1)
+        if (output_shape.num_dimensions() > 1)
         {
             const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
 
@@ -80,13 +86,15 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
                                             "mm_result tensor must have the same number of batches of output tensor");
 
-            if(a_offset != 0)
+            if (a_offset != 0)
             {
                 TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
                 vector_sum_col_shape.collapse_from(1);
 
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+                                                    vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                "vector_sum_col tensor must have the same number of batches of "
+                                                "vector_sum_row_shape or the number of batches must be set to 1");
             }
         }
     }
@@ -94,9 +102,15 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
     return Status{};
 }
 
-void run_offset_contribution(const Window &window,
-                             ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row,
-                             int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col, bool is_gemm3d)
+void run_offset_contribution(const Window  &window,
+                             ITensor       *mm_result,
+                             const ITensor *vector_sum_col,
+                             const ITensor *vector_sum_row,
+                             int32_t        a_offset,
+                             int32_t        b_offset,
+                             int32_t        k_offset,
+                             bool           slide_vector_sum_col,
+                             bool           is_gemm3d)
 {
     Window collapsed_window = window.collapse_if_possible(window, Window::DimZ);
     collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -112,7 +126,7 @@ void run_offset_contribution(const Window &window,
     const size_t sum_col_stride_y = (vector_sum_col != nullptr) ? (vector_sum_col->info()->strides_in_bytes().y()) : 0;
     Iterator     mm_result_it(mm_result, collapsed_window);
 
-    if((a_offset != 0) && (b_offset != 0) && (vector_sum_col != nullptr) && (vector_sum_row != nullptr)) // true, true
+    if ((a_offset != 0) && (b_offset != 0) && (vector_sum_col != nullptr) && (vector_sum_row != nullptr)) // true, true
     {
         // Set window for vector_sum_col
         Window win_vector_sum_col(collapsed_window);
@@ -131,95 +145,85 @@ void run_offset_contribution(const Window &window,
         const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
 
         // Offset in case vector_sum_col is batched
-        const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
-
-        execute_window_loop(collapsed_window, [&](const Coordinates & id)
-        {
-            const int    batch_id           = id.z() / depth_input;
-            const size_t batch_offset_col   = batch_id * (sum_col_stride_y );
-            auto         vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col + batch_id * vector_sum_col_batch_offset);
-            auto         mm_result_ptr      = reinterpret_cast<int32_t *>(mm_result_it.ptr());
-
-            // Compute the leftover term due to b_offset.
-            int32_t b_offset_term_s32 = *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input);
-            b_offset_term_s32 *= b_offset;
+        const int vector_sum_col_batch_offset =
+            slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
 
-            const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32);
-
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            collapsed_window,
+            [&](const Coordinates &id)
             {
-                // Compute the leftover term due to a_offset.
-                int32x4x4_t a_offset_term_s32 =
-                {
-                    {
-                        vld1q_s32(vector_sum_col_ptr + x + 0),
-                        vld1q_s32(vector_sum_col_ptr + x + 4),
-                        vld1q_s32(vector_sum_col_ptr + x + 8),
-                        vld1q_s32(vector_sum_col_ptr + x + 12)
-                    }
-                };
-
-                a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
-                a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
-                a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
-                a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
-
-                // Add a_offset_term_s32 and b_offset_term_s32
-                int32x4x4_t offset_term_s32 =
+                const int    batch_id         = id.z() / depth_input;
+                const size_t batch_offset_col = batch_id * (sum_col_stride_y);
+                auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col +
+                                                                            batch_id * vector_sum_col_batch_offset);
+                auto mm_result_ptr      = reinterpret_cast<int32_t *>(mm_result_it.ptr());
+
+                // Compute the leftover term due to b_offset.
+                int32_t b_offset_term_s32 =
+                    *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+                      id.y() + (id.z() % depth_input) * height_input);
+                b_offset_term_s32 *= b_offset;
+
+                const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32);
+
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vdupq_n_s32(k_offset),
-                        vdupq_n_s32(k_offset),
-                        vdupq_n_s32(k_offset),
-                        vdupq_n_s32(k_offset)
-                    }
-                };
-
-                offset_term_s32.val[0] = vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32_vec));
-                offset_term_s32.val[1] = vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32_vec));
-                offset_term_s32.val[2] = vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32_vec));
-                offset_term_s32.val[3] = vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32_vec));
-
-                int32x4x4_t in_s32 =
+                    // Compute the leftover term due to a_offset.
+                    int32x4x4_t a_offset_term_s32 = {
+                        {vld1q_s32(vector_sum_col_ptr + x + 0), vld1q_s32(vector_sum_col_ptr + x + 4),
+                         vld1q_s32(vector_sum_col_ptr + x + 8), vld1q_s32(vector_sum_col_ptr + x + 12)}};
+
+                    a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
+                    a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
+                    a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
+                    a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
+
+                    // Add a_offset_term_s32 and b_offset_term_s32
+                    int32x4x4_t offset_term_s32 = {
+                        {vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset)}};
+
+                    offset_term_s32.val[0] =
+                        vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32_vec));
+                    offset_term_s32.val[1] =
+                        vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32_vec));
+                    offset_term_s32.val[2] =
+                        vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32_vec));
+                    offset_term_s32.val[3] =
+                        vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32_vec));
+
+                    int32x4x4_t in_s32 = {{vld1q_s32(mm_result_ptr + x + 0), vld1q_s32(mm_result_ptr + x + 4),
+                                           vld1q_s32(mm_result_ptr + x + 8), vld1q_s32(mm_result_ptr + x + 12)}};
+
+                    // Add the offset terms to GEMM's result
+                    in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]);
+                    in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]);
+                    in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]);
+                    in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]);
+
+                    // Store the result with the offset contribution
+                    vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
+                    vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
+                    vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
+                    vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
+                }
+
+                // Left-overs loop
+                for (; x < window_end_x; ++x)
                 {
-                    {
-                        vld1q_s32(mm_result_ptr + x + 0),
-                        vld1q_s32(mm_result_ptr + x + 4),
-                        vld1q_s32(mm_result_ptr + x + 8),
-                        vld1q_s32(mm_result_ptr + x + 12)
-                    }
-                };
-
-                // Add the offset terms to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]);
-
-                // Store the result with the offset contribution
-                vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
-                vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
-                vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
-                vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
-            }
+                    // Compute the leftover term due to a_offset.
+                    int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x);
 
-            // Left-overs loop
-            for(; x < window_end_x; ++x)
-            {
-                // Compute the leftover term due to a_offset.
-                int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x);
-
-                a_offset_term_s32 *= a_offset;
+                    a_offset_term_s32 *= a_offset;
 
-                // Add the offset terms to GEMM's result
-                // Store the result with the offset contribution
-                mm_result_ptr[x] += k_offset + a_offset_term_s32 + b_offset_term_s32;
-            }
-        },
-        vector_sum_col_it, vector_sum_row_it, mm_result_it);
+                    // Add the offset terms to GEMM's result
+                    // Store the result with the offset contribution
+                    mm_result_ptr[x] += k_offset + a_offset_term_s32 + b_offset_term_s32;
+                }
+            },
+            vector_sum_col_it, vector_sum_row_it, mm_result_it);
     }
-    else if((a_offset == 0) && (b_offset != 0) && (vector_sum_row != nullptr)) // false, true
+    else if ((a_offset == 0) && (b_offset != 0) && (vector_sum_row != nullptr)) // false, true
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row);
 
@@ -233,54 +237,51 @@ void run_offset_contribution(const Window &window,
 
         const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
 
-        execute_window_loop(collapsed_window, [&](const Coordinates & id)
-        {
-            const int batch_id      = id.z() / depth_input;
-            auto      mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr());
+        execute_window_loop(
+            collapsed_window,
+            [&](const Coordinates &id)
+            {
+                const int batch_id      = id.z() / depth_input;
+                auto      mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr());
 
-            // Compute the leftover term due to b_offset.
-            int32_t b_offset_term_s32 = *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input);
-            b_offset_term_s32 *= b_offset;
+                // Compute the leftover term due to b_offset.
+                int32_t b_offset_term_s32 =
+                    *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+                      id.y() + (id.z() % depth_input) * height_input);
+                b_offset_term_s32 *= b_offset;
 
-            const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32);
+                const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32);
 
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                int32x4x4_t in_s32 =
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(mm_result_ptr + x + 0),
-                        vld1q_s32(mm_result_ptr + x + 4),
-                        vld1q_s32(mm_result_ptr + x + 8),
-                        vld1q_s32(mm_result_ptr + x + 12)
-                    }
-                };
-
-                // Add the offset terms to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32_vec);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32_vec);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32_vec);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32_vec);
-
-                // Store the result with the offset contribution
-                vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
-                vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
-                vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
-                vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
-            }
-
-            // Left-overs loop
-            for(; x < window_end_x; ++x)
-            {
-                // Add the offset terms to GEMM's result
-                // Store the result with the offset contribution
-                mm_result_ptr[x] += b_offset_term_s32;
-            }
-        },
-        vector_sum_row_it, mm_result_it);
+                    int32x4x4_t in_s32 = {{vld1q_s32(mm_result_ptr + x + 0), vld1q_s32(mm_result_ptr + x + 4),
+                                           vld1q_s32(mm_result_ptr + x + 8), vld1q_s32(mm_result_ptr + x + 12)}};
+
+                    // Add the offset terms to GEMM's result
+                    in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32_vec);
+                    in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32_vec);
+                    in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32_vec);
+                    in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32_vec);
+
+                    // Store the result with the offset contribution
+                    vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
+                    vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
+                    vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
+                    vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
+                }
+
+                // Left-overs loop
+                for (; x < window_end_x; ++x)
+                {
+                    // Add the offset terms to GEMM's result
+                    // Store the result with the offset contribution
+                    mm_result_ptr[x] += b_offset_term_s32;
+                }
+            },
+            vector_sum_row_it, mm_result_it);
     }
-    else if((a_offset != 0) && (b_offset == 0) && (vector_sum_col != nullptr)) // true, false
+    else if ((a_offset != 0) && (b_offset == 0) && (vector_sum_col != nullptr)) // true, false
     {
         // Set window for vector_sum_col
         Window win_vector_sum_col(collapsed_window);
@@ -290,69 +291,62 @@ void run_offset_contribution(const Window &window,
         Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col);
 
         // Offset in case vector_sum_col is batched
-        const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
-
-        execute_window_loop(collapsed_window, [&](const Coordinates & id)
-        {
-            const int    batch_id           = id.z() / depth_input;
-            const size_t batch_offset_col   = batch_id * (sum_col_stride_y ); // Value to offset vector_sum_col_ptr to allow for iteration of y values in tensor
-            auto         vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col + batch_id * vector_sum_col_batch_offset);
-            auto         mm_result_ptr      = reinterpret_cast<int32_t *>(mm_result_it.ptr());
+        const int vector_sum_col_batch_offset =
+            slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
 
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            collapsed_window,
+            [&](const Coordinates &id)
             {
-                // Compute the leftover term due to a_offset.
-                int32x4x4_t a_offset_term_s32 =
+                const int    batch_id = id.z() / depth_input;
+                const size_t batch_offset_col =
+                    batch_id *
+                    (sum_col_stride_y); // Value to offset vector_sum_col_ptr to allow for iteration of y values in tensor
+                auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col +
+                                                                            batch_id * vector_sum_col_batch_offset);
+                auto mm_result_ptr      = reinterpret_cast<int32_t *>(mm_result_it.ptr());
+
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(vector_sum_col_ptr + x + 0),
-                        vld1q_s32(vector_sum_col_ptr + x + 4),
-                        vld1q_s32(vector_sum_col_ptr + x + 8),
-                        vld1q_s32(vector_sum_col_ptr + x + 12)
-                    }
-                };
-
-                a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
-                a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
-                a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
-                a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
-
-                int32x4x4_t in_s32 =
+                    // Compute the leftover term due to a_offset.
+                    int32x4x4_t a_offset_term_s32 = {
+                        {vld1q_s32(vector_sum_col_ptr + x + 0), vld1q_s32(vector_sum_col_ptr + x + 4),
+                         vld1q_s32(vector_sum_col_ptr + x + 8), vld1q_s32(vector_sum_col_ptr + x + 12)}};
+
+                    a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
+                    a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
+                    a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
+                    a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
+
+                    int32x4x4_t in_s32 = {{vld1q_s32(mm_result_ptr + x + 0), vld1q_s32(mm_result_ptr + x + 4),
+                                           vld1q_s32(mm_result_ptr + x + 8), vld1q_s32(mm_result_ptr + x + 12)}};
+
+                    // Add the offset terms to GEMM's result
+                    in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]);
+                    in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]);
+                    in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]);
+                    in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]);
+
+                    // Store the result with the offset contribution
+                    vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
+                    vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
+                    vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
+                    vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
+                }
+
+                // Left-overs loop
+                for (; x < window_end_x; ++x)
                 {
-                    {
-                        vld1q_s32(mm_result_ptr + x + 0),
-                        vld1q_s32(mm_result_ptr + x + 4),
-                        vld1q_s32(mm_result_ptr + x + 8),
-                        vld1q_s32(mm_result_ptr + x + 12)
-                    }
-                };
-
-                // Add the offset terms to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]);
-
-                // Store the result with the offset contribution
-                vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
-                vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
-                vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
-                vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
-            }
-
-            // Left-overs loop
-            for(; x < window_end_x; ++x)
-            {
-                // Compute the leftover term due to a_offset.
-                const int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x);
-
-                // Add the offset terms to GEMM's result
-                // Store the result with the offset contribution
-                mm_result_ptr[x] += a_offset_term_s32 * a_offset;
-            }
-        },
-        vector_sum_col_it, mm_result_it);
+                    // Compute the leftover term due to a_offset.
+                    const int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x);
+
+                    // Add the offset terms to GEMM's result
+                    // Store the result with the offset contribution
+                    mm_result_ptr[x] += a_offset_term_s32 * a_offset;
+                }
+            },
+            vector_sum_col_it, mm_result_it);
     }
     else // false, false
     {
@@ -362,7 +356,12 @@ void run_offset_contribution(const Window &window,
 }
 } // namespace
 
-void CpuGemmLowpOffsetContributionKernel::configure(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset)
+void CpuGemmLowpOffsetContributionKernel::configure(ITensorInfo *mm_result,
+                                                    ITensorInfo *vector_sum_col,
+                                                    ITensorInfo *vector_sum_row,
+                                                    int32_t      k,
+                                                    int32_t      a_offset,
+                                                    int32_t      b_offset)
 {
     // Perform validate step
     ARM_COMPUTE_UNUSED(vector_sum_row);
@@ -374,7 +373,7 @@ void CpuGemmLowpOffsetContributionKernel::configure(ITensorInfo *mm_result, ITen
     _k_offset = a_offset * b_offset * k;
 
     // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         // Check if vector_sum_col_shape should be slidden or not
         // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1
@@ -387,8 +386,11 @@ void CpuGemmLowpOffsetContributionKernel::configure(ITensorInfo *mm_result, ITen
     ICpuKernel::configure(win);
 }
 
-Status CpuGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
-                                                     int32_t a_offset, int32_t b_offset)
+Status CpuGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result,
+                                                     const ITensorInfo *vector_sum_col,
+                                                     const ITensorInfo *vector_sum_row,
+                                                     int32_t            a_offset,
+                                                     int32_t            b_offset)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset));
     return Status{};
@@ -405,11 +407,11 @@ void CpuGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Win
     auto mm_result      = tensors.get_tensor(TensorType::ACL_DST);
 
     // Check if input is a 3D reinterpretation
-    const bool reinterpret_as_3d = vector_sum_row != nullptr
-                                   && mm_result->info()->num_dimensions() > 1
-                                   && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
+    const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->info()->num_dimensions() > 1 &&
+                                   mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
 
-    run_offset_contribution(window, mm_result, vector_sum_col, vector_sum_row, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, reinterpret_as_3d);
+    run_offset_contribution(window, mm_result, vector_sum_col, vector_sum_row, _a_offset, _b_offset, _k_offset,
+                            _slide_vector_sum_col, reinterpret_as_3d);
 }
 
 const char *CpuGemmLowpOffsetContributionKernel::name() const
@@ -418,4 +420,4 @@ const char *CpuGemmLowpOffsetContributionKernel::name() const
 }
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h
index 3514ca811d..08b2d47529 100644
--- a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h
@@ -63,24 +63,33 @@ public:
      * @param[in]      a_offset       Offset to be added to each element of the matrix A.
      * @param[in]      b_offset       Offset to be added to each element of the matrix B.
      */
-    void configure(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset);
+    void configure(ITensorInfo *mm_result,
+                   ITensorInfo *vector_sum_col,
+                   ITensorInfo *vector_sum_row,
+                   int32_t      k,
+                   int32_t      a_offset,
+                   int32_t      b_offset);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuGemmLowpOffsetContributionKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, int32_t a_offset, int32_t b_offset);
+    static Status validate(const ITensorInfo *mm_result,
+                           const ITensorInfo *vector_sum_col,
+                           const ITensorInfo *vector_sum_row,
+                           int32_t            a_offset,
+                           int32_t            b_offset);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
-    int32_t _a_offset{ 0 };
-    int32_t _b_offset{ 0 };
-    int32_t _k_offset{ 0 };
-    bool    _slide_vector_sum_col{ true };
+    int32_t _a_offset{0};
+    int32_t _b_offset{0};
+    int32_t _k_offset{0};
+    bool    _slide_vector_sum_col{true};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp
index 190487eced..d008842398 100644
--- a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp
@@ -31,10 +31,11 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 
@@ -48,80 +49,38 @@ namespace
 {
 inline int32x4x4_t load_results_input(const Iterator &mm_result_it, int32_t x)
 {
-    return
-    {
-        {
-            vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 0),
-            vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 4),
-            vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 8),
-            vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 12)
-        }
-    };
+    return {{vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 0),
+             vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 4),
+             vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 8),
+             vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 12)}};
 }
 
 inline int32x4x4_t load(const int32_t *ptr, int32_t x)
 {
-    return
-    {
-        {
-            vld1q_s32(ptr + x + 0),
-            vld1q_s32(ptr + x + 4),
-            vld1q_s32(ptr + x + 8),
-            vld1q_s32(ptr + x + 12)
-        }
-    };
+    return {{vld1q_s32(ptr + x + 0), vld1q_s32(ptr + x + 4), vld1q_s32(ptr + x + 8), vld1q_s32(ptr + x + 12)}};
 }
 
 inline int32x4x4_t add_s32(int32x4x4_t a, int32x4_t b)
 {
-    return
-    {
-        {
-            vaddq_s32(a.val[0], b),
-            vaddq_s32(a.val[1], b),
-            vaddq_s32(a.val[2], b),
-            vaddq_s32(a.val[3], b)
-        }
-    };
+    return {{vaddq_s32(a.val[0], b), vaddq_s32(a.val[1], b), vaddq_s32(a.val[2], b), vaddq_s32(a.val[3], b)}};
 }
 
 inline int32x4x4_t add_s32(int32x4x4_t a, int32x4x4_t b)
 {
-    return
-    {
-        {
-            vaddq_s32(a.val[0], b.val[0]),
-            vaddq_s32(a.val[1], b.val[1]),
-            vaddq_s32(a.val[2], b.val[2]),
-            vaddq_s32(a.val[3], b.val[3])
-        }
-    };
+    return {{vaddq_s32(a.val[0], b.val[0]), vaddq_s32(a.val[1], b.val[1]), vaddq_s32(a.val[2], b.val[2]),
+             vaddq_s32(a.val[3], b.val[3])}};
 }
 
 inline int32x4x4_t mul_s32(int32x4x4_t &a, int32_t mul_scalar)
 {
-    return
-    {
-        {
-            vmulq_n_s32(a.val[0], mul_scalar),
-            vmulq_n_s32(a.val[1], mul_scalar),
-            vmulq_n_s32(a.val[2], mul_scalar),
-            vmulq_n_s32(a.val[3], mul_scalar)
-        }
-    };
+    return {{vmulq_n_s32(a.val[0], mul_scalar), vmulq_n_s32(a.val[1], mul_scalar), vmulq_n_s32(a.val[2], mul_scalar),
+             vmulq_n_s32(a.val[3], mul_scalar)}};
 }
 
 inline int32x4x4_t mul_s32(int32x4x4_t &a, const int32_t *multilpier)
 {
-    return
-    {
-        {
-            vmulq_s32(a.val[0], vld1q_s32(multilpier)),
-            vmulq_s32(a.val[1], vld1q_s32(multilpier + 4)),
-            vmulq_s32(a.val[2], vld1q_s32(multilpier + 8)),
-            vmulq_s32(a.val[3], vld1q_s32(multilpier + 12))
-        }
-    };
+    return {{vmulq_s32(a.val[0], vld1q_s32(multilpier)), vmulq_s32(a.val[1], vld1q_s32(multilpier + 4)),
+             vmulq_s32(a.val[2], vld1q_s32(multilpier + 8)), vmulq_s32(a.val[3], vld1q_s32(multilpier + 12))}};
 }
 
 inline int32x4x4_t get_a_offset(const int32_t *vector_sum_col_ptr, int32_t a_offset, int32_t x)
@@ -144,18 +103,11 @@ inline int32x4_t get_b_offset(const int32_t *vector_sum_row_ptr, int32_t b_offse
 
 inline int32x4x4_t get_k_offset(int32_t k_offset)
 {
-    return
-    {
-        {
-            vdupq_n_s32(k_offset),
-            vdupq_n_s32(k_offset),
-            vdupq_n_s32(k_offset),
-            vdupq_n_s32(k_offset)
-        }
-    };
+    return {{vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset)}};
 }
 
-inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8, bool is_bounded_relu)
+inline uint8x16_t finalize_quantization_floating_point(
+    int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8, bool is_bounded_relu)
 {
     const static int32x4_t zero_s32 = vdupq_n_s32(0);
 
@@ -172,18 +124,13 @@ inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int3
     in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
 
     // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
+    const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+                                 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
 
     // Convert S16 to U8
     uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
 
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_u8 = vmaxq_u8(out_u8, min_u8);
         out_u8 = vminq_u8(out_u8, max_u8);
@@ -192,7 +139,8 @@ inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int3
     return out_u8;
 }
 
-inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu)
+inline int8x16_t finalize_quantization_floating_point(
+    int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu)
 {
     const static int32x4_t zero_s32 = vdupq_n_s32(0);
 
@@ -209,18 +157,13 @@ inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32
     in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
 
     // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
+    const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+                                 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
 
     // Convert S16 to S8
     int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
 
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_s8 = vmaxq_s8(out_s8, min_s8);
         out_s8 = vminq_s8(out_s8, max_s8);
@@ -229,7 +172,8 @@ inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32
     return out_s8;
 }
 
-inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu)
+inline int8x16_t finalize_quantization_floating_point(
+    int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu)
 {
     const static int32x4_t zero_s32 = vdupq_n_s32(0);
 
@@ -246,18 +190,13 @@ inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32
     in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
 
     // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
+    const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+                                 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
 
     // Convert S16 to S8
     int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
 
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_s8 = vmaxq_s8(out_s8, min_s8);
         out_s8 = vminq_s8(out_s8, max_s8);
@@ -305,81 +244,103 @@ inline Iterator get_bias_it(const Window &window, const ITensor *bias)
 }
 
 template <typename VT>
-inline void run_offset_contribution_output_stage_window(const int32_t *vector_sum_col_ptr, const int32_t *vector_sum_row_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it,
-                                                        const int32x4_t result_offset_s32, const int32x4_t result_shift_s32,
-                                                        typename VT::vtype min_vec, typename VT::vtype max_vec,
-                                                        int32_t a_offset, int32_t b_offset, int32_t k_offset,
-                                                        int32_t multiplier, int32_t shift, int32_t offset, int32_t min_bound, int32_t max_bound,
-                                                        int window_step_x, int window_start_x, int window_end_x, bool has_a_offset, bool has_b_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point)
+inline void run_offset_contribution_output_stage_window(const int32_t     *vector_sum_col_ptr,
+                                                        const int32_t     *vector_sum_row_ptr,
+                                                        const int32_t     *bias_ptr,
+                                                        Iterator           mm_result_it,
+                                                        Iterator           out_it,
+                                                        const int32x4_t    result_offset_s32,
+                                                        const int32x4_t    result_shift_s32,
+                                                        typename VT::vtype min_vec,
+                                                        typename VT::vtype max_vec,
+                                                        int32_t            a_offset,
+                                                        int32_t            b_offset,
+                                                        int32_t            k_offset,
+                                                        int32_t            multiplier,
+                                                        int32_t            shift,
+                                                        int32_t            offset,
+                                                        int32_t            min_bound,
+                                                        int32_t            max_bound,
+                                                        int                window_step_x,
+                                                        int                window_start_x,
+                                                        int                window_end_x,
+                                                        bool               has_a_offset,
+                                                        bool               has_b_offset,
+                                                        bool               has_bias,
+                                                        bool               is_bounded_relu,
+                                                        bool               is_fixed_point)
 {
-    int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 };
-    if(!is_fixed_point)
+    int32x4x4_t offset_term_s32 = {0, 0, 0, 0};
+    if (!is_fixed_point)
     {
         // Combine quantization offset with other offsets.
         offset_term_s32 = add_s32(offset_term_s32, result_offset_s32);
     }
-    if(has_a_offset && has_b_offset)
+    if (has_a_offset && has_b_offset)
     {
         offset_term_s32 = add_s32(offset_term_s32, get_k_offset(k_offset));
     }
-    if(has_b_offset)
+    if (has_b_offset)
     {
         offset_term_s32 = add_s32(offset_term_s32, get_b_offset(vector_sum_row_ptr, b_offset));
     }
 
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         int32x4x4_t in_s32 = load_results_input(mm_result_it, x);
 
-        if(has_a_offset)
+        if (has_a_offset)
         {
             in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x));
         }
-        if(has_bias)
+        if (has_bias)
         {
             in_s32 = add_s32(in_s32, load(bias_ptr, x));
         }
-        if(!is_fixed_point || has_b_offset)
+        if (!is_fixed_point || has_b_offset)
         {
             in_s32 = add_s32(in_s32, offset_term_s32);
         }
-        if(!is_fixed_point)
+        if (!is_fixed_point)
         {
             in_s32 = mul_s32(in_s32, multiplier);
         }
 
-        if(is_fixed_point)
+        if (is_fixed_point)
         {
-            wrapper::vstore(reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
-                            finalize_quantization(in_s32, multiplier, shift, result_offset_s32, min_vec, max_vec, is_bounded_relu));
+            wrapper::vstore(
+                reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
+                finalize_quantization(in_s32, multiplier, shift, result_offset_s32, min_vec, max_vec, is_bounded_relu));
         }
         else
         {
-            wrapper::vstore(reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
-                            finalize_quantization_floating_point(in_s32, result_shift_s32, min_vec, max_vec, is_bounded_relu));
+            wrapper::vstore(
+                reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
+                finalize_quantization_floating_point(in_s32, result_shift_s32, min_vec, max_vec, is_bounded_relu));
         }
     }
     // Compute left-over elements
-    for(; x < window_end_x; ++x)
+    for (; x < window_end_x; ++x)
     {
-        int32_t in_value = *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0);
+        int32_t in_value =
+            *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0);
 
-        if(has_a_offset)
+        if (has_a_offset)
         {
             in_value += (*(vector_sum_col_ptr + x) * a_offset);
         }
-        if(has_bias)
+        if (has_bias)
         {
             in_value += *(bias_ptr + x);
         }
 
-        if(is_fixed_point)
+        if (is_fixed_point)
         {
             // Finalize and store the result
-            *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = finalize_quantization(in_value, multiplier, shift, offset,
-                                                                                              static_cast<typename VT::stype>(min_bound),
-                                                                                              static_cast<typename VT::stype>(max_bound), is_bounded_relu);
+            *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) =
+                finalize_quantization(in_value, multiplier, shift, offset, static_cast<typename VT::stype>(min_bound),
+                                      static_cast<typename VT::stype>(max_bound), is_bounded_relu);
         }
         else
         {
@@ -387,75 +348,100 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su
             in_value = (in_value * multiplier) >> shift;
 
             // Bound and store the result
-            if(is_bounded_relu)
+            if (is_bounded_relu)
             {
-                in_value = static_cast<typename VT::stype>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value)));
+                in_value = static_cast<typename VT::stype>(
+                    std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value)));
             }
-            *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = static_cast<typename VT::stype>(std::max<int32_t>(static_cast<int32_t>(std::numeric_limits<typename VT::stype>::lowest()),
-                                                                                                                          std::min<int32_t>(static_cast<int32_t>(std::numeric_limits<typename VT::stype>::max()), in_value)));
+            *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) =
+                static_cast<typename VT::stype>(std::max<int32_t>(
+                    static_cast<int32_t>(std::numeric_limits<typename VT::stype>::lowest()),
+                    std::min<int32_t>(static_cast<int32_t>(std::numeric_limits<typename VT::stype>::max()), in_value)));
         }
     }
 }
 
-inline void run_offset_contribution_output_stage_window_symm(const int32_t *vector_sum_col_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it,
-                                                             const int32_t *result_multipliers, const int32_t *result_shifts,
-                                                             const int32x4_t result_offset, int8x16_t min_s8, int8x16_t max_s8,
-                                                             int32_t a_offset, int32_t offset, int32_t min_bound, int32_t max_bound,
-                                                             int window_step_x, int window_start_x, int window_end_x, bool has_a_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point)
+inline void run_offset_contribution_output_stage_window_symm(const int32_t  *vector_sum_col_ptr,
+                                                             const int32_t  *bias_ptr,
+                                                             Iterator        mm_result_it,
+                                                             Iterator        out_it,
+                                                             const int32_t  *result_multipliers,
+                                                             const int32_t  *result_shifts,
+                                                             const int32x4_t result_offset,
+                                                             int8x16_t       min_s8,
+                                                             int8x16_t       max_s8,
+                                                             int32_t         a_offset,
+                                                             int32_t         offset,
+                                                             int32_t         min_bound,
+                                                             int32_t         max_bound,
+                                                             int             window_step_x,
+                                                             int             window_start_x,
+                                                             int             window_end_x,
+                                                             bool            has_a_offset,
+                                                             bool            has_bias,
+                                                             bool            is_bounded_relu,
+                                                             bool            is_fixed_point)
 {
-    int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 };
-    if(!is_fixed_point)
+    int32x4x4_t offset_term_s32 = {0, 0, 0, 0};
+    if (!is_fixed_point)
     {
         // Combine quantization offset with other offsets.
         offset_term_s32 = add_s32(offset_term_s32, result_offset);
     }
 
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         int32x4x4_t in_s32 = load_results_input(mm_result_it, x);
 
-        if(has_a_offset)
+        if (has_a_offset)
         {
             in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x));
         }
-        if(has_bias)
+        if (has_bias)
         {
             in_s32 = add_s32(in_s32, load(bias_ptr, x));
         }
-        if(!is_fixed_point)
+        if (!is_fixed_point)
         {
             in_s32 = add_s32(in_s32, offset_term_s32);
             in_s32 = mul_s32(in_s32, result_multipliers + x);
         }
 
-        if(is_fixed_point)
+        if (is_fixed_point)
         {
-            vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_symm(in_s32, load(result_multipliers, x), load(result_shifts, x), result_offset, min_s8, max_s8, is_bounded_relu));
+            vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x),
+                     finalize_quantization_symm(in_s32, load(result_multipliers, x), load(result_shifts, x),
+                                                result_offset, min_s8, max_s8, is_bounded_relu));
         }
         else
         {
-            vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_floating_point(in_s32, load(result_shifts, x), min_s8, max_s8, is_bounded_relu));
+            vst1q_s8(
+                reinterpret_cast<int8_t *>(out_it.ptr() + x),
+                finalize_quantization_floating_point(in_s32, load(result_shifts, x), min_s8, max_s8, is_bounded_relu));
         }
     }
     // Compute left-over elements
-    for(; x < window_end_x; ++x)
+    for (; x < window_end_x; ++x)
     {
-        int32_t in_value = *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0);
+        int32_t in_value =
+            *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0);
 
-        if(has_a_offset)
+        if (has_a_offset)
         {
             in_value += (*(vector_sum_col_ptr + x) * a_offset);
         }
-        if(has_bias)
+        if (has_bias)
         {
             in_value += *(bias_ptr + x);
         }
 
-        if(is_fixed_point)
+        if (is_fixed_point)
         {
             // Finalize and store the result
-            *(out_it.ptr() + x) = finalize_quantization(in_value, result_multipliers[x], result_shifts[x], offset, static_cast<int8_t>(min_bound), static_cast<int8_t>(max_bound), is_bounded_relu);
+            *(out_it.ptr() + x) =
+                finalize_quantization(in_value, result_multipliers[x], result_shifts[x], offset,
+                                      static_cast<int8_t>(min_bound), static_cast<int8_t>(max_bound), is_bounded_relu);
         }
         else
         {
@@ -463,7 +449,7 @@ inline void run_offset_contribution_output_stage_window_symm(const int32_t *vect
             in_value = (in_value * result_multipliers[x]) >> (-result_shifts[x]);
 
             // Bound and store the result
-            if(is_bounded_relu)
+            if (is_bounded_relu)
             {
                 in_value = static_cast<int8_t>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value)));
             }
@@ -473,10 +459,20 @@ inline void run_offset_contribution_output_stage_window_symm(const int32_t *vect
 }
 
 template <typename T>
-void run_offset_contribution_output_stage(const Window &window,
-                                          const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
-                                          int32_t a_offset, int32_t b_offset, int32_t k_offset, bool is_vector_sum_col_batched,
-                                          GEMMLowpOutputStageInfo output_stage, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point)
+void run_offset_contribution_output_stage(const Window           &window,
+                                          const ITensor          *mm_result,
+                                          const ITensor          *vector_sum_col,
+                                          const ITensor          *vector_sum_row,
+                                          const ITensor          *bias,
+                                          ITensor                *output,
+                                          int32_t                 a_offset,
+                                          int32_t                 b_offset,
+                                          int32_t                 k_offset,
+                                          bool                    is_vector_sum_col_batched,
+                                          GEMMLowpOutputStageInfo output_stage,
+                                          bool                    is_gemm3d,
+                                          bool                    is_bounded_relu,
+                                          bool                    is_fixed_point)
 {
     //  Semantics of XYZW Explained for each tensor
     //
@@ -516,7 +512,7 @@ void run_offset_contribution_output_stage(const Window &window,
     Iterator mm_result_it(mm_result, win);
     Iterator out_it(output, win);
 
-    if((a_offset != 0) && (b_offset != 0))
+    if ((a_offset != 0) && (b_offset != 0))
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col);
         ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row);
@@ -527,45 +523,52 @@ void run_offset_contribution_output_stage(const Window &window,
         const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
 
         // Offset in case vector_sum_col is batched in y dimension
-        const int vector_sum_col_stride_batch = is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0;
+        const int vector_sum_col_stride_batch =
+            is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0;
 
-        if(bias != nullptr)
+        if (bias != nullptr)
         {
             Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
-                const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
-                                                + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()),
-                                                                   mm_result_it,
-                                                                   out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, true, true, true, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id           = id.z() / depth_input;
+                    const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+                        vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+                    const auto vector_sum_row_ptr =
+                        reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+                        id.y() + (id.z() % depth_input) * height_input;
+                    run_offset_contribution_output_stage_window<Typer>(
+                        vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()),
+                        mm_result_it, out_it, result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset,
+                        k_offset, multiplier, shift, offset, min_bound, max_bound, window_step_x, window_start_x,
+                        window_end_x, true, true, true, is_bounded_relu, is_fixed_point);
+                },
+                vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it);
         }
         else
         {
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
-                const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
-                                                + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, true, true, false, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id           = id.z() / depth_input;
+                    const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+                        vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+                    const auto vector_sum_row_ptr =
+                        reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+                        id.y() + (id.z() % depth_input) * height_input;
+                    run_offset_contribution_output_stage_window<Typer>(
+                        vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, result_offset_s32,
+                        result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset,
+                        min_bound, max_bound, window_step_x, window_start_x, window_end_x, true, true, false,
+                        is_bounded_relu, is_fixed_point);
+                },
+                vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it);
         }
     }
-    else if((a_offset == 0) && (b_offset != 0))
+    else if ((a_offset == 0) && (b_offset != 0))
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row);
 
@@ -573,114 +576,139 @@ void run_offset_contribution_output_stage(const Window &window,
 
         const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
 
-        if(bias != nullptr)
+        if (bias != nullptr)
         {
             Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
-                                                + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<Typer>(nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
-                                                                   out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, false, true, true, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_row_it, bias_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id = id.z() / depth_input;
+                    const auto vector_sum_row_ptr =
+                        reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+                        id.y() + (id.z() % depth_input) * height_input;
+                    run_offset_contribution_output_stage_window<Typer>(
+                        nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
+                        out_it, result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset,
+                        multiplier, shift, offset, min_bound, max_bound, window_step_x, window_start_x, window_end_x,
+                        false, true, true, is_bounded_relu, is_fixed_point);
+                },
+                vector_sum_row_it, bias_it, mm_result_it, out_it);
         }
         else
         {
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
-                                                + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<Typer>(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, false, true, false, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_row_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id = id.z() / depth_input;
+                    const auto vector_sum_row_ptr =
+                        reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+                        id.y() + (id.z() % depth_input) * height_input;
+                    run_offset_contribution_output_stage_window<Typer>(
+                        nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32,
+                        min_vec, max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset, min_bound, max_bound,
+                        window_step_x, window_start_x, window_end_x, false, true, false, is_bounded_relu,
+                        is_fixed_point);
+                },
+                vector_sum_row_it, mm_result_it, out_it);
         }
     }
-    else if((a_offset != 0) && (b_offset == 0))
+    else if ((a_offset != 0) && (b_offset == 0))
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col);
 
         Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col);
 
         // Offset in case vector_sum_col is batched in y dimension
-        const int vector_sum_col_stride_batch = is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0;
+        const int vector_sum_col_stride_batch =
+            is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0;
 
-        if(bias != nullptr)
+        if (bias != nullptr)
         {
             Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
-                run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
-                                                                   out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, true, false, true, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, bias_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id           = id.z() / depth_input;
+                    const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+                        vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+                    run_offset_contribution_output_stage_window<Typer>(
+                        vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
+                        out_it, result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset,
+                        multiplier, shift, offset, min_bound, max_bound, window_step_x, window_start_x, window_end_x,
+                        true, false, true, is_bounded_relu, is_fixed_point);
+                },
+                vector_sum_col_it, bias_it, mm_result_it, out_it);
         }
         else
         {
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
-                run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, true, false, false, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id           = id.z() / depth_input;
+                    const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+                        vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+                    run_offset_contribution_output_stage_window<Typer>(
+                        vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32,
+                        min_vec, max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset, min_bound, max_bound,
+                        window_step_x, window_start_x, window_end_x, true, false, false, is_bounded_relu,
+                        is_fixed_point);
+                },
+                vector_sum_col_it, mm_result_it, out_it);
         }
     }
     else
     {
-        if(bias != nullptr)
+        if (bias != nullptr)
         {
             Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates &)
-            {
-                run_offset_contribution_output_stage_window<Typer>(nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, false, false, true, is_bounded_relu, is_fixed_point);
-            },
-            bias_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &)
+                {
+                    run_offset_contribution_output_stage_window<Typer>(
+                        nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+                        result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset, multiplier,
+                        shift, offset, min_bound, max_bound, window_step_x, window_start_x, window_end_x, false, false,
+                        true, is_bounded_relu, is_fixed_point);
+                },
+                bias_it, mm_result_it, out_it);
         }
         else
         {
-            execute_window_loop(collapsed_window, [&](const Coordinates &)
-            {
-                run_offset_contribution_output_stage_window<Typer>(nullptr, nullptr, nullptr, mm_result_it, out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, false, false, false, is_bounded_relu, is_fixed_point);
-            },
-            mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &)
+                {
+                    run_offset_contribution_output_stage_window<Typer>(
+                        nullptr, nullptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32, min_vec,
+                        max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset, min_bound, max_bound,
+                        window_step_x, window_start_x, window_end_x, false, false, false, is_bounded_relu,
+                        is_fixed_point);
+                },
+                mm_result_it, out_it);
         }
         return;
     }
 }
 
-void run_offset_contribution_output_stage_symm(const Window &window,
-                                               const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
-                                               int32_t a_offset, int32_t b_offset, int32_t k_offset, bool is_vector_sum_col_batched,
-                                               GEMMLowpOutputStageInfo output_stage, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point)
+void run_offset_contribution_output_stage_symm(const Window           &window,
+                                               const ITensor          *mm_result,
+                                               const ITensor          *vector_sum_col,
+                                               const ITensor          *vector_sum_row,
+                                               const ITensor          *bias,
+                                               ITensor                *output,
+                                               int32_t                 a_offset,
+                                               int32_t                 b_offset,
+                                               int32_t                 k_offset,
+                                               bool                    is_vector_sum_col_batched,
+                                               GEMMLowpOutputStageInfo output_stage,
+                                               bool                    is_gemm3d,
+                                               bool                    is_bounded_relu,
+                                               bool                    is_fixed_point)
 {
     ARM_COMPUTE_UNUSED(vector_sum_row, b_offset, k_offset);
 
@@ -690,8 +718,8 @@ void run_offset_contribution_output_stage_symm(const Window &window,
     const int32_t min_bound = output_stage.gemmlowp_min_bound;
     const int32_t max_bound = output_stage.gemmlowp_max_bound;
 
-    const int32_t *result_multipliers = output_stage.gemmlowp_multipliers.data();
-    const int32_t *result_shifts      = output_stage.gemmlowp_shifts.data();
+    const int32_t  *result_multipliers = output_stage.gemmlowp_multipliers.data();
+    const int32_t  *result_shifts      = output_stage.gemmlowp_shifts.data();
     const int32x4_t result_offset_s32  = vdupq_n_s32(offset);
     const int8x16_t min_s8             = vdupq_n_s8(static_cast<int8_t>(min_bound));
     const int8x16_t max_s8             = vdupq_n_s8(static_cast<int8_t>(max_bound));
@@ -708,88 +736,105 @@ void run_offset_contribution_output_stage_symm(const Window &window,
     Iterator mm_result_it(mm_result, win);
     Iterator out_it(output, win);
 
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col);
 
         Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col);
 
         // Offset in case vector_sum_col is batched in y dimension
-        const int vector_sum_col_stride_batch = is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0;
+        const int vector_sum_col_stride_batch =
+            is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0;
 
-        if(bias != nullptr)
+        if (bias != nullptr)
         {
             Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
-                run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
-                                                                 result_multipliers, result_shifts,
-                                                                 result_offset_s32, min_s8, max_s8,
-                                                                 a_offset, offset, min_bound, max_bound,
-                                                                 window_step_x, window_start_x, window_end_x, true, true, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, bias_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id           = id.z() / depth_input;
+                    const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+                        vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+                    run_offset_contribution_output_stage_window_symm(
+                        vector_sum_col_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+                        result_multipliers, result_shifts, result_offset_s32, min_s8, max_s8, a_offset, offset,
+                        min_bound, max_bound, window_step_x, window_start_x, window_end_x, true, true, is_bounded_relu,
+                        is_fixed_point);
+                },
+                vector_sum_col_it, bias_it, mm_result_it, out_it);
         }
         else
         {
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
-                run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, nullptr, mm_result_it, out_it,
-                                                                 result_multipliers, result_shifts,
-                                                                 result_offset_s32, min_s8, max_s8,
-                                                                 a_offset, offset, min_bound, max_bound,
-                                                                 window_step_x, window_start_x, window_end_x, true, false, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id           = id.z() / depth_input;
+                    const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+                        vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+                    run_offset_contribution_output_stage_window_symm(
+                        vector_sum_col_ptr, nullptr, mm_result_it, out_it, result_multipliers, result_shifts,
+                        result_offset_s32, min_s8, max_s8, a_offset, offset, min_bound, max_bound, window_step_x,
+                        window_start_x, window_end_x, true, false, is_bounded_relu, is_fixed_point);
+                },
+                vector_sum_col_it, mm_result_it, out_it);
         }
     }
     else
     {
-        if(bias != nullptr)
+        if (bias != nullptr)
         {
             Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates &)
-            {
-                run_offset_contribution_output_stage_window_symm(nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
-                                                                 result_multipliers, result_shifts,
-                                                                 result_offset_s32, min_s8, max_s8,
-                                                                 a_offset, offset, min_bound, max_bound,
-                                                                 window_step_x, window_start_x, window_end_x, false, true, is_bounded_relu, is_fixed_point);
-            },
-            bias_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &)
+                {
+                    run_offset_contribution_output_stage_window_symm(
+                        nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+                        result_multipliers, result_shifts, result_offset_s32, min_s8, max_s8, a_offset, offset,
+                        min_bound, max_bound, window_step_x, window_start_x, window_end_x, false, true, is_bounded_relu,
+                        is_fixed_point);
+                },
+                bias_it, mm_result_it, out_it);
         }
         else
         {
-            execute_window_loop(collapsed_window, [&](const Coordinates &)
-            {
-                run_offset_contribution_output_stage_window_symm(nullptr, nullptr, mm_result_it, out_it,
-                                                                 result_multipliers, result_shifts,
-                                                                 result_offset_s32, min_s8, max_s8,
-                                                                 a_offset, offset, min_bound, max_bound,
-                                                                 window_step_x, window_start_x, window_end_x, false, false, is_bounded_relu, is_fixed_point);
-            },
-            mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &)
+                {
+                    run_offset_contribution_output_stage_window_symm(
+                        nullptr, nullptr, mm_result_it, out_it, result_multipliers, result_shifts, result_offset_s32,
+                        min_s8, max_s8, a_offset, offset, min_bound, max_bound, window_step_x, window_start_x,
+                        window_end_x, false, false, is_bounded_relu, is_fixed_point);
+                },
+                mm_result_it, out_it);
         }
         return;
     }
 }
 
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output,
-                          int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
+Status validate_arguments(const ITensorInfo      *mm_result,
+                          const ITensorInfo      *vector_sum_col,
+                          const ITensorInfo      *vector_sum_row,
+                          const ITensorInfo      *bias,
+                          const ITensorInfo      *output,
+                          int32_t                 a_offset,
+                          int32_t                 b_offset,
+                          GEMMLowpOutputStageInfo output_stage)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
-    if(output->data_type() != DataType::QASYMM8)
+    if (output->data_type() != DataType::QASYMM8)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) > 1 && output_stage.gemmlowp_multipliers.size() > 1 && b_offset != 0);
+        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) > 1 && output_stage.gemmlowp_multipliers.size() > 1 &&
+                                    b_offset != 0);
     }
     ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN && output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN &&
+                                output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
@@ -797,7 +842,7 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
     }
 
     // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
@@ -805,19 +850,21 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
     }
 
     // If b_offset == 0, vector_sum_row can be a nullptr
-    if(b_offset != 0)
+    if (b_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
 
         // Check if input is a 3D reinterpretation
-        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+        const bool reinterpret_as_3d =
+            mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
 
         // Validate input
-        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
+        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+                                                             (mm_result->dimension(1) * mm_result->dimension(2)));
         ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
 
         TensorShape output_shape = output->tensor_shape();
-        if(output_shape.num_dimensions() > 1)
+        if (output_shape.num_dimensions() > 1)
         {
             const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
 
@@ -828,13 +875,15 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
                                             "mm_result tensor must have the same number of batches of output tensor");
 
-            if(a_offset != 0)
+            if (a_offset != 0)
             {
                 TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
                 vector_sum_col_shape.collapse_from(1);
 
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+                                                    vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                "vector_sum_col tensor must have the same number of batches of "
+                                                "vector_sum_row_shape or the number of batches must be set to 1");
             }
         }
 
@@ -842,7 +891,7 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
         ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_row->num_dimensions() > 3);
     }
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output);
@@ -852,15 +901,21 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
 }
 } // namespace
 
-void CpuGemmLowpOffsetContributionOutputStageKernel::configure(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col,
-                                                               const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst,
-                                                               int32_t k, int32_t a_offset, int32_t b_offset,
+void CpuGemmLowpOffsetContributionOutputStageKernel::configure(const ITensorInfo      *mm_result,
+                                                               const ITensorInfo      *vector_sum_col,
+                                                               const ITensorInfo      *vector_sum_row,
+                                                               const ITensorInfo      *bias,
+                                                               ITensorInfo            *dst,
+                                                               int32_t                 k,
+                                                               int32_t                 a_offset,
+                                                               int32_t                 b_offset,
                                                                GEMMLowpOutputStageInfo output_stage)
 {
     ARM_COMPUTE_UNUSED(vector_sum_row, bias);
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage));
 
     _a_offset     = a_offset;
     _b_offset     = b_offset;
@@ -868,7 +923,7 @@ void CpuGemmLowpOffsetContributionOutputStageKernel::configure(const ITensorInfo
     _output_stage = output_stage;
 
     // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         // Check if vector_sum_col_shape should be slidden or not
         // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1
@@ -888,16 +943,24 @@ void CpuGemmLowpOffsetContributionOutputStageKernel::configure(const ITensorInfo
     ICpuKernel::configure(win);
 }
 
-Status CpuGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col,
-                                                                const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output,
-                                                                int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
+Status CpuGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo      *mm_result,
+                                                                const ITensorInfo      *vector_sum_col,
+                                                                const ITensorInfo      *vector_sum_row,
+                                                                const ITensorInfo      *bias,
+                                                                const ITensorInfo      *output,
+                                                                int32_t                 a_offset,
+                                                                int32_t                 b_offset,
+                                                                GEMMLowpOutputStageInfo output_stage)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage));
     return Status{};
 }
 
-void CpuGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack      &tensors,
+                                                            const Window     &window,
+                                                            const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -912,14 +975,14 @@ void CpuGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors
     PixelValue type_min{};
     PixelValue type_max{};
     std::tie(type_min, type_max) = get_min_max(dst->info()->data_type());
-    int32_t type_min_int = type_min.get<int32_t>();
-    int32_t type_max_int = type_max.get<int32_t>();
+    int32_t type_min_int         = type_min.get<int32_t>();
+    int32_t type_max_int         = type_max.get<int32_t>();
 
-    const bool reinterpret_as_3d = vector_sum_row != nullptr
-                                   && mm_result->info()->num_dimensions() > 1
-                                   && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
+    const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->info()->num_dimensions() > 1 &&
+                                   mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
 
-    const bool is_bounded_relu = !(_output_stage.gemmlowp_min_bound <= type_min_int && _output_stage.gemmlowp_max_bound >= type_max_int);
+    const bool is_bounded_relu =
+        !(_output_stage.gemmlowp_min_bound <= type_min_int && _output_stage.gemmlowp_max_bound >= type_max_int);
 
     // Check if we need to perform fixed point requantization
     const bool is_fixed_point = _output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN;
@@ -930,22 +993,25 @@ void CpuGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors
     // Check if symmetric per-channel execution
     const bool is_symm = _output_stage.is_quantized_per_channel;
 
-    if(is_symm)
+    if (is_symm)
     {
-        run_offset_contribution_output_stage_symm(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _is_vector_sum_col_batched, _output_stage,
-                                                  reinterpret_as_3d, is_bounded_relu, is_fixed_point);
+        run_offset_contribution_output_stage_symm(window, mm_result, vector_sum_col, vector_sum_row, bias, dst,
+                                                  _a_offset, _b_offset, _k_offset, _is_vector_sum_col_batched,
+                                                  _output_stage, reinterpret_as_3d, is_bounded_relu, is_fixed_point);
     }
     else
     {
-        if(is_signed)
+        if (is_signed)
         {
-            run_offset_contribution_output_stage<int8_t>(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _is_vector_sum_col_batched, _output_stage,
-                                                         reinterpret_as_3d, is_bounded_relu, is_fixed_point);
+            run_offset_contribution_output_stage<int8_t>(
+                window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset,
+                _is_vector_sum_col_batched, _output_stage, reinterpret_as_3d, is_bounded_relu, is_fixed_point);
         }
         else
         {
-            run_offset_contribution_output_stage<uint8_t>(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _is_vector_sum_col_batched, _output_stage,
-                                                          reinterpret_as_3d, is_bounded_relu, is_fixed_point);
+            run_offset_contribution_output_stage<uint8_t>(
+                window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset,
+                _is_vector_sum_col_batched, _output_stage, reinterpret_as_3d, is_bounded_relu, is_fixed_point);
         }
     }
 }
diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h
index 3cb99faee8..af477d4756 100644
--- a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_OUTPUTSTAGE_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -85,7 +86,13 @@ public:
      * @param[in]  b_offset       Offset to be added to each element of the matrix B.
      * @param[in]  output_stage   GEMMLowp output stage info, providing the type of quantization and the necessary parameters.
      */
-    void configure(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst, int32_t k, int32_t a_offset,
+    void configure(const ITensorInfo      *mm_result,
+                   const ITensorInfo      *vector_sum_col,
+                   const ITensorInfo      *vector_sum_row,
+                   const ITensorInfo      *bias,
+                   ITensorInfo            *dst,
+                   int32_t                 k,
+                   int32_t                 a_offset,
                    int32_t                 b_offset,
                    GEMMLowpOutputStageInfo output_stage);
     /** Static function to check if given info will lead to a valid configuration
@@ -94,21 +101,26 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, int32_t a_offset,
+    static Status validate(const ITensorInfo      *mm_result,
+                           const ITensorInfo      *vector_sum_col,
+                           const ITensorInfo      *vector_sum_row,
+                           const ITensorInfo      *bias,
+                           const ITensorInfo      *dst,
+                           int32_t                 a_offset,
                            int32_t                 b_offset,
                            GEMMLowpOutputStageInfo output_stage);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
     /** Function to use for the particular tensors passed to configure() */
-    int32_t                 _a_offset{ 0 };
-    int32_t                 _b_offset{ 0 };
-    int32_t                 _k_offset{ 0 };
-    bool                    _is_vector_sum_col_batched{ true };
-    GEMMLowpOutputStageInfo _output_stage{ GEMMLowpOutputStageInfo() };
+    int32_t                 _a_offset{0};
+    int32_t                 _b_offset{0};
+    int32_t                 _k_offset{0};
+    bool                    _is_vector_sum_col_batched{true};
+    GEMMLowpOutputStageInfo _output_stage{GEMMLowpOutputStageInfo()};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp
index 3023d93113..eefc294700 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp
@@ -28,13 +28,14 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
 #include "src/core/AccessWindowStatic.h"
-#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 
@@ -46,26 +47,35 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage)
+Status validate_arguments(const ITensorInfo             *src,
+                          const ITensorInfo             *bias,
+                          const ITensorInfo             *dst,
+                          const GEMMLowpOutputStageInfo *output_stage)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
 
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)));
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))
-                                || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        output_stage->gemmlowp_max_bound >
+        std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        output_stage->gemmlowp_min_bound <
+            std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) ||
+        output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
 
     // Check biases if exist
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        if(dst->data_type() != output_stage->output_data_type && (output_stage->output_data_type == DataType::QASYMM8 || output_stage->output_data_type == DataType::QASYMM8_SIGNED))
+        if (dst->data_type() != output_stage->output_data_type &&
+            (output_stage->output_data_type == DataType::QASYMM8 ||
+             output_stage->output_data_type == DataType::QASYMM8_SIGNED))
         {
             ARM_COMPUTE_RETURN_ERROR_MSG("Mismatching data types");
         }
@@ -92,24 +102,26 @@ inline void scale_input(int32x4x4_t &in_s32, int32x4_t result_offset_s32, int32_
 }
 
 template <typename T>
-inline typename std::enable_if<std::is_same<T, uint8_t>::value,
-       typename wrapper::traits::neon_vector<T, 16>::type>::type
-       convert_to_8bit(const int16x8x2_t in_s16)
+inline
+    typename std::enable_if<std::is_same<T, uint8_t>::value, typename wrapper::traits::neon_vector<T, 16>::type>::type
+    convert_to_8bit(const int16x8x2_t in_s16)
 {
     return wrapper::vcombine(wrapper::vqmovun(in_s16.val[0]), wrapper::vqmovun(in_s16.val[1]));
 }
 
 template <typename T>
-inline typename std::enable_if<std::is_same<T, int8_t>::value,
-       typename wrapper::traits::neon_vector<T, 16>::type>::type
-       convert_to_8bit(const int16x8x2_t in_s16)
+inline typename std::enable_if<std::is_same<T, int8_t>::value, typename wrapper::traits::neon_vector<T, 16>::type>::type
+convert_to_8bit(const int16x8x2_t in_s16)
 {
     return wrapper::vcombine(wrapper::vqmovn(in_s16.val[0]), wrapper::vqmovn(in_s16.val[1]));
 }
 
 template <typename T>
-inline typename wrapper::traits::neon_vector<T, 16>::type finalize_quantization(int32x4x4_t &in_s32, int32x4_t result_shift_s32, typename wrapper::traits::neon_vector<T, 16>::type min,
-                                                                                typename wrapper::traits::neon_vector<T, 16>::type max)
+inline typename wrapper::traits::neon_vector<T, 16>::type
+finalize_quantization(int32x4x4_t                                       &in_s32,
+                      int32x4_t                                          result_shift_s32,
+                      typename wrapper::traits::neon_vector<T, 16>::type min,
+                      typename wrapper::traits::neon_vector<T, 16>::type max)
 {
     // Shift final result (negative value shift right)
     in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
@@ -118,13 +130,8 @@ inline typename wrapper::traits::neon_vector<T, 16>::type finalize_quantization(
     in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
 
     // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
+    const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+                                 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
 
     // Convert S16 to S8 or U8
     typename wrapper::traits::neon_vector<T, 16>::type out = convert_to_8bit<T>(in_s16);
@@ -137,7 +144,10 @@ inline typename wrapper::traits::neon_vector<T, 16>::type finalize_quantization(
 } // namespace
 
 template <typename T>
-void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window)
+void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal(const ITensor *src,
+                                                           const ITensor *bias,
+                                                           ITensor       *dst,
+                                                           const Window  &window)
 {
     using VectorType = typename wrapper::traits::neon_vector<T, 16>::type;
 
@@ -159,107 +169,105 @@ void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal(const ITensor *src, c
     Iterator in(src, win);
     Iterator out(dst, win);
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         Window win_biases;
         win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
         win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
 
         Iterator bias_i(bias, win_biases);
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                int32x4x4_t in_s32 =
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
-
-                const int32x4x4_t bias_s32 =
+                    int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
+
+                    const int32x4x4_t bias_s32 = {
+                        {vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)}};
+
+                    // Add the bias to GEMM's result
+                    in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
+                    in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
+                    in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
+                    in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
+
+                    // Add the offset terms to GEMM's result and multiply by result_mult_int
+                    scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier);
+
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x),
+                                    finalize_quantization<T>(in_s32, result_shift_s32, min, max));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)
-                    }
-                };
-
-                // Add the bias to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
-
-                // Add the offset terms to GEMM's result and multiply by result_mult_int
-                scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier);
-
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x), finalize_quantization<T>(in_s32, result_shift_s32, min, max));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int bias_value = *(reinterpret_cast<const int *>(bias_i.ptr()) + x);
-                int       in_value   = *(reinterpret_cast<const int *>(in.ptr()) + x);
-
-                // Quantize
-                in_value = ((in_value + bias_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >> _output_stage->gemmlowp_shift;
-
-                // Store the result
-                *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
-            }
-        },
-        in, bias_i, out);
+                    const int bias_value = *(reinterpret_cast<const int *>(bias_i.ptr()) + x);
+                    int       in_value   = *(reinterpret_cast<const int *>(in.ptr()) + x);
+
+                    // Quantize
+                    in_value = ((in_value + bias_value + _output_stage->gemmlowp_offset) *
+                                _output_stage->gemmlowp_multiplier) >>
+                               _output_stage->gemmlowp_shift;
+
+                    // Store the result
+                    *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
+                }
+            },
+            in, bias_i, out);
     }
     else
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                int32x4x4_t in_s32 =
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
-
-                // Add the offset terms to GEMM's result and multiply by result_mult_int
-                scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier);
-
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x), finalize_quantization<T>(in_s32, result_shift_s32, min, max));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                int in_value = *(reinterpret_cast<const int *>(in.ptr()) + x);
+                    int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
+
+                    // Add the offset terms to GEMM's result and multiply by result_mult_int
+                    scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier);
+
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x),
+                                    finalize_quantization<T>(in_s32, result_shift_s32, min, max));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    int in_value = *(reinterpret_cast<const int *>(in.ptr()) + x);
 
-                // Quantize
-                in_value = ((in_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >> _output_stage->gemmlowp_shift;
+                    // Quantize
+                    in_value = ((in_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >>
+                               _output_stage->gemmlowp_shift;
 
-                // Store the result
-                *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
-            }
-        },
-        in, out);
+                    // Store the result
+                    *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
+                }
+            },
+            in, out);
     }
 }
 
-void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage)
+void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo                   *src,
+                                                        ITensorInfo                   *bias,
+                                                        ITensorInfo                   *dst,
+                                                        const GEMMLowpOutputStageInfo *output_stage)
 {
     ARM_COMPUTE_UNUSED(bias);
     // Perform validate step
@@ -268,10 +276,7 @@ void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo *src, ITenso
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*dst, src->clone()->set_data_type(output_stage->output_data_type));
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src,
-                                                  bias,
-                                                  dst,
-                                                  output_stage));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, output_stage));
 
     _output_stage = output_stage;
 
@@ -281,14 +286,17 @@ void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo *src, ITenso
     ICpuKernel::configure(win);
 
     // Check if we need to clamp the result using min and max
-    _is_bounded_relu = ((_output_stage->gemmlowp_min_bound != _output_stage->gemmlowp_max_bound)
-                        && !(_output_stage->gemmlowp_min_bound == std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))
-                             && _output_stage->gemmlowp_max_bound == std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))));
-    if(_output_stage->output_data_type == DataType::QASYMM8)
+    _is_bounded_relu =
+        ((_output_stage->gemmlowp_min_bound != _output_stage->gemmlowp_max_bound) &&
+         !(_output_stage->gemmlowp_min_bound ==
+               std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) &&
+           _output_stage->gemmlowp_max_bound ==
+               std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))));
+    if (_output_stage->output_data_type == DataType::QASYMM8)
     {
         _func = &CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal<uint8_t>;
     }
-    else if(_output_stage->output_data_type == DataType::QASYMM8_SIGNED)
+    else if (_output_stage->output_data_type == DataType::QASYMM8_SIGNED)
     {
         _func = &CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal<int8_t>;
     }
@@ -298,7 +306,10 @@ void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo *src, ITenso
     }
 }
 
-Status CpuGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage)
+Status CpuGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo             *src,
+                                                         const ITensorInfo             *bias,
+                                                         const ITensorInfo             *dst,
+                                                         const GEMMLowpOutputStageInfo *output_stage)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, output_stage));
     return Status{};
@@ -323,4 +334,4 @@ const char *CpuGemmLowpQuantizeDownInt32ScaleKernel::name() const
 }
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h
index c7813edcd7..33e296b251 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -71,10 +72,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage);
+    static Status validate(const ITensorInfo             *src,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           const GEMMLowpOutputStageInfo *output_stage);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
@@ -95,11 +99,14 @@ private:
      * @param[out] dst    Output tensor info
      * @param[in]  window Region on which to execute the kernel.
      */
-    using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ScaleKernel::*)(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
+    using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ScaleKernel::*)(const ITensor *src,
+                                                                                      const ITensor *bias,
+                                                                                      ITensor       *dst,
+                                                                                      const Window  &window);
 
-    QuantizeDownFunctionPtr        _func{ nullptr };
-    const GEMMLowpOutputStageInfo *_output_stage{ nullptr };
-    bool                           _is_bounded_relu{ false };
+    QuantizeDownFunctionPtr        _func{nullptr};
+    const GEMMLowpOutputStageInfo *_output_stage{nullptr};
+    bool                           _is_bounded_relu{false};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
index 53ca991889..a5c09c9977 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
@@ -29,12 +29,13 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/NESymm.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NESymm.h"
 
 #include <arm_neon.h>
 
@@ -53,14 +54,14 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const
     ARM_COMPUTE_RETURN_ERROR_ON(min > max);
 
     // Check biases if exist
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QSYMM16);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src);
@@ -71,7 +72,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const
 } // namespace
 
 template <bool is_bounded_relu>
-void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window)
+void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal(const ITensor *src,
+                                                                              const ITensor *bias,
+                                                                              ITensor       *dst,
+                                                                              const Window  &window)
 {
     const int16x8_t min_s16 = vdupq_n_s16(static_cast<int16_t>(_min));
     const int16x8_t max_s16 = vdupq_n_s16(static_cast<int16_t>(_max));
@@ -88,92 +92,92 @@ void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal(co
 
     Iterator in(src, win_collapsed);
     Iterator out(dst, win_collapsed);
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         Window win_biases;
         win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
         win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
 
         Iterator bias_i(bias, win_biases);
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win_collapsed,
+            [&](const Coordinates &)
             {
-                int32x4x2_t in_s32 =
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4)
-                    }
-                };
+                    int32x4x2_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4)}};
 
-                const int32x4x2_t bias_s32 =
-                {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4)
-                    }
-                };
+                    const int32x4x2_t bias_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
+                                                   vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4)}};
 
-                // Add the bias to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
+                    // Add the bias to GEMM's result
+                    in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
+                    in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
 
-                vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x, finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, min_s16, max_s16));
-            }
+                    vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x,
+                              finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier,
+                                                                           _result_shift, min_s16, max_s16));
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
-                int32_t       in_value   = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
-                // Add bias
-                in_value += bias_value;
-                // Finalize and store the result
-                *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min),
-                                                                                                             static_cast<int16_t>(_max));
-            }
-        },
-        in, out, bias_i);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
+                    int32_t       in_value   = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+                    // Add bias
+                    in_value += bias_value;
+                    // Finalize and store the result
+                    *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>(
+                        in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min),
+                        static_cast<int16_t>(_max));
+                }
+            },
+            in, out, bias_i);
     }
     else
     {
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win_collapsed,
+            [&](const Coordinates &)
             {
-                int32x4x2_t in_s32 =
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4)
-                    }
-                };
+                    int32x4x2_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4)}};
 
-                vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x, finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, min_s16, max_s16));
-            }
+                    vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x,
+                              finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier,
+                                                                           _result_shift, min_s16, max_s16));
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-                ARM_COMPUTE_UNUSED(in_value);
-                // Finalize and store the result
-                *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min),
-                                                                                                             static_cast<int16_t>(_max));
-            }
-        },
-        in, out);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+                    ARM_COMPUTE_UNUSED(in_value);
+                    // Finalize and store the result
+                    *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>(
+                        in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min),
+                        static_cast<int16_t>(_max));
+                }
+            },
+            in, out);
     }
 }
 
-void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift,
-                                                                           int min, int max)
+void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure(ITensorInfo *src,
+                                                                           ITensorInfo *bias,
+                                                                           ITensorInfo *dst,
+                                                                           int          result_fixedpoint_multiplier,
+                                                                           int          result_shift,
+                                                                           int          min,
+                                                                           int          max)
 {
     // Perform validate step
     ARM_COMPUTE_UNUSED(bias, dst);
@@ -193,18 +197,21 @@ void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure(ITens
 
     // Check if we need to clamp the result using min and max
     const bool is_bounded_relu = !(min <= -32768 && max >= 32767);
-    _func                      = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal<true> :
-                                 &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal<false>;
+    _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal<true>
+                            : &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal<false>;
 }
 
-Status CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+Status CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(
+    const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
     return Status{};
 }
 
-void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_op(ITensorPack      &tensors,
+                                                                        const Window     &window,
+                                                                        const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
index 681d099695..925788b680 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT16_SCALEBYFIXEDPOINT_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -48,7 +49,8 @@ namespace kernels
  *  -# Clamp the resulting int32 values to the [-32768, 32767] range and cast to QSYMM16.
  *
  */
-class CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>
+class CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
+    : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>
 {
 public:
     CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel() = default;
@@ -65,17 +67,24 @@ public:
      * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QSYMM16.
      *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0.
      */
-    void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, int min = 0, int max = 0);
+    void configure(ITensorInfo *src,
+                   ITensorInfo *bias,
+                   ITensorInfo *dst,
+                   int          result_fixedpoint_multiplier,
+                   int          result_shift,
+                   int          min = 0,
+                   int          max = 0);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
@@ -97,13 +106,13 @@ private:
      * @param[in]  window Region on which to execute the kernel.
      */
     using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::*)(
-                                        const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
+        const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
 
-    QuantizeDownFunctionPtr _func{ nullptr };
-    int                     _result_fixedpoint_multiplier{ 0 };
-    int                     _result_shift{ 0 };
-    int                     _min{ 0 };
-    int                     _max{ 0 };
+    QuantizeDownFunctionPtr _func{nullptr};
+    int                     _result_fixedpoint_multiplier{0};
+    int                     _result_shift{0};
+    int                     _min{0};
+    int                     _max{0};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
index 27214dcb5a..0e58097073 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
@@ -29,12 +29,13 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/NEAsymm.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
 
 #include <arm_neon.h>
 
@@ -53,14 +54,14 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const
     ARM_COMPUTE_RETURN_ERROR_ON(min > max);
 
     // Check biases if exist
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src);
@@ -71,7 +72,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const
 } // namespace
 
 template <bool is_bounded_relu>
-void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window)
+void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal(const ITensor *src,
+                                                                             const ITensor *bias,
+                                                                             ITensor       *dst,
+                                                                             const Window  &window)
 {
     const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(_result_offset_after_shift);
     const int8x16_t min_s8                        = vdupq_n_s8(static_cast<int8_t>(_min));
@@ -88,102 +92,102 @@ void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal(con
 
     Iterator in(src, win_collapsed);
     Iterator out(dst, win_collapsed);
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         Window win_biases;
         win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
         win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
 
         Iterator bias_i(bias, win_biases);
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win_collapsed,
+            [&](const Coordinates &)
             {
-                int32x4x4_t in_s32 =
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
-
-                const int32x4x4_t bias_s32 =
+                    int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
+
+                    const int32x4x4_t bias_s32 = {
+                        {vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)}};
+
+                    // Add the bias to GEMM's result
+                    in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
+                    in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
+                    in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
+                    in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
+
+                    vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
+                             finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift,
+                                                   result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)
-                    }
-                };
-
-                // Add the bias to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
-
-                vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
-                         finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
-                int32_t       in_value   = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
-                // Add bias
-                in_value += bias_value;
-                // Finalize and store the result
-                *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
-                                                                                   static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu);
-            }
-        },
-        in, out, bias_i);
+                    const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
+                    int32_t       in_value   = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+                    // Add bias
+                    in_value += bias_value;
+                    // Finalize and store the result
+                    *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(
+                        in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
+                        static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu);
+                }
+            },
+            in, out, bias_i);
     }
     else
     {
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win_collapsed,
+            [&](const Coordinates &)
             {
-                int32x4x4_t in_s32 =
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
-
-                vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
-                         finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
-                // Finalize and store the result
-                *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
-                                                                                   static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu);
-            }
-        },
-        in, out);
+                    int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
+
+                    vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
+                             finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift,
+                                                   result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+                    // Finalize and store the result
+                    *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(
+                        in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
+                        static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu);
+                }
+            },
+            in, out);
     }
 }
 
-void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift,
-                                                                          int result_offset_after_shift, int min, int max)
+void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(ITensorInfo *src,
+                                                                          ITensorInfo *bias,
+                                                                          ITensorInfo *dst,
+                                                                          int          result_fixedpoint_multiplier,
+                                                                          int          result_shift,
+                                                                          int          result_offset_after_shift,
+                                                                          int          min,
+                                                                          int          max)
 {
     ARM_COMPUTE_UNUSED(bias);
     // Perform validate step
@@ -205,18 +209,21 @@ void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(ITenso
 
     // Check if we need to clamp the result using min and max
     const bool is_bounded_relu = !(min <= -128 && max >= 127);
-    _func                      = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal<true> :
-                                 &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal<false>;
+    _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal<true>
+                            : &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal<false>;
 }
 
-Status CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max)
+Status CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(
+    const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, min, max));
     return Status{};
 }
 
-void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_op(ITensorPack      &tensors,
+                                                                       const Window     &window,
+                                                                       const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
index 3e615b935e..6a67ba4f19 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT8_SCALEBYFIXEDPOINT_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -49,7 +50,8 @@ namespace kernels
  *  -# Clamp the resulting int32 values to the [-128..127] range and cast to QASYMM8_SIGNED.
  *
  */
-class CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>
+class CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
+    : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>
 {
 public:
     CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel() = default;
@@ -67,17 +69,25 @@ public:
      * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED,
      *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
      */
-    void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0);
+    void configure(ITensorInfo *src,
+                   ITensorInfo *bias,
+                   ITensorInfo *dst,
+                   int          result_fixedpoint_multiplier,
+                   int          result_shift,
+                   int          result_offset_after_shift,
+                   int          min = 0,
+                   int          max = 0);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
@@ -99,14 +109,14 @@ private:
      * @param[in]  window Region on which to execute the kernel.
      */
     using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::*)(
-                                        const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
+        const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
 
-    QuantizeDownFunctionPtr _func{ nullptr };
-    int                     _result_fixedpoint_multiplier{ 0 };
-    int                     _result_shift{ 0 };
-    int                     _result_offset_after_shift{ 0 };
-    int                     _min{ 0 };
-    int                     _max{ 0 };
+    QuantizeDownFunctionPtr _func{nullptr};
+    int                     _result_fixedpoint_multiplier{0};
+    int                     _result_shift{0};
+    int                     _result_offset_after_shift{0};
+    int                     _min{0};
+    int                     _max{0};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
index e49fd29115..e3dd2240ca 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
@@ -29,12 +29,13 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/NEAsymm.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
 
 #include <arm_neon.h>
 
@@ -53,14 +54,14 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const
     ARM_COMPUTE_RETURN_ERROR_ON(min > max);
 
     // Check biases if exist
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src);
@@ -71,7 +72,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const
 } // namespace
 
 template <bool is_bounded_relu>
-void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window)
+void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal(const ITensor *src,
+                                                                              const ITensor *bias,
+                                                                              ITensor       *dst,
+                                                                              const Window  &window)
 {
     const int32x4_t  result_offset_after_shift_s32 = vdupq_n_s32(_result_offset_after_shift);
     const uint8x16_t min_u8                        = vdupq_n_u8(static_cast<uint8_t>(_min));
@@ -89,98 +93,102 @@ void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal(co
 
     Iterator in(src, win_collapsed);
     Iterator out(dst, win_collapsed);
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         Window win_biases;
         win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
         win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
 
         Iterator bias_i(bias, win_biases);
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win_collapsed,
+            [&](const Coordinates &)
             {
-                int32x4x4_t in_s32 =
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
-
-                const int32x4x4_t bias_s32 =
+                    int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
+
+                    const int32x4x4_t bias_s32 = {
+                        {vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)}};
+
+                    // Add the bias to GEMM's result
+                    in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
+                    in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
+                    in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
+                    in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
+
+                    vst1q_u8(out.ptr() + x,
+                             finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift,
+                                                   result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)
-                    }
-                };
-
-                // Add the bias to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
-
-                vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
-                int32_t       in_value   = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
-                // Add bias
-                in_value += bias_value;
-                // Finalize and store the result
-                *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max), is_bounded_relu);
-            }
-        },
-        in, out, bias_i);
+                    const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
+                    int32_t       in_value   = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+                    // Add bias
+                    in_value += bias_value;
+                    // Finalize and store the result
+                    *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift,
+                                                             _result_offset_after_shift, static_cast<uint8_t>(_min),
+                                                             static_cast<uint8_t>(_max), is_bounded_relu);
+                }
+            },
+            in, out, bias_i);
     }
     else
     {
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win_collapsed,
+            [&](const Coordinates &)
             {
-                int32x4x4_t in_s32 =
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
-
-                vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
-                // Finalize and store the result
-                *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max), is_bounded_relu);
-            }
-        },
-        in, out);
+                    int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
+
+                    vst1q_u8(out.ptr() + x,
+                             finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift,
+                                                   result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+                    // Finalize and store the result
+                    *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift,
+                                                             _result_offset_after_shift, static_cast<uint8_t>(_min),
+                                                             static_cast<uint8_t>(_max), is_bounded_relu);
+                }
+            },
+            in, out);
     }
 }
 
-void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift,
-                                                                           int result_offset_after_shift, int min, int max)
+void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(ITensorInfo *src,
+                                                                           ITensorInfo *bias,
+                                                                           ITensorInfo *dst,
+                                                                           int          result_fixedpoint_multiplier,
+                                                                           int          result_shift,
+                                                                           int          result_offset_after_shift,
+                                                                           int          min,
+                                                                           int          max)
 {
     ARM_COMPUTE_UNUSED(bias);
     // Perform validate step
@@ -202,18 +210,21 @@ void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(ITens
 
     // Check if we need to clamp the result using min and max
     const bool is_bounded_relu = !(min <= 0 && max >= 255);
-    _func                      = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal<true> :
-                                 &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal<false>;
+    _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal<true>
+                            : &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal<false>;
 }
 
-Status CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max)
+Status CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(
+    const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, min, max));
     return Status{};
 }
 
-void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_op(ITensorPack      &tensors,
+                                                                        const Window     &window,
+                                                                        const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -233,4 +244,4 @@ const char *CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::name() c
 }
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
index b773fdfdcf..45bd742a70 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOUINT8_SCALEBYFIXEDPOINT_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -49,7 +50,8 @@ namespace kernels
  *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
  *
  */
-class CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>
+class CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
+    : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>
 {
 public:
     CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel() = default;
@@ -67,17 +69,25 @@ public:
      * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
      *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
      */
-    void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0);
+    void configure(ITensorInfo *src,
+                   ITensorInfo *bias,
+                   ITensorInfo *dst,
+                   int          result_fixedpoint_multiplier,
+                   int          result_shift,
+                   int          result_offset_after_shift,
+                   int          min = 0,
+                   int          max = 0);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
@@ -93,14 +103,14 @@ private:
      * @param[in] window Region on which to execute the kernel.
      */
     using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::*)(
-                                        const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
+        const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
 
-    QuantizeDownFunctionPtr _func{ nullptr };
-    int                     _result_fixedpoint_multiplier{ 0 };
-    int                     _result_shift{ 0 };
-    int                     _result_offset_after_shift{ 0 };
-    int                     _min{ 0 };
-    int                     _max{ 0 };
+    QuantizeDownFunctionPtr _func{nullptr};
+    int                     _result_fixedpoint_multiplier{0};
+    int                     _result_shift{0};
+    int                     _result_offset_after_shift{0};
+    int                     _min{0};
+    int                     _max{0};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp
index 6399ebbef4..fb1b70b91f 100644
--- a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp
+++ b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp
@@ -26,11 +26,12 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEFixedPoint.h"
 #include "src/cpu/kernels/gemm_matrix_add/list.h"
 namespace arm_compute
 {
@@ -40,24 +41,12 @@ namespace kernels
 {
 namespace
 {
-static const std::vector<CpuGemmMatrixAdditionKernel::GemmMatrixAddKernel> available_kernels =
-{
-    {
-        "neon_fp32_gemm_matrix_add",
-        [](const DataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::F32);
-        },
-        REGISTER_FP32_NEON(neon_fp32_gemm_matrix_add)
-    },
-    {
-        "neon_fp16_gemm_matrix_add",
-        [](const DataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::F16) && data.isa.fp16;
-        },
-        REGISTER_FP16_NEON(neon_fp16_gemm_matrix_add)
-    },
+static const std::vector<CpuGemmMatrixAdditionKernel::GemmMatrixAddKernel> available_kernels = {
+    {"neon_fp32_gemm_matrix_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(neon_fp32_gemm_matrix_add)},
+    {"neon_fp16_gemm_matrix_add",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(neon_fp16_gemm_matrix_add)},
 
 };
 } // namespace
@@ -71,7 +60,8 @@ void CpuGemmMatrixAdditionKernel::configure(const ITensorInfo *src, ITensorInfo
     ARM_COMPUTE_ERROR_THROW_ON(CpuGemmMatrixAdditionKernel::validate(src, dst, beta));
 
     _beta         = beta;
-    const auto uk = CpuGemmMatrixAdditionKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+    const auto uk = CpuGemmMatrixAdditionKernel::get_implementation(
+        DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
     _func = uk->ukernel;
     // Configure kernel window
@@ -87,7 +77,7 @@ Status CpuGemmMatrixAdditionKernel::validate(const ITensorInfo *src, const ITens
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
@@ -105,7 +95,7 @@ void CpuGemmMatrixAdditionKernel::run_op(ITensorPack &tensors, const Window &win
     const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
     ITensor       *dst = tensors.get_tensor(TensorType::ACL_DST);
 
-    if(_beta != 0.0f)
+    if (_beta != 0.0f)
     {
         (*_func)(src, dst, window, _beta);
     }
@@ -116,7 +106,8 @@ const char *CpuGemmMatrixAdditionKernel::name() const
     return "CpuGemmMatrixAdditionKernel";
 }
 
-const std::vector<CpuGemmMatrixAdditionKernel::GemmMatrixAddKernel> &CpuGemmMatrixAdditionKernel::get_available_kernels()
+const std::vector<CpuGemmMatrixAdditionKernel::GemmMatrixAddKernel> &
+CpuGemmMatrixAdditionKernel::get_available_kernels()
 {
     return available_kernels;
 }
diff --git a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h
index cbc5b53087..5e12f1dcbd 100644
--- a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h
+++ b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h
@@ -75,7 +75,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     static const std::vector<GemmMatrixAddKernel> &get_available_kernels();
@@ -89,8 +89,8 @@ private:
      * @param[in]  beta   Weight of matrix C
      */
     /** Matrix addition function to use for the particular tensor types passed to configure() */
-    GemmMatrixAddKernelPtr _func{ nullptr };
-    float                  _beta{ 0.f };
+    GemmMatrixAddKernelPtr _func{nullptr};
+    float                  _beta{0.f};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp
index 03b372efd4..beccd94844 100644
--- a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp
+++ b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp
@@ -26,10 +26,11 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/gemm_matrix_mul/list.h"
@@ -42,27 +43,20 @@ namespace kernels
 {
 namespace
 {
-static const std::vector<CpuGemmMatrixMultiplyKernel::GemmMatrixMulKernel> available_kernels =
-{
-    {
-        "neon_fp32_gemm_matrix_mul",
-        [](const DataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::F32);
-        },
-        REGISTER_FP32_NEON(neon_fp32_gemm_matrix_mul)
-    },
-    {
-        "neon_fp16_gemm_matrix_mul",
-        [](const DataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::F16) && data.isa.fp16;
-        },
-        REGISTER_FP16_NEON(neon_fp16_gemm_matrix_mul)
-    },
+static const std::vector<CpuGemmMatrixMultiplyKernel::GemmMatrixMulKernel> available_kernels = {
+    {"neon_fp32_gemm_matrix_mul", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(neon_fp32_gemm_matrix_mul)},
+    {"neon_fp16_gemm_matrix_mul",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(neon_fp16_gemm_matrix_mul)},
 };
 
-inline Status validate_arguments(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info)
+inline Status validate_arguments(const ITensorInfo     *lhs,
+                                 const ITensorInfo     *rhs,
+                                 const ITensorInfo     *dst,
+                                 float                  alpha,
+                                 bool                   is_interleaved,
+                                 const GEMMReshapeInfo &reshape_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
 
@@ -70,11 +64,11 @@ inline Status validate_arguments(const ITensorInfo *lhs, const ITensorInfo *rhs,
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst);
 
-    if(!is_interleaved)
+    if (!is_interleaved)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(lhs->dimension(0) != rhs->dimension(1));
 
-        if(dst->total_size() != 0)
+        if (dst->total_size() != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(rhs->dimension(0) != dst->dimension(0));
             ARM_COMPUTE_RETURN_ERROR_ON(lhs->dimension(1) != dst->dimension(1));
@@ -90,28 +84,31 @@ inline Status validate_arguments(const ITensorInfo *lhs, const ITensorInfo *rhs,
         const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
 
         /* Interleave */
-        TensorShape tensor_shape0{ lhs->tensor_shape() };
+        TensorShape tensor_shape0{lhs->tensor_shape()};
         tensor_shape0.set(0, k);
         tensor_shape0.set(1, m);
 
         const TensorInfo tensor_info0          = lhs->clone()->set_tensor_shape(tensor_shape0);
-        const TensorInfo tensor_info_reshaped0 = lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));
+        const TensorInfo tensor_info_reshaped0 = lhs->clone()->set_tensor_shape(
+            misc::shape_calculator::compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lhs, &tensor_info_reshaped0);
 
-        if(n != 0) /* Transpose */
+        if (n != 0) /* Transpose */
         {
-            TensorShape tensor_shape1{ rhs->tensor_shape() };
+            TensorShape tensor_shape1{rhs->tensor_shape()};
             tensor_shape1.set(0, n);
             tensor_shape1.set(1, k);
 
-            const TensorInfo tensor_info1          = rhs->clone()->set_tensor_shape(tensor_shape1);
-            const TensorInfo tensor_info_reshaped1 = rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width));
+            const TensorInfo tensor_info1 = rhs->clone()->set_tensor_shape(tensor_shape1);
+            const TensorInfo tensor_info_reshaped1 =
+                rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transpose1xW_with_element_size_shape(
+                    tensor_info1, mult_transpose1xW_width));
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(rhs, &tensor_info_reshaped1);
         }
 
-        if(dst->total_size() != 0)
+        if (dst->total_size() != 0)
         {
-            if(n != 0)
+            if (n != 0)
             {
                 ARM_COMPUTE_RETURN_ERROR_ON(dst->dimension(0) != static_cast<size_t>(n));
             }
@@ -125,12 +122,17 @@ inline Status validate_arguments(const ITensorInfo *lhs, const ITensorInfo *rhs,
 
 } // namespace
 
-void CpuGemmMatrixMultiplyKernel::configure(const ITensorInfo *lhs, const ITensorInfo *rhs, ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info)
+void CpuGemmMatrixMultiplyKernel::configure(const ITensorInfo     *lhs,
+                                            const ITensorInfo     *rhs,
+                                            ITensorInfo           *dst,
+                                            float                  alpha,
+                                            bool                   is_interleaved,
+                                            const GEMMReshapeInfo &reshape_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
 
     // dst tensor auto inizialitation if not yet initialized
-    TensorShape tensor_shape{ lhs->tensor_shape() };
+    TensorShape tensor_shape{lhs->tensor_shape()};
     tensor_shape.set(0, is_interleaved ? reshape_info.n() : rhs->dimension(0));
     tensor_shape.set(1, is_interleaved ? reshape_info.m() : lhs->dimension(1));
 
@@ -146,7 +148,7 @@ void CpuGemmMatrixMultiplyKernel::configure(const ITensorInfo *lhs, const ITenso
 
     // Check if the dst tensor is a vector. If so,the kernel runs the vector-matrix multiplication
     const bool is_dst_vector = (dst->dimension(1) == 1);
-    if(is_dst_vector)
+    if (is_dst_vector)
     {
         const unsigned int num_elems_processed_per_iteration_x = (lhs->data_type() == DataType::F32) ? 16 : 32;
 
@@ -157,17 +159,23 @@ void CpuGemmMatrixMultiplyKernel::configure(const ITensorInfo *lhs, const ITenso
         constexpr unsigned int num_elems_processed_per_iteration_x = 8;
         constexpr unsigned int num_elems_processed_per_iteration_y = 4;
 
-        win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+        win =
+            calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
     }
 
-    const auto uk = CpuGemmMatrixMultiplyKernel::get_implementation(DataTypeISASelectorData{ lhs->data_type(), CPUInfo::get().get_isa() });
+    const auto uk = CpuGemmMatrixMultiplyKernel::get_implementation(
+        DataTypeISASelectorData{lhs->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
     _func = uk->ukernel;
 
     ICPPKernel::configure(win);
 }
 
-Status CpuGemmMatrixMultiplyKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, float alpha, bool is_interleaved,
+Status CpuGemmMatrixMultiplyKernel::validate(const ITensorInfo     *lhs,
+                                             const ITensorInfo     *rhs,
+                                             const ITensorInfo     *dst,
+                                             float                  alpha,
+                                             bool                   is_interleaved,
                                              const GEMMReshapeInfo &reshape_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(lhs, rhs, dst, alpha, is_interleaved, reshape_info));
@@ -195,7 +203,8 @@ const char *CpuGemmMatrixMultiplyKernel::name() const
     return "CpuGemmMatrixMultiplyKernel";
 }
 
-const std::vector<CpuGemmMatrixMultiplyKernel::GemmMatrixMulKernel> &CpuGemmMatrixMultiplyKernel::get_available_kernels()
+const std::vector<CpuGemmMatrixMultiplyKernel::GemmMatrixMulKernel> &
+CpuGemmMatrixMultiplyKernel::get_available_kernels()
 {
     return available_kernels;
 }
diff --git a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h
index a7dfec87bd..765fcb8275 100644
--- a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h
+++ b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h
@@ -42,7 +42,8 @@ namespace kernels
 class CpuGemmMatrixMultiplyKernel : public ICpuKernel<CpuGemmMatrixMultiplyKernel>
 {
 private:
-    using GemmMatrixMulKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const Window &, const ThreadInfo &, float, const bool)>::type;
+    using GemmMatrixMulKernelPtr = std::add_pointer<void(
+        const ITensor *, const ITensor *, ITensor *, const Window &, const ThreadInfo &, float, const bool)>::type;
 
 public:
     struct GemmMatrixMulKernel
@@ -67,17 +68,27 @@ public:
      * @param[in]  is_interleaved (Optional) True if lhs and rhs have been reshaped respectively using @ref CpuGemmInterleave4x4Kernel and @ref CpuGemmTranspose1xWKernel
      * @param[in]  reshape_info   (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how @p lhs and @p rhs have been reshaped
      */
-    void configure(const ITensorInfo *lhs, const ITensorInfo *rhs, ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo());
+    void configure(const ITensorInfo     *lhs,
+                   const ITensorInfo     *rhs,
+                   ITensorInfo           *dst,
+                   float                  alpha,
+                   bool                   is_interleaved,
+                   const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmMatrixMultiplyKernel
      *
      * Similar to @ref CpuGemmMatrixMultiplyKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info);
+    static Status validate(const ITensorInfo     *lhs,
+                           const ITensorInfo     *rhs,
+                           const ITensorInfo     *dst,
+                           float                  alpha,
+                           bool                   is_interleaved,
+                           const GEMMReshapeInfo &reshape_info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     static const std::vector<GemmMatrixMulKernel> &get_available_kernels();
@@ -94,8 +105,8 @@ private:
      */
 
     /** Matrix multiply function to use for the particular tensor types passed to configure() */
-    GemmMatrixMulKernelPtr _func{ nullptr };
-    float                  _alpha{ 1.f };
+    GemmMatrixMulKernelPtr _func{nullptr};
+    float                  _alpha{1.f};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp b/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp
index 62d5d5f5e9..c47746bc4b 100644
--- a/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp
+++ b/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp
@@ -24,9 +24,10 @@
 #include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h"
 
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -63,9 +64,10 @@ Status CpuGemmTranspose1xWKernel::validate(const ITensorInfo *src, const ITensor
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), compute_transpose1xW_with_element_size_shape(*src));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
+                                                           compute_transpose1xW_with_element_size_shape(*src));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
     }
@@ -107,25 +109,28 @@ void CpuGemmTranspose1xWKernel::run_op(ITensorPack &tensors, const Window &windo
     const size_t out_stride   = dst->info()->strides_in_bytes()[1];
     const size_t vector_size  = 16 / element_size;
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const uint8_t *in_ptr  = in.ptr();
-        uint8_t *const out_ptr = out.ptr() + (id.y() * vector_size) * element_size + (id.x() / vector_size) * out_stride;
-
-        for(size_t k = 0; k < vector_size; ++k)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            // If the src width is not multiple of W, we fill the reference with 0s
-            if((id.x() + k) >= in_width)
-            {
-                std::memset(out_ptr + k * element_size, 0, element_size);
-            }
-            else
+            const uint8_t *in_ptr = in.ptr();
+            uint8_t *const out_ptr =
+                out.ptr() + (id.y() * vector_size) * element_size + (id.x() / vector_size) * out_stride;
+
+            for (size_t k = 0; k < vector_size; ++k)
             {
-                std::memcpy(out_ptr + k * element_size, in_ptr + k * element_size, element_size);
+                // If the src width is not multiple of W, we fill the reference with 0s
+                if ((id.x() + k) >= in_width)
+                {
+                    std::memset(out_ptr + k * element_size, 0, element_size);
+                }
+                else
+                {
+                    std::memcpy(out_ptr + k * element_size, in_ptr + k * element_size, element_size);
+                }
             }
-        }
-    },
-    in, out);
+        },
+        in, out);
 }
 
 const char *CpuGemmTranspose1xWKernel::name() const
diff --git a/src/cpu/kernels/CpuGemmTranspose1xWKernel.h b/src/cpu/kernels/CpuGemmTranspose1xWKernel.h
index 0ca92641b7..4b834b2cc6 100644
--- a/src/cpu/kernels/CpuGemmTranspose1xWKernel.h
+++ b/src/cpu/kernels/CpuGemmTranspose1xWKernel.h
@@ -88,7 +88,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 };
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuIm2ColKernel.cpp b/src/cpu/kernels/CpuIm2ColKernel.cpp
index 9ac291549b..55ac7c5192 100644
--- a/src/cpu/kernels/CpuIm2ColKernel.cpp
+++ b/src/cpu/kernels/CpuIm2ColKernel.cpp
@@ -29,13 +29,13 @@
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
 #include <arm_neon.h>
 #include <cstddef>
 #include <cstdint>
@@ -51,26 +51,34 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                          bool has_bias, const Size2D &dilation, unsigned int num_groups, unsigned int input_pad_right)
+Status validate_arguments(const ITensorInfo   *input,
+                          const ITensorInfo   *output,
+                          const Size2D        &kernel_dims,
+                          const PadStrideInfo &conv_info,
+                          bool                 has_bias,
+                          const Size2D        &dilation,
+                          unsigned int         num_groups,
+                          unsigned int         input_pad_right)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::BFLOAT16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(input->data_type()) && has_bias);
     ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Number of groups greater than one are not supported on Neon");
 
     // Since there's no implicit padding added, check the total input spatial dimensions (with conv paddings) are big enough for the kernel dimensions
-    const unsigned int width_idx    = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const unsigned int height_idx   = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-    const unsigned     total_width  = input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right();
+    const unsigned int width_idx   = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const unsigned int height_idx  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+    const unsigned     total_width = input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right();
     const unsigned     total_height = input->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom();
     ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height));
 
-    if(output->total_size() > 0)
+    if (output->total_size() > 0)
     {
-        TensorInfo expected_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false, num_groups, input_pad_right));
+        TensorInfo expected_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(
+            input, kernel_dims, conv_info, has_bias, dilation, false, num_groups, input_pad_right));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -106,14 +114,14 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr,
     // This for loop linearize a volume with 3 slices. This allows:
     // 1) to reduce the iterations of the outer for loop "d"
     // 2) to have an optimized im2col for the first convolution layer where usually we have 3 IFMs
-    for(; d <= (kernel_depth - 3); d += 3)
+    for (; d <= (kernel_depth - 3); d += 3)
     {
-        for(int y = top_left_y; y < y_e; y += dilation_y)
+        for (int y = top_left_y; y < y_e; y += dilation_y)
         {
-            if((y < 0 || y >= input_h) && has_pads)
+            if ((y < 0 || y >= input_h) && has_pads)
             {
                 // All the values will be the offset (will be zeros when not quantized)
-                for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
+                for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
                 {
                     *(out_ptr + 0 * kernel_size2) = pad_value;
                     *(out_ptr + 1 * kernel_size2) = pad_value;
@@ -122,9 +130,9 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr,
             }
             else
             {
-                for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
+                for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
                 {
-                    if((x < 0 || x >= input_w) && has_pads)
+                    if ((x < 0 || x >= input_w) && has_pads)
                     {
                         *(out_ptr + 0 * kernel_size2) = pad_value;
                         *(out_ptr + 1 * kernel_size2) = pad_value;
@@ -132,9 +140,12 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr,
                     }
                     else
                     {
-                        *(out_ptr + 0 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x)));
-                        *(out_ptr + 1 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x)));
-                        *(out_ptr + 2 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                        *(out_ptr + 0 * kernel_size2) = *(reinterpret_cast<const T *>(
+                            in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                        *(out_ptr + 1 * kernel_size2) = *(reinterpret_cast<const T *>(
+                            in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                        *(out_ptr + 2 * kernel_size2) = *(reinterpret_cast<const T *>(
+                            in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x)));
                     }
                 }
             }
@@ -143,11 +154,11 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr,
     }
 
     // Left over
-    for(; d < kernel_depth; d++)
+    for (; d < kernel_depth; d++)
     {
-        for(int y = top_left_y; y < y_e; y += dilation_y)
+        for (int y = top_left_y; y < y_e; y += dilation_y)
         {
-            if((y < 0 || y >= input_h) && has_pads)
+            if ((y < 0 || y >= input_h) && has_pads)
             {
                 // All the values will be the offset (will be zeros when not quantized)
                 memset(static_cast<void *>(out_ptr), pad_value, kernel_width * sizeof(T));
@@ -155,15 +166,16 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr,
             }
             else
             {
-                for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
+                for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
                 {
-                    if((x < 0 || x >= input_w) && has_pads)
+                    if ((x < 0 || x >= input_w) && has_pads)
                     {
                         *out_ptr = pad_value;
                     }
                     else
                     {
-                        *out_ptr = *(reinterpret_cast<const T *>(in_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                        *out_ptr = *(reinterpret_cast<const T *>(
+                            in_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x)));
                     }
                 }
             }
@@ -171,7 +183,7 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr,
     }
 
     // Append 1 if the convolution layer has biases
-    if(has_bias)
+    if (has_bias)
     {
         *out_ptr = static_cast<T>(1);
     }
@@ -198,36 +210,39 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr,
     const int end_y        = start_y + kernel_height * dilation_y;
     const int pad_quant    = kernel_width * input_c;
     const int element_size = static_cast<int>(sizeof(T));
-    if((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) && (input_stride_y == input_c * element_size))
+    if ((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) &&
+        (input_stride_y == input_c * element_size))
     {
-        for(int y = start_y; y < end_y; y += dilation_y)
+        for (int y = start_y; y < end_y; y += dilation_y)
         {
             //optimized for no dilation and no boundary pixels
-            memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * element_size);
+            memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)),
+                   input_c * kernel_width * element_size);
             out_ptr += input_c * kernel_width;
         }
     }
     else
     {
-        for(int y = start_y; y < end_y; y += dilation_y)
+        for (int y = start_y; y < end_y; y += dilation_y)
         {
-            if(y < 0 || y >= input_h)
+            if (y < 0 || y >= input_h)
             {
                 memset(static_cast<void *>(out_ptr), pad_value, pad_quant * element_size);
                 out_ptr += pad_quant;
             }
-            else if(dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != input_c * element_size)
+            else if (dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != input_c * element_size)
             {
-                for(int x = start_x; x < end_x; x += dilation_x)
+                for (int x = start_x; x < end_x; x += dilation_x)
                 {
-                    if(x < 0 || x >= input_w)
+                    if (x < 0 || x >= input_w)
                     {
                         memset(static_cast<void *>(out_ptr), pad_value, input_c * element_size);
                         out_ptr += input_c;
                     }
                     else
                     {
-                        memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), input_c * element_size);
+                        memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)),
+                               input_c * element_size);
                         out_ptr += input_c;
                     }
                 }
@@ -235,13 +250,14 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr,
             else
             {
                 //optimized for no dilation and no boundary pixels
-                memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * element_size);
+                memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)),
+                       input_c * kernel_width * element_size);
                 out_ptr += input_c * kernel_width;
             }
         }
     }
     // Append 1 if the convolution layer has biases
-    if(has_bias)
+    if (has_bias)
     {
         *out_ptr = static_cast<T>(1);
     }
@@ -271,12 +287,13 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr,
     const int element_size       = static_cast<int>(sizeof(T));
     const int channel_chunk_size = input_c * element_size;
 
-    if((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) && (input_stride_y == channel_chunk_size))
+    if ((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) &&
+        (input_stride_y == channel_chunk_size))
     {
-        for(int y = start_y; y < end_y; y += dilation_y)
+        for (int y = start_y; y < end_y; y += dilation_y)
         {
             const uint8_t *offset_ptr = in_ptr + (y * input_stride_z + start_x * input_stride_y);
-            for(int e = 0; e < kernel_width; e++)
+            for (int e = 0; e < kernel_width; e++)
             {
                 memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size), channel_chunk_size);
                 out_ptr += input_c + pad_right;
@@ -285,25 +302,26 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr,
     }
     else
     {
-        for(int y = start_y; y < end_y; y += dilation_y)
+        for (int y = start_y; y < end_y; y += dilation_y)
         {
-            if(y < 0 || y >= input_h)
+            if (y < 0 || y >= input_h)
             {
                 memset(static_cast<void *>(out_ptr), pad_value, pad_quant * element_size);
                 out_ptr += pad_quant;
             }
-            else if(dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != channel_chunk_size)
+            else if (dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != channel_chunk_size)
             {
-                for(int x = start_x; x < end_x; x += dilation_x)
+                for (int x = start_x; x < end_x; x += dilation_x)
                 {
-                    if(x < 0 || x >= input_w)
+                    if (x < 0 || x >= input_w)
                     {
                         memset(static_cast<void *>(out_ptr), pad_value, (input_c + pad_right) * element_size);
                         out_ptr += input_c + pad_right;
                     }
                     else
                     {
-                        memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), channel_chunk_size);
+                        memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)),
+                               channel_chunk_size);
                         out_ptr += input_c + pad_right;
                     }
                 }
@@ -311,16 +329,17 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr,
             else
             {
                 const uint8_t *offset_ptr = in_ptr + (y * input_stride_z + start_x * input_stride_y);
-                for(int e = 0; e < kernel_width; e++)
+                for (int e = 0; e < kernel_width; e++)
                 {
-                    memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size), channel_chunk_size);
+                    memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size),
+                           channel_chunk_size);
                     out_ptr += input_c + pad_right;
                 }
             }
         }
     }
     // Append 1 if the convolution layer has biases
-    if(has_bias)
+    if (has_bias)
     {
         *out_ptr = static_cast<T>(1);
     }
@@ -348,7 +367,8 @@ void CpuIm2ColKernel::run_im2col(const ITensor *src, ITensor *dst, const Window
     const int pad_top        = _conv_info.pad_top();
     const int stride_x       = _conv_info.stride().first;
     const int stride_y       = _conv_info.stride().second;
-    const int pad_value      = is_data_type_quantized(src->info()->data_type()) ? src->info()->quantization_info().uniform().offset : 0;
+    const int pad_value =
+        is_data_type_quantized(src->info()->data_type()) ? src->info()->quantization_info().uniform().offset : 0;
 
     Window window_in_out(window);
     // The first three dimensions of the input and output are increased by the inner loops
@@ -361,84 +381,57 @@ void CpuIm2ColKernel::run_im2col(const ITensor *src, ITensor *dst, const Window
     Iterator out(dst, window_in_out);
 
     execute_window_loop(
-        window, [&](const Coordinates & id)
-    {
-        const int start_w = id[width_idx] * stride_x - pad_left;
-        const int start_h = id[height_idx] * stride_y - pad_top;
+        window,
+        [&](const Coordinates &id)
+        {
+            const int start_w = id[width_idx] * stride_x - pad_left;
+            const int start_h = id[height_idx] * stride_y - pad_top;
 
-        // Get pointers
-        const uint8_t *const input_ptr  = in.ptr();
-        auto                 output_ptr = reinterpret_cast<T *>(out.ptr() + (id[width_idx] + id[height_idx] * _convolved_dims.first) * dst->info()->strides_in_bytes().y());
+            // Get pointers
+            const uint8_t *const input_ptr = in.ptr();
+            auto                 output_ptr =
+                reinterpret_cast<T *>(out.ptr() + (id[width_idx] + id[height_idx] * _convolved_dims.first) *
+                                                      dst->info()->strides_in_bytes().y());
 
-        // Linearize volume
-        if(is_nchw)
-        {
-            linearize_volume_nchw<T, has_pads>(input_ptr,
-                                               output_ptr,
-                                               _has_bias,
-                                               start_w,
-                                               start_h,
-                                               _kernel_width,
-                                               _kernel_height,
-                                               input_c,
-                                               input_w,
-                                               input_h,
-                                               input_stride_x,
-                                               input_stride_y,
-                                               input_stride_z,
-                                               pad_value,
-                                               _dilation.x(),
-                                               _dilation.y());
-        }
-        else
-        {
-            if(_input_pad_right > 0)
+            // Linearize volume
+            if (is_nchw)
             {
-                linearize_volume_nhwc<T, has_pads>(input_ptr,
-                                                   output_ptr,
-                                                   _has_bias,
-                                                   start_w,
-                                                   start_h,
-                                                   _kernel_width,
-                                                   _kernel_height,
-                                                   input_w,
-                                                   input_h,
-                                                   input_c,
-                                                   input_stride_y,
-                                                   input_stride_z,
-                                                   pad_value,
-                                                   _dilation.x(),
-                                                   _dilation.y(),
-                                                   _input_pad_right);
+                linearize_volume_nchw<T, has_pads>(
+                    input_ptr, output_ptr, _has_bias, start_w, start_h, _kernel_width, _kernel_height, input_c, input_w,
+                    input_h, input_stride_x, input_stride_y, input_stride_z, pad_value, _dilation.x(), _dilation.y());
             }
             else
             {
-                linearize_volume_nhwc<T, has_pads>(input_ptr,
-                                                   output_ptr,
-                                                   _has_bias,
-                                                   start_w,
-                                                   start_h,
-                                                   _kernel_width,
-                                                   _kernel_height,
-                                                   input_w,
-                                                   input_h,
-                                                   input_c,
-                                                   input_stride_y,
-                                                   input_stride_z,
-                                                   pad_value,
-                                                   _dilation.x(),
-                                                   _dilation.y());
+                if (_input_pad_right > 0)
+                {
+                    linearize_volume_nhwc<T, has_pads>(input_ptr, output_ptr, _has_bias, start_w, start_h,
+                                                       _kernel_width, _kernel_height, input_w, input_h, input_c,
+                                                       input_stride_y, input_stride_z, pad_value, _dilation.x(),
+                                                       _dilation.y(), _input_pad_right);
+                }
+                else
+                {
+                    linearize_volume_nhwc<T, has_pads>(
+                        input_ptr, output_ptr, _has_bias, start_w, start_h, _kernel_width, _kernel_height, input_w,
+                        input_h, input_c, input_stride_y, input_stride_z, pad_value, _dilation.x(), _dilation.y());
+                }
             }
-        }
-    },
-    in, out);
+        },
+        in, out);
 }
 
-void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                                bool has_bias, const Size2D &dilation, unsigned int num_groups, unsigned int input_pad_right)
+void CpuIm2ColKernel::configure(const ITensorInfo   *src,
+                                ITensorInfo         *dst,
+                                const Size2D        &kernel_dims,
+                                const PadStrideInfo &conv_info,
+                                bool                 has_bias,
+                                const Size2D        &dilation,
+                                unsigned int         num_groups,
+                                unsigned int         input_pad_right)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right));
     ARM_COMPUTE_UNUSED(num_groups);
 
     _data_layout                   = src->data_layout();
@@ -451,31 +444,34 @@ void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const
     _kernel_height   = kernel_dims.height;
     _input_pad_right = input_pad_right;
     _dilation        = dilation;
-    _convolved_dims  = scaled_dimensions(src->dimension(width_idx), dst->dimension(height_idx),
-                                         _kernel_width, _kernel_height,
-                                         _conv_info, _dilation);
+    _convolved_dims  = scaled_dimensions(src->dimension(width_idx), dst->dimension(height_idx), _kernel_width,
+                                         _kernel_height, _conv_info, _dilation);
     _has_bias        = has_bias;
 
-    if(_data_layout == DataLayout::NCHW)
+    if (_data_layout == DataLayout::NCHW)
     {
-        switch(src->data_type())
+        switch (src->data_type())
         {
             case DataType::F32:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, true> : &CpuIm2ColKernel::run_im2col<float, true, true>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, true>
+                                                   : &CpuIm2ColKernel::run_im2col<float, true, true>;
                 break;
 #if defined(ARM_COMPUTE_ENABLE_BF16)
             case DataType::BFLOAT16:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, true> : &CpuIm2ColKernel::run_im2col<bfloat16, true, true>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, true>
+                                                   : &CpuIm2ColKernel::run_im2col<bfloat16, true, true>;
                 break;
 #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, true> : &CpuIm2ColKernel::run_im2col<float16_t, true, true>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, true>
+                                                   : &CpuIm2ColKernel::run_im2col<float16_t, true, true>;
                 break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             case DataType::QASYMM8_SIGNED:
             case DataType::QASYMM8:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<qasymm8_t, false, true> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, true>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<qasymm8_t, false, true>
+                                                   : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, true>;
                 break;
             default:
                 ARM_COMPUTE_ERROR("Data type not supported");
@@ -484,26 +480,31 @@ void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const
     }
     else
     {
-        switch(src->data_type())
+        switch (src->data_type())
         {
             case DataType::F32:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, false> : &CpuIm2ColKernel::run_im2col<float, true, false>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, false>
+                                                   : &CpuIm2ColKernel::run_im2col<float, true, false>;
                 break;
 #if defined(ARM_COMPUTE_ENABLE_BF16)
             case DataType::BFLOAT16:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, false> : &CpuIm2ColKernel::run_im2col<bfloat16, true, false>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, false>
+                                                   : &CpuIm2ColKernel::run_im2col<bfloat16, true, false>;
                 break;
 #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, false> : &CpuIm2ColKernel::run_im2col<float16_t, true, false>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, false>
+                                                   : &CpuIm2ColKernel::run_im2col<float16_t, true, false>;
                 break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             case DataType::QASYMM8:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<uint8_t, false, false> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<uint8_t, false, false>
+                                                   : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>;
                 break;
             case DataType::QASYMM8_SIGNED:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<int8_t, false, false> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<int8_t, false, false>
+                                                   : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>;
                 break;
             default:
                 ARM_COMPUTE_ERROR("Data type not supported");
@@ -512,11 +513,13 @@ void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const
     }
 
     // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, false, num_groups, _input_pad_right)));
+    auto_init_if_empty(
+        *dst, src->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation,
+                                                                       false, num_groups, _input_pad_right)));
 
-    std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(src->dimension(width_idx), src->dimension(height_idx),
-                                                                             kernel_dims.width, kernel_dims.height,
-                                                                             conv_info, dilation);
+    std::pair<unsigned int, unsigned int> convolved_dims =
+        scaled_dimensions(src->dimension(width_idx), src->dimension(height_idx), kernel_dims.width, kernel_dims.height,
+                          conv_info, dilation);
 
     Window win = calculate_max_window(*src, Steps());
     win.set(width_idx, Window::Dimension(0, convolved_dims.first, 1));
@@ -526,10 +529,17 @@ void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const
     ICpuKernel::configure(win);
 }
 
-Status CpuIm2ColKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                                 bool has_bias, const Size2D &dilation, unsigned int num_groups, unsigned int input_pad_right)
+Status CpuIm2ColKernel::validate(const ITensorInfo   *src,
+                                 const ITensorInfo   *dst,
+                                 const Size2D        &kernel_dims,
+                                 const PadStrideInfo &conv_info,
+                                 bool                 has_bias,
+                                 const Size2D        &dilation,
+                                 unsigned int         num_groups,
+                                 unsigned int         input_pad_right)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right));
     return Status{};
 }
 
diff --git a/src/cpu/kernels/CpuIm2ColKernel.h b/src/cpu/kernels/CpuIm2ColKernel.h
index d133f8dc2d..2cb26179ce 100644
--- a/src/cpu/kernels/CpuIm2ColKernel.h
+++ b/src/cpu/kernels/CpuIm2ColKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_IM2COL_KERNEL_H
 
 #include "arm_compute/core/Size2D.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -78,16 +79,28 @@ public:
      * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      * @param[in]  input_pad_right (Optional) When fast-math is selected, per element padding for the im2col matrix may be necessary
      */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                   bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1, unsigned int input_pad_right = 0);
+    void configure(const ITensorInfo   *src,
+                   ITensorInfo         *dst,
+                   const Size2D        &kernel_dims,
+                   const PadStrideInfo &conv_info,
+                   bool                 has_bias,
+                   const Size2D        &dilation        = Size2D(1U, 1U),
+                   unsigned int         num_groups      = 1,
+                   unsigned int         input_pad_right = 0);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuIm2ColKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                           bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1, unsigned int input_pad_right = 0);
+    static Status validate(const ITensorInfo   *src,
+                           const ITensorInfo   *dst,
+                           const Size2D        &kernel_dims,
+                           const PadStrideInfo &conv_info,
+                           bool                 has_bias,
+                           const Size2D        &dilation        = Size2D(1U, 1U),
+                           unsigned int         num_groups      = 1,
+                           unsigned int         input_pad_right = 0);
 
     // Inherited methods overridden:
     void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
@@ -117,15 +130,15 @@ private:
      */
     using Im2ColFunctionPtr = void (CpuIm2ColKernel::*)(const ITensor *src, ITensor *dst, const Window &window);
 
-    Im2ColFunctionPtr                     _func{ nullptr };
+    Im2ColFunctionPtr                     _func{nullptr};
     std::pair<unsigned int, unsigned int> _convolved_dims{};
     PadStrideInfo                         _conv_info{};
-    unsigned int                          _kernel_width{ 0 };
-    unsigned int                          _kernel_height{ 0 };
-    unsigned int                          _input_pad_right{ 0 };
-    bool                                  _has_bias{ false };
-    Size2D                                _dilation{ 1U, 1U };
-    DataLayout                            _data_layout{ DataLayout::UNKNOWN };
+    unsigned int                          _kernel_width{0};
+    unsigned int                          _kernel_height{0};
+    unsigned int                          _input_pad_right{0};
+    bool                                  _has_bias{false};
+    Size2D                                _dilation{1U, 1U};
+    DataLayout                            _data_layout{DataLayout::UNKNOWN};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuKernelSelectionTypes.h b/src/cpu/kernels/CpuKernelSelectionTypes.h
index 39adc9af7c..b7daa4d583 100644
--- a/src/cpu/kernels/CpuKernelSelectionTypes.h
+++ b/src/cpu/kernels/CpuKernelSelectionTypes.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/common/cpuinfo/CpuIsaInfo.h"
 
 namespace arm_compute
@@ -78,10 +79,10 @@ struct DepthwiseConv2dNativeDataTypeISASelectorData
 
 struct ActivationDataTypeISASelectorData
 {
-    DataType                                dt;
-    const CPUModel                         &cpumodel;
-    const cpuinfo::CpuIsaInfo              &isa;
-    const ActivationFunction f;
+    DataType                   dt;
+    const CPUModel            &cpumodel;
+    const cpuinfo::CpuIsaInfo &isa;
+    const ActivationFunction   f;
 };
 
 struct CpuAddKernelDataTypeISASelectorData
@@ -99,15 +100,19 @@ struct ScaleKernelDataTypeISASelectorData
 };
 
 // Selector pointer types
-using DataTypeISASelectorPtr                      = std::add_pointer<bool(const DataTypeISASelectorData &data)>::type;
-using DataTypeDataLayoutSelectorPtr               = std::add_pointer<bool(const DataTypeDataLayoutISASelectorData &data)>::type;
-using PoolDataTypeISASelectorPtr                  = std::add_pointer<bool(const PoolDataTypeISASelectorData &data)>::type;
-using ElementwiseDataTypeISASelectorPtr           = std::add_pointer<bool(const ElementwiseDataTypeISASelectorData &data)>::type;
-using DepthwiseConv2dNativeDataTypeISASelectorPtr = std::add_pointer<bool(const DepthwiseConv2dNativeDataTypeISASelectorData &data)>::type;
-using CastDataTypeISASelectorDataPtr              = std::add_pointer<bool(const CastDataTypeISASelectorData &data)>::type;
-using ActivationDataTypeISASelectorDataPtr        = std::add_pointer<bool(const ActivationDataTypeISASelectorData &data)>::type;
-using CpuAddKernelDataTypeISASelectorDataPtr      = std::add_pointer<bool(const CpuAddKernelDataTypeISASelectorData &data)>::type;
-using ScaleKernelDataTypeISASelectorDataPtr       = std::add_pointer<bool(const ScaleKernelDataTypeISASelectorData &data)>::type;
+using DataTypeISASelectorPtr            = std::add_pointer<bool(const DataTypeISASelectorData &data)>::type;
+using DataTypeDataLayoutSelectorPtr     = std::add_pointer<bool(const DataTypeDataLayoutISASelectorData &data)>::type;
+using PoolDataTypeISASelectorPtr        = std::add_pointer<bool(const PoolDataTypeISASelectorData &data)>::type;
+using ElementwiseDataTypeISASelectorPtr = std::add_pointer<bool(const ElementwiseDataTypeISASelectorData &data)>::type;
+using DepthwiseConv2dNativeDataTypeISASelectorPtr =
+    std::add_pointer<bool(const DepthwiseConv2dNativeDataTypeISASelectorData &data)>::type;
+using CastDataTypeISASelectorDataPtr = std::add_pointer<bool(const CastDataTypeISASelectorData &data)>::type;
+using ActivationDataTypeISASelectorDataPtr =
+    std::add_pointer<bool(const ActivationDataTypeISASelectorData &data)>::type;
+using CpuAddKernelDataTypeISASelectorDataPtr =
+    std::add_pointer<bool(const CpuAddKernelDataTypeISASelectorData &data)>::type;
+using ScaleKernelDataTypeISASelectorDataPtr =
+    std::add_pointer<bool(const ScaleKernelDataTypeISASelectorData &data)>::type;
 
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp
index 7d077c75bf..bcaa76b99b 100644
--- a/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp
+++ b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp
@@ -24,11 +24,12 @@
 #include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h"
 
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/maxunpool/list.h"
@@ -43,50 +44,43 @@ using namespace misc::shape_calculator;
 
 namespace
 {
-static const std::vector<CpuMaxUnpoolingLayerKernel::MaxUnpoolingKernel> available_kernels =
-{
-    {
-        "neon_fp32_maxunpooling",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(neon_fp32_maxunpooling)
-    },
-    {
-        "neon_fp16_maxunpooling",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(neon_fp16_maxunpooling)
-    },
-    {
-        "neon_qu8_maxunpooling",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; },
-        REGISTER_QASYMM8_NEON(neon_qs8_maxunpooling)
-    },
-    {
-        "neon_qs8_maxunpooling",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
-        REGISTER_QASYMM8_SIGNED_NEON(neon_qu8_maxunpooling)
-    },
+static const std::vector<CpuMaxUnpoolingLayerKernel::MaxUnpoolingKernel> available_kernels = {
+    {"neon_fp32_maxunpooling", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(neon_fp32_maxunpooling)},
+    {"neon_fp16_maxunpooling",
+     [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(neon_fp16_maxunpooling)},
+    {"neon_qu8_maxunpooling", [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; },
+     REGISTER_QASYMM8_NEON(neon_qs8_maxunpooling)},
+    {"neon_qs8_maxunpooling", [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
+     REGISTER_QASYMM8_SIGNED_NEON(neon_qu8_maxunpooling)},
 };
 
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info)
+Status validate_arguments(const ITensorInfo      *src,
+                          const ITensorInfo      *indices,
+                          const ITensorInfo      *dst,
+                          const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, indices, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, indices);
 
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    PoolingType         pool_type       = pool_info.pool_type;
-    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
+    int                 pool_stride_x      = 0;
+    int                 pool_stride_y      = 0;
+    PoolingType         pool_type          = pool_info.pool_type;
+    const PadStrideInfo pad_stride_info    = pool_info.pad_stride_info;
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-    const int    pool_size_x = pool_info.pool_size.width;
-    const int    pool_size_y = pool_info.pool_size.height;
+    const int    pool_size_x               = pool_info.pool_size.width;
+    const int    pool_size_y               = pool_info.pool_size.height;
     const Size2D pool_size(pool_size_x, pool_size_y);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX,
+                                    "Pooling indices only supported for MAX pooling method");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
@@ -96,13 +90,17 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *indices, co
 }
 } // namespace
 
-void CpuMaxUnpoolingLayerKernel::configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info)
+void CpuMaxUnpoolingLayerKernel::configure(const ITensorInfo      *src,
+                                           const ITensorInfo      *indices,
+                                           ITensorInfo            *dst,
+                                           const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, indices);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, indices, dst, pool_info));
     ARM_COMPUTE_UNUSED(indices);
 
-    const auto uk = CpuMaxUnpoolingLayerKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+    const auto uk = CpuMaxUnpoolingLayerKernel::get_implementation(
+        DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
     _run_method = uk->ukernel;
 
@@ -113,7 +111,10 @@ void CpuMaxUnpoolingLayerKernel::configure(const ITensorInfo *src, const ITensor
     ICpuKernel::configure(window);
 }
 
-Status CpuMaxUnpoolingLayerKernel::validate(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info)
+Status CpuMaxUnpoolingLayerKernel::validate(const ITensorInfo      *src,
+                                            const ITensorInfo      *indices,
+                                            const ITensorInfo      *dst,
+                                            const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, indices, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, indices, dst, pool_info));
diff --git a/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h
index d0c13471c8..5a641a2bea 100644
--- a/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h
+++ b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h
@@ -37,7 +37,8 @@ namespace kernels
 class CpuMaxUnpoolingLayerKernel : public ICpuKernel<CpuMaxUnpoolingLayerKernel>
 {
 private:
-    using MaxUnpoolingUKernelPtr = std::add_pointer<void(const ITensor *input, const ITensor *indices, ITensor *output, const Window &window)>::type;
+    using MaxUnpoolingUKernelPtr = std::add_pointer<void(
+        const ITensor *input, const ITensor *indices, ITensor *output, const Window &window)>::type;
 
 public:
     /** Default constructor */
@@ -56,7 +57,8 @@ public:
      * @param[out] dst       Destination tensor. Data types supported: Same as @p src
      * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info);
+    void
+    configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CpuMaxUnpoolingLayerKernel
      *
      * @param[in]  src       Source tensor to permute. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -66,7 +68,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *indices,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
@@ -83,7 +88,7 @@ public:
     const char *name() const override;
 
 private:
-    MaxUnpoolingUKernelPtr _run_method{ nullptr };
+    MaxUnpoolingUKernelPtr _run_method{nullptr};
 };
 
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuMulKernel.cpp b/src/cpu/kernels/CpuMulKernel.cpp
index b73d2bdf73..ba086e3ac6 100644
--- a/src/cpu/kernels/CpuMulKernel.cpp
+++ b/src/cpu/kernels/CpuMulKernel.cpp
@@ -25,23 +25,24 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
+
 #include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NESymm.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
 namespace
 {
 #if defined(ENABLE_FP32_KERNELS)
-    static constexpr size_t default_mws_N1_fp32_neon = 22447;
-    static constexpr size_t default_mws_V1_fp32_neon = 38982;
+static constexpr size_t default_mws_N1_fp32_neon = 22447;
+static constexpr size_t default_mws_V1_fp32_neon = 38982;
 #endif /* ENABLE_FP32_KERNELS */
-    static constexpr size_t default_mws_other_platforms_1d_tensor = 10240;
-}
+static constexpr size_t default_mws_other_platforms_1d_tensor = 10240;
+} // namespace
 namespace arm_compute
 {
 namespace cpu
@@ -54,29 +55,38 @@ const float       scale255_constant      = 1.f / 255.f;
 const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant);
 const float32x4_t positive_round_f32q    = vdupq_n_f32(0.5f);
 
-inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+inline Status validate_arguments(const ITensorInfo *src1,
+                                 const ITensorInfo *src2,
+                                 const ITensorInfo *dst,
+                                 float              scale,
+                                 ConvertPolicy      overflow_policy,
+                                 RoundingPolicy     rounding_policy)
 {
     ARM_COMPUTE_UNUSED(overflow_policy);
     ARM_COMPUTE_UNUSED(rounding_policy);
 
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32,
+                                                         DataType::QSYMM16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32,
+                                                         DataType::QSYMM16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
                                                          DataType::S32, DataType::F16, DataType::F32);
-    if(is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type()))
+    if (is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, "ConvertPolicy cannot be WRAP if datatype is quantized");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP,
+                                        "ConvertPolicy cannot be WRAP if datatype is quantized");
     }
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
+                                        "Wrong shape for dst");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
         // clang-format off
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(
@@ -88,13 +98,17 @@ inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src
             !(src1->data_type() == DataType::QSYMM16 && src2->data_type() == DataType::QSYMM16 && dst->data_type() == DataType::S32)
             , "Invalid data type combination");
         // clang-format on
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 dst");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 &&
+                                            scale != 1.f,
+                                        "Unsupported scale for QSYMM16 inputs and S32 dst");
     }
 
-    if(std::abs(scale - scale255_constant) < 0.00001f)
+    if (std::abs(scale - scale255_constant) < 0.00001f)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 && dst->data_type() == DataType::S32,
+        ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP &&
+                                    rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 &&
+                                            dst->data_type() == DataType::S32,
                                         "Scale == 1/255 is not supported if input and dst are of data type S32");
     }
     else
@@ -107,7 +121,8 @@ inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src
         // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
         // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
         // Moreover, it will be negative as we deal with 1/2^n
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), "Scale value not supported (Should be 1/(2^n) or 1/255");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)),
+                                        "Scale value not supported (Should be 1/(2^n) or 1/255");
     }
 
     return Status{};
@@ -168,9 +183,9 @@ void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor
     const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
 
     const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
-    const UniformQuantizationInfo tmp_qua_info    = { output_qua_info.scale / scale, output_qua_info.offset };
+    const UniformQuantizationInfo tmp_qua_info    = {output_qua_info.scale / scale, output_qua_info.offset};
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool                    is_broadcast_input_2 = input2_win.x().step() == 0;
         Window                        broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -178,7 +193,7 @@ void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor
         const ITensor                *broadcast_tensor     = is_broadcast_input_2 ? src2 : src1;
         const ITensor                *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
         const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo  = non_broadcast_tensor->info()->quantization_info().uniform();
+        const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
 
         // Clear X Dimension on execution window as we handle manually
         non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -190,52 +205,52 @@ void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor
         using ExactTagType = typename wrapper::traits::neon_vector<T, window_step_x>::tag_type;
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<T *>(dst.ptr());
-
-            const auto broadcast_value     = *reinterpret_cast<const T *>(broadcast_input.ptr());
-            const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+                const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<T *>(dst.ptr());
 
-                // Dequantize inputs
-                const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo);
-                const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo);
+                const auto broadcast_value     = *reinterpret_cast<const T *>(broadcast_input.ptr());
+                const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
 
-                const float32x4x4_t out_f32x4x4 =
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
-                    vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
-                    vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
-                    vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
-                };
-
-                // Quantize dst
-                const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
-                wrapper::vstore(output_ptr + x, result);
-            }
+                    const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+
+                    // Dequantize inputs
+                    const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo);
+                    const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo);
+
+                    const float32x4x4_t out_f32x4x4 = {
+                        vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
+                        vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
+                        vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
+                        vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
+                    };
+
+                    // Quantize dst
+                    const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
+                    wrapper::vstore(output_ptr + x, result);
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                // Dequantize inputs
-                const T     src1    = *(non_broadcast_input_ptr + x);
-                const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, non_broadcast_qinfo);
-                const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(broadcast_value, broadcast_qinfo);
-                const float tmp_f   = tmp_in1 * tmp_in2;
-
-                // Quantize dst
-                const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
-                *(output_ptr + x)  = tmp_qua;
-            }
-        },
-        broadcast_input, non_broadcast_input, dst);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    // Dequantize inputs
+                    const T     src1    = *(non_broadcast_input_ptr + x);
+                    const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, non_broadcast_qinfo);
+                    const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(broadcast_value, broadcast_qinfo);
+                    const float tmp_f   = tmp_in1 * tmp_in2;
+
+                    // Quantize dst
+                    const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
+                    *(output_ptr + x)  = tmp_qua;
+                }
+            },
+            broadcast_input, non_broadcast_input, dst);
     }
     else
     {
@@ -251,56 +266,59 @@ void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor
         Iterator dst(out, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const auto input1_q = wrapper::vloadq(input1_ptr + x);
-                const auto input2_q = wrapper::vloadq(input2_ptr + x);
-
-                // Dequantize inputs
-                const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
-                const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
+                const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
 
-                const float32x4x4_t out_f32x4x4 =
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
-                    vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
-                    vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
-                    vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
-                };
-
-                // Quantize dst
-                const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
-                wrapper::vstore(output_ptr + x, result);
-            }
+                    const auto input1_q = wrapper::vloadq(input1_ptr + x);
+                    const auto input2_q = wrapper::vloadq(input2_ptr + x);
+
+                    // Dequantize inputs
+                    const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
+                    const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
+
+                    const float32x4x4_t out_f32x4x4 = {
+                        vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
+                        vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
+                        vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
+                        vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
+                    };
+
+                    // Quantize dst
+                    const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
+                    wrapper::vstore(output_ptr + x, result);
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                // Dequantize inputs
-                const T     src1    = *(input1_ptr + x);
-                const T     src2    = *(input2_ptr + x);
-                const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, input1_qua_info);
-                const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(src2, input2_qua_info);
-                const float tmp_f   = tmp_in1 * tmp_in2;
-
-                // Quantize dst
-                const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
-                *(output_ptr + x)  = tmp_qua;
-            }
-        },
-        input1, input2, dst);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    // Dequantize inputs
+                    const T     src1    = *(input1_ptr + x);
+                    const T     src2    = *(input2_ptr + x);
+                    const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, input1_qua_info);
+                    const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(src2, input2_qua_info);
+                    const float tmp_f   = tmp_in1 * tmp_in2;
+
+                    // Quantize dst
+                    const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
+                    *(output_ptr + x)  = tmp_qua;
+                }
+            },
+            input1, input2, dst);
     }
 }
 
-bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, float scale)
+bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0,
+                                     const ITensorInfo *src1,
+                                     const ITensorInfo *dst,
+                                     float              scale)
 {
     const auto iq0 = src0->quantization_info().uniform();
     const auto iq1 = src1->quantization_info().uniform();
@@ -308,7 +326,7 @@ bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo
 
     const auto multiplier = ((iq0.scale * iq1.scale) / oq.scale) * scale;
 
-    if(multiplier < -8191.f || multiplier > 8191.f)
+    if (multiplier < -8191.f || multiplier > 8191.f)
     {
         //The multiplier cannot be stored as a 14.18 signed fixed-point number
         return false;
@@ -318,7 +336,7 @@ bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo
 
     const auto max_result = multiplier * (256) * (256) + offset_out;
 
-    if(max_result > 8191.f)
+    if (max_result > 8191.f)
     {
         //It might not be possible to store the result as a 14.18 signed fixed-point number.
         return false;
@@ -366,7 +384,7 @@ void mul_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *d
     const auto out_offset_14p18 = static_cast<int32_t>(out_offset * two_pwr18i);
     const auto multiplier_14p18 = static_cast<int32_t>(multiplier * two_pwr18f);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         // Prefix: a = non-broadcast, b = broadcast.
 
@@ -392,78 +410,76 @@ void mul_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *d
         Iterator out_it(dst, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto a_ptr   = reinterpret_cast<const ScalarType *>(a_input_it.ptr());
-            const auto b_ptr   = reinterpret_cast<const ScalarType *>(b_input_it.ptr());
-            const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
-
-            const auto b_val            = *b_ptr;
-            const auto b_offseted_32p0  = static_cast<int32_t>(b_val - b_offset_16p0);
-            const auto b_voffseted_32p0 = wrapper::vdup_n(b_offseted_32p0, wrapper::traits::vector_128_tag());
+            win,
+            [&](const Coordinates &)
+            {
+                const auto a_ptr   = reinterpret_cast<const ScalarType *>(a_input_it.ptr());
+                const auto b_ptr   = reinterpret_cast<const ScalarType *>(b_input_it.ptr());
+                const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
 
-            const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag());
-            const auto voffsetout_14p18  = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag());
+                const auto b_val            = *b_ptr;
+                const auto b_offseted_32p0  = static_cast<int32_t>(b_val - b_offset_16p0);
+                const auto b_voffseted_32p0 = wrapper::vdup_n(b_offseted_32p0, wrapper::traits::vector_128_tag());
 
-            int x = window_start_x;
+                const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag());
+                const auto voffsetout_14p18  = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag());
 
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                // Load the inputs.
-                const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x);
-
-                // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness.
-                const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0)));
-                const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0)));
-
-                const auto voffseted_32p0_00 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_0), a_voffset_16p0);
-                const auto voffseted_32p0_01 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_0), a_voffset_16p0);
-                const auto voffseted_32p0_10 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_1), a_voffset_16p0);
-                const auto voffseted_32p0_11 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_1), a_voffset_16p0);
-
-                const auto vinnermul_32p0_00 = wrapper::vmul(voffseted_32p0_00, b_voffseted_32p0);
-                const auto vinnermul_32p0_01 = wrapper::vmul(voffseted_32p0_01, b_voffseted_32p0);
-                const auto vinnermul_32p0_10 = wrapper::vmul(voffseted_32p0_10, b_voffseted_32p0);
-                const auto vinnermul_32p0_11 = wrapper::vmul(voffseted_32p0_11, b_voffseted_32p0);
-
-                const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
-                const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
-                const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
-                const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
-
-                // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.
-                const auto vout_15p1_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
-                const auto vout_15p1_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
-                const auto vout_15p1_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
-                const auto vout_15p1_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
-
-                const auto vout_15p1_0 = wrapper::vcombine(
-                                             vout_15p1_00,
-                                             vout_15p1_01);
-
-                const auto vout_15p1_1 = wrapper::vcombine(
-                                             vout_15p1_10,
-                                             vout_15p1_11);
-                const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
+                int x = window_start_x;
 
-                const auto vout_8p0 = wrapper::vcombine(
-                                          wrapper::vqrshrn<2>(vout_15p1_0),
-                                          wrapper::vqrshrn<2>(vout_15p1_1));
-                wrapper::vstore(out_ptr + x, vout_8p0);
-            }
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    // Load the inputs.
+                    const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x);
+
+                    // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness.
+                    const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0)));
+                    const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0)));
+
+                    const auto voffseted_32p0_00 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_0), a_voffset_16p0);
+                    const auto voffseted_32p0_01 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_0), a_voffset_16p0);
+                    const auto voffseted_32p0_10 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_1), a_voffset_16p0);
+                    const auto voffseted_32p0_11 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_1), a_voffset_16p0);
+
+                    const auto vinnermul_32p0_00 = wrapper::vmul(voffseted_32p0_00, b_voffseted_32p0);
+                    const auto vinnermul_32p0_01 = wrapper::vmul(voffseted_32p0_01, b_voffseted_32p0);
+                    const auto vinnermul_32p0_10 = wrapper::vmul(voffseted_32p0_10, b_voffseted_32p0);
+                    const auto vinnermul_32p0_11 = wrapper::vmul(voffseted_32p0_11, b_voffseted_32p0);
+
+                    const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
+                    const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
+                    const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
+                    const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
+
+                    // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.
+                    const auto vout_15p1_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
+                    const auto vout_15p1_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
+                    const auto vout_15p1_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
+                    const auto vout_15p1_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
+
+                    const auto vout_15p1_0 = wrapper::vcombine(vout_15p1_00, vout_15p1_01);
+
+                    const auto vout_15p1_1 = wrapper::vcombine(vout_15p1_10, vout_15p1_11);
+                    const auto out_ptr     = reinterpret_cast<ScalarType *>(out_it.ptr());
+
+                    const auto vout_8p0 =
+                        wrapper::vcombine(wrapper::vqrshrn<2>(vout_15p1_0), wrapper::vqrshrn<2>(vout_15p1_1));
+                    wrapper::vstore(out_ptr + x, vout_8p0);
+                }
 
-            //Process the left-over elements.
-            for(; x < window_end_x; ++x)
-            {
+                //Process the left-over elements.
+                for (; x < window_end_x; ++x)
+                {
 #ifdef __aarch64__
-                out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(a_ptr[x]) - a_offset_16p0) * (int32_t(
-                                                                                                             b_val) - b_offset_16p0)) + out_offset_14p18)));
+                    out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(
+                        (multiplier_14p18 * (int32_t(a_ptr[x]) - a_offset_16p0) * (int32_t(b_val) - b_offset_16p0)) +
+                        out_offset_14p18)));
 #else  //__aarch64__
-                out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(multiplier * ((float(a_ptr[x]) - a_offset) * (float(b_val) - b_offset)) + float(out_offset)));
+                    out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(
+                        multiplier * ((float(a_ptr[x]) - a_offset) * (float(b_val) - b_offset)) + float(out_offset)));
 #endif //__aarch64__
-            }
-        },
-        a_input_it, b_input_it, out_it);
+                }
+            },
+            a_input_it, b_input_it, out_it);
     }
     else
     {
@@ -481,82 +497,83 @@ void mul_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *d
         Iterator out_it(dst, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr());
-            const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr());
-            const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
+            win,
+            [&](const Coordinates &)
+            {
+                const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr());
+                const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr());
+                const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
 
-            int x = window_start_x;
+                int x = window_start_x;
 
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                // Load the inputs.
-                const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x);
-                const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x);
-
-                // Widen the input elements to signed 16-bit regardless of the input signedness.
-                const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0)));
-                const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0)));
-                const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0)));
-                const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0)));
-
-                const auto voffseted0_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_0), voffset0_16p0);
-                const auto voffseted0_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_0), voffset0_16p0);
-                const auto voffseted0_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_1), voffset0_16p0);
-                const auto voffseted0_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_1), voffset0_16p0);
-
-                const auto voffseted1_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_0), voffset1_16p0);
-                const auto voffseted1_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_0), voffset1_16p0);
-                const auto voffseted1_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_1), voffset1_16p0);
-                const auto voffseted1_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_1), voffset1_16p0);
-
-                const auto vinnermul_32p0_00 = wrapper::vmul(voffseted0_32p0_00, voffseted1_32p0_00);
-                const auto vinnermul_32p0_01 = wrapper::vmul(voffseted0_32p0_01, voffseted1_32p0_01);
-                const auto vinnermul_32p0_10 = wrapper::vmul(voffseted0_32p0_10, voffseted1_32p0_10);
-                const auto vinnermul_32p0_11 = wrapper::vmul(voffseted0_32p0_11, voffseted1_32p0_11);
-
-                const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
-                const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
-                const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
-                const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
-
-                // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.
-                const auto vout_14p2_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
-                const auto vout_14p2_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
-                const auto vout_14p2_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
-                const auto vout_14p2_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
-
-                const auto vout_14p2_0 = wrapper::vcombine(
-                                             vout_14p2_00,
-                                             vout_14p2_01);
-
-                const auto vout_14p2_1 = wrapper::vcombine(
-                                             vout_14p2_10,
-                                             vout_14p2_11);
-
-                const auto vout_8p0 = wrapper::vcombine(
-                                          wrapper::vqrshrn<2>(vout_14p2_0),
-                                          wrapper::vqrshrn<2>(vout_14p2_1));
-                wrapper::vstore(out_ptr + x, vout_8p0);
-            }
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    // Load the inputs.
+                    const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x);
+                    const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x);
+
+                    // Widen the input elements to signed 16-bit regardless of the input signedness.
+                    const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0)));
+                    const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0)));
+                    const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0)));
+                    const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0)));
+
+                    const auto voffseted0_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_0), voffset0_16p0);
+                    const auto voffseted0_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_0), voffset0_16p0);
+                    const auto voffseted0_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_1), voffset0_16p0);
+                    const auto voffseted0_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_1), voffset0_16p0);
+
+                    const auto voffseted1_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_0), voffset1_16p0);
+                    const auto voffseted1_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_0), voffset1_16p0);
+                    const auto voffseted1_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_1), voffset1_16p0);
+                    const auto voffseted1_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_1), voffset1_16p0);
+
+                    const auto vinnermul_32p0_00 = wrapper::vmul(voffseted0_32p0_00, voffseted1_32p0_00);
+                    const auto vinnermul_32p0_01 = wrapper::vmul(voffseted0_32p0_01, voffseted1_32p0_01);
+                    const auto vinnermul_32p0_10 = wrapper::vmul(voffseted0_32p0_10, voffseted1_32p0_10);
+                    const auto vinnermul_32p0_11 = wrapper::vmul(voffseted0_32p0_11, voffseted1_32p0_11);
+
+                    const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
+                    const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
+                    const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
+                    const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
+
+                    // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.
+                    const auto vout_14p2_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
+                    const auto vout_14p2_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
+                    const auto vout_14p2_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
+                    const auto vout_14p2_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
+
+                    const auto vout_14p2_0 = wrapper::vcombine(vout_14p2_00, vout_14p2_01);
+
+                    const auto vout_14p2_1 = wrapper::vcombine(vout_14p2_10, vout_14p2_11);
+
+                    const auto vout_8p0 =
+                        wrapper::vcombine(wrapper::vqrshrn<2>(vout_14p2_0), wrapper::vqrshrn<2>(vout_14p2_1));
+                    wrapper::vstore(out_ptr + x, vout_8p0);
+                }
 
-            //Process the left-over elements.
-            for(; x < window_end_x; ++x)
-            {
+                //Process the left-over elements.
+                for (; x < window_end_x; ++x)
+                {
 #ifdef __aarch64__
-                out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(in0_ptr[x]) - in0_offset_16p0) * (int32_t(
-                                                                                                             in1_ptr[x]) - in1_offset_16p0)) + out_offset_14p18)));
+                    out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(
+                        wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(in0_ptr[x]) - in0_offset_16p0) *
+                                             (int32_t(in1_ptr[x]) - in1_offset_16p0)) +
+                                            out_offset_14p18)));
 #else  //__aarch64__
-                out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(multiplier * ((float(in0_ptr[x]) - in0_offset) * (float(in1_ptr[x]) - in1_offset)) + float(out_offset)));
+                    out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(
+                        multiplier * ((float(in0_ptr[x]) - in0_offset) * (float(in1_ptr[x]) - in1_offset)) +
+                        float(out_offset)));
 #endif //__aarch64__
-            }
-        },
-        in0_it, in1_it, out_it);
+                }
+            },
+            in0_it, in1_it, out_it);
     }
 }
 
-void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
+void mul_saturate_QSYMM16_QSYMM16_QSYMM16(
+    const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
 {
     const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
     const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
@@ -580,66 +597,61 @@ void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *src1, const ITensor *sr
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
 
-    const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
+    const UniformQuantizationInfo tmp_qua_info = {output_qua_info.scale / scale, output_qua_info.offset};
 
     execute_window_loop(
-        win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<qsymm16_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        win,
+        [&](const Coordinates &)
         {
-            const qsymm16x8x2_t input1_q =
+            const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<qsymm16_t *>(dst.ptr());
+
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                {
+                const qsymm16x8x2_t input1_q = {{
                     vld1q_s16(input1_ptr + x),
                     vld1q_s16(input1_ptr + x + 8),
-                }
-            };
-            const qsymm16x8x2_t input2_q =
-            {
-                {
+                }};
+                const qsymm16x8x2_t input2_q = {{
                     vld1q_s16(input2_ptr + x),
                     vld1q_s16(input2_ptr + x + 8),
-                }
-            };
+                }};
 
-            // Dequantize inputs
-            const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
-            const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
+                // Dequantize inputs
+                const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
+                const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
 
-            const float32x4x4_t out_f32x4x4 =
-            {
-                vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
-                vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
-                vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
-                vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
-            };
-
-            const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info);
-            vst1q_s16(output_ptr + x, result.val[0]);
-            vst1q_s16(output_ptr + x + 8, result.val[1]);
-        }
+                const float32x4x4_t out_f32x4x4 = {
+                    vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
+                    vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
+                    vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
+                    vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
+                };
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            // Dequantize inputs
-            float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale;
-            float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;
-            float tmp_f   = tmp_in1 * tmp_in2;
-
-            // Quantize dst, lrintf() has same rounding mode as vcombine_s16
-            int32_t   tmp     = lrintf(tmp_f / tmp_qua_info.scale);
-            qsymm16_t tmp_qua = static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
-            *(output_ptr + x) = tmp_qua;
-        }
-    },
-    input1, input2, dst);
+                const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info);
+                vst1q_s16(output_ptr + x, result.val[0]);
+                vst1q_s16(output_ptr + x + 8, result.val[1]);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                // Dequantize inputs
+                float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale;
+                float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;
+                float tmp_f   = tmp_in1 * tmp_in2;
+
+                // Quantize dst, lrintf() has same rounding mode as vcombine_s16
+                int32_t   tmp = lrintf(tmp_f / tmp_qua_info.scale);
+                qsymm16_t tmp_qua =
+                    static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
+                *(output_ptr + x) = tmp_qua;
+            }
+        },
+        input1, input2, dst);
 }
 
 void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int scale)
@@ -665,74 +677,60 @@ void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor *
     const auto window_end_x   = static_cast<int>(window.x().end());
 
     execute_window_loop(
-        win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        win,
+        [&](const Coordinates &)
         {
-            const qsymm16x8x2_t input1_q =
+            const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                {
+                const qsymm16x8x2_t input1_q = {{
                     vld1q_s16(input1_ptr + x),
                     vld1q_s16(input1_ptr + x + 8),
-                }
-            };
-            const qsymm16x8x2_t input2_q =
-            {
-                {
+                }};
+                const qsymm16x8x2_t input2_q = {{
                     vld1q_s16(input2_ptr + x),
                     vld1q_s16(input2_ptr + x + 8),
-                }
-            };
+                }};
 
-            const int32x4x4_t in1_s32 =
-            {
-                {
+                const int32x4x4_t in1_s32 = {{
                     vmovl_s16(vget_low_s16(input1_q.val[0])),
                     vmovl_s16(vget_high_s16(input1_q.val[0])),
                     vmovl_s16(vget_low_s16(input1_q.val[1])),
                     vmovl_s16(vget_high_s16(input1_q.val[1])),
-                }
-            };
-            const int32x4x4_t in2_s32 =
-            {
-                {
+                }};
+                const int32x4x4_t in2_s32 = {{
                     vmovl_s16(vget_low_s16(input2_q.val[0])),
                     vmovl_s16(vget_high_s16(input2_q.val[0])),
                     vmovl_s16(vget_low_s16(input2_q.val[1])),
                     vmovl_s16(vget_high_s16(input2_q.val[1])),
-                }
-            };
+                }};
 
-            const int32x4x4_t result =
-            {
-                {
+                const int32x4x4_t result = {{
                     vmulq_s32(in1_s32.val[0], in2_s32.val[0]),
                     vmulq_s32(in1_s32.val[1], in2_s32.val[1]),
                     vmulq_s32(in1_s32.val[2], in2_s32.val[2]),
                     vmulq_s32(in1_s32.val[3], in2_s32.val[3]),
-                }
-            };
+                }};
 
-            vst1q_s32(output_ptr + x, result.val[0]);
-            vst1q_s32(output_ptr + x + 4, result.val[1]);
-            vst1q_s32(output_ptr + x + 8, result.val[2]);
-            vst1q_s32(output_ptr + x + 12, result.val[3]);
-        }
+                vst1q_s32(output_ptr + x, result.val[0]);
+                vst1q_s32(output_ptr + x + 4, result.val[1]);
+                vst1q_s32(output_ptr + x + 8, result.val[2]);
+                vst1q_s32(output_ptr + x + 12, result.val[3]);
+            }
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int32_t tmp       = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input1, input2, dst);
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                int32_t tmp       = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
+                *(output_ptr + x) = tmp;
+            }
+        },
+        input1, input2, dst);
 }
 
 template <bool is_scale255, bool is_sat>
@@ -757,79 +755,80 @@ void mul_U8_U8_U8(const ITensor *src1, const ITensor *src2, ITensor *out, const
     const auto window_end_x   = static_cast<int>(window.x().end());
 
     execute_window_loop(
-        win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        win,
+        [&](const Coordinates &)
         {
-            const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x);
-            const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x);
+            const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x);
+                const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x);
 
-            uint16x8_t       tmp1_high = vmovl_u8(vget_high_u8(ta1));
-            const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
-            uint16x8_t       tmp1_low  = vmovl_u8(vget_low_u8(ta1));
-            const uint16x8_t tmp2_low  = vmovl_u8(vget_low_u8(ta2));
+                uint16x8_t       tmp1_high = vmovl_u8(vget_high_u8(ta1));
+                const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
+                uint16x8_t       tmp1_low  = vmovl_u8(vget_low_u8(ta1));
+                const uint16x8_t tmp2_low  = vmovl_u8(vget_low_u8(ta2));
 
-            tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
-            tmp1_low  = vmulq_u16(tmp1_low, tmp2_low);
+                tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
+                tmp1_low  = vmulq_u16(tmp1_low, tmp2_low);
 
-            if(is_scale255)
-            {
-                tmp1_high = scale255_U16_U16(tmp1_high);
-                tmp1_low  = scale255_U16_U16(tmp1_low);
-            }
-            else
-            {
-                const int16x8_t vn = vdupq_n_s16(-n);
+                if (is_scale255)
+                {
+                    tmp1_high = scale255_U16_U16(tmp1_high);
+                    tmp1_low  = scale255_U16_U16(tmp1_low);
+                }
+                else
+                {
+                    const int16x8_t vn = vdupq_n_s16(-n);
 
-                if(is_sat)
+                    if (is_sat)
+                    {
+                        tmp1_high = vqshlq_u16(tmp1_high, vn);
+                        tmp1_low  = vqshlq_u16(tmp1_low, vn);
+                    }
+                    else
+                    {
+                        tmp1_high = vshlq_u16(tmp1_high, vn);
+                        tmp1_low  = vshlq_u16(tmp1_low, vn);
+                    }
+                }
+                if (is_sat)
                 {
-                    tmp1_high = vqshlq_u16(tmp1_high, vn);
-                    tmp1_low  = vqshlq_u16(tmp1_low, vn);
+                    vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
                 }
                 else
                 {
-                    tmp1_high = vshlq_u16(tmp1_high, vn);
-                    tmp1_low  = vshlq_u16(tmp1_low, vn);
+                    vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
                 }
             }
-            if(is_sat)
-            {
-                vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
-            }
-            else
-            {
-                vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
-            }
-        }
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x));
-
-            if(is_scale255)
-            {
-                float tmp_f = static_cast<float>(tmp) * scale255_constant;
-                tmp         = static_cast<uint16_t>(tmp_f + 0.5f);
-            }
-            else
-            {
-                tmp >>= n;
-            }
-            if(is_sat && tmp > 255)
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                tmp = 255;
+                uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x));
+
+                if (is_scale255)
+                {
+                    float tmp_f = static_cast<float>(tmp) * scale255_constant;
+                    tmp         = static_cast<uint16_t>(tmp_f + 0.5f);
+                }
+                else
+                {
+                    tmp >>= n;
+                }
+                if (is_sat && tmp > 255)
+                {
+                    tmp = 255;
+                }
+                *(output_ptr + x) = static_cast<uint8_t>(tmp);
             }
-            *(output_ptr + x) = static_cast<uint8_t>(tmp);
-        }
-    },
-    input1, input2, dst);
+        },
+        input1, input2, dst);
 }
 
 template <bool is_scale255, bool is_sat>
@@ -843,7 +842,7 @@ inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &
     tmp1_high = vmulq_s32(tmp1_high, tmp2_high);
     tmp1_low  = vmulq_s32(tmp1_low, tmp2_low);
 
-    if(is_scale255)
+    if (is_scale255)
     {
         tmp1_high = scale255_S32_S32(tmp1_high);
         tmp1_low  = scale255_S32_S32(tmp1_low);
@@ -863,7 +862,7 @@ inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &
         const int32x4_t  sign_low_s   = vreinterpretq_s32_u32(sign_low);
         const int32x4_t  convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);
         const int32x4_t  convert_low  = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);
-        if(is_sat)
+        if (is_sat)
         {
             tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
             tmp1_low  = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
@@ -875,7 +874,7 @@ inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &
         }
     }
 
-    if(is_sat)
+    if (is_sat)
     {
         return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high));
     }
@@ -888,15 +887,10 @@ inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &
 template <bool is_scale255, bool is_sat>
 inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &src1, const int16x8x2_t &src2, int n)
 {
-    const int16x8x2_t result =
-    {
-        {
-            // First 8 elements
-            mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[0], src2.val[0], n),
-            // Second 8 elements
-            mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[1], src2.val[1], n)
-        }
-    };
+    const int16x8x2_t result = {{// First 8 elements
+                                 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[0], src2.val[0], n),
+                                 // Second 8 elements
+                                 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[1], src2.val[1], n)}};
 
     return result;
 }
@@ -923,67 +917,62 @@ void mul_S16_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, con
     const auto window_end_x   = static_cast<int>(window.x().end());
 
     execute_window_loop(
-        win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        win,
+        [&](const Coordinates &)
         {
-            const int16x8x2_t ta1 =
-            {
-                {
-                    vld1q_s16(input1_ptr + x),
-                    vld1q_s16(input1_ptr + x + 8),
-                }
-            };
-            const int16x8x2_t ta2 =
-            {
-                {
-                    vld1q_s16(input2_ptr + x),
-                    vld1q_s16(input2_ptr + x + 8),
-                }
-            };
-            const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
-
-            vst1q_s16(output_ptr + x, result.val[0]);
-            vst1q_s16(output_ptr + x + 8, result.val[1]);
-        }
+            const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const int16x8x2_t ta1    = {{
+                       vld1q_s16(input1_ptr + x),
+                       vld1q_s16(input1_ptr + x + 8),
+                }};
+                const int16x8x2_t ta2    = {{
+                       vld1q_s16(input2_ptr + x),
+                       vld1q_s16(input2_ptr + x + 8),
+                }};
+                const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
+
+                vst1q_s16(output_ptr + x, result.val[0]);
+                vst1q_s16(output_ptr + x + 8, result.val[1]);
+            }
 
-            if(is_scale255)
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                float tmp_f = static_cast<float>(tmp) * scale255_constant;
+                int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
 
-                tmp = static_cast<int32_t>(tmp_f + 0.5f);
-            }
-            else
-            {
-                if(tmp >= 0)
+                if (is_scale255)
                 {
-                    tmp >>= n;
+                    float tmp_f = static_cast<float>(tmp) * scale255_constant;
+
+                    tmp = static_cast<int32_t>(tmp_f + 0.5f);
                 }
                 else
                 {
-                    uint32_t mask = (1u << n) - 1;
-                    tmp           = (tmp + static_cast<int32_t>(mask)) >> n;
+                    if (tmp >= 0)
+                    {
+                        tmp >>= n;
+                    }
+                    else
+                    {
+                        uint32_t mask = (1u << n) - 1;
+                        tmp           = (tmp + static_cast<int32_t>(mask)) >> n;
+                    }
                 }
+                if (is_sat)
+                {
+                    tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
+                }
+                *(output_ptr + x) = static_cast<int16_t>(tmp);
             }
-            if(is_sat)
-            {
-                tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
-            }
-            *(output_ptr + x) = static_cast<int16_t>(tmp);
-        }
-    },
-    input1, input2, dst);
+        },
+        input1, input2, dst);
 }
 
 template <bool is_sat>
@@ -1012,7 +1001,7 @@ inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &src1, const int32x4_t &
     const uint64x2_t sign_2    = vshrq_n_u64(tmp_2_u, 63);
     const int64x2_t  sign_2_s  = vreinterpretq_s64_u64(sign_2);
     const int64x2_t  convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s);
-    if(is_sat)
+    if (is_sat)
     {
         tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
         tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
@@ -1029,15 +1018,10 @@ inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &src1, const int32x4_t &
 template <bool is_sat>
 inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &src1, const int32x4x2_t &src2, int n)
 {
-    const int32x4x2_t result =
-    {
-        {
-            // First 4 elements
-            mul_S32_S32_S32_n_loop<is_sat>(src1.val[0], src2.val[0], n),
-            // Second 4 elements
-            mul_S32_S32_S32_n_loop<is_sat>(src1.val[1], src2.val[1], n)
-        }
-    };
+    const int32x4x2_t result = {{// First 4 elements
+                                 mul_S32_S32_S32_n_loop<is_sat>(src1.val[0], src2.val[0], n),
+                                 // Second 4 elements
+                                 mul_S32_S32_S32_n_loop<is_sat>(src1.val[1], src2.val[1], n)}};
 
     return result;
 }
@@ -1058,7 +1042,7 @@ void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, con
     const auto window_end_x          = static_cast<int>(window.x().end());
     const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -1074,60 +1058,56 @@ void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, con
         Iterator dst(out, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int32_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int32_t *>(dst.ptr());
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int32_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<int32_t *>(dst.ptr());
 
-            const int32_t broadcast_value     = *reinterpret_cast<const int32_t *>(broadcast_input.ptr());
-            const auto    broadcast_value_vec = vdupq_n_s32(broadcast_value);
+                const int32_t broadcast_value     = *reinterpret_cast<const int32_t *>(broadcast_input.ptr());
+                const auto    broadcast_value_vec = vdupq_n_s32(broadcast_value);
 
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const int32x4x2_t broadcast_v =
-                {
-                    {
-                        broadcast_value_vec,
-                        broadcast_value_vec,
-                    }
-                };
-                const int32x4x2_t non_broadcast_v =
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
+                    const int32x4x2_t broadcast_v     = {{
+                            broadcast_value_vec,
+                            broadcast_value_vec,
+                    }};
+                    const int32x4x2_t non_broadcast_v = {{
                         vld1q_s32(non_broadcast_input_ptr + x),
                         vld1q_s32(non_broadcast_input_ptr + x + 4),
-                    }
-                };
-                const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n);
-
-                vst1q_s32(output_ptr + x, result.val[0]);
-                vst1q_s32(output_ptr + x + 4, result.val[1]);
-            }
+                    }};
+                    const int32x4x2_t result          = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n);
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                int64_t tmp = static_cast<int64_t>(broadcast_value) * static_cast<int64_t>(*(non_broadcast_input_ptr + x));
-
-                if(tmp >= 0)
-                {
-                    tmp >>= n;
-                }
-                else
-                {
-                    uint64_t mask = ((uint64_t)1u << n) - 1;
-                    tmp           = (tmp + static_cast<int64_t>(mask)) >> n;
+                    vst1q_s32(output_ptr + x, result.val[0]);
+                    vst1q_s32(output_ptr + x + 4, result.val[1]);
                 }
-                if(is_sat)
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
-                    tmp = utility::clamp<int64_t, int32_t>(tmp);
+                    int64_t tmp =
+                        static_cast<int64_t>(broadcast_value) * static_cast<int64_t>(*(non_broadcast_input_ptr + x));
+
+                    if (tmp >= 0)
+                    {
+                        tmp >>= n;
+                    }
+                    else
+                    {
+                        uint64_t mask = ((uint64_t)1u << n) - 1;
+                        tmp           = (tmp + static_cast<int64_t>(mask)) >> n;
+                    }
+                    if (is_sat)
+                    {
+                        tmp = utility::clamp<int64_t, int32_t>(tmp);
+                    }
+                    *(output_ptr + x) = static_cast<int32_t>(tmp);
                 }
-                *(output_ptr + x) = static_cast<int32_t>(tmp);
-            }
-        },
-        broadcast_input, non_broadcast_input, dst);
+            },
+            broadcast_input, non_broadcast_input, dst);
     }
     else
     {
@@ -1140,58 +1120,53 @@ void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, con
         Iterator dst(out, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int32_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int32_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const int32x4x2_t ta1 =
+                const auto input1_ptr = reinterpret_cast<const int32_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int32_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const int32x4x2_t ta1    = {{
+                           vld1q_s32(input1_ptr + x),
+                           vld1q_s32(input1_ptr + x + 4),
+                    }};
+                    const int32x4x2_t ta2    = {{
+                           vld1q_s32(input2_ptr + x),
+                           vld1q_s32(input2_ptr + x + 4),
+                    }};
+                    const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n);
+
+                    vst1q_s32(output_ptr + x, result.val[0]);
+                    vst1q_s32(output_ptr + x + 4, result.val[1]);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
+                    int64_t tmp = static_cast<int64_t>(*(input1_ptr + x)) * static_cast<int64_t>(*(input2_ptr + x));
+
+                    if (tmp >= 0)
                     {
-                        vld1q_s32(input1_ptr + x),
-                        vld1q_s32(input1_ptr + x + 4),
+                        tmp >>= n;
                     }
-                };
-                const int32x4x2_t ta2 =
-                {
+                    else
                     {
-                        vld1q_s32(input2_ptr + x),
-                        vld1q_s32(input2_ptr + x + 4),
+                        uint64_t mask = ((uint64_t)1u << n) - 1;
+                        tmp           = (tmp + static_cast<int64_t>(mask)) >> n;
                     }
-                };
-                const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n);
-
-                vst1q_s32(output_ptr + x, result.val[0]);
-                vst1q_s32(output_ptr + x + 4, result.val[1]);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                int64_t tmp = static_cast<int64_t>(*(input1_ptr + x)) * static_cast<int64_t>(*(input2_ptr + x));
-
-                if(tmp >= 0)
-                {
-                    tmp >>= n;
-                }
-                else
-                {
-                    uint64_t mask = ((uint64_t)1u << n) - 1;
-                    tmp           = (tmp + static_cast<int64_t>(mask)) >> n;
-                }
-                if(is_sat)
-                {
-                    tmp = utility::clamp<int64_t, int32_t>(tmp);
+                    if (is_sat)
+                    {
+                        tmp = utility::clamp<int64_t, int32_t>(tmp);
+                    }
+                    *(output_ptr + x) = static_cast<int32_t>(tmp);
                 }
-                *(output_ptr + x) = static_cast<int32_t>(tmp);
-            }
-        },
-        input1, input2, dst);
+            },
+            input1, input2, dst);
     }
 }
 
@@ -1212,7 +1187,7 @@ void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, con
 
     using ExactTagType = typename wrapper::traits::neon_vector<float, window_step_x>::tag_type;
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -1228,32 +1203,33 @@ void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, con
         Iterator dst(out, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<float *>(dst.ptr());
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<float *>(dst.ptr());
 
-            const float broadcast_value     = *reinterpret_cast<const float *>(broadcast_input.ptr());
-            const auto  broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
-            const auto  scale_vec           = wrapper::vdup_n(scale, ExactTagType{});
+                const float broadcast_value     = *reinterpret_cast<const float *>(broadcast_input.ptr());
+                const auto  broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
+                const auto  scale_vec           = wrapper::vdup_n(scale, ExactTagType{});
 
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
-                auto       res             = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec);
-                wrapper::vstore(output_ptr + x, res);
-            }
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+                    auto       res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec);
+                    wrapper::vstore(output_ptr + x, res);
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
-                *(output_ptr + x)          = broadcast_value * non_broadcast_v * scale;
-            }
-        },
-        broadcast_input, non_broadcast_input, dst);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
+                    *(output_ptr + x)          = broadcast_value * non_broadcast_v * scale;
+                }
+            },
+            broadcast_input, non_broadcast_input, dst);
     }
     else
     {
@@ -1266,32 +1242,33 @@ void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, con
         Iterator dst(out, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const auto ta1       = wrapper::vloadq(input1_ptr + x);
-                const auto ta2       = wrapper::vloadq(input2_ptr + x);
-                const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
-                const auto res       = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec);
-                wrapper::vstore(output_ptr + x, res);
-            }
+                const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto ta1    = *(input1_ptr + x);
-                const auto ta2    = *(input2_ptr + x);
-                *(output_ptr + x) = ta1 * ta2 * scale;
-            }
-        },
-        input1, input2, dst);
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto ta1       = wrapper::vloadq(input1_ptr + x);
+                    const auto ta2       = wrapper::vloadq(input2_ptr + x);
+                    const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
+                    const auto res       = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec);
+                    wrapper::vstore(output_ptr + x, res);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto ta1    = *(input1_ptr + x);
+                    const auto ta2    = *(input2_ptr + x);
+                    *(output_ptr + x) = ta1 * ta2 * scale;
+                }
+            },
+            input1, input2, dst);
     }
 }
 
@@ -1312,7 +1289,7 @@ void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out,
 
     using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -1328,48 +1305,49 @@ void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out,
         Iterator dst(out, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<float *>(dst.ptr());
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<float *>(dst.ptr());
 
-            const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
+                const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
 
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto  a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x);
-                float32x4_t b = vdupq_n_f32(broadcast_value);
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto  a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x);
+                    float32x4_t b = vdupq_n_f32(broadcast_value);
 
-                const float32x4_t mask  = { -1.0f, 1.0f, -1.0f, 1.0f };
-                const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
-                const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
-                const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
-                const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
+                    const float32x4_t mask  = {-1.0f, 1.0f, -1.0f, 1.0f};
+                    const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
+                    const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
+                    const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
+                    const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
 
-                const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
-                const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
+                    const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
+                    const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
 
-                float32x4_t res = wrapper::vmul(tmp0, b);
-                b               = wrapper::vmul(b, mask);
+                    float32x4_t res = wrapper::vmul(tmp0, b);
+                    b               = wrapper::vmul(b, mask);
 
-                res = wrapper::vmla(res, tmp1, b);
-                wrapper::vstore(output_ptr + 2 * x, res);
-            }
+                    res = wrapper::vmla(res, tmp1, b);
+                    wrapper::vstore(output_ptr + 2 * x, res);
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x);
-                const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1);
-                auto       res1                 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1);
-                auto       res2                 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0);
-                *(output_ptr + 2 * x)           = res1;
-                *(output_ptr + 2 * x + 1)       = res2;
-            }
-        },
-        broadcast_input, non_broadcast_input, dst);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x);
+                    const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1);
+                    auto       res1                 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1);
+                    auto       res2                 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0);
+                    *(output_ptr + 2 * x)           = res1;
+                    *(output_ptr + 2 * x + 1)       = res2;
+                }
+            },
+            broadcast_input, non_broadcast_input, dst);
     }
     else
     {
@@ -1382,51 +1360,52 @@ void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out,
         Iterator dst(out, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x);
-                float32x4_t       b = wrapper::vloadq(input2_ptr + 2 * x);
+                const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
 
-                const float32x4_t mask  = { -1.0f, 1.0f, -1.0f, 1.0f };
-                const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
-                const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
-                const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
-                const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x);
+                    float32x4_t       b = wrapper::vloadq(input2_ptr + 2 * x);
 
-                const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
-                const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
+                    const float32x4_t mask  = {-1.0f, 1.0f, -1.0f, 1.0f};
+                    const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
+                    const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
+                    const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
+                    const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
 
-                float32x4_t res = wrapper::vmul(tmp0, b);
+                    const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
+                    const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
 
-                b = wrapper::vrev64(b);
-                b = wrapper::vmul(b, mask);
+                    float32x4_t res = wrapper::vmul(tmp0, b);
 
-                res = wrapper::vmla(res, tmp1, b);
-                wrapper::vstore(output_ptr + 2 * x, res);
-            }
+                    b = wrapper::vrev64(b);
+                    b = wrapper::vmul(b, mask);
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto a0             = *(input1_ptr + 2 * x);
-                const auto a1             = *(input1_ptr + 2 * x + 1);
-                const auto b0             = *(input2_ptr + 2 * x);
-                const auto b1             = *(input2_ptr + 2 * x + 1);
-                auto       res1           = a0 * b0 - a1 * b1;
-                auto       res2           = a0 * b1 + a1 * b0;
-                *(output_ptr + 2 * x)     = res1;
-                *(output_ptr + 2 * x + 1) = res2;
-            }
-        },
-        input1, input2, dst);
+                    res = wrapper::vmla(res, tmp1, b);
+                    wrapper::vstore(output_ptr + 2 * x, res);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto a0             = *(input1_ptr + 2 * x);
+                    const auto a1             = *(input1_ptr + 2 * x + 1);
+                    const auto b0             = *(input2_ptr + 2 * x);
+                    const auto b1             = *(input2_ptr + 2 * x + 1);
+                    auto       res1           = a0 * b0 - a1 * b1;
+                    auto       res2           = a0 * b1 + a1 * b0;
+                    *(output_ptr + 2 * x)     = res1;
+                    *(output_ptr + 2 * x + 1) = res2;
+                }
+            },
+            input1, input2, dst);
     }
 }
 
@@ -1444,7 +1423,7 @@ void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, con
     const auto    window_start_x        = static_cast<int>(window.x().start());
     const auto    window_end_x          = static_cast<int>(window.x().end());
     const bool    is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -1457,48 +1436,40 @@ void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, con
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator dst(out, win);
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto          non_broadcast_input_ptr = reinterpret_cast<const float16_t *>(non_broadcast_input.ptr());
-            const auto          output_ptr              = reinterpret_cast<float16_t *>(dst.ptr());
-            const auto          broadcast_value         = *reinterpret_cast<const float16_t *>(broadcast_input.ptr());
-            const float16x8x2_t broadcast_value_vec     =
+            win,
+            [&](const Coordinates &)
             {
-                {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const float16_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<float16_t *>(dst.ptr());
+                const auto broadcast_value         = *reinterpret_cast<const float16_t *>(broadcast_input.ptr());
+                const float16x8x2_t broadcast_value_vec = {{
                     vdupq_n_f16(broadcast_value),
                     vdupq_n_f16(broadcast_value),
-                }
-            };
-            const auto scale_vec = vdupq_n_f16(scale);
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const float16x8x2_t non_broadcast_v =
+                }};
+                const auto          scale_vec           = vdupq_n_f16(scale);
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
+                    const float16x8x2_t non_broadcast_v = {{
                         vld1q_f16(non_broadcast_input_ptr + x),
                         vld1q_f16(non_broadcast_input_ptr + x + 8),
-                    }
-                };
-                const float16x8x2_t result =
+                    }};
+                    const float16x8x2_t result          = {{
+                                 vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec),
+                                 vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec),
+                    }};
+                    vst1q_f16(output_ptr + x, result.val[0]);
+                    vst1q_f16(output_ptr + x + 8, result.val[1]);
+                }
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
-                    {
-                        vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec),
-                        vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec),
-                    }
-                };
-                vst1q_f16(output_ptr + x, result.val[0]);
-                vst1q_f16(output_ptr + x + 8, result.val[1]);
-            }
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
-                *(output_ptr + x)          = broadcast_value * non_broadcast_v * scale;
-            }
-        },
-        broadcast_input, non_broadcast_input, dst);
+                    const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
+                    *(output_ptr + x)          = broadcast_value * non_broadcast_v * scale;
+                }
+            },
+            broadcast_input, non_broadcast_input, dst);
     }
     else
     {
@@ -1508,49 +1479,41 @@ void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, con
         Iterator input2(src2, input2_win);
         Iterator dst(out, win);
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const float16x8x2_t ta1 =
+            win,
+            [&](const Coordinates &)
+            {
+                const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_f16(input1_ptr + x),
-                        vld1q_f16(input1_ptr + x + 8),
-                    }
-                };
-                const float16x8x2_t ta2 =
-                {
-                    {
-                        vld1q_f16(input2_ptr + x),
-                        vld1q_f16(input2_ptr + x + 8),
-                    }
-                };
-                const float16x8_t   scale_vec = vdupq_n_f16(scale);
-                const float16x8x2_t result    =
+                    const float16x8x2_t ta1       = {{
+                              vld1q_f16(input1_ptr + x),
+                              vld1q_f16(input1_ptr + x + 8),
+                    }};
+                    const float16x8x2_t ta2       = {{
+                              vld1q_f16(input2_ptr + x),
+                              vld1q_f16(input2_ptr + x + 8),
+                    }};
+                    const float16x8_t   scale_vec = vdupq_n_f16(scale);
+                    const float16x8x2_t result    = {{
+                           vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),
+                           vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),
+                    }};
+                    vst1q_f16(output_ptr + x, result.val[0]);
+                    vst1q_f16(output_ptr + x + 8, result.val[1]);
+                }
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
-                    {
-                        vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),
-                        vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),
-                    }
-                };
-                vst1q_f16(output_ptr + x, result.val[0]);
-                vst1q_f16(output_ptr + x + 8, result.val[1]);
-            }
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto ta1    = *(input1_ptr + x);
-                const auto ta2    = *(input2_ptr + x);
-                *(output_ptr + x) = ta1 * ta2 * scale;
-            }
-        },
-        input1, input2, dst);
+                    const auto ta1    = *(input1_ptr + x);
+                    const auto ta2    = *(input2_ptr + x);
+                    *(output_ptr + x) = ta1 * ta2 * scale;
+                }
+            },
+            input1, input2, dst);
     }
 }
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
@@ -1577,81 +1540,82 @@ void mul_U8_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const
     const auto window_end_x   = static_cast<int>(window.x().end());
 
     execute_window_loop(
-        win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        win,
+        [&](const Coordinates &)
         {
-            const uint8x16_t bv = wrapper::vloadq(input2_ptr + x);
-            const uint8x16_t av = wrapper::vloadq(input1_ptr + x);
-
-            uint16x8_t tmp_low  = vmovl_u8(vget_low_u8(av));
-            uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
-            tmp_low             = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
-            tmp_high            = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
+            const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
 
-            if(is_scale255)
-            {
-                tmp_low  = scale255_U16_U16(tmp_low);
-                tmp_high = scale255_U16_U16(tmp_high);
-            }
-            else
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                const int16x8_t vn = vdupq_n_s16(-n);
+                const uint8x16_t bv = wrapper::vloadq(input2_ptr + x);
+                const uint8x16_t av = wrapper::vloadq(input1_ptr + x);
 
-                if(is_sat)
+                uint16x8_t tmp_low  = vmovl_u8(vget_low_u8(av));
+                uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
+                tmp_low             = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
+                tmp_high            = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
+
+                if (is_scale255)
                 {
-                    tmp_low  = vqshlq_u16(tmp_low, vn);
-                    tmp_high = vqshlq_u16(tmp_high, vn);
+                    tmp_low  = scale255_U16_U16(tmp_low);
+                    tmp_high = scale255_U16_U16(tmp_high);
                 }
                 else
                 {
-                    tmp_low  = vshlq_u16(tmp_low, vn);
-                    tmp_high = vshlq_u16(tmp_high, vn);
+                    const int16x8_t vn = vdupq_n_s16(-n);
+
+                    if (is_sat)
+                    {
+                        tmp_low  = vqshlq_u16(tmp_low, vn);
+                        tmp_high = vqshlq_u16(tmp_high, vn);
+                    }
+                    else
+                    {
+                        tmp_low  = vshlq_u16(tmp_low, vn);
+                        tmp_high = vshlq_u16(tmp_high, vn);
+                    }
                 }
-            }
 
-            if(is_sat)
-            {
-                static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
+                if (is_sat)
+                {
+                    static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
 
-                tmp_low  = vminq_u16(tmp_low, max);
-                tmp_high = vminq_u16(tmp_high, max);
+                    tmp_low  = vminq_u16(tmp_low, max);
+                    tmp_high = vminq_u16(tmp_high, max);
+                }
+
+                vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low));
+                vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high));
             }
 
-            vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low));
-            vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high));
-        }
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
+                if (is_scale255)
+                {
+                    float tmp_f = static_cast<float>(tmp) * scale255_constant;
+                    tmp         = static_cast<int32_t>(tmp_f + 0.5f);
+                }
+                else
+                {
+                    tmp >>= n;
+                }
 
-            if(is_scale255)
-            {
-                float tmp_f = static_cast<float>(tmp) * scale255_constant;
-                tmp         = static_cast<int32_t>(tmp_f + 0.5f);
-            }
-            else
-            {
-                tmp >>= n;
-            }
+                if (is_sat)
+                {
+                    tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp;
+                }
 
-            if(is_sat)
-            {
-                tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp;
+                *(output_ptr + x) = static_cast<int16_t>(tmp);
             }
-
-            *(output_ptr + x) = static_cast<int16_t>(tmp);
-        }
-    },
-    input1, input2, dst);
+        },
+        input1, input2, dst);
 }
 
 template <bool is_scale255, bool is_sat>
@@ -1676,75 +1640,65 @@ void mul_S16_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, cons
     const auto window_end_x   = static_cast<int>(window.x().end());
 
     execute_window_loop(
-        win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        win,
+        [&](const Coordinates &)
         {
-            const int16x8x2_t ta1 =
-            {
-                {
-                    vld1q_s16(input1_ptr + x),
-                    vld1q_s16(input1_ptr + x + 8),
-                }
-            };
-            const uint8x8x2_t ta2u =
+            const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
+
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                {
+                const int16x8x2_t ta1  = {{
+                     vld1q_s16(input1_ptr + x),
+                     vld1q_s16(input1_ptr + x + 8),
+                }};
+                const uint8x8x2_t ta2u = {{
                     vld1_u8(input2_ptr + x),
                     vld1_u8(input2_ptr + x + 8),
-                }
-            };
-            const int16x8x2_t ta2 =
-            {
-                {
-                    vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])),
-                    vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))
-                }
-            };
-
-            const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
+                }};
+                const int16x8x2_t ta2  = {
+                     {vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])), vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))}};
 
-            vst1q_s16(output_ptr + x, result.val[0]);
-            vst1q_s16(output_ptr + x + 8, result.val[1]);
-        }
+                const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
+                vst1q_s16(output_ptr + x, result.val[0]);
+                vst1q_s16(output_ptr + x + 8, result.val[1]);
+            }
 
-            if(is_scale255)
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                float tmp_f = static_cast<float>(tmp) * scale255_constant;
+                int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
 
-                tmp = static_cast<int32_t>(tmp_f + 0.5f);
-            }
-            else
-            {
-                if(tmp >= 0)
+                if (is_scale255)
                 {
-                    tmp >>= n;
+                    float tmp_f = static_cast<float>(tmp) * scale255_constant;
+
+                    tmp = static_cast<int32_t>(tmp_f + 0.5f);
                 }
                 else
                 {
-                    uint32_t mask = (1u << n) - 1;
-                    tmp           = (tmp + static_cast<int32_t>(mask)) >> n;
+                    if (tmp >= 0)
+                    {
+                        tmp >>= n;
+                    }
+                    else
+                    {
+                        uint32_t mask = (1u << n) - 1;
+                        tmp           = (tmp + static_cast<int32_t>(mask)) >> n;
+                    }
+                }
+                if (is_sat)
+                {
+                    tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
                 }
+                *(output_ptr + x) = static_cast<int16_t>(tmp);
             }
-            if(is_sat)
-            {
-                tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
-            }
-            *(output_ptr + x) = static_cast<int16_t>(tmp);
-        }
-    },
-    input1, input2, dst);
+        },
+        input1, input2, dst);
 }
 
 template <bool is_scale255, bool is_sat>
@@ -1755,7 +1709,12 @@ void mul_U8_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, cons
 }
 } // namespace
 
-void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+void CpuMulKernel::configure(ITensorInfo   *src1,
+                             ITensorInfo   *src2,
+                             ITensorInfo   *dst,
+                             float          scale,
+                             ConvertPolicy  overflow_policy,
+                             RoundingPolicy rounding_policy)
 {
     ARM_COMPUTE_UNUSED(rounding_policy);
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
@@ -1775,7 +1734,7 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *
 
     bool is_scale_255 = false;
     // Check and validate scaling factor
-    if(std::abs(scale - scale255_constant) < 0.00001f)
+    if (std::abs(scale - scale255_constant) < 0.00001f)
     {
         is_scale_255 = true;
     }
@@ -1795,12 +1754,12 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *
     const DataType dt_output = dst->data_type();
     const bool     is_sat    = (overflow_policy == ConvertPolicy::SATURATE);
 
-    switch(dt_input1)
+    switch (dt_input1)
     {
         case DataType::QASYMM8:
-            if(dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8)
+            if (dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8)
             {
-                if(mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
+                if (mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
                 {
                     _func_quantized = &mul_q8_neon_fixedpoint<uint8_t>;
                 }
@@ -1811,9 +1770,9 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *
             }
             break;
         case DataType::QASYMM8_SIGNED:
-            if(dt_input2 == DataType::QASYMM8_SIGNED)
+            if (dt_input2 == DataType::QASYMM8_SIGNED)
             {
-                if(mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
+                if (mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
                 {
                     _func_quantized = &mul_q8_neon_fixedpoint<int8_t>;
                 }
@@ -1824,19 +1783,19 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *
             }
             break;
         case DataType::QSYMM16:
-            if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16)
+            if (dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16)
             {
                 _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16;
             }
-            else if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32)
+            else if (dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32)
             {
                 _func_int = &mul_QSYMM16_QSYMM16_S32;
             }
             break;
         case DataType::S16:
-            if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
+            if (DataType::U8 == dt_input2 && DataType::S16 == dt_output)
             {
-                if(is_scale_255)
+                if (is_scale_255)
                 {
                     _func_int = is_sat ? &mul_S16_U8_S16<true, true> : &mul_S16_U8_S16<true, false>;
                 }
@@ -1845,9 +1804,9 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *
                     _func_int = is_sat ? &mul_S16_U8_S16<false, true> : &mul_S16_U8_S16<false, false>;
                 }
             }
-            if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
+            if (DataType::S16 == dt_input2 && DataType::S16 == dt_output)
             {
-                if(is_scale_255)
+                if (is_scale_255)
                 {
                     _func_int = is_sat ? &mul_S16_S16_S16<true, true> : &mul_S16_S16_S16<true, false>;
                 }
@@ -1858,15 +1817,15 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *
             }
             break;
         case DataType::S32:
-            if(DataType::S32 == dt_input2 && DataType::S32 == dt_output)
+            if (DataType::S32 == dt_input2 && DataType::S32 == dt_output)
             {
                 _func_int = is_sat ? &mul_S32_S32_S32<true> : &mul_S32_S32_S32<false>;
             }
             break;
         case DataType::U8:
-            if(DataType::U8 == dt_input2 && DataType::U8 == dt_output)
+            if (DataType::U8 == dt_input2 && DataType::U8 == dt_output)
             {
-                if(is_scale_255)
+                if (is_scale_255)
                 {
                     _func_int = is_sat ? &mul_U8_U8_U8<true, true> : &mul_U8_U8_U8<true, false>;
                 }
@@ -1875,9 +1834,9 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *
                     _func_int = is_sat ? &mul_U8_U8_U8<false, true> : &mul_U8_U8_U8<false, false>;
                 }
             }
-            else if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
+            else if (DataType::U8 == dt_input2 && DataType::S16 == dt_output)
             {
-                if(is_scale_255)
+                if (is_scale_255)
                 {
                     _func_int = is_sat ? &mul_U8_U8_S16<true, true> : &mul_U8_U8_S16<true, false>;
                 }
@@ -1886,9 +1845,9 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *
                     _func_int = is_sat ? &mul_U8_U8_S16<false, true> : &mul_U8_U8_S16<false, false>;
                 }
             }
-            else if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
+            else if (DataType::S16 == dt_input2 && DataType::S16 == dt_output)
             {
-                if(is_scale_255)
+                if (is_scale_255)
                 {
                     _func_int = is_sat ? &mul_U8_S16_S16<true, true> : &mul_U8_S16_S16<true, false>;
                 }
@@ -1922,20 +1881,20 @@ size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
     ARM_COMPUTE_UNUSED(thread_count);
 
 #if defined(ENABLE_FP32_KERNELS)
-    if(this->_func_float == &mul_F32_F32_F32)
+    if (this->_func_float == &mul_F32_F32_F32)
     {
         size_t mws = ICPPKernel::default_mws;
-        if(platform.get_cpu_model() == CPUModel::N1)
+        if (platform.get_cpu_model() == CPUModel::N1)
         {
             mws = default_mws_N1_fp32_neon;
         }
-        else if(platform.get_cpu_model() == CPUModel::V1)
+        else if (platform.get_cpu_model() == CPUModel::V1)
         {
             mws = default_mws_V1_fp32_neon;
         }
         else
         {
-            if(_split_dimension == Window::DimX)
+            if (_split_dimension == Window::DimX)
             {
                 // Don't split the work load too small if the tensor has been reinterpreted as 1D.
                 // This number is loosely chosen as threading overhead in each platform varies wildly.
@@ -1945,7 +1904,7 @@ size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
         }
 
         // tensor is 1D or was re-interpreted as 1D
-        if(this->window().shape().num_dimensions() == 1)
+        if (this->window().shape().num_dimensions() == 1)
         {
             return mws;
         }
@@ -1958,10 +1917,10 @@ size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
             return std::max(static_cast<size_t>(1), mws);
         }
     }
-#else /* ENABLE_FP32_KERNELS */
+#else  /* ENABLE_FP32_KERNELS */
     ARM_COMPUTE_UNUSED(platform);
 #endif /* ENABLE_FP32_KERNELS */
-    if(_split_dimension == Window::DimX)
+    if (_split_dimension == Window::DimX)
     {
         // Don't split the work load too small if the tensor has been reinterpreted as 1D.
         // This number is loosely chosen as threading overhead in each platform varies wildly.
@@ -1970,8 +1929,12 @@ size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
     return default_mws;
 }
 
-Status CpuMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy,
-                              RoundingPolicy rounding_policy)
+Status CpuMulKernel::validate(const ITensorInfo *src1,
+                              const ITensorInfo *src2,
+                              const ITensorInfo *dst,
+                              float              scale,
+                              ConvertPolicy      overflow_policy,
+                              RoundingPolicy     rounding_policy)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));
@@ -1989,11 +1952,11 @@ void CpuMulKernel::run_op(ITensorPack &tensors, const Window &window, const Thre
     auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
     auto dst  = tensors.get_tensor(TensorType::ACL_DST);
 
-    if(_func_quantized != nullptr)
+    if (_func_quantized != nullptr)
     {
         (*_func_quantized)(src1, src2, dst, window, _scale);
     }
-    else if(_func_int != nullptr)
+    else if (_func_int != nullptr)
     {
         (*_func_int)(src1, src2, dst, window, _scale_exponent);
     }
@@ -2021,10 +1984,11 @@ Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *sr
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
     // Validate in case of configured dst
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
+                                        "Wrong shape for dst");
     }
 
     return Status{};
diff --git a/src/cpu/kernels/CpuMulKernel.h b/src/cpu/kernels/CpuMulKernel.h
index 9e4a37110b..7eaf287507 100644
--- a/src/cpu/kernels/CpuMulKernel.h
+++ b/src/cpu/kernels/CpuMulKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_MUL_KERNEL_H
 
 #include "arm_compute/core/Rounding.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -68,17 +69,27 @@ public:
      * @param[in]  overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
      * @param[in]  rounding_policy Rounding policy.
      */
-    void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+    void configure(ITensorInfo   *src1,
+                   ITensorInfo   *src2,
+                   ITensorInfo   *dst,
+                   float          scale,
+                   ConvertPolicy  overflow_policy,
+                   RoundingPolicy rounding_policy);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref CpuMulKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+    static Status validate(const ITensorInfo *src1,
+                           const ITensorInfo *src2,
+                           const ITensorInfo *dst,
+                           float              scale,
+                           ConvertPolicy      overflow_policy,
+                           RoundingPolicy     rounding_policy);
 
     // Inherited methods overridden
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     /** Return minimum workload size of the relevant kernel
@@ -108,7 +119,8 @@ private:
      * @param[in]  window Region on which to execute the kernel
      * @param[in]  scale  Integer scale factor.
      */
-    using MulFunctionInt = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, int scale);
+    using MulFunctionInt =
+        void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, int scale);
     /** Common signature for all the specialised multiplication functions with float scaling factor
      *
      * @param[in]  src1   Src1 tensor object.
@@ -117,7 +129,8 @@ private:
      * @param[in]  window Region on which to execute the kernel
      * @param[in]  scale  Float scale factor.
      */
-    using MulFunctionFloat = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale);
+    using MulFunctionFloat =
+        void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale);
     /** Common signature for all the specialised QASYMM8 multiplication functions with float scaling factor
      *
      * @param[in]  src1   Src1 tensor object.
@@ -127,14 +140,15 @@ private:
      * @param[in]  scale  Float scale factor.
      *
      */
-    using MulFunctionQuantized = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale);
+    using MulFunctionQuantized =
+        void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale);
 
-    MulFunctionFloat     *_func_float{ nullptr };
-    MulFunctionInt       *_func_int{ nullptr };
-    MulFunctionQuantized *_func_quantized{ nullptr };
-    float                 _scale{ 0 };
-    int                   _scale_exponent{ 0 };
-    size_t                _split_dimension{ Window::DimY };
+    MulFunctionFloat     *_func_float{nullptr};
+    MulFunctionInt       *_func_int{nullptr};
+    MulFunctionQuantized *_func_quantized{nullptr};
+    float                 _scale{0};
+    int                   _scale_exponent{0};
+    size_t                _split_dimension{Window::DimY};
 };
 
 /** Interface for the complex pixelwise multiplication kernel. */
@@ -159,7 +173,7 @@ public:
     static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 };
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuPermuteKernel.cpp b/src/cpu/kernels/CpuPermuteKernel.cpp
index d65e011032..b444a25ff7 100644
--- a/src/cpu/kernels/CpuPermuteKernel.cpp
+++ b/src/cpu/kernels/CpuPermuteKernel.cpp
@@ -28,8 +28,9 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -48,56 +49,31 @@ namespace
 {
 inline bool is_permutation_supported(const PermutationVector &v)
 {
-    static const std::array<PermutationVector, 2> permutations2 =
-    {
-        {
-            PermutationVector(0U, 1U),
-            PermutationVector(1U, 0U),
-        }
-    };
-    static const std::array<PermutationVector, 6> permutations3 =
-    {
-        {
-            PermutationVector(2U, 0U, 1U),
-            PermutationVector(1U, 2U, 0U),
-            PermutationVector(0U, 1U, 2U),
-            PermutationVector(0U, 2U, 1U),
-            PermutationVector(1U, 0U, 2U),
-            PermutationVector(2U, 1U, 0U),
-        }
-    };
-    static const std::array<PermutationVector, 24> permutations4 =
-    {
-        {
-            PermutationVector(0U, 1U, 2U, 3U),
-            PermutationVector(1U, 0U, 2U, 3U),
-            PermutationVector(2U, 0U, 1U, 3U),
-            PermutationVector(0U, 2U, 1U, 3U),
-            PermutationVector(1U, 2U, 0U, 3U),
-            PermutationVector(2U, 1U, 0U, 3U),
-            PermutationVector(2U, 1U, 3U, 0U),
-            PermutationVector(1U, 2U, 3U, 0U),
-            PermutationVector(3U, 2U, 1U, 0U),
-            PermutationVector(2U, 3U, 1U, 0U),
-            PermutationVector(1U, 3U, 2U, 0U),
-            PermutationVector(3U, 1U, 2U, 0U),
-            PermutationVector(3U, 0U, 2U, 1U),
-            PermutationVector(0U, 3U, 2U, 1U),
-            PermutationVector(2U, 3U, 0U, 1U),
-            PermutationVector(3U, 2U, 0U, 1U),
-            PermutationVector(0U, 2U, 3U, 1U),
-            PermutationVector(2U, 0U, 3U, 1U),
-            PermutationVector(1U, 0U, 3U, 2U),
-            PermutationVector(0U, 1U, 3U, 2U),
-            PermutationVector(3U, 1U, 0U, 2U),
-            PermutationVector(1U, 3U, 0U, 2U),
-            PermutationVector(0U, 3U, 1U, 2U),
-            PermutationVector(3U, 0U, 1U, 2U)
-        }
-    };
+    static const std::array<PermutationVector, 2>  permutations2 = {{
+         PermutationVector(0U, 1U),
+         PermutationVector(1U, 0U),
+    }};
+    static const std::array<PermutationVector, 6>  permutations3 = {{
+         PermutationVector(2U, 0U, 1U),
+         PermutationVector(1U, 2U, 0U),
+         PermutationVector(0U, 1U, 2U),
+         PermutationVector(0U, 2U, 1U),
+         PermutationVector(1U, 0U, 2U),
+         PermutationVector(2U, 1U, 0U),
+    }};
+    static const std::array<PermutationVector, 24> permutations4 = {
+        {PermutationVector(0U, 1U, 2U, 3U), PermutationVector(1U, 0U, 2U, 3U), PermutationVector(2U, 0U, 1U, 3U),
+         PermutationVector(0U, 2U, 1U, 3U), PermutationVector(1U, 2U, 0U, 3U), PermutationVector(2U, 1U, 0U, 3U),
+         PermutationVector(2U, 1U, 3U, 0U), PermutationVector(1U, 2U, 3U, 0U), PermutationVector(3U, 2U, 1U, 0U),
+         PermutationVector(2U, 3U, 1U, 0U), PermutationVector(1U, 3U, 2U, 0U), PermutationVector(3U, 1U, 2U, 0U),
+         PermutationVector(3U, 0U, 2U, 1U), PermutationVector(0U, 3U, 2U, 1U), PermutationVector(2U, 3U, 0U, 1U),
+         PermutationVector(3U, 2U, 0U, 1U), PermutationVector(0U, 2U, 3U, 1U), PermutationVector(2U, 0U, 3U, 1U),
+         PermutationVector(1U, 0U, 3U, 2U), PermutationVector(0U, 1U, 3U, 2U), PermutationVector(3U, 1U, 0U, 2U),
+         PermutationVector(1U, 3U, 0U, 2U), PermutationVector(0U, 3U, 1U, 2U), PermutationVector(3U, 0U, 1U, 2U)}};
 
-    return (permutations2.end() != std::find(permutations2.begin(), permutations2.end(), v)) || (permutations3.end() != std::find(permutations3.begin(), permutations3.end(), v))
-           || (permutations4.end() != std::find(permutations4.begin(), permutations4.end(), v));
+    return (permutations2.end() != std::find(permutations2.begin(), permutations2.end(), v)) ||
+           (permutations3.end() != std::find(permutations3.begin(), permutations3.end(), v)) ||
+           (permutations4.end() != std::find(permutations4.begin(), permutations4.end(), v));
 }
 
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm)
@@ -108,7 +84,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm);
 
     // Validate configured destination
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
@@ -128,18 +104,22 @@ void run_permute(const Window &window, const ITensor *src, const ITensor *dst, c
 
     // we only support these two configs in src/core/NEON/kernels/convolution/common/shims.hpp, for all others
     // we have to fall back to C++
-    if((src_layout == DataLayout::NCHW && perm == PermutationVector{ 2U, 0U, 1U }) || (src_layout == DataLayout::NHWC && perm == PermutationVector{ 1U, 2U, 0U }))
+    if ((src_layout == DataLayout::NCHW && perm == PermutationVector{2U, 0U, 1U}) ||
+        (src_layout == DataLayout::NHWC && perm == PermutationVector{1U, 2U, 0U}))
     {
-        window_src.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), window.x().end() - window.x().start()));
-        window_src.set(Window::DimY, Window::Dimension(window.y().start(), window.y().end(), window.y().end() - window.y().start()));
-        window_src.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), window.z().end() - window.z().start()));
+        window_src.set(Window::DimX,
+                       Window::Dimension(window.x().start(), window.x().end(), window.x().end() - window.x().start()));
+        window_src.set(Window::DimY,
+                       Window::Dimension(window.y().start(), window.y().end(), window.y().end() - window.y().start()));
+        window_src.set(Window::DimZ,
+                       Window::Dimension(window.z().start(), window.z().end(), window.z().end() - window.z().start()));
         window_src.set(3, Window::Dimension(window[3].start(), window[3].end(), window[3].end() - window[3].start()));
     }
 
     // Destination window
     Window                  window_dst(window);
     const Window::Dimension zero_window = Window::Dimension(0, 0, 0);
-    for(size_t d = 0; d <= dst->info()->num_dimensions(); ++d)
+    for (size_t d = 0; d <= dst->info()->num_dimensions(); ++d)
     {
         window_dst.set(d, zero_window);
     }
@@ -157,7 +137,7 @@ void run_permute(const Window &window, const ITensor *src, const ITensor *dst, c
     int n_channels        = 0;
     int n_batches         = 0;
 
-    switch(src_layout)
+    switch (src_layout)
     {
         case DataLayout::NCHW:
         {
@@ -189,38 +169,42 @@ void run_permute(const Window &window, const ITensor *src, const ITensor *dst, c
     }
 
     // CHW -> HWC
-    if(src_layout == DataLayout::NCHW && perm == PermutationVector{ 2U, 0U, 1U })
+    if (src_layout == DataLayout::NCHW && perm == PermutationVector{2U, 0U, 1U})
     {
         const int out_channel_stride = dst->info()->strides_in_bytes().x() / sizeof(T);
         const int out_col_stride     = dst->info()->strides_in_bytes().y() / sizeof(T);
         const int out_row_stride     = dst->info()->strides_in_bytes().z() / sizeof(T);
         const int out_batch_stride   = dst->info()->strides_in_bytes()[3] / sizeof(T);
-        execute_window_loop(window_src, [&](const Coordinates & id)
-        {
-            const int idx = id[0] * out_col_stride + id[1] * out_row_stride + id[2] * out_channel_stride;
-            reorder::nchw_to_nhwc(reinterpret_cast<const T *>(src_it.ptr()), reinterpret_cast<T *>(dst_it.ptr()) + idx,
-                                  n_batches, n_channels, n_rows, n_cols,
-                                  in_batch_stride, in_channel_stride, in_row_stride,
-                                  out_batch_stride, out_row_stride, out_col_stride);
-        },
-        src_it, dst_it);
+        execute_window_loop(
+            window_src,
+            [&](const Coordinates &id)
+            {
+                const int idx = id[0] * out_col_stride + id[1] * out_row_stride + id[2] * out_channel_stride;
+                reorder::nchw_to_nhwc(reinterpret_cast<const T *>(src_it.ptr()),
+                                      reinterpret_cast<T *>(dst_it.ptr()) + idx, n_batches, n_channels, n_rows, n_cols,
+                                      in_batch_stride, in_channel_stride, in_row_stride, out_batch_stride,
+                                      out_row_stride, out_col_stride);
+            },
+            src_it, dst_it);
     }
     // HWC -> CHW
-    else if(src_layout == DataLayout::NHWC && perm == PermutationVector{ 1U, 2U, 0U })
+    else if (src_layout == DataLayout::NHWC && perm == PermutationVector{1U, 2U, 0U})
     {
         const int out_col_stride     = dst->info()->strides_in_bytes().x() / sizeof(T);
         const int out_row_stride     = dst->info()->strides_in_bytes().y() / sizeof(T);
         const int out_channel_stride = dst->info()->strides_in_bytes().z() / sizeof(T);
         const int out_batch_stride   = dst->info()->strides_in_bytes()[3] / sizeof(T);
-        execute_window_loop(window_src, [&](const Coordinates & id)
-        {
-            const int idx = id[0] * out_channel_stride + id[1] * out_col_stride + id[2] * out_row_stride;
-            reorder::nhwc_to_nchw(reinterpret_cast<const T *>(src_it.ptr()), reinterpret_cast<T *>(dst_it.ptr()) + idx,
-                                  n_batches, n_rows, n_cols, n_channels,
-                                  in_batch_stride, in_row_stride, in_col_stride,
-                                  out_batch_stride, out_channel_stride, out_row_stride);
-        },
-        src_it, dst_it);
+        execute_window_loop(
+            window_src,
+            [&](const Coordinates &id)
+            {
+                const int idx = id[0] * out_channel_stride + id[1] * out_col_stride + id[2] * out_row_stride;
+                reorder::nhwc_to_nchw(reinterpret_cast<const T *>(src_it.ptr()),
+                                      reinterpret_cast<T *>(dst_it.ptr()) + idx, n_batches, n_rows, n_cols, n_channels,
+                                      in_batch_stride, in_row_stride, in_col_stride, out_batch_stride,
+                                      out_channel_stride, out_row_stride);
+            },
+            src_it, dst_it);
     }
     else
     {
@@ -230,12 +214,15 @@ void run_permute(const Window &window, const ITensor *src, const ITensor *dst, c
         Strides perm_strides = strides;
         permute_strides(perm_strides, perm);
         const int perm_stride_3 = src->info()->num_dimensions() >= 4 ? perm_strides[3] : 0;
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int idx                                = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_stride_3;
-            *(reinterpret_cast<T *>(dst_it.ptr() + idx)) = *(reinterpret_cast<const T *>(src_it.ptr()));
-        },
-        src_it, dst_it);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int idx =
+                    id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_stride_3;
+                *(reinterpret_cast<T *>(dst_it.ptr() + idx)) = *(reinterpret_cast<const T *>(src_it.ptr()));
+            },
+            src_it, dst_it);
     }
 }
 } // namespace
@@ -275,7 +262,7 @@ void CpuPermuteKernel::run_op(ITensorPack &tensors, const Window &window, const
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
     auto       dst = tensors.get_tensor(TensorType::ACL_DST);
 
-    switch(src->info()->element_size())
+    switch (src->info()->element_size())
     {
         case 1:
             run_permute<uint8_t>(window, src, dst, _perm);
diff --git a/src/cpu/kernels/CpuPermuteKernel.h b/src/cpu/kernels/CpuPermuteKernel.h
index 9e1b93318e..0cb2faf223 100644
--- a/src/cpu/kernels/CpuPermuteKernel.h
+++ b/src/cpu/kernels/CpuPermuteKernel.h
@@ -57,7 +57,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
diff --git a/src/cpu/kernels/CpuPool2dKernel.cpp b/src/cpu/kernels/CpuPool2dKernel.cpp
index d72a41cbbe..9308d860d1 100644
--- a/src/cpu/kernels/CpuPool2dKernel.cpp
+++ b/src/cpu/kernels/CpuPool2dKernel.cpp
@@ -25,15 +25,17 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-#include "src/cpu/kernels/pool2d/neon/list.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/pool2d/neon/list.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -46,99 +48,111 @@ namespace
 {
 using namespace misc::shape_calculator;
 
-static const std::vector<CpuPool2dKernel::PoolingKernel> available_kernels =
-{
-    {
-        "neon_qu8_nhwc_poolMxN",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8)); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_qasymm8_neon_nhwc)
-    },
-    {
-        "neon_qs8_nhwc_poolMxN",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8_SIGNED)); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_qasymm8_signed_neon_nhwc)
-    },
-    {
-        "neon_f16_nhwc_poolMxN",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F16)) && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nhwc)
-    },
-    {
-        "neon_fp32_nhwc_poolMxN",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F32)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nhwc)
-    },
+static const std::vector<CpuPool2dKernel::PoolingKernel> available_kernels = {
+    {"neon_qu8_nhwc_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8)); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_qasymm8_neon_nhwc)},
+    {"neon_qs8_nhwc_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8_SIGNED)); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_qasymm8_signed_neon_nhwc)},
+    {"neon_f16_nhwc_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F16)) && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nhwc)},
+    {"neon_fp32_nhwc_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F32)); },
+     REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nhwc)},
 #if defined(ENABLE_NCHW_KERNELS)
-    {
-        "neon_qu8_nchw_pool2",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<uint8_t>)
-    },
-    {
-        "neon_qu8_nchw_pool3",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<uint8_t>)
-    },
-    {
-        "neon_qu8_nchw_poolMxN",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8)); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<uint8_t>)
-    },
-    {
-        "neon_qs8_nchw_pool2",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<int8_t>)
-    },
-    {
-        "neon_qs8_nchw_pool3",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<int8_t>)
-    },
-    {
-        "neon_qs8_nchw_poolMxN",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED)); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<int8_t>)
-    },
-    {
-        "neon_fp16_nchw_pool2",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); },
-        REGISTER_FP16_NEON(arm_compute::cpu::pooling2_fp16_neon_nchw)
-    },
-    {
-        "neon_fp16_nchw_pool3",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); },
-        REGISTER_FP16_NEON(arm_compute::cpu::pooling3_fp16_neon_nchw)
-    },
-    {
-        "neon_fp16_nchw_poolMxN",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16)); },
-        REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nchw)
-    },
-    {
-        "neon_fp32_nchw_pool2",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::pooling2_fp32_neon_nchw)
-    },
-    {
-        "neon_fp32_nchw_pool3",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::pooling3_fp32_neon_nchw)
-    },
-    {
-        "neon_fp32_nchw_pool7",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 7)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::pooling7_fp32_neon_nchw)
-    },
-    {
-        "neon_fp32_nchw_poolMxN",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nchw)
-    },
+    {"neon_qu8_nchw_pool2",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3));
+     },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<uint8_t>)},
+    {"neon_qu8_nchw_pool3",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3));
+     },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<uint8_t>)},
+    {"neon_qu8_nchw_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8)); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<uint8_t>)},
+    {"neon_qs8_nchw_pool2",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3));
+     },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<int8_t>)},
+    {"neon_qs8_nchw_pool3",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3));
+     },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<int8_t>)},
+    {"neon_qs8_nchw_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED)); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<int8_t>)},
+    {"neon_fp16_nchw_pool2",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2));
+     },
+     REGISTER_FP16_NEON(arm_compute::cpu::pooling2_fp16_neon_nchw)},
+    {"neon_fp16_nchw_pool3",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3));
+     },
+     REGISTER_FP16_NEON(arm_compute::cpu::pooling3_fp16_neon_nchw)},
+    {"neon_fp16_nchw_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16)); },
+     REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nchw)},
+    {"neon_fp32_nchw_pool2",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2));
+     },
+     REGISTER_FP32_NEON(arm_compute::cpu::pooling2_fp32_neon_nchw)},
+    {"neon_fp32_nchw_pool3",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3));
+     },
+     REGISTER_FP32_NEON(arm_compute::cpu::pooling3_fp32_neon_nchw)},
+    {"neon_fp32_nchw_pool7",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 7));
+     },
+     REGISTER_FP32_NEON(arm_compute::cpu::pooling7_fp32_neon_nchw)},
+    {"neon_fp32_nchw_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32)); },
+     REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nchw)},
 #endif /* defined(ENABLE_NCHW_KERNELS) */
 };
 
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info,
-                          const ITensorInfo *indices, Size2D pool_size)
+Status validate_arguments(const ITensorInfo      *src,
+                          const ITensorInfo      *dst,
+                          const PoolingLayerInfo &pool_info,
+                          const ITensorInfo      *indices,
+                          Size2D                  pool_size)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(pool_size.x() == 0);
@@ -150,65 +164,78 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     int                 output_height   = 0;
     PoolingType         pool_type       = pool_info.pool_type;
     const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
-    const auto          data_layout     = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    const int           idx_width       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int           idx_height      = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type()))
-                                    && (is_pool_region_entirely_outside_input(pool_info)),
-                                    "Pooling region that is entirely outside input tensor is unsupported for non-float types");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (!is_data_type_float(src->data_type())) && (is_pool_region_entirely_outside_input(pool_info)),
+        "Pooling region that is entirely outside input tensor is unsupported for non-float types");
 
-    std::tie(output_width, output_height) = scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
-                                                                     pool_size.x(), pool_size.y(), pool_info.pad_stride_info);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), "Calculated output dimension size is invalid");
+    std::tie(output_width, output_height) =
+        scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], pool_size.x(),
+                                 pool_size.y(), pool_info.pad_stride_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1),
+                                    "Calculated output dimension size is invalid");
 
     TensorInfo out_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, dst->data_type()));
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
 
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    if(indices)
+    if (indices)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32, DataType::F16);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX,
+                                        "Pooling indices only supported for MAX pooling method");
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(src->data_type()));
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(src->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding()
-                                    && (src->data_layout() == DataLayout::NHWC),
-                                    "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        is_data_type_quantized(src->data_type()) && !pool_info.exclude_padding &&
+            (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding() &&
+            (src->data_layout() == DataLayout::NHWC),
+        "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info);
-        if(indices)
+        if (indices)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(((pool_size != Size2D(2, 2)) && !pool_info.use_kernel_indices), "Pooling indices returning source tensor coordinates is only supported for pool size 2x2");
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.use_kernel_indices && (src->data_layout() != DataLayout::NHWC), "Pooling kernel indices only supported for NHWC");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                ((pool_size != Size2D(2, 2)) && !pool_info.use_kernel_indices),
+                "Pooling indices returning source tensor coordinates is only supported for pool size 2x2");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.use_kernel_indices && (src->data_layout() != DataLayout::NHWC),
+                                            "Pooling kernel indices only supported for NHWC");
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &out_info);
         }
     }
 
-    const auto *uk = CpuPool2dKernel::get_implementation(PoolDataTypeISASelectorData{ src->data_type(), src->data_layout(), pool_stride_x, pool_size, CPUInfo::get().get_isa() });
+    const auto *uk = CpuPool2dKernel::get_implementation(PoolDataTypeISASelectorData{
+        src->data_type(), src->data_layout(), pool_stride_x, pool_size, CPUInfo::get().get_isa()});
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, ITensorInfo *indices, const PoolingLayerInfo &pool_info,
-                                                        unsigned int &num_elems_processed_per_iteration,
-                                                        int pool_size_x, int pool_size_y)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo            *src,
+                                                        ITensorInfo            *dst,
+                                                        ITensorInfo            *indices,
+                                                        const PoolingLayerInfo &pool_info,
+                                                        unsigned int           &num_elems_processed_per_iteration,
+                                                        int                     pool_size_x,
+                                                        int                     pool_size_y)
 {
     // dst auto inizialitation if not yet initialized
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, pool_info)));
-    if(indices)
+    if (indices)
     {
         // Indices auto inizialitation if not yet initialized
-        auto_init_if_empty(*indices, (src->clone()->set_tensor_shape(compute_pool_shape(*src,
-                                                                                        pool_info)))
-                           .set_data_type(DataType::U32) /* we store the offset to the element */);
+        auto_init_if_empty(*indices, (src->clone()->set_tensor_shape(compute_pool_shape(*src, pool_info)))
+                                         .set_data_type(DataType::U32) /* we store the offset to the element */);
     }
     const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
 
@@ -219,20 +246,20 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso
     const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
 
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-    const bool         is_square = pool_size_x == pool_size_y;
-    const unsigned int pooled_w  = dst->dimension(idx_width);
-    const unsigned int pooled_h  = dst->dimension(idx_height);
+    const bool         is_square           = pool_size_x == pool_size_y;
+    const unsigned int pooled_w            = dst->dimension(idx_width);
+    const unsigned int pooled_h            = dst->dimension(idx_height);
 
     //If it's not squared and optimized will be executed the MxN
     num_elems_processed_per_iteration = 1;
 
-    if(is_square)
+    if (is_square)
     {
-        switch(src->data_type())
+        switch (src->data_type())
         {
             case DataType::QASYMM8:
             case DataType::QASYMM8_SIGNED:
-                switch(pool_size_x)
+                switch (pool_size_x)
                 {
                     case 2:
                         num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
@@ -261,18 +288,22 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso
     bool   window_changed = false;
     Window win{};
     // Upper limit for the number of right/bottom border elements that are accessed
-    TensorShape dst_shape{ src->tensor_shape() };
+    TensorShape dst_shape{src->tensor_shape()};
     dst_shape.set(0, pooled_w);
     dst_shape.set(1, pooled_h);
     TensorInfo dst_info(src->clone()->set_tensor_shape(dst_shape));
     win = calculate_max_window(dst_info, Steps(num_elems_processed_per_iteration));
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
 
-void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
+void CpuPool2dKernel::configure(ITensorInfo            *src,
+                                ITensorInfo            *dst,
+                                const PoolingLayerInfo &pool_info,
+                                ITensorInfo            *indices)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     const PadStrideInfo pad_stride_info   = pool_info.pad_stride_info;
@@ -284,14 +315,15 @@ void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const Poolin
     const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
     // Update pool size in case of global pooling
-    const Size2D pool_size(
-        is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width,
-        is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height);
+    const Size2D pool_size(is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width,
+                           is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height);
 
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices, pool_size));
 
-    const auto *uk = CpuPool2dKernel::get_implementation(PoolDataTypeISASelectorData{ src->data_type(), src->data_layout(), (int)pad_stride_info.stride().first, pool_size, CPUInfo::get().get_isa() });
+    const auto *uk = CpuPool2dKernel::get_implementation(
+        PoolDataTypeISASelectorData{src->data_type(), src->data_layout(), (int)pad_stride_info.stride().first,
+                                    pool_size, CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr);
 
     // Set instance variables
@@ -302,7 +334,7 @@ void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const Poolin
     _run_method    = uk->ukernel;
     _name          = std::string("CpuPool2dKernel").append("/").append(uk->name);
 
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         // Configure kernel window
         Window win = calculate_max_window(*dst, Steps());
@@ -311,14 +343,17 @@ void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const Poolin
     else
     {
         // Configure kernel window
-        auto win_config = validate_and_configure_window(src, dst, indices, pool_info, _num_elems_processed_per_iteration,
-                                                        pool_size.x(), pool_size.y());
+        auto win_config = validate_and_configure_window(
+            src, dst, indices, pool_info, _num_elems_processed_per_iteration, pool_size.x(), pool_size.y());
         ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
         ICpuKernel::configure(win_config.second);
     }
 }
 
-Status CpuPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status CpuPool2dKernel::validate(const ITensorInfo      *src,
+                                 const ITensorInfo      *dst,
+                                 const PoolingLayerInfo &pool_info,
+                                 const ITensorInfo      *indices)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
 
@@ -336,9 +371,10 @@ Status CpuPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst,
 
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices, Size2D(pool_size_x, pool_size_y)));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(),
-                                                              (indices) ? indices->clone().get() : nullptr, pool_info, num_elems_processed_per_iteration,
-                                                              pool_size_x, pool_size_y)
-                                .first);
+                                                              (indices) ? indices->clone().get() : nullptr, pool_info,
+                                                              num_elems_processed_per_iteration, pool_size_x,
+                                                              pool_size_y)
+                                    .first);
 
     return Status{};
 }
@@ -359,19 +395,20 @@ void CpuPool2dKernel::run_op(ITensorPack &tensors, const Window &window, const T
     const unsigned int pool_size     = _pool_info.pool_size.width;
 
     Window window_src(window);
-    if(_data_layout == DataLayout::NCHW)
+    if (_data_layout == DataLayout::NCHW)
     {
         // Set step for src in x and y direction for the src
         unsigned int window_x_inc = 0;
-        switch(src->info()->data_type())
+        switch (src->info()->data_type())
         {
             case DataType::QASYMM8:
             case DataType::QASYMM8_SIGNED:
             {
                 window_x_inc = pool_stride_x;
-                if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
+                if ((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
                 {
-                    window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
+                    window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2
+                                                        : _num_elems_processed_per_iteration;
                 }
                 break;
             }
@@ -387,8 +424,10 @@ void CpuPool2dKernel::run_op(ITensorPack &tensors, const Window &window, const T
                 ARM_COMPUTE_ERROR("Not supported");
             }
         }
-        window_src.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
-        window_src.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
+        window_src.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x,
+                                                       window.x().end() * pool_stride_x, window_x_inc));
+        window_src.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y,
+                                                       window.y().end() * pool_stride_y, pool_stride_y));
     }
     else
     {
diff --git a/src/cpu/kernels/CpuPool2dKernel.h b/src/cpu/kernels/CpuPool2dKernel.h
index c952ea839d..859de8cc5f 100644
--- a/src/cpu/kernels/CpuPool2dKernel.h
+++ b/src/cpu/kernels/CpuPool2dKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_POOL2D_KERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -38,7 +39,8 @@ namespace kernels
 class CpuPool2dKernel : public ICpuKernel<CpuPool2dKernel>
 {
 private:
-    using PoolingKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, ITensor *, PoolingLayerInfo &, const Window &, const Window &)>::type;
+    using PoolingKernelPtr = std::add_pointer<void(
+        const ITensor *, ITensor *, ITensor *, PoolingLayerInfo &, const Window &, const Window &)>::type;
 
 public:
     CpuPool2dKernel() = default;
@@ -52,17 +54,21 @@ public:
      * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      * @param[out] indices   (optional) The indices of the maximal values. Data type supported: U32.
      */
-    void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
+    void
+    configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuPool2dKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &pool_info,
+                           const ITensorInfo      *indices = nullptr);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct PoolingKernel
@@ -76,11 +82,11 @@ public:
 
 private:
     PoolingLayerInfo _pool_info{};
-    DataLayout       _data_layout{ DataLayout::UNKNOWN };
-    unsigned int     _num_elems_processed_per_iteration{ 0 };
+    DataLayout       _data_layout{DataLayout::UNKNOWN};
+    unsigned int     _num_elems_processed_per_iteration{0};
     Size2D           _pool_size{};
     int              _pool_stride_x{};
-    PoolingKernelPtr _run_method{ nullptr };
+    PoolingKernelPtr _run_method{nullptr};
     std::string      _name{};
 };
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuPool3dKernel.cpp b/src/cpu/kernels/CpuPool3dKernel.cpp
index 4504f3f7c9..8b484d4e0b 100644
--- a/src/cpu/kernels/CpuPool3dKernel.cpp
+++ b/src/cpu/kernels/CpuPool3dKernel.cpp
@@ -25,8 +25,9 @@
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/pool3d/list.h"
@@ -41,39 +42,28 @@ namespace
 {
 using namespace misc::shape_calculator;
 
-static const std::vector<CpuPool3dKernel::Pooling3dKernel> available_kernels =
-{
-    {
-        "neon_qu8_ndhwc_poolMxNxD",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_q8_pool3d)
-    },
-    {
-        "neon_qs8_ndhwc_poolMxNxD",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_q8_signed_pool3d)
-    },
-    {
-        "neon_fp16_ndhwc_poolMxNxD",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16 && data.isa.fp16); },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_pool3d)
-    },
-    {
-        "neon_fp32_ndhwc_poolMxNxD",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_pool3d)
-    }
-};
+static const std::vector<CpuPool3dKernel::Pooling3dKernel> available_kernels = {
+    {"neon_qu8_ndhwc_poolMxNxD", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_q8_pool3d)},
+    {"neon_qs8_ndhwc_poolMxNxD",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_q8_signed_pool3d)},
+    {"neon_fp16_ndhwc_poolMxNxD",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16 && data.isa.fp16); },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_pool3d)},
+    {"neon_fp32_ndhwc_poolMxNxD", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_pool3d)}};
 
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NDHWC, "Only NDHWC layout supported");
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) && (!pool_info.exclude_padding
-                                                                                && (pool_info.pool_type == PoolingType::AVG)),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) &&
+                                        (!pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG)),
                                     "Exclude padding is unsupported for non-float types for Avg op");
 
     const auto data_layout = src->data_layout();
@@ -97,21 +87,26 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     int output_height = 0;
     int output_depth  = 0;
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_3d_region_entirely_outside_input(pool_info), "Pooling region that is entirely outside input tensor is unsupported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_3d_region_entirely_outside_input(pool_info),
+                                    "Pooling region that is entirely outside input tensor is unsupported");
 
-    std::tie(output_width, output_height, output_depth) = scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], src->tensor_shape()[idx_depth],
-                                                                                      pool_size_x, pool_size_y, pool_size_z, pool_info);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1), "Calculated output dimension size is invalid");
+    std::tie(output_width, output_height, output_depth) =
+        scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
+                                    src->tensor_shape()[idx_depth], pool_size_x, pool_size_y, pool_size_z, pool_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1),
+                                    "Calculated output dimension size is invalid");
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
-        TensorInfo out_info(TensorInfo(compute_pool3d_shape(src->tensor_shape(), pool_info), 1, dst->data_type(), DataLayout::NDHWC));
+        TensorInfo out_info(
+            TensorInfo(compute_pool3d_shape(src->tensor_shape(), pool_info), 1, dst->data_type(), DataLayout::NDHWC));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info);
     }
 
-    const auto *uk = CpuPool3dKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+    const auto *uk =
+        CpuPool3dKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     return Status{};
@@ -136,12 +131,12 @@ void CpuPool3dKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const
 
     // Update pool size in case of global pooling
     const bool   is_global_pooling = pool_info.is_global_pooling;
-    const Size3D pool_size(
-        is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width,
-        is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height,
-        is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth);
+    const Size3D pool_size(is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width,
+                           is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height,
+                           is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth);
 
-    const auto *uk = CpuPool3dKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+    const auto *uk =
+        CpuPool3dKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr);
 
     // Set instance variables
@@ -188,4 +183,4 @@ const std::vector<CpuPool3dKernel::Pooling3dKernel> &CpuPool3dKernel::get_availa
 
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuPool3dKernel.h b/src/cpu/kernels/CpuPool3dKernel.h
index 437f2af7e4..bd1ff61046 100644
--- a/src/cpu/kernels/CpuPool3dKernel.h
+++ b/src/cpu/kernels/CpuPool3dKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_POOL3D_KERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -39,7 +40,8 @@ class CpuPool3dKernel : public ICpuKernel<CpuPool3dKernel>
 {
 private:
     /* Template function for Pooling 3D NDHWC */
-    using Pooling3dKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, Pooling3dLayerInfo &, const Window &)>::type;
+    using Pooling3dKernelPtr =
+        std::add_pointer<void(const ITensor *, ITensor *, Pooling3dLayerInfo &, const Window &)>::type;
 
 public:
     CpuPool3dKernel() = default;
@@ -68,7 +70,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct Pooling3dKernel
@@ -82,11 +84,11 @@ public:
 
 private:
     Pooling3dLayerInfo _pool_info{};
-    Pooling3dKernelPtr _run_method{ nullptr };
+    Pooling3dKernelPtr _run_method{nullptr};
     std::string        _name{};
 };
 
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_POOL3D_KERNEL_H */
-\ No newline at end of file
+#endif /*ARM_COMPUTE_CPU_POOL3D_KERNEL_H */
diff --git a/src/cpu/kernels/CpuQuantizeKernel.cpp b/src/cpu/kernels/CpuQuantizeKernel.cpp
index 9700c62318..5dde680837 100644
--- a/src/cpu/kernels/CpuQuantizeKernel.cpp
+++ b/src/cpu/kernels/CpuQuantizeKernel.cpp
@@ -28,13 +28,13 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/CPP/Validate.h"
 
 #include <arm_neon.h>
 #include <map>
@@ -53,9 +53,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QSYMM8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QSYMM8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::QASYMM16);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
 
     return Status{};
@@ -71,19 +73,15 @@ inline float32x4x4_t load_value(const T *input_ptr)
 template <>
 inline float32x4x4_t load_value(const float *input_ptr)
 {
-    return { wrapper::vloadq(input_ptr),
-             wrapper::vloadq(input_ptr + 4),
-             wrapper::vloadq(input_ptr + 8),
-             wrapper::vloadq(input_ptr + 12) };
+    return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4), wrapper::vloadq(input_ptr + 8),
+            wrapper::vloadq(input_ptr + 12)};
 }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 template <>
 inline float32x4x4_t load_value(const float16_t *input_ptr)
 {
-    return { vcvt_f32_f16(wrapper::vload(input_ptr)),
-             vcvt_f32_f16(wrapper::vload(input_ptr + 4)),
-             vcvt_f32_f16(wrapper::vload(input_ptr + 8)),
-             vcvt_f32_f16(wrapper::vload(input_ptr + 12)) };
+    return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)),
+            vcvt_f32_f16(wrapper::vload(input_ptr + 8)), vcvt_f32_f16(wrapper::vload(input_ptr + 12))};
 }
 
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -113,26 +111,25 @@ void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
 
-    static const std::map<std::string, QuantizeFunctionExecutorPtr> quant_map =
-    {
-        { "op_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, uint8_t> },
-        { "op_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, int8_t> },
-        { "op_QASYMM8_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<uint8_t> },
+    static const std::map<std::string, QuantizeFunctionExecutorPtr> quant_map = {
+        {"op_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, uint8_t>},
+        {"op_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, int8_t>},
+        {"op_QASYMM8_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<uint8_t>},
 
-        { "op_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, uint8_t> },
-        { "op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, int8_t> },
-        { "op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<int8_t> },
+        {"op_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, uint8_t>},
+        {"op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, int8_t>},
+        {"op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<int8_t>},
 
-        { "op_F32_QSYMM8", &CpuQuantizeKernel::run_quantize_qsymm8<float, int8_t> },
+        {"op_F32_QSYMM8", &CpuQuantizeKernel::run_quantize_qsymm8<float, int8_t>},
 
-        { "op_F32_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float, uint8_t> },
-        { "op_F32_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float, int8_t> },
-        { "op_F32_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float> },
+        {"op_F32_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float, uint8_t>},
+        {"op_F32_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float, int8_t>},
+        {"op_F32_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float>},
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        { "op_F16_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, uint8_t> },
-        { "op_F16_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, int8_t> },
-        { "op_F16_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float16_t> },
+        {"op_F16_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, uint8_t>},
+        {"op_F16_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, int8_t>},
+        {"op_F16_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float16_t>},
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/
     };
 
@@ -142,7 +139,7 @@ void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
 
     auto it = quant_map.find(function_to_call);
 
-    if(it == quant_map.end())
+    if (it == quant_map.end())
     {
         ARM_COMPUTE_ERROR("Unsupported combination of input and output data types");
     }
@@ -167,7 +164,7 @@ void CpuQuantizeKernel::run_quantize_qsymm8(const ITensor *src, ITensor *dst, co
 
     const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
     UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
-    if(is_data_type_quantized_asymmetric(src->info()->data_type()))
+    if (is_data_type_quantized_asymmetric(src->info()->data_type()))
     {
         uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
     }
@@ -177,22 +174,24 @@ void CpuQuantizeKernel::run_quantize_qsymm8(const ITensor *src, ITensor *dst, co
 
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
-        auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step); x += window_step)
-        {
-            wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
-        }
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            output_ptr[x] = quantize_qsymm8(input_ptr[x], dst->info()->quantization_info());
-        }
-    },
-    input, output);
+            auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
+            auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
+            int  x          = window_start_x;
+            for (; x <= (window_end_x - window_step); x += window_step)
+            {
+                wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
+            }
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                output_ptr[x] = quantize_qsymm8(input_ptr[x], dst->info()->quantization_info());
+            }
+        },
+        input, output);
 }
 
 template <typename TIn, typename TOut>
@@ -203,7 +202,7 @@ void CpuQuantizeKernel::run_quantize_qasymm8(const ITensor *src, ITensor *dst, c
 
     const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
     UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
-    if(is_data_type_quantized_asymmetric(src->info()->data_type()))
+    if (is_data_type_quantized_asymmetric(src->info()->data_type()))
     {
         uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
     }
@@ -219,23 +218,25 @@ void CpuQuantizeKernel::run_quantize_qasymm8(const ITensor *src, ITensor *dst, c
 
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
-        auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step); x += window_step)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
-        }
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            output_ptr[x] = Qasymm8QuantizationHelper<TOut>::quantize(input_ptr[x], uqinfo, rounding_policy);
-        }
-    },
-    input, output);
+            auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
+            auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step); x += window_step)
+            {
+                wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
+            }
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                output_ptr[x] = Qasymm8QuantizationHelper<TOut>::quantize(input_ptr[x], uqinfo, rounding_policy);
+            }
+        },
+        input, output);
 }
 
 template <typename T>
@@ -246,7 +247,7 @@ void CpuQuantizeKernel::run_quantize_qasymm16(const ITensor *src, ITensor *dst,
 
     const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
     UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
-    if(is_data_type_quantized_asymmetric(src->info()->data_type()))
+    if (is_data_type_quantized_asymmetric(src->info()->data_type()))
     {
         uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
     }
@@ -262,25 +263,27 @@ void CpuQuantizeKernel::run_quantize_qasymm16(const ITensor *src, ITensor *dst,
 
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
-        auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step); x += window_step)
-        {
-            uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo);
-            vst1q_u16(&output_ptr[x], tmp.val[0]);
-            vst1q_u16(&output_ptr[x + 8], tmp.val[1]);
-        }
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy);
-        }
-    },
-    input, output);
+            auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+            auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step); x += window_step)
+            {
+                uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo);
+                vst1q_u16(&output_ptr[x], tmp.val[0]);
+                vst1q_u16(&output_ptr[x + 8], tmp.val[1]);
+            }
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy);
+            }
+        },
+        input, output);
 }
 
 void CpuQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
diff --git a/src/cpu/kernels/CpuQuantizeKernel.h b/src/cpu/kernels/CpuQuantizeKernel.h
index 2bc8105a11..d6714136da 100644
--- a/src/cpu/kernels/CpuQuantizeKernel.h
+++ b/src/cpu/kernels/CpuQuantizeKernel.h
@@ -59,7 +59,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
@@ -67,7 +67,9 @@ private:
      *
      * @param[in] window Region on which to execute the kernel.
      */
-    using QuantizeFunctionExecutorPtr = void (CpuQuantizeKernel::*)(const ITensor *src, ITensor *dst, const Window &window);
+    using QuantizeFunctionExecutorPtr = void (CpuQuantizeKernel::*)(const ITensor *src,
+                                                                    ITensor       *dst,
+                                                                    const Window  &window);
     /** Function to apply QASYMM8 or QASYMM8_SIGNED quantization on a tensor.
      *
      * @param[in] window Region on which to execute the kernel.
@@ -84,7 +86,7 @@ private:
     template <typename TIn, typename TOut>
     void run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window);
 
-    QuantizeFunctionExecutorPtr _func{ nullptr };
+    QuantizeFunctionExecutorPtr _func{nullptr};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuReshapeKernel.cpp b/src/cpu/kernels/CpuReshapeKernel.cpp
index a9672a8c5e..241e58fbce 100644
--- a/src/cpu/kernels/CpuReshapeKernel.cpp
+++ b/src/cpu/kernels/CpuReshapeKernel.cpp
@@ -29,9 +29,11 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/NEON/INEKernel.h"
+
 #include "src/core/helpers/Utils.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/INEKernel.h"
+
 #include <cstdint>
 
 /** [NEReshapeLayerKernel Kernel] **/
@@ -49,7 +51,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
     // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
 
-    if(dst->tensor_shape().total_size() != 0)
+    if (dst->tensor_shape().total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
@@ -59,29 +61,30 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
     return Status{};
 }
 
-
 template <typename T>
 void reshape_tensor_per_element(const Window &window, const ITensor *src, ITensor *dst)
 {
     const TensorShape &src_shape = src->info()->tensor_shape();
     const TensorShape &dst_shape = dst->info()->tensor_shape();
 
-    Iterator    dst_it(dst, window);
+    Iterator dst_it(dst, window);
 
-    execute_window_loop(window, [&](const Coordinates & dst_coord)
-    {
-        Coordinates src_coord = index2coords(src_shape, coords2index(dst_shape, dst_coord));
-        const auto output_ptr = dst->ptr_to_element(dst_coord);
-        const auto input_ptr  = src->ptr_to_element(src_coord);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &dst_coord)
+        {
+            Coordinates src_coord  = index2coords(src_shape, coords2index(dst_shape, dst_coord));
+            const auto  output_ptr = dst->ptr_to_element(dst_coord);
+            const auto  input_ptr  = src->ptr_to_element(src_coord);
 
-        *reinterpret_cast<T *>(output_ptr) = *reinterpret_cast<T *>(input_ptr);
-    },
-    dst_it);
+            *reinterpret_cast<T *>(output_ptr) = *reinterpret_cast<T *>(input_ptr);
+        },
+        dst_it);
 }
 
-void reshape_tensor_per_element_selector(const Window &window, const ITensor *src, ITensor *dst )
+void reshape_tensor_per_element_selector(const Window &window, const ITensor *src, ITensor *dst)
 {
-    switch(src->info()->data_type())
+    switch (src->info()->data_type())
     {
         case DataType::U8:
         case DataType::S8:
@@ -131,22 +134,24 @@ void reshape_tensor_per_row(const Window &window, const ITensor *src, ITensor *d
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator dst_it(dst, win);
-    execute_window_loop(win, [&]( Coordinates & id)
-    {
-        dst_coord = id;
-
-        for(int x = window_start_x; x < window_end_x; x += src_row_size)
+    execute_window_loop(
+        win,
+        [&](Coordinates &id)
         {
-            src_coord  = index2coords(src_shape, coords2index(dst_shape, dst_coord));
-            output_ptr = dst->ptr_to_element(dst_coord);
-            input_ptr  = src->ptr_to_element(src_coord);
+            dst_coord = id;
 
-            std::memcpy(output_ptr, input_ptr, row_size_in_bytes);
+            for (int x = window_start_x; x < window_end_x; x += src_row_size)
+            {
+                src_coord  = index2coords(src_shape, coords2index(dst_shape, dst_coord));
+                output_ptr = dst->ptr_to_element(dst_coord);
+                input_ptr  = src->ptr_to_element(src_coord);
 
-            dst_coord.increment(Window::DimX, src_row_size);
-        }
-    },
-    dst_it);
+                std::memcpy(output_ptr, input_ptr, row_size_in_bytes);
+
+                dst_coord.increment(Window::DimX, src_row_size);
+            }
+        },
+        dst_it);
 }
 
 void reshape_tensor_per_window(const Window &window, const ITensor *src, ITensor *dst)
@@ -213,8 +218,8 @@ void CpuReshapeKernel::prepare(ITensorPack &tensors)
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
     auto       dst = tensors.get_tensor(TensorType::ACL_DST);
 
-    const ITensorInfo* src_info = src->info();
-    const ITensorInfo* dst_info = dst->info();
+    const ITensorInfo *src_info = src->info();
+    const ITensorInfo *dst_info = dst->info();
 
     // Calculate kernel window based on the padding info
     Window win;
@@ -226,7 +231,7 @@ void CpuReshapeKernel::prepare(ITensorPack &tensors)
     const auto src_row_size       = static_cast<int>(src_info->tensor_shape()[0]);
     const auto dst_row_size       = static_cast<int>(dst_info->tensor_shape()[0]);
 
-    if(!src_has_holes && !dst_has_holes)
+    if (!src_has_holes && !dst_has_holes)
     {
         std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*dst_info);
         /*
diff --git a/src/cpu/kernels/CpuReshapeKernel.h b/src/cpu/kernels/CpuReshapeKernel.h
index eddbbf7135..ce566fd9e2 100644
--- a/src/cpu/kernels/CpuReshapeKernel.h
+++ b/src/cpu/kernels/CpuReshapeKernel.h
@@ -55,7 +55,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     /** Prepare the reshape kernel for execution (Only executed once) by calculating max or squashed window and selecting the _reshape_tensor_fn based on the presence of holes
@@ -84,10 +84,9 @@ public:
     }
 
 private:
-    size_t               _split_dimension{ Window::DimY };
-
-    std::function<void(const Window &window, const ITensor *src, ITensor *dst )>  _reshape_tensor_fn{};
+    size_t _split_dimension{Window::DimY};
 
+    std::function<void(const Window &window, const ITensor *src, ITensor *dst)> _reshape_tensor_fn{};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuScaleKernel.cpp b/src/cpu/kernels/CpuScaleKernel.cpp
index 332304599f..702e0a8134 100644
--- a/src/cpu/kernels/CpuScaleKernel.cpp
+++ b/src/cpu/kernels/CpuScaleKernel.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/InterpolationPolicyUtils.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/common/Registrars.h"
 #include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -44,104 +45,74 @@ namespace kernels
 {
 namespace
 {
-static const std::vector<CpuScaleKernel::ScaleKernel> available_kernels =
-{
-    {
-        "sve_fp16_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && data.interpolation_policy != InterpolationPolicy::BILINEAR;
-        },
-        REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_scale)
-    },
-    {
-        "sve_fp32_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F32 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR;
-        },
-        REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_scale)
-    },
-    {
-        "sve_qu8_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR;
-        },
-        REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_scale)
-    },
-    {
-        "sve_qs8_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR;
-        },
-        REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_scale)
-    },
-    {
-        "sve_u8_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::U8 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR;
-        },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::u8_sve_scale)
-    },
-    {
-        "sve_s16_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::S16 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR;
-        },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::s16_sve_scale)
-    },
-    {
-        "neon_fp16_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::common_neon_scale<float16_t>)
-    },
-    {
-        "neon_fp32_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::common_neon_scale<float>)
-    },
-    {
-        "neon_qu8_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_scale)
-    },
-    {
-        "neon_qs8_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_scale)
-    },
-    {
-        "neon_u8_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::U8; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_scale)
-    },
-    {
-        "neon_s8_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::S8; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_scale)
-    },
-    {
-        "neon_s16_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::S16; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_scale)
-    },
+static const std::vector<CpuScaleKernel::ScaleKernel> available_kernels = {
+    {"sve_fp16_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 &&
+                data.interpolation_policy != InterpolationPolicy::BILINEAR;
+     },
+     REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_scale)},
+    {"sve_fp32_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data)
+     { return data.dt == DataType::F32 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; },
+     REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_scale)},
+    {"sve_qu8_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data) {
+         return data.dt == DataType::QASYMM8 && data.isa.sve &&
+                data.interpolation_policy != InterpolationPolicy::BILINEAR;
+     },
+     REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_scale)},
+    {"sve_qs8_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data)
+     {
+         return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve &&
+                data.interpolation_policy != InterpolationPolicy::BILINEAR;
+     },
+     REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_scale)},
+    {"sve_u8_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data)
+     { return data.dt == DataType::U8 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; },
+     REGISTER_INTEGER_SVE(arm_compute::cpu::u8_sve_scale)},
+    {"sve_s16_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data)
+     { return data.dt == DataType::S16 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; },
+     REGISTER_INTEGER_SVE(arm_compute::cpu::s16_sve_scale)},
+    {"neon_fp16_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::common_neon_scale<float16_t>)},
+    {"neon_fp32_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::common_neon_scale<float>)},
+    {"neon_qu8_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_scale)},
+    {"neon_qs8_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_scale)},
+    {"neon_u8_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::U8; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_scale)},
+    {"neon_s8_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::S8; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_scale)},
+    {"neon_s16_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::S16; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_scale)},
 };
 
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy,
-                          const ITensorInfo *offsets, ITensorInfo *dst, const ScaleKernelInfo &info)
+Status validate_arguments(const ITensorInfo     *src,
+                          const ITensorInfo     *dx,
+                          const ITensorInfo     *dy,
+                          const ITensorInfo     *offsets,
+                          ITensorInfo           *dst,
+                          const ScaleKernelInfo &info)
 {
-    const auto *uk = CpuScaleKernel::get_implementation(ScaleKernelDataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy });
+    const auto *uk = CpuScaleKernel::get_implementation(
+        ScaleKernelDataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy});
 
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(dst == src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->num_channels()!=1);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->num_channels() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER &&
+                                info.sampling_policy != SamplingPolicy::TOP_LEFT);
     ARM_COMPUTE_UNUSED(info.constant_border_value);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.use_padding, "Padding is not supported");
 
@@ -153,27 +124,30 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dx, const I
     ARM_COMPUTE_RETURN_ERROR_ON(output_width == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(output_height == 0);
 
-    ARM_COMPUTE_RETURN_ERROR_ON((src->data_type() == DataType::S8) && (data_layout != DataLayout::NHWC || info.interpolation_policy != InterpolationPolicy::BILINEAR
-                                                                       || info.border_mode != BorderMode::REPLICATE));
+    ARM_COMPUTE_RETURN_ERROR_ON((src->data_type() == DataType::S8) &&
+                                (data_layout != DataLayout::NHWC ||
+                                 info.interpolation_policy != InterpolationPolicy::BILINEAR ||
+                                 info.border_mode != BorderMode::REPLICATE));
 
-    if(info.interpolation_policy == InterpolationPolicy::NEAREST_NEIGHBOR && offsets != nullptr)
+    if (info.interpolation_policy == InterpolationPolicy::NEAREST_NEIGHBOR && offsets != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
     }
 
-    if(info.interpolation_policy == InterpolationPolicy::BILINEAR && offsets != nullptr)
+    if (info.interpolation_policy == InterpolationPolicy::BILINEAR && offsets != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
-        if(dx != nullptr && dy != nullptr)
+        if (dx != nullptr && dy != nullptr)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dx, 1, DataType::F32);
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dy, 1, DataType::F32);
         }
     }
 
-    ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
+    ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners &&
+                                !scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
 
-    if(info.interpolation_policy == InterpolationPolicy::AREA)
+    if (info.interpolation_policy == InterpolationPolicy::AREA)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::U8);
@@ -183,24 +157,28 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dx, const I
 }
 } // namespace
 
-void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets,
-                               ITensorInfo *dst, const ScaleKernelInfo &info)
+void CpuScaleKernel::configure(const ITensorInfo     *src,
+                               const ITensorInfo     *dx,
+                               const ITensorInfo     *dy,
+                               const ITensorInfo     *offsets,
+                               ITensorInfo           *dst,
+                               const ScaleKernelInfo &info)
 {
     ARM_COMPUTE_UNUSED(dx, dy, offsets);
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src,
-                                                  dx,
-                                                  dy,
-                                                  offsets,
-                                                  dst,
-                                                  info));
-
-    const auto *uk = CpuScaleKernel::get_implementation(ScaleKernelDataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy });
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dx, dy, offsets, dst, info));
+
+    const auto *uk = CpuScaleKernel::get_implementation(
+        ScaleKernelDataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy});
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
 
     _run_method = uk->ukernel;
-    _name       = std::string("CpuScaleKernel").append("/").append(uk->name).append("_").append(string_from_interpolation_policy(info.interpolation_policy));
+    _name       = std::string("CpuScaleKernel")
+                .append("/")
+                .append(uk->name)
+                .append("_")
+                .append(string_from_interpolation_policy(info.interpolation_policy));
 
     // Get data layout and width/height indices
     _data_layout         = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
@@ -212,19 +190,22 @@ void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, co
     _constant_border_value = info.constant_border_value;
     _align_corners         = info.align_corners;
 
-    if(info.sampling_policy == SamplingPolicy::CENTER)
+    if (info.sampling_policy == SamplingPolicy::CENTER)
     {
         _sampling_offset = 0.5f;
     }
 
     // Compute the ratio between source width/height and destination width/height
-    const auto wr = scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), _align_corners);
-    const auto hr = scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), _align_corners);
+    const auto wr =
+        scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), _align_corners);
+    const auto hr =
+        scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), _align_corners);
 
     // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-    _policy = (_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : _policy;
+    _policy = (_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR
+                                                                               : _policy;
 
-    if(_border_mode == BorderMode::UNDEFINED)
+    if (_border_mode == BorderMode::UNDEFINED)
     {
         _border_mode           = BorderMode::CONSTANT;
         _constant_border_value = PixelValue();
@@ -232,39 +213,38 @@ void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, co
 
 #ifdef ENABLE_NCHW_KERNELS
     // Configure scale function to run
-    if(_data_layout == DataLayout::NCHW)
+    if (_data_layout == DataLayout::NCHW)
     {
         std::string function_to_call("scale_");
         function_to_call += string_from_data_type(src->data_type()) + "_";
         function_to_call += string_from_data_layout(_data_layout) + "_";
         function_to_call += string_from_interpolation_policy(_policy);
 
-        static std::map<std::string, ScaleFunctionPtr> map_function =
-        {
-            { "scale_U8_NCHW_AREA_CONSTANT", &CpuScaleKernel::scale_area_nchw_u8 },
+        static std::map<std::string, ScaleFunctionPtr> map_function = {
+            {"scale_U8_NCHW_AREA_CONSTANT", &CpuScaleKernel::scale_area_nchw_u8},
 
-            { "scale_U8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<uint8_t> },
-            { "scale_U8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<uint8_t> },
+            {"scale_U8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<uint8_t>},
+            {"scale_U8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<uint8_t>},
 
-            { "scale_QASYMM8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm<uint8_t> },
-            { "scale_QASYMM8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<uint8_t> },
+            {"scale_QASYMM8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm<uint8_t>},
+            {"scale_QASYMM8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<uint8_t>},
 
-            { "scale_QASYMM8_SIGNED_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm<int8_t> },
-            { "scale_QASYMM8_SIGNED_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<int8_t> },
+            {"scale_QASYMM8_SIGNED_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm<int8_t>},
+            {"scale_QASYMM8_SIGNED_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<int8_t>},
 
-            { "scale_S16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<int16_t> },
-            { "scale_S16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<int16_t> },
+            {"scale_S16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<int16_t>},
+            {"scale_S16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<int16_t>},
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            { "scale_F16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<float16_t> },
-            { "scale_F16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<float16_t> },
+            {"scale_F16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<float16_t>},
+            {"scale_F16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<float16_t>},
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
-            { "scale_F32_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<float> },
-            { "scale_F32_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<float> },
+            {"scale_F32_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<float>},
+            {"scale_F32_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<float>},
         };
         auto it = map_function.find(function_to_call);
-        if(it != map_function.end())
+        if (it != map_function.end())
         {
             _func = it->second;
         }
@@ -278,13 +258,19 @@ void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, co
 
 #ifdef ENABLE_NCHW_KERNELS
 template <typename T>
-void CpuScaleKernel::scale_nearest_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window)
+void CpuScaleKernel::scale_nearest_nchw(const ITensor *src,
+                                        ITensor       *dst,
+                                        const ITensor *dx,
+                                        const ITensor *dy,
+                                        const ITensor *offsets,
+                                        const Window  &window)
 {
     ARM_COMPUTE_UNUSED(dx, dy);
     const size_t in_stride_x = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
 
     // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
+    const auto hr =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
 
     // Don't increment in X and Y direction for the input tensor
     // A pointer to the start of this plane is needed as base for the precomputed offsets
@@ -296,7 +282,7 @@ void CpuScaleKernel::scale_nearest_nchw(const ITensor *src, ITensor *dst, const
     Window win_off;
     win_off.set(Window::DimX, window[Window::DimX]);
     win_off.set(Window::DimY, window[Window::DimY]);
-    for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
+    for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
     {
         win_off.set(d, Window::Dimension(0, 0, 0));
     }
@@ -305,24 +291,33 @@ void CpuScaleKernel::scale_nearest_nchw(const ITensor *src, ITensor *dst, const
     Iterator src_i(src, win_in);
     Iterator dst_i(dst, window);
     Iterator offsets_i(offsets, win_off);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets_i.ptr());
-        const auto in_yi       = static_cast<int32_t>(_align_corners ? utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((
-                                                          id.y() + _sampling_offset)
-                                                      * hr));
-        const int32_t offset_row            = in_yi * in_stride_x;
-        *reinterpret_cast<T *>(dst_i.ptr()) = *(reinterpret_cast<const T *>(src_i.ptr()) + offsets_ptr[0] + offset_row);
-    },
-    src_i, offsets_i, dst_i);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets_i.ptr());
+            const auto in_yi       = static_cast<int32_t>(
+                _align_corners ? utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr)
+                                     : std::floor((id.y() + _sampling_offset) * hr));
+            const int32_t offset_row = in_yi * in_stride_x;
+            *reinterpret_cast<T *>(dst_i.ptr()) =
+                *(reinterpret_cast<const T *>(src_i.ptr()) + offsets_ptr[0] + offset_row);
+        },
+        src_i, offsets_i, dst_i);
 }
 
 template <typename T>
-void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window)
+void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src,
+                                         ITensor       *dst,
+                                         const ITensor *dx,
+                                         const ITensor *dy,
+                                         const ITensor *offsets,
+                                         const Window  &window)
 {
     // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
-    Window     win_off;
+    const auto hr =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
+    Window win_off;
     win_off.set(Window::DimX, window.x());
     win_off.set(Window::DimY, window.y());
 
@@ -332,7 +327,7 @@ void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src, ITensor *dst, const
     win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
     win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
 
-    for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
+    for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
     {
         win_off.set(d, Window::Dimension(0, 0, 0));
     }
@@ -347,7 +342,7 @@ void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src, ITensor *dst, const
     const int32_t in_dim_h    = src->info()->dimension(1);
     const int32_t in_stride_w = in_dim_w + src->info()->padding().left + src->info()->padding().right;
 
-    if(_border_mode == BorderMode::CONSTANT)
+    if (_border_mode == BorderMode::CONSTANT)
     {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
@@ -355,52 +350,60 @@ void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src, ITensor *dst, const
         using ConstType = T;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         const T const_border_value = static_cast<T>(_constant_border_value.get<ConstType>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int32_t index_h       = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
-            const auto    index_w       = *(reinterpret_cast<const int32_t *>(offsets_i.ptr()));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx_i.ptr()));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy_i.ptr()));
-            const auto    pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
-
-            const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + index_h * in_stride_w)) : const_border_value;
-            const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w)) : const_border_value;
-            const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h
-                              && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + index_w + index_h * in_stride_w + in_stride_w)) :
-                             const_border_value;
-            const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h
-                              && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w + in_stride_w)) :
-                             const_border_value;
-
-            *reinterpret_cast<T *>(dst_i.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        src_i, offsets_i, dx_i, dy_i, dst_i);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int32_t index_h       = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
+                const auto    index_w       = *(reinterpret_cast<const int32_t *>(offsets_i.ptr()));
+                const auto    dx_val        = *(reinterpret_cast<const float *>(dx_i.ptr()));
+                const auto    dy_val        = *(reinterpret_cast<const float *>(dy_i.ptr()));
+                const auto    pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
+
+                const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h)
+                                     ? (*(pixel_row_ptr + index_w + index_h * in_stride_w))
+                                     : const_border_value;
+                const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h)
+                                     ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w))
+                                     : const_border_value;
+                const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1)
+                                     ? (*(pixel_row_ptr + index_w + index_h * in_stride_w + in_stride_w))
+                                     : const_border_value;
+                const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1)
+                                     ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w + in_stride_w))
+                                     : const_border_value;
+
+                *reinterpret_cast<T *>(dst_i.ptr()) =
+                    static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+            },
+            src_i, offsets_i, dx_i, dy_i, dst_i);
     }
-    else if(_border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int  index_h       = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
-            const auto index_w       = *(reinterpret_cast<const int32_t *>(offsets_i.ptr()));
-            const auto dx_val        = *(reinterpret_cast<const float *>(dx_i.ptr()));
-            const auto dy_val        = *(reinterpret_cast<const float *>(dy_i.ptr()));
-            const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
-
-            auto clamped_x  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
-            auto clamped_x1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
-            auto clamped_y  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
-            auto clamped_y1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(pixel_row_ptr + clamped_x + clamped_y * in_stride_w);
-            const auto a01 = *(pixel_row_ptr + clamped_x1 + clamped_y * in_stride_w);
-            const auto a10 = *(pixel_row_ptr + clamped_x + clamped_y1 * in_stride_w);
-            const auto a11 = *(pixel_row_ptr + clamped_x1 + clamped_y1 * in_stride_w);
-
-            *reinterpret_cast<T *>(dst_i.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        src_i, offsets_i, dx_i, dy_i, dst_i);
+    else if (_border_mode == BorderMode::REPLICATE)
+    {
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int  index_h       = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
+                const auto index_w       = *(reinterpret_cast<const int32_t *>(offsets_i.ptr()));
+                const auto dx_val        = *(reinterpret_cast<const float *>(dx_i.ptr()));
+                const auto dy_val        = *(reinterpret_cast<const float *>(dy_i.ptr()));
+                const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
+
+                auto clamped_x  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
+                auto clamped_x1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
+                auto clamped_y  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
+                auto clamped_y1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
+
+                const auto a00 = *(pixel_row_ptr + clamped_x + clamped_y * in_stride_w);
+                const auto a01 = *(pixel_row_ptr + clamped_x1 + clamped_y * in_stride_w);
+                const auto a10 = *(pixel_row_ptr + clamped_x + clamped_y1 * in_stride_w);
+                const auto a11 = *(pixel_row_ptr + clamped_x1 + clamped_y1 * in_stride_w);
+
+                *reinterpret_cast<T *>(dst_i.ptr()) =
+                    static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+            },
+            src_i, offsets_i, dx_i, dy_i, dst_i);
     }
     else
     {
@@ -408,7 +411,12 @@ void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src, ITensor *dst, const
     }
 }
 
-void CpuScaleKernel::scale_area_nchw_u8(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window)
+void CpuScaleKernel::scale_area_nchw_u8(const ITensor *src,
+                                        ITensor       *dst,
+                                        const ITensor *dx,
+                                        const ITensor *dy,
+                                        const ITensor *offsets,
+                                        const Window  &window)
 {
     ARM_COMPUTE_UNUSED(dx, dy, offsets);
     using namespace scale_helpers;
@@ -425,50 +433,60 @@ void CpuScaleKernel::scale_area_nchw_u8(const ITensor *src, ITensor *dst, const
     Iterator src_i(src, win_in);
     Iterator dst_i(dst, window);
 
-    const auto   wr        = scale_utils::calculate_resize_ratio(src->info()->dimension(0), dst->info()->dimension(0), _align_corners);
-    const auto   hr        = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
+    const auto wr =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(0), dst->info()->dimension(0), _align_corners);
+    const auto hr =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
     const auto   w         = src->info()->dimension(0);
     const auto   h         = src->info()->dimension(1);
     const size_t in_stride = src->info()->strides_in_bytes()[1];
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto in_ptr = reinterpret_cast<const uint8_t *>(src_i.ptr());
-
-        uint8x8_t tmp0 = vdup_n_u8(0);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x(), id.y()), tmp0, 0);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 1, id.y()), tmp0, 1);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 2, id.y()), tmp0, 2);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 3, id.y()), tmp0, 3);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 4, id.y()), tmp0, 4);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 5, id.y()), tmp0, 5);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 6, id.y()), tmp0, 6);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 7, id.y()), tmp0, 7);
-
-        uint8x8_t tmp1 = vdup_n_u8(0);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 8, id.y()), tmp1, 0);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 9, id.y()), tmp1, 1);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 10, id.y()), tmp1, 2);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 11, id.y()), tmp1, 3);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 12, id.y()), tmp1, 4);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 13, id.y()), tmp1, 5);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 14, id.y()), tmp1, 6);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 15, id.y()), tmp1, 7);
-
-        vst1q_u8(dst_i.ptr(), vcombine_u8(tmp0, tmp1));
-    },
-    src_i, dst_i);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const auto in_ptr = reinterpret_cast<const uint8_t *>(src_i.ptr());
+
+            uint8x8_t tmp0 = vdup_n_u8(0);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x(), id.y()), tmp0, 0);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 1, id.y()), tmp0, 1);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 2, id.y()), tmp0, 2);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 3, id.y()), tmp0, 3);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 4, id.y()), tmp0, 4);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 5, id.y()), tmp0, 5);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 6, id.y()), tmp0, 6);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 7, id.y()), tmp0, 7);
+
+            uint8x8_t tmp1 = vdup_n_u8(0);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 8, id.y()), tmp1, 0);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 9, id.y()), tmp1, 1);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 10, id.y()), tmp1, 2);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 11, id.y()), tmp1, 3);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 12, id.y()), tmp1, 4);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 13, id.y()), tmp1, 5);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 14, id.y()), tmp1, 6);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 15, id.y()), tmp1, 7);
+
+            vst1q_u8(dst_i.ptr(), vcombine_u8(tmp0, tmp1));
+        },
+        src_i, dst_i);
 }
 
 template <typename T>
-void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window)
+void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src,
+                                           ITensor       *dst,
+                                           const ITensor *dx,
+                                           const ITensor *dy,
+                                           const ITensor *offsets,
+                                           const Window  &window)
 {
     // Get data layout and width/height indices
     const int idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
     const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
 
     // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), _align_corners);
+    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height),
+                                                        dst->info()->dimension(idx_height), _align_corners);
     Window     win_off;
     win_off.set(Window::DimX, Window::Dimension(0, 0, 0));
     win_off.set(Window::DimY, Window::Dimension(0, 0, 0));
@@ -479,7 +497,7 @@ void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src, ITensor *dst, con
     win_in.set(idx_width, Window::Dimension(0, 0, 0));
     win_in.set(idx_height, Window::Dimension(0, 0, 0));
 
-    for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
+    for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
     {
         win_off.set(d, Window::Dimension(0, 0, 0));
     }
@@ -495,7 +513,7 @@ void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src, ITensor *dst, con
     const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform();
     const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
 
-    if(_border_mode == BorderMode::CONSTANT)
+    if (_border_mode == BorderMode::CONSTANT)
     {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
@@ -503,62 +521,74 @@ void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src, ITensor *dst, con
         using ConstType = T;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         const T const_border_value = static_cast<T>(_constant_border_value.get<ConstType>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int32_t index_h       = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
-
-            const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ?
-                             (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) :
-                             const_border_value;
-            const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ?
-                             (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) :
-                             const_border_value;
-            const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) :
-                             const_border_value;
-            const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) :
-                             const_border_value;
-
-            const float inp00                   = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
-            const float inp01                   = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
-            const float inp10                   = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
-            const float inp11                   = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
-            *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        src_i, dst_i);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int32_t index_h = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset);
+                const int32_t index_w = *(reinterpret_cast<const int32_t *>(
+                    offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+                const auto    dx_val =
+                    *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+                const auto dy_val =
+                    *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+                const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
+
+                const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h)
+                                     ? (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h))
+                                     : const_border_value;
+                const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h)
+                                     ? (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h))
+                                     : const_border_value;
+                const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1)
+                                     ? (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h))
+                                     : const_border_value;
+                const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1)
+                                     ? (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h))
+                                     : const_border_value;
+
+                const float inp00                   = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
+                const float inp01                   = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
+                const float inp10                   = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
+                const float inp11                   = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
+                *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize(
+                    scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
+            },
+            src_i, dst_i);
     }
-    else if(_border_mode == BorderMode::REPLICATE)
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int     index_h       = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
-
-            auto clamped_w  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h);
-            const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h);
-            const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h);
-            const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h);
-
-            const float inp00                   = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
-            const float inp01                   = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
-            const float inp10                   = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
-            const float inp11                   = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
-            *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        src_i, dst_i);
+    else if (_border_mode == BorderMode::REPLICATE)
+    {
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int     index_h = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset);
+                const int32_t index_w = *(reinterpret_cast<const int32_t *>(
+                    offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+                const auto    dx_val =
+                    *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+                const auto dy_val =
+                    *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+                const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
+
+                auto clamped_w  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
+                auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
+                auto clamped_h  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
+                auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
+
+                const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h);
+                const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h);
+                const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h);
+                const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h);
+
+                const float inp00                   = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
+                const float inp01                   = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
+                const float inp10                   = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
+                const float inp11                   = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
+                *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize(
+                    scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
+            },
+            src_i, dst_i);
     }
     else
     {
@@ -567,8 +597,12 @@ void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src, ITensor *dst, con
 }
 #endif // ENABLE_NCHW_KERNELS
 
-Status CpuScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy,
-                                const ITensorInfo *offsets, ITensorInfo *output, const ScaleKernelInfo &info)
+Status CpuScaleKernel::validate(const ITensorInfo     *input,
+                                const ITensorInfo     *dx,
+                                const ITensorInfo     *dy,
+                                const ITensorInfo     *offsets,
+                                ITensorInfo           *output,
+                                const ScaleKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, dx, dy, offsets, output, info));
     return Status{};
@@ -588,13 +622,14 @@ void CpuScaleKernel::run_op(ITensorPack &tensors, const Window &window, const Th
     const auto dy      = tensors.get_const_tensor(TensorType::ACL_INT_1);
     const auto offsets = tensors.get_const_tensor(TensorType::ACL_INT_2);
 
-    if(_data_layout == DataLayout::NCHW)
+    if (_data_layout == DataLayout::NCHW)
     {
         (this->*_func)(src, dst, dx, dy, offsets, window);
     }
     else
     {
-        _run_method(src, dst, offsets, dx, dy, _policy, _border_mode, _constant_border_value, _sampling_offset, _align_corners, window);
+        _run_method(src, dst, offsets, dx, dy, _policy, _border_mode, _constant_border_value, _sampling_offset,
+                    _align_corners, window);
     }
 }
 
diff --git a/src/cpu/kernels/CpuScaleKernel.h b/src/cpu/kernels/CpuScaleKernel.h
index 8102142fc3..38142df021 100644
--- a/src/cpu/kernels/CpuScaleKernel.h
+++ b/src/cpu/kernels/CpuScaleKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_SCALEKERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -39,9 +40,19 @@ class CpuScaleKernel : public ICpuKernel<CpuScaleKernel>
 {
 private:
     /** Scale function to use for the particular function to use */
-    using ScaleFunctionPtr = void (CpuScaleKernel::*)(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, const Window &window);
-    using ScaleKernelPtr   = std::add_pointer<void(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *,
-                                                   InterpolationPolicy, BorderMode, PixelValue, float, bool, const Window &)>::type;
+    using ScaleFunctionPtr = void (CpuScaleKernel::*)(
+        const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, const Window &window);
+    using ScaleKernelPtr = std::add_pointer<void(const ITensor *,
+                                                 ITensor *,
+                                                 const ITensor *,
+                                                 const ITensor *,
+                                                 const ITensor *,
+                                                 InterpolationPolicy,
+                                                 BorderMode,
+                                                 PixelValue,
+                                                 float,
+                                                 bool,
+                                                 const Window &)>::type;
 
 public:
     CpuScaleKernel() = default;
@@ -59,7 +70,11 @@ public:
      * @param[out] dst     Destination tensor info. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]  info    @ref ScaleKernelInfo to use for configuration
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *dst,
+    void configure(const ITensorInfo     *src,
+                   const ITensorInfo     *dx,
+                   const ITensorInfo     *dy,
+                   const ITensorInfo     *offsets,
+                   ITensorInfo           *dst,
                    const ScaleKernelInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -67,11 +82,15 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *dst,
+    static Status validate(const ITensorInfo     *src,
+                           const ITensorInfo     *dx,
+                           const ITensorInfo     *dy,
+                           const ITensorInfo     *offsets,
+                           ITensorInfo           *dst,
                            const ScaleKernelInfo &info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct ScaleKernel
@@ -89,28 +108,48 @@ private:
      *
      *  @note Used only in case down-sampling.
      */
-    void scale_area_nchw_u8(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window);
+    void scale_area_nchw_u8(const ITensor *src,
+                            ITensor       *dst,
+                            const ITensor *dx,
+                            const ITensor *dy,
+                            const ITensor *offsets,
+                            const Window  &window);
 
     /** function to perform scale using bilinear interpolation on the given window */
     template <typename T>
-    void scale_bilinear_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window);
+    void scale_bilinear_nchw(const ITensor *src,
+                             ITensor       *dst,
+                             const ITensor *dx,
+                             const ITensor *dy,
+                             const ITensor *offsets,
+                             const Window  &window);
     /** function to perform scale using bilinear interpolation on the given window */
     template <typename T>
-    void scale_bilinear_qasymm(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window);
+    void scale_bilinear_qasymm(const ITensor *src,
+                               ITensor       *dst,
+                               const ITensor *dx,
+                               const ITensor *dy,
+                               const ITensor *offsets,
+                               const Window  &window);
 
     /** function to perform scale using nearest neighbour on the given window */
     template <typename T>
-    void scale_nearest_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window);
+    void scale_nearest_nchw(const ITensor *src,
+                            ITensor       *dst,
+                            const ITensor *dx,
+                            const ITensor *dy,
+                            const ITensor *offsets,
+                            const Window  &window);
 #endif // ENABLE_NCHW_KERNELS
 
-    ScaleFunctionPtr    _func{ nullptr };
+    ScaleFunctionPtr    _func{nullptr};
     InterpolationPolicy _policy{};
     BorderMode          _border_mode{};
     PixelValue          _constant_border_value{};
-    float               _sampling_offset{ 0 };
-    bool                _align_corners{ false };
-    DataLayout          _data_layout{ DataLayout::UNKNOWN };
-    ScaleKernelPtr      _run_method{ nullptr };
+    float               _sampling_offset{0};
+    bool                _align_corners{false};
+    DataLayout          _data_layout{DataLayout::UNKNOWN};
+    ScaleKernelPtr      _run_method{nullptr};
     std::string         _name{};
 };
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuSoftmaxKernel.cpp b/src/cpu/kernels/CpuSoftmaxKernel.cpp
index e06ab9917c..ce144351f8 100644
--- a/src/cpu/kernels/CpuSoftmaxKernel.cpp
+++ b/src/cpu/kernels/CpuSoftmaxKernel.cpp
@@ -30,11 +30,11 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/common/Registrars.h"
 #include "src/cpu/kernels/softmax/list.h"
 
 namespace arm_compute
@@ -46,61 +46,44 @@ namespace kernels
 namespace
 {
 /* Softmax Logits 1D Max - identifying the max value of 1D Logits  */
-static const std::vector<CpuLogits1DMaxKernel::SoftmaxLogits1DMaxKernel> available_kernels_max_logits =
-{
-    {
-        "sve_fp32_logits_1d_max",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32) && data.isa.sve; },
-        REGISTER_FP32_SVE(sve_fp32_logits)
-    },
-    {
-        "sve_fp16_logits_1d_max",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
-        REGISTER_FP16_SVE(sve_fp16_logits)
-    },
-    {
-        "sve_qu8_logits_1d_max",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8) && data.isa.sve; },
-        REGISTER_QASYMM8_SVE(sve_qasymm8_logits)
-    },
-    {
-        "sve_qs8_logits_1d_max",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve; },
-        REGISTER_QASYMM8_SIGNED_SVE(sve_qasymm8_signed_logits)
-    },
-    {
-        "neon_fp32_logits_1d_max",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_NEON(neon_fp32_logits)
-    },
-    {
-        "neon_fp16_logits_1d_max",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.fp16; },
-        REGISTER_FP16_NEON(neon_fp16_logits)
-    },
-    {
-        "neon_qu8_logits_1d_max",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
-        REGISTER_QASYMM8_NEON(neon_qasymm8_logits)
-    },
-    {
-        "neon_qs8_logits_1d_max",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-        REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_singed_logits)
-    },
+static const std::vector<CpuLogits1DMaxKernel::SoftmaxLogits1DMaxKernel> available_kernels_max_logits = {
+    {"sve_fp32_logits_1d_max",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; },
+     REGISTER_FP32_SVE(sve_fp32_logits)},
+    {"sve_fp16_logits_1d_max",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
+     REGISTER_FP16_SVE(sve_fp16_logits)},
+    {"sve_qu8_logits_1d_max",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve; },
+     REGISTER_QASYMM8_SVE(sve_qasymm8_logits)},
+    {"sve_qs8_logits_1d_max",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve; },
+     REGISTER_QASYMM8_SIGNED_SVE(sve_qasymm8_signed_logits)},
+    {"neon_fp32_logits_1d_max", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(neon_fp32_logits)},
+    {"neon_fp16_logits_1d_max",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(neon_fp16_logits)},
+    {"neon_qu8_logits_1d_max", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(neon_qasymm8_logits)},
+    {"neon_qs8_logits_1d_max",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_singed_logits)},
 };
 
 Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
 
     // Validate in case of configured output
-    if(output.total_size() != 0)
+    if (output.total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(), TensorShape(input.tensor_shape()).set(0, 1));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(),
+                                                           TensorShape(input.tensor_shape()).set(0, 1));
     }
 
     return Status{};
@@ -121,7 +104,7 @@ void CpuLogits1DMaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info());
 
-    const auto *uk = get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+    const auto *uk = get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     _run_method = uk->ukernel;
@@ -158,60 +141,46 @@ const char *CpuLogits1DMaxKernel::name() const
 }
 
 /* Softmax Logits 1D  - computation for QASYMM8 with pre-computed max.  */
-template <bool                                                                             IS_LOG>
-static const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> available_kernels_logits =
-{
-    {
-        "sve2_qu8_softmax_logits_1d",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; },
-        REGISTER_QASYMM8_SVE2(sve2_qasymm8_softmax)
-    },
-    {
-        "sve2_qs8_softmax_logits_1d",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; },
-        REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_softmax)
-    },
-    {
-        "sve_fp32_softmax_logits_1d",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32) && data.isa.sve; },
-        REGISTER_FP32_SVE(sve_fp32_softmax)
-    },
-    {
-        "sve_fp16_softmax_logits_1d",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
-        REGISTER_FP16_SVE(sve_fp16_softmax)
-    },
-
-    {
-        "neon_fp32_softmax_logits_1d",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_NEON(neon_fp32_softmax)
-    },
-    {
-        "neon_fp16_softmax_logits_1d",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.fp16; },
-        REGISTER_FP16_NEON(neon_fp16_softmax)
-    },
-    {
-        "neon_qu8_softmax_logits_1d",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax)
-    },
-    {
-        "neon_qs8_softmax_logits_1d",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax)
-    },
+template <bool IS_LOG>
+static const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> available_kernels_logits = {
+    {"sve2_qu8_softmax_logits_1d",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; },
+     REGISTER_QASYMM8_SVE2(sve2_qasymm8_softmax)},
+    {"sve2_qs8_softmax_logits_1d",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; },
+     REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_softmax)},
+    {"sve_fp32_softmax_logits_1d",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; },
+     REGISTER_FP32_SVE(sve_fp32_softmax)},
+    {"sve_fp16_softmax_logits_1d",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
+     REGISTER_FP16_SVE(sve_fp16_softmax)},
+
+    {"neon_fp32_softmax_logits_1d", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(neon_fp32_softmax)},
+    {"neon_fp16_softmax_logits_1d",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(neon_fp16_softmax)},
+    {"neon_qu8_softmax_logits_1d", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax)},
+    {"neon_qs8_softmax_logits_1d",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax)},
 };
 namespace
 {
-Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorInfo &max,
-                                         const ITensorInfo &dst, const float beta, const ITensorInfo &tmp, bool is_log)
+Status validate_arguments_logits_softmax(const ITensorInfo &src,
+                                         const ITensorInfo &max,
+                                         const ITensorInfo &dst,
+                                         const float        beta,
+                                         const ITensorInfo &tmp,
+                                         bool               is_log)
 {
     ARM_COMPUTE_UNUSED(beta);
     // Check input
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
 
     const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type());
 
@@ -221,16 +190,18 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorIn
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&src, &max);
 
     // Check output if configured
-    if(dst.total_size() != 0)
+    if (dst.total_size() != 0)
     {
-        const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src.data_type(), is_log) : dst.quantization_info();
+        const QuantizationInfo output_quantization =
+            is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src.data_type(), is_log)
+                                    : dst.quantization_info();
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
         ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() != output_quantization);
     }
 
     // Check tmp if configured
-    if(tmp.total_size() != 0)
+    if (tmp.total_size() != 0)
     {
         const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src.data_type();
         ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != tmp_data_type);
@@ -243,14 +214,16 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorIn
 }
 } // namespace
 
-template <bool                                                                       IS_LOG>
-const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> &CpuLogits1DSoftmaxKernel<IS_LOG>::get_available_kernels()
+template <bool IS_LOG>
+const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> &
+CpuLogits1DSoftmaxKernel<IS_LOG>::get_available_kernels()
 {
     return available_kernels_logits<IS_LOG>;
 }
 
 template <bool IS_LOG>
-void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp)
+void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(
+    const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
@@ -259,17 +232,21 @@ void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(const ITensorInfo *src, const I
     const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type());
 
     // Output auto initialization if not yet initialized
-    const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG) : dst->quantization_info();
+    const QuantizationInfo output_quantization =
+        is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG)
+                                : dst->quantization_info();
     auto_init_if_empty(*dst, TensorInfo(*src).set_quantization_info(output_quantization).reset_padding());
 
     // Tmp auto initialization if not yet initialized
     const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type();
     auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding());
 
-    const auto *uk = CpuLogits1DSoftmaxKernel<IS_LOG>::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+    const auto *uk = CpuLogits1DSoftmaxKernel<IS_LOG>::get_implementation(
+        DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
-    std::string kernel_name = IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel");
+    std::string kernel_name =
+        IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel");
 
     _beta       = beta;
     _run_method = uk->ukernel;
@@ -282,8 +259,8 @@ void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(const ITensorInfo *src, const I
 }
 
 template <bool IS_LOG>
-Status CpuLogits1DSoftmaxKernel<IS_LOG>::validate(const ITensorInfo *src, const ITensorInfo *max,
-                                                  const ITensorInfo *dst, const float beta, const ITensorInfo *tmp)
+Status CpuLogits1DSoftmaxKernel<IS_LOG>::validate(
+    const ITensorInfo *src, const ITensorInfo *max, const ITensorInfo *dst, const float beta, const ITensorInfo *tmp)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
@@ -305,7 +282,7 @@ void CpuLogits1DSoftmaxKernel<IS_LOG>::run_op(ITensorPack &tensors, const Window
     auto       tmp = tensors.get_tensor(TensorType::ACL_DST_1);
 
     const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x();
-    const unsigned int tmp_size_for_thread               = tmp->info()->element_size() * num_elems_processed_per_iteration;
+    const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration;
 
     ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread));
 
@@ -314,7 +291,7 @@ void CpuLogits1DSoftmaxKernel<IS_LOG>::run_op(ITensorPack &tensors, const Window
 }
 
 template <bool IS_LOG>
-const char    *CpuLogits1DSoftmaxKernel<IS_LOG>::name() const
+const char *CpuLogits1DSoftmaxKernel<IS_LOG>::name() const
 {
     return _name.c_str();
 }
diff --git a/src/cpu/kernels/CpuSoftmaxKernel.h b/src/cpu/kernels/CpuSoftmaxKernel.h
index 59f43bd1d2..5d288179fd 100644
--- a/src/cpu/kernels/CpuSoftmaxKernel.h
+++ b/src/cpu/kernels/CpuSoftmaxKernel.h
@@ -57,7 +57,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct SoftmaxLogits1DMaxKernel
@@ -70,7 +70,7 @@ public:
     static const std::vector<SoftmaxLogits1DMaxKernel> &get_available_kernels();
 
 private:
-    SoftmaxLogits1DMaxKernelPtr _run_method{ nullptr };
+    SoftmaxLogits1DMaxKernelPtr _run_method{nullptr};
     std::string                 _name{};
 };
 
@@ -79,7 +79,8 @@ template <bool IS_LOG = false>
 class CpuLogits1DSoftmaxKernel : public ICpuKernel<CpuLogits1DSoftmaxKernel<IS_LOG>>
 {
 private:
-    using SoftmaxLogits1DKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type;
+    using SoftmaxLogits1DKernelPtr = std::add_pointer<void(
+        const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type;
 
 public:
     CpuLogits1DSoftmaxKernel() = default;
@@ -95,18 +96,22 @@ public:
      *
      * @param      tmp    Auxiliary tensor info. Must be type F32 and same shape as the input.
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp);
+    void
+    configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuLogits1DSoftmaxKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *max,
-                           const ITensorInfo *dst, const float beta, const ITensorInfo *tmp);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *max,
+                           const ITensorInfo *dst,
+                           const float        beta,
+                           const ITensorInfo *tmp);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct SoftmaxLogits1DKernel
@@ -119,8 +124,8 @@ public:
     static const std::vector<SoftmaxLogits1DKernel> &get_available_kernels();
 
 private:
-    float                    _beta{ 1.0f };
-    SoftmaxLogits1DKernelPtr _run_method{ nullptr };
+    float                    _beta{1.0f};
+    SoftmaxLogits1DKernelPtr _run_method{nullptr};
     std::string              _name{};
 };
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuSubKernel.cpp b/src/cpu/kernels/CpuSubKernel.cpp
index 875d613dca..2b2c6f2e92 100644
--- a/src/cpu/kernels/CpuSubKernel.cpp
+++ b/src/cpu/kernels/CpuSubKernel.cpp
@@ -25,8 +25,9 @@
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/add/generic/neon/impl.h"
@@ -51,70 +52,48 @@ namespace
 using CpuSubKernelDataTypeISASelectorData    = CpuAddKernelDataTypeISASelectorData;
 using CpuSubKernelDataTypeISASelectorDataPtr = CpuAddKernelDataTypeISASelectorDataPtr;
 
-static const std::vector<CpuSubKernel::SubKernel> available_kernels =
-{
-    {
-        "neon_fp32_sub",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_NEON(arm_compute::cpu::sub_same_neon<float>)
-    },
-    {
-        "neon_fp16_sub",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::sub_same_neon<float16_t>)
-    },
-    {
-        "neon_u8_sub",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::U8); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<uint8_t>)
-    },
-    {
-        "neon_s16_sub",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::S16); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int16_t>)
-    },
-    {
-        "neon_s32_sub",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::S32); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int32_t>)
-    },
-    {
-        "neon_qu8_sub_fixedpoint",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return ((data.dt == DataType::QASYMM8) && data.can_use_fixedpoint); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon_fixedpoint)
-    },
-    {
-        "neon_qs8_sub_fixedpoint",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return ((data.dt == DataType::QASYMM8_SIGNED) && data.can_use_fixedpoint); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon_fixedpoint)
-    },
-    {
-        "neon_qu8_sub",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon)
-    },
-    {
-        "neon_qs8_sub",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon)
-    },
-    {
-        "neon_qs16_sub",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QSYMM16); },
-        REGISTER_QSYMM16_NEON(arm_compute::cpu::sub_qsymm16_neon)
-    },
+static const std::vector<CpuSubKernel::SubKernel> available_kernels = {
+    {"neon_fp32_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(arm_compute::cpu::sub_same_neon<float>)},
+    {"neon_fp16_sub",
+     [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::sub_same_neon<float16_t>)},
+    {"neon_u8_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::U8); },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<uint8_t>)},
+    {"neon_s16_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S16); },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int16_t>)},
+    {"neon_s32_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S32); },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int32_t>)},
+    {"neon_qu8_sub_fixedpoint",
+     [](const CpuSubKernelDataTypeISASelectorData &data)
+     { return ((data.dt == DataType::QASYMM8) && data.can_use_fixedpoint); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon_fixedpoint)},
+    {"neon_qs8_sub_fixedpoint",
+     [](const CpuSubKernelDataTypeISASelectorData &data)
+     { return ((data.dt == DataType::QASYMM8_SIGNED) && data.can_use_fixedpoint); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon_fixedpoint)},
+    {"neon_qu8_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon)},
+    {"neon_qs8_sub",
+     [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon)},
+    {"neon_qs16_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QSYMM16); },
+     REGISTER_QSYMM16_NEON(arm_compute::cpu::sub_qsymm16_neon)},
 };
 
-inline Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy)
+inline Status
+validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy)
 {
     ARM_COMPUTE_UNUSED(policy);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16,
-                                                         DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16,
+                                                         DataType::S32, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1);
 
     const auto can_use_fixedpoint = sub_q8_neon_fixedpoint_possible(&src0, &src1, &dst);
-    const auto uk                 = CpuSubKernel::get_implementation<CpuSubKernelDataTypeISASelectorData>(CpuSubKernelDataTypeISASelectorData{ src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint });
+    const auto uk                 = CpuSubKernel::get_implementation<CpuSubKernelDataTypeISASelectorData>(
+        CpuSubKernelDataTypeISASelectorData{src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint});
 
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
@@ -125,7 +104,7 @@ inline Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src
                                     "Convert policy cannot be WRAP if datatype is quantized");
 
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
@@ -147,7 +126,8 @@ void CpuSubKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I
     set_data_type_if_unknown(*dst, src0->data_type());
 
     const auto can_use_fixedpoint = sub_q8_neon_fixedpoint_possible(src0, src1, dst);
-    const auto uk                 = CpuSubKernel::get_implementation<CpuSubKernelDataTypeISASelectorData>(CpuSubKernelDataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint });
+    const auto uk                 = CpuSubKernel::get_implementation<CpuSubKernelDataTypeISASelectorData>(
+        CpuSubKernelDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint});
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
 
@@ -167,14 +147,14 @@ size_t CpuSubKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
     ARM_COMPUTE_UNUSED(thread_count);
 
 #if defined(ENABLE_FP32_KERNELS)
-    if(this->_run_method == &sub_same_neon<float>)
+    if (this->_run_method == &sub_same_neon<float>)
     {
         size_t mws = ICPPKernel::default_mws;
-        if(platform.get_cpu_model() == CPUModel::N1)
+        if (platform.get_cpu_model() == CPUModel::N1)
         {
             mws = default_mws_N1_fp32_neon;
         }
-        else if(platform.get_cpu_model() == CPUModel::V1)
+        else if (platform.get_cpu_model() == CPUModel::V1)
         {
             mws = default_mws_V1_fp32_neon;
         }
@@ -184,7 +164,7 @@ size_t CpuSubKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
         }
 
         // tensor is 1D or was re-interpreted as 1D
-        if(this->window().shape().num_dimensions() == 1)
+        if (this->window().shape().num_dimensions() == 1)
         {
             return mws;
         }
@@ -203,7 +183,8 @@ size_t CpuSubKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
     return ICPPKernel::default_mws;
 }
 
-Status CpuSubKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy)
+Status
+CpuSubKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst, policy));
diff --git a/src/cpu/kernels/CpuSubKernel.h b/src/cpu/kernels/CpuSubKernel.h
index cd209d1837..5fa0dc411a 100644
--- a/src/cpu/kernels/CpuSubKernel.h
+++ b/src/cpu/kernels/CpuSubKernel.h
@@ -37,7 +37,8 @@ namespace kernels
 class CpuSubKernel : public ICpuKernel<CpuSubKernel>
 {
 private:
-    using SubKernelPtr                           = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
+    using SubKernelPtr                           = std::add_pointer<void(
+        const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
     using CpuSubKernelDataTypeISASelectorDataPtr = CpuAddKernelDataTypeISASelectorDataPtr;
 
 public:
@@ -68,7 +69,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy);
+    static Status
+    validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy);
 
     // Inherited methods overridden:
     void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
@@ -99,9 +101,9 @@ public:
 
 private:
     ConvertPolicy _policy{};
-    SubKernelPtr  _run_method{ nullptr };
+    SubKernelPtr  _run_method{nullptr};
     std::string   _name{};
-    size_t        _split_dimension{ Window::DimY };
+    size_t        _split_dimension{Window::DimY};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuTransposeKernel.cpp b/src/cpu/kernels/CpuTransposeKernel.cpp
index b2cebc4230..615bc6ce1e 100644
--- a/src/cpu/kernels/CpuTransposeKernel.cpp
+++ b/src/cpu/kernels/CpuTransposeKernel.cpp
@@ -28,8 +28,9 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -45,7 +46,7 @@ namespace
 {
 unsigned int num_elems_processed(size_t element_size)
 {
-    switch(element_size)
+    switch (element_size)
     {
         case 1:
             return 8;
@@ -81,10 +82,10 @@ void transpose_8bit_elements(const ITensor *in, ITensor *out, const Window &wind
 
     Window window_in(window);
     window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
-    if(left_over_loop_y)
+    if (left_over_loop_y)
     {
         // Check if window_end_y_multiple_of is greater than window_start_y
-        if(window_end_y_multiple_of > window_start_y)
+        if (window_end_y_multiple_of > window_start_y)
         {
             window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
         }
@@ -101,87 +102,121 @@ void transpose_8bit_elements(const ITensor *in, ITensor *out, const Window &wind
     Iterator output(out, window_out);
 
     // Run the SIMD path if and only if the input is not a row-vector
-    if(in->info()->dimension(1) != 1)
+    if (in->info()->dimension(1) != 1)
     {
         Iterator input(in, window_in);
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            // Compute 8x8 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
             {
-                const uint8x8_t row0 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 0 * input_stride_in_bytes));
-                const uint8x8_t row1 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 1 * input_stride_in_bytes));
-                const uint8x8_t row2 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 2 * input_stride_in_bytes));
-                const uint8x8_t row3 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 3 * input_stride_in_bytes));
-                const uint8x8_t row4 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 4 * input_stride_in_bytes));
-                const uint8x8_t row5 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 5 * input_stride_in_bytes));
-                const uint8x8_t row6 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 6 * input_stride_in_bytes));
-                const uint8x8_t row7 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 7 * input_stride_in_bytes));
-
-                // Transpose 2x2
-                const uint8x8x2_t k0_u8 = vtrn_u8(row0, row1);
-                const uint8x8x2_t k1_u8 = vtrn_u8(row2, row3);
-                const uint8x8x2_t k2_u8 = vtrn_u8(row4, row5);
-                const uint8x8x2_t k3_u8 = vtrn_u8(row6, row7);
-
-                // Transpose 4x4
-                const uint16x4x2_t k0_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[0]), vreinterpret_u16_u8(k1_u8.val[0]));
-                const uint16x4x2_t k1_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[1]), vreinterpret_u16_u8(k1_u8.val[1]));
-                const uint16x4x2_t k2_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[0]), vreinterpret_u16_u8(k3_u8.val[0]));
-                const uint16x4x2_t k3_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[1]), vreinterpret_u16_u8(k3_u8.val[1]));
-
-                // Transpose 8x8
-                const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k2_u16.val[0]));
-                const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k2_u16.val[1]));
-                const uint32x2x2_t k2_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[0]), vreinterpret_u32_u16(k3_u16.val[0]));
-                const uint32x2x2_t k3_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[1]), vreinterpret_u32_u16(k3_u16.val[1]));
-
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
-
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[0])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[0])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[0])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[0])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[1])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[1])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[1])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[1])));
-            }
-
-            // Compute left-over elements along the x dimension (1x8)
-            for(; x < window_end_x; ++x)
-            {
-                const uint8_t val0 = *(input.ptr() + x + 0 * input_stride_in_bytes);
-                const uint8_t val1 = *(input.ptr() + x + 1 * input_stride_in_bytes);
-                const uint8_t val2 = *(input.ptr() + x + 2 * input_stride_in_bytes);
-                const uint8_t val3 = *(input.ptr() + x + 3 * input_stride_in_bytes);
-                const uint8_t val4 = *(input.ptr() + x + 4 * input_stride_in_bytes);
-                const uint8_t val5 = *(input.ptr() + x + 5 * input_stride_in_bytes);
-                const uint8_t val6 = *(input.ptr() + x + 6 * input_stride_in_bytes);
-                const uint8_t val7 = *(input.ptr() + x + 7 * input_stride_in_bytes);
-
-                uint8x8_t result = vdup_n_u8(0);
-                result           = vset_lane_u8(val0, result, 0);
-                result           = vset_lane_u8(val1, result, 1);
-                result           = vset_lane_u8(val2, result, 2);
-                result           = vset_lane_u8(val3, result, 3);
-                result           = vset_lane_u8(val4, result, 4);
-                result           = vset_lane_u8(val5, result, 5);
-                result           = vset_lane_u8(val6, result, 6);
-                result           = vset_lane_u8(val7, result, 7);
-
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
-
-                vst1_u8(output.ptr() + dst_offset_in_bytes, result);
-            }
-        },
-        input, output);
+                // Compute 8x8 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const uint8x8_t row0 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 0 * input_stride_in_bytes));
+                    const uint8x8_t row1 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 1 * input_stride_in_bytes));
+                    const uint8x8_t row2 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 2 * input_stride_in_bytes));
+                    const uint8x8_t row3 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 3 * input_stride_in_bytes));
+                    const uint8x8_t row4 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 4 * input_stride_in_bytes));
+                    const uint8x8_t row5 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 5 * input_stride_in_bytes));
+                    const uint8x8_t row6 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 6 * input_stride_in_bytes));
+                    const uint8x8_t row7 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 7 * input_stride_in_bytes));
+
+                    // Transpose 2x2
+                    const uint8x8x2_t k0_u8 = vtrn_u8(row0, row1);
+                    const uint8x8x2_t k1_u8 = vtrn_u8(row2, row3);
+                    const uint8x8x2_t k2_u8 = vtrn_u8(row4, row5);
+                    const uint8x8x2_t k3_u8 = vtrn_u8(row6, row7);
+
+                    // Transpose 4x4
+                    const uint16x4x2_t k0_u16 =
+                        vtrn_u16(vreinterpret_u16_u8(k0_u8.val[0]), vreinterpret_u16_u8(k1_u8.val[0]));
+                    const uint16x4x2_t k1_u16 =
+                        vtrn_u16(vreinterpret_u16_u8(k0_u8.val[1]), vreinterpret_u16_u8(k1_u8.val[1]));
+                    const uint16x4x2_t k2_u16 =
+                        vtrn_u16(vreinterpret_u16_u8(k2_u8.val[0]), vreinterpret_u16_u8(k3_u8.val[0]));
+                    const uint16x4x2_t k3_u16 =
+                        vtrn_u16(vreinterpret_u16_u8(k2_u8.val[1]), vreinterpret_u16_u8(k3_u8.val[1]));
+
+                    // Transpose 8x8
+                    const uint32x2x2_t k0_u32 =
+                        vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k2_u16.val[0]));
+                    const uint32x2x2_t k1_u32 =
+                        vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k2_u16.val[1]));
+                    const uint32x2x2_t k2_u32 =
+                        vtrn_u32(vreinterpret_u32_u16(k1_u16.val[0]), vreinterpret_u32_u16(k3_u16.val[0]));
+                    const uint32x2x2_t k3_u32 =
+                        vtrn_u32(vreinterpret_u32_u16(k1_u16.val[1]), vreinterpret_u32_u16(k3_u16.val[1]));
+
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
+
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[0])));
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[0])));
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[0])));
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[0])));
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[1])));
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[1])));
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[1])));
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[1])));
+                }
+
+                // Compute left-over elements along the x dimension (1x8)
+                for (; x < window_end_x; ++x)
+                {
+                    const uint8_t val0 = *(input.ptr() + x + 0 * input_stride_in_bytes);
+                    const uint8_t val1 = *(input.ptr() + x + 1 * input_stride_in_bytes);
+                    const uint8_t val2 = *(input.ptr() + x + 2 * input_stride_in_bytes);
+                    const uint8_t val3 = *(input.ptr() + x + 3 * input_stride_in_bytes);
+                    const uint8_t val4 = *(input.ptr() + x + 4 * input_stride_in_bytes);
+                    const uint8_t val5 = *(input.ptr() + x + 5 * input_stride_in_bytes);
+                    const uint8_t val6 = *(input.ptr() + x + 6 * input_stride_in_bytes);
+                    const uint8_t val7 = *(input.ptr() + x + 7 * input_stride_in_bytes);
+
+                    uint8x8_t result = vdup_n_u8(0);
+                    result           = vset_lane_u8(val0, result, 0);
+                    result           = vset_lane_u8(val1, result, 1);
+                    result           = vset_lane_u8(val2, result, 2);
+                    result           = vset_lane_u8(val3, result, 3);
+                    result           = vset_lane_u8(val4, result, 4);
+                    result           = vset_lane_u8(val5, result, 5);
+                    result           = vset_lane_u8(val6, result, 6);
+                    result           = vset_lane_u8(val7, result, 7);
+
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
+
+                    vst1_u8(output.ptr() + dst_offset_in_bytes, result);
+                }
+            },
+            input, output);
     }
 
-    if(left_over_loop_y)
+    if (left_over_loop_y)
     {
         window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
         window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
@@ -190,16 +225,18 @@ void transpose_8bit_elements(const ITensor *in, ITensor *out, const Window &wind
         Iterator output(out, window_out);
 
         // Compute left-over elements along the y dimension (1x1)
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            const uint8_t val0 = *input.ptr();
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
+            {
+                const uint8_t val0 = *input.ptr();
 
-            // Compute destination address
-            const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + id.x() * output_stride_in_bytes;
+                // Compute destination address
+                const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + id.x() * output_stride_in_bytes;
 
-            *(output.ptr() + dst_offset_in_bytes) = val0;
-        },
-        input, output);
+                *(output.ptr() + dst_offset_in_bytes) = val0;
+            },
+            input, output);
     }
 }
 
@@ -220,10 +257,10 @@ void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &win
 
     Window window_in(window);
     window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
-    if(left_over_loop_y)
+    if (left_over_loop_y)
     {
         // Check if window_end_y_multiple_of is greater than window_start_y
-        if(window_end_y_multiple_of > window_start_y)
+        if (window_end_y_multiple_of > window_start_y)
         {
             window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
         }
@@ -240,61 +277,77 @@ void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &win
     Iterator output(out, window_out);
 
     // Run the SIMD path if and only if the input is not a row-vector
-    if(in->info()->dimension(1) != 1)
+    if (in->info()->dimension(1) != 1)
     {
         Iterator input(in, window_in);
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            // Compute 4x4 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
             {
-                const uint16x4_t row0 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
-                const uint16x4_t row1 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
-                const uint16x4_t row2 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
-                const uint16x4_t row3 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-
-                // Transpose 2x2
-                const uint16x4x2_t k0_u16 = vtrn_u16(row0, row1);
-                const uint16x4x2_t k1_u16 = vtrn_u16(row2, row3);
-
-                // Transpose 4x4
-                const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_u16.val[0]));
-                const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_u16.val[1]));
-
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
-
-                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[0]));
-                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[0]));
-                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[1]));
-                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[1]));
-            }
-
-            // Compute left-over elements (1x4)
-            for(; x < window_end_x; ++x)
-            {
-                const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
-                const uint16_t val1 = *(reinterpret_cast<uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
-                const uint16_t val2 = *(reinterpret_cast<uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
-                const uint16_t val3 = *(reinterpret_cast<uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-
-                uint16x4_t result = vdup_n_u16(0);
-                result            = vset_lane_u16(val0, result, 0);
-                result            = vset_lane_u16(val1, result, 1);
-                result            = vset_lane_u16(val2, result, 2);
-                result            = vset_lane_u16(val3, result, 3);
-
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
-
-                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes), result);
-            }
-        },
-        input, output);
+                // Compute 4x4 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const uint16x4_t row0 =
+                        vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+                    const uint16x4_t row1 =
+                        vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+                    const uint16x4_t row2 =
+                        vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+                    const uint16x4_t row3 =
+                        vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+
+                    // Transpose 2x2
+                    const uint16x4x2_t k0_u16 = vtrn_u16(row0, row1);
+                    const uint16x4x2_t k1_u16 = vtrn_u16(row2, row3);
+
+                    // Transpose 4x4
+                    const uint32x2x2_t k0_u32 =
+                        vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_u16.val[0]));
+                    const uint32x2x2_t k1_u32 =
+                        vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_u16.val[1]));
+
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
+
+                    vst1_u16(
+                        reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes),
+                        vreinterpret_u16_u32(k0_u32.val[0]));
+                    vst1_u16(
+                        reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes),
+                        vreinterpret_u16_u32(k1_u32.val[0]));
+                    vst1_u16(
+                        reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes),
+                        vreinterpret_u16_u32(k0_u32.val[1]));
+                    vst1_u16(
+                        reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes),
+                        vreinterpret_u16_u32(k1_u32.val[1]));
+                }
+
+                // Compute left-over elements (1x4)
+                for (; x < window_end_x; ++x)
+                {
+                    const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+                    const uint16_t val1 = *(reinterpret_cast<uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+                    const uint16_t val2 = *(reinterpret_cast<uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+                    const uint16_t val3 = *(reinterpret_cast<uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+
+                    uint16x4_t result = vdup_n_u16(0);
+                    result            = vset_lane_u16(val0, result, 0);
+                    result            = vset_lane_u16(val1, result, 1);
+                    result            = vset_lane_u16(val2, result, 2);
+                    result            = vset_lane_u16(val3, result, 3);
+
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
+
+                    vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes), result);
+                }
+            },
+            input, output);
     }
 
-    if(left_over_loop_y)
+    if (left_over_loop_y)
     {
         window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
         window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
@@ -303,16 +356,18 @@ void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &win
         Iterator output(out, window_out);
 
         // Compute left-over elements along the y dimension (1x1)
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr()));
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
+            {
+                const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr()));
 
-            // Compute destination address
-            const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + id.x() * output_stride_in_bytes;
+                // Compute destination address
+                const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + id.x() * output_stride_in_bytes;
 
-            *(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
-        },
-        input, output);
+                *(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
+            },
+            input, output);
     }
 }
 
@@ -347,10 +402,10 @@ void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &win
 
     Window window_in(window);
     window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
-    if(left_over_loop_y)
+    if (left_over_loop_y)
     {
         // Check if window_end_y_multiple_of is greater than window_start_y
-        if(window_end_y_multiple_of > window_start_y)
+        if (window_end_y_multiple_of > window_start_y)
         {
             window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
         }
@@ -367,102 +422,160 @@ void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &win
     Iterator output(out, window_out);
 
     // Run the SIMD path if and only if the input is not a row-vector
-    if(in->info()->dimension(1) != 1)
+    if (in->info()->dimension(1) != 1)
     {
         Iterator input(in, window_in);
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            // Compute 8x8 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
             {
-                // Load
-                const uint32x4x2_t row0 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
-                const uint32x4x2_t row1 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
-                const uint32x4x2_t row2 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
-                const uint32x4x2_t row3 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-                const uint32x4x2_t row4 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 4 * input_stride_in_bytes) + x);
-                const uint32x4x2_t row5 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 5 * input_stride_in_bytes) + x);
-                const uint32x4x2_t row6 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 6 * input_stride_in_bytes) + x);
-                const uint32x4x2_t row7 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 7 * input_stride_in_bytes) + x);
-
-                // Transpose 2x4
-                const uint32x4x2_t k0_u32 = {vtrn1q_u32(row0.val[0], row1.val[0]), vtrn2q_u32(row0.val[0], row1.val[0])};
-                const uint32x4x2_t k1_u32 = {vtrn1q_u32(row0.val[1], row1.val[1]), vtrn2q_u32(row0.val[1], row1.val[1])};
-                const uint32x4x2_t k2_u32 = {vtrn1q_u32(row2.val[0], row3.val[0]), vtrn2q_u32(row2.val[0], row3.val[0])};
-                const uint32x4x2_t k3_u32 = {vtrn1q_u32(row2.val[1], row3.val[1]), vtrn2q_u32(row2.val[1], row3.val[1])};
-                const uint32x4x2_t k4_u32 = {vtrn1q_u32(row4.val[0], row5.val[0]), vtrn2q_u32(row4.val[0], row5.val[0])};
-                const uint32x4x2_t k5_u32 = {vtrn1q_u32(row4.val[1], row5.val[1]), vtrn2q_u32(row4.val[1], row5.val[1])};
-                const uint32x4x2_t k6_u32 = {vtrn1q_u32(row6.val[0], row7.val[0]), vtrn2q_u32(row6.val[0], row7.val[0])};
-                const uint32x4x2_t k7_u32 = {vtrn1q_u32(row6.val[1], row7.val[1]), vtrn2q_u32(row6.val[1], row7.val[1])};
-
-                // Transpose 2x2
-                const uint64x2x2_t k0_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0]))};
-                const uint64x2x2_t k1_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1]))};
-                const uint64x2x2_t k2_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0]))};
-                const uint64x2x2_t k3_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1]))};
-                const uint64x2x2_t k4_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0]))};
-                const uint64x2x2_t k5_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1]))};
-                const uint64x2x2_t k6_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0]))};
-                const uint64x2x2_t k7_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1]))};
-
-                // Swap blocks
-                const uint32x4x2_t col0 = {vreinterpretq_u32_u64(k0_u64.val[0]), vreinterpretq_u32_u64(k4_u64.val[0])};
-                const uint32x4x2_t col1 = {vreinterpretq_u32_u64(k1_u64.val[0]), vreinterpretq_u32_u64(k5_u64.val[0])};
-                const uint32x4x2_t col2 = {vreinterpretq_u32_u64(k0_u64.val[1]), vreinterpretq_u32_u64(k4_u64.val[1])};
-                const uint32x4x2_t col3 = {vreinterpretq_u32_u64(k1_u64.val[1]), vreinterpretq_u32_u64(k5_u64.val[1])};
-                const uint32x4x2_t col4 = {vreinterpretq_u32_u64(k2_u64.val[0]), vreinterpretq_u32_u64(k6_u64.val[0])};
-                const uint32x4x2_t col5 = {vreinterpretq_u32_u64(k3_u64.val[0]), vreinterpretq_u32_u64(k7_u64.val[0])};
-                const uint32x4x2_t col6 = {vreinterpretq_u32_u64(k2_u64.val[1]), vreinterpretq_u32_u64(k6_u64.val[1])};
-                const uint32x4x2_t col7 = {vreinterpretq_u32_u64(k3_u64.val[1]), vreinterpretq_u32_u64(k7_u64.val[1])};
-
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
-
-                // Store
-                vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), col0);
-                vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), col1);
-                vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), col2);
-                vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), col3);
-                vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), col4);
-                vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), col5);
-                vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), col6);
-                vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), col7);
-            }
-
-            // Compute left-over elements (8x1)
-            for(; x < window_end_x; ++x)
-            {
-                const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
-                const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
-                const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
-                const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-                const uint32_t val4 = *(reinterpret_cast<uint32_t *>(input.ptr() + 4 * input_stride_in_bytes) + x);
-                const uint32_t val5 = *(reinterpret_cast<uint32_t *>(input.ptr() + 5 * input_stride_in_bytes) + x);
-                const uint32_t val6 = *(reinterpret_cast<uint32_t *>(input.ptr() + 6 * input_stride_in_bytes) + x);
-                const uint32_t val7 = *(reinterpret_cast<uint32_t *>(input.ptr() + 7 * input_stride_in_bytes) + x);
-
-                uint32x4_t result0 = vdupq_n_u32(0);
-                uint32x4_t result1 = vdupq_n_u32(0);
-                result0            = vsetq_lane_u32(val0, result0, 0);
-                result0            = vsetq_lane_u32(val1, result0, 1);
-                result0            = vsetq_lane_u32(val2, result0, 2);
-                result0            = vsetq_lane_u32(val3, result0, 3);
-                result1            = vsetq_lane_u32(val4, result1, 0);
-                result1            = vsetq_lane_u32(val5, result1, 1);
-                result1            = vsetq_lane_u32(val6, result1, 2);
-                result1            = vsetq_lane_u32(val7, result1, 3);
-
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
-
-                vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), {result0, result1});
-            }
-        },
-        input, output);
+                // Compute 8x8 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    // Load
+                    const uint32x4x2_t row0 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+                    const uint32x4x2_t row1 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+                    const uint32x4x2_t row2 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+                    const uint32x4x2_t row3 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+                    const uint32x4x2_t row4 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 4 * input_stride_in_bytes) + x);
+                    const uint32x4x2_t row5 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 5 * input_stride_in_bytes) + x);
+                    const uint32x4x2_t row6 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 6 * input_stride_in_bytes) + x);
+                    const uint32x4x2_t row7 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 7 * input_stride_in_bytes) + x);
+
+                    // Transpose 2x4
+                    const uint32x4x2_t k0_u32 = {vtrn1q_u32(row0.val[0], row1.val[0]),
+                                                 vtrn2q_u32(row0.val[0], row1.val[0])};
+                    const uint32x4x2_t k1_u32 = {vtrn1q_u32(row0.val[1], row1.val[1]),
+                                                 vtrn2q_u32(row0.val[1], row1.val[1])};
+                    const uint32x4x2_t k2_u32 = {vtrn1q_u32(row2.val[0], row3.val[0]),
+                                                 vtrn2q_u32(row2.val[0], row3.val[0])};
+                    const uint32x4x2_t k3_u32 = {vtrn1q_u32(row2.val[1], row3.val[1]),
+                                                 vtrn2q_u32(row2.val[1], row3.val[1])};
+                    const uint32x4x2_t k4_u32 = {vtrn1q_u32(row4.val[0], row5.val[0]),
+                                                 vtrn2q_u32(row4.val[0], row5.val[0])};
+                    const uint32x4x2_t k5_u32 = {vtrn1q_u32(row4.val[1], row5.val[1]),
+                                                 vtrn2q_u32(row4.val[1], row5.val[1])};
+                    const uint32x4x2_t k6_u32 = {vtrn1q_u32(row6.val[0], row7.val[0]),
+                                                 vtrn2q_u32(row6.val[0], row7.val[0])};
+                    const uint32x4x2_t k7_u32 = {vtrn1q_u32(row6.val[1], row7.val[1]),
+                                                 vtrn2q_u32(row6.val[1], row7.val[1])};
+
+                    // Transpose 2x2
+                    const uint64x2x2_t k0_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0]))};
+                    const uint64x2x2_t k1_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1]))};
+                    const uint64x2x2_t k2_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0]))};
+                    const uint64x2x2_t k3_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1]))};
+                    const uint64x2x2_t k4_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0]))};
+                    const uint64x2x2_t k5_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1]))};
+                    const uint64x2x2_t k6_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0]))};
+                    const uint64x2x2_t k7_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1]))};
+
+                    // Swap blocks
+                    const uint32x4x2_t col0 = {vreinterpretq_u32_u64(k0_u64.val[0]),
+                                               vreinterpretq_u32_u64(k4_u64.val[0])};
+                    const uint32x4x2_t col1 = {vreinterpretq_u32_u64(k1_u64.val[0]),
+                                               vreinterpretq_u32_u64(k5_u64.val[0])};
+                    const uint32x4x2_t col2 = {vreinterpretq_u32_u64(k0_u64.val[1]),
+                                               vreinterpretq_u32_u64(k4_u64.val[1])};
+                    const uint32x4x2_t col3 = {vreinterpretq_u32_u64(k1_u64.val[1]),
+                                               vreinterpretq_u32_u64(k5_u64.val[1])};
+                    const uint32x4x2_t col4 = {vreinterpretq_u32_u64(k2_u64.val[0]),
+                                               vreinterpretq_u32_u64(k6_u64.val[0])};
+                    const uint32x4x2_t col5 = {vreinterpretq_u32_u64(k3_u64.val[0]),
+                                               vreinterpretq_u32_u64(k7_u64.val[0])};
+                    const uint32x4x2_t col6 = {vreinterpretq_u32_u64(k2_u64.val[1]),
+                                               vreinterpretq_u32_u64(k6_u64.val[1])};
+                    const uint32x4x2_t col7 = {vreinterpretq_u32_u64(k3_u64.val[1]),
+                                               vreinterpretq_u32_u64(k7_u64.val[1])};
+
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
+
+                    // Store
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes),
+                        col0);
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes),
+                        col1);
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes),
+                        col2);
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes),
+                        col3);
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes),
+                        col4);
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes),
+                        col5);
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes),
+                        col6);
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes),
+                        col7);
+                }
+
+                // Compute left-over elements (8x1)
+                for (; x < window_end_x; ++x)
+                {
+                    const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+                    const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+                    const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+                    const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+                    const uint32_t val4 = *(reinterpret_cast<uint32_t *>(input.ptr() + 4 * input_stride_in_bytes) + x);
+                    const uint32_t val5 = *(reinterpret_cast<uint32_t *>(input.ptr() + 5 * input_stride_in_bytes) + x);
+                    const uint32_t val6 = *(reinterpret_cast<uint32_t *>(input.ptr() + 6 * input_stride_in_bytes) + x);
+                    const uint32_t val7 = *(reinterpret_cast<uint32_t *>(input.ptr() + 7 * input_stride_in_bytes) + x);
+
+                    uint32x4_t result0 = vdupq_n_u32(0);
+                    uint32x4_t result1 = vdupq_n_u32(0);
+                    result0            = vsetq_lane_u32(val0, result0, 0);
+                    result0            = vsetq_lane_u32(val1, result0, 1);
+                    result0            = vsetq_lane_u32(val2, result0, 2);
+                    result0            = vsetq_lane_u32(val3, result0, 3);
+                    result1            = vsetq_lane_u32(val4, result1, 0);
+                    result1            = vsetq_lane_u32(val5, result1, 1);
+                    result1            = vsetq_lane_u32(val6, result1, 2);
+                    result1            = vsetq_lane_u32(val7, result1, 3);
+
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
+
+                    vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), {result0, result1});
+                }
+            },
+            input, output);
     }
 
-    if(left_over_loop_y)
+    if (left_over_loop_y)
     {
         window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
         window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
@@ -471,40 +584,42 @@ void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &win
         Iterator output(out, window_out);
 
         // Compute left-over elements along the y dimension (1x1)
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr()));
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
+            {
+                const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr()));
 
-            // Compute destination address
-            const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;
+                // Compute destination address
+                const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;
 
-            *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
-        },
-        input, output);
+                *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
+            },
+            input, output);
     }
 }
 #else  // __aarch64__
 void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &window)
 {
-    const int    window_step_x            = 4;
-    const int    window_step_y            = 4;
-    const int    window_start_x           = window.x().start();
-    const int    window_end_x             = window.x().end();
-    const int    window_start_y           = window.y().start();
-    const int    window_end_y             = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
-    const int    window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
-    const size_t input_stride_in_bytes    = in->info()->strides_in_bytes()[1];
-    const size_t output_stride_in_bytes   = out->info()->strides_in_bytes()[1];
+    const int window_step_x = 4;
+    const int window_step_y = 4;
+    const int window_start_x = window.x().start();
+    const int window_end_x = window.x().end();
+    const int window_start_y = window.y().start();
+    const int window_end_y = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
+    const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
+    const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1];
+    const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];
 
     // Check if we need a left-over loop for the y dimension
     bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
 
     Window window_in(window);
     window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
-    if(left_over_loop_y)
+    if (left_over_loop_y)
     {
         // Check if window_end_y_multiple_of is greater than window_start_y
-        if(window_end_y_multiple_of > window_start_y)
+        if (window_end_y_multiple_of > window_start_y)
         {
             window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
         }
@@ -521,60 +636,74 @@ void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &win
     Iterator output(out, window_out);
 
     // Run the SIMD path if and only if the input is not a row-vector
-    if(in->info()->dimension(1) != 1)
+    if (in->info()->dimension(1) != 1)
     {
         Iterator input(in, window_in);
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            // Compute 4x4 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const uint32x4_t row0 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
-                const uint32x4_t row1 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
-                const uint32x4_t row2 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
-                const uint32x4_t row3 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-
-                // Transpose 2x2
-                const uint32x2x2_t k0_u32 = vtrn_u32(vget_low_u32(row0), vget_low_u32(row1));
-                const uint32x2x2_t k1_u32 = vtrn_u32(vget_high_u32(row2), vget_high_u32(row3));
-                const uint32x2x2_t k2_u32 = vtrn_u32(vget_high_u32(row0), vget_high_u32(row1));
-                const uint32x2x2_t k3_u32 = vtrn_u32(vget_low_u32(row2), vget_low_u32(row3));
-
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
-
-                // Swap block 01 with block 10 and store
-                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vcombine_u32(k0_u32.val[0], k3_u32.val[0]));
-                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vcombine_u32(k0_u32.val[1], k3_u32.val[1]));
-                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vcombine_u32(k2_u32.val[0], k1_u32.val[0]));
-                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vcombine_u32(k2_u32.val[1], k1_u32.val[1]));
-            }
-
-            // Compute left-over elements (1x4)
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
             {
-                const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
-                const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
-                const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
-                const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-
-                uint32x4_t result = vdupq_n_u32(0);
-                result            = vsetq_lane_u32(val0, result, 0);
-                result            = vsetq_lane_u32(val1, result, 1);
-                result            = vsetq_lane_u32(val2, result, 2);
-                result            = vsetq_lane_u32(val3, result, 3);
-
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
-
-                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), result);
-            }
-        },
-        input, output);
+                // Compute 4x4 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const uint32x4_t row0 =
+                        vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+                    const uint32x4_t row1 =
+                        vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+                    const uint32x4_t row2 =
+                        vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+                    const uint32x4_t row3 =
+                        vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+
+                    // Transpose 2x2
+                    const uint32x2x2_t k0_u32 = vtrn_u32(vget_low_u32(row0), vget_low_u32(row1));
+                    const uint32x2x2_t k1_u32 = vtrn_u32(vget_high_u32(row2), vget_high_u32(row3));
+                    const uint32x2x2_t k2_u32 = vtrn_u32(vget_high_u32(row0), vget_high_u32(row1));
+                    const uint32x2x2_t k3_u32 = vtrn_u32(vget_low_u32(row2), vget_low_u32(row3));
+
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
+
+                    // Swap block 01 with block 10 and store
+                    vst1q_u32(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes),
+                        vcombine_u32(k0_u32.val[0], k3_u32.val[0]));
+                    vst1q_u32(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes),
+                        vcombine_u32(k0_u32.val[1], k3_u32.val[1]));
+                    vst1q_u32(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes),
+                        vcombine_u32(k2_u32.val[0], k1_u32.val[0]));
+                    vst1q_u32(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes),
+                        vcombine_u32(k2_u32.val[1], k1_u32.val[1]));
+                }
+
+                // Compute left-over elements (1x4)
+                for (; x < window_end_x; ++x)
+                {
+                    const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+                    const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+                    const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+                    const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+
+                    uint32x4_t result = vdupq_n_u32(0);
+                    result = vsetq_lane_u32(val0, result, 0);
+                    result = vsetq_lane_u32(val1, result, 1);
+                    result = vsetq_lane_u32(val2, result, 2);
+                    result = vsetq_lane_u32(val3, result, 3);
+
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
+
+                    vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), result);
+                }
+            },
+            input, output);
     }
 
-    if(left_over_loop_y)
+    if (left_over_loop_y)
     {
         window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
         window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
@@ -583,16 +712,18 @@ void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &win
         Iterator output(out, window_out);
 
         // Compute left-over elements along the y dimension (1x1)
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr()));
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
+            {
+                const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr()));
 
-            // Compute destination address
-            const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;
+                // Compute destination address
+                const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;
 
-            *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
-        },
-        input, output);
+                *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
+            },
+            input, output);
     }
 }
 #endif // __aarch64__
@@ -616,7 +747,8 @@ void CpuTransposeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
     const unsigned int num_elems_processed_per_iteration_y = num_elems_processed(src->element_size());
 
     // Configure kernel window
-    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    Window win =
+        calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     // The CpuTranspose doesn't need padding so update_window_and_padding() can be skipped
     Coordinates coord;
@@ -637,7 +769,7 @@ Status CpuTransposeKernel::validate(const ITensorInfo *src, const ITensorInfo *d
                                     "Element size not supported");
 
     // Validate configured destination
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src);
 
@@ -658,7 +790,7 @@ void CpuTransposeKernel::run_op(ITensorPack &tensors, const Window &window, cons
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
     auto       dst = tensors.get_tensor(TensorType::ACL_DST);
 
-    switch(src->info()->element_size())
+    switch (src->info()->element_size())
     {
         case 1:
             transpose_8bit_elements(src, dst, window);
diff --git a/src/cpu/kernels/CpuTransposeKernel.h b/src/cpu/kernels/CpuTransposeKernel.h
index cb85daeb40..e79a405677 100644
--- a/src/cpu/kernels/CpuTransposeKernel.h
+++ b/src/cpu/kernels/CpuTransposeKernel.h
@@ -54,7 +54,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 };
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuWeightsReshapeKernel.cpp b/src/cpu/kernels/CpuWeightsReshapeKernel.cpp
index 2ccc977995..297ba63826 100644
--- a/src/cpu/kernels/CpuWeightsReshapeKernel.cpp
+++ b/src/cpu/kernels/CpuWeightsReshapeKernel.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -38,7 +39,7 @@ namespace
 {
 TensorShape get_output_shape(const ITensorInfo *src, bool has_bias)
 {
-    TensorShape output_shape{ src->tensor_shape() };
+    TensorShape output_shape{src->tensor_shape()};
 
     output_shape.collapse(3);
     const size_t tmp_dim = output_shape[0];
@@ -54,20 +55,22 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *biases, con
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(src->data_type()));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
         ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 4) && (biases->num_dimensions() != 1));
         ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->num_dimensions() != 2));
         ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 4) && (biases->dimension(0) != src->tensor_shape()[3]));
-        ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->dimension(0) != src->tensor_shape()[3] || biases->dimension(1) != src->tensor_shape()[4]));
+        ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->dimension(0) != src->tensor_shape()[3] ||
+                                                                     biases->dimension(1) != src->tensor_shape()[4]));
     }
 
     // Checks performed when output is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), get_output_shape(src, biases != nullptr));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
+                                                           get_output_shape(src, biases != nullptr));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
     }
@@ -84,9 +87,7 @@ void CpuWeightsReshapeKernel::configure(const ITensorInfo *src, const ITensorInf
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(get_output_shape(src, (biases != nullptr))));
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src,
-                                                  biases,
-                                                  dst));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, biases, dst));
 
     // Configure kernel
     Window window = calculate_max_window(*src, Steps());
@@ -122,44 +123,47 @@ void CpuWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window,
 
     // Create iterators
     Iterator in(src, window);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        // Get column index
-        const int kernel_idx = id[3];
-        const int kernel_idz = id[4];
-
-        // Setup pointers
-        const uint8_t *tmp_input_ptr        = in.ptr();
-        uint8_t       *tmp_output_ptr       = dst->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz));
-        const uint8_t *curr_input_row_ptr   = tmp_input_ptr;
-        const uint8_t *curr_input_depth_ptr = tmp_input_ptr;
-
-        // Linearize volume
-        for(unsigned int d = 0; d < kernel_depth; ++d)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            for(unsigned int j = 0; j < kernel_size_y; ++j)
+            // Get column index
+            const int kernel_idx = id[3];
+            const int kernel_idz = id[4];
+
+            // Setup pointers
+            const uint8_t *tmp_input_ptr        = in.ptr();
+            uint8_t       *tmp_output_ptr       = dst->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz));
+            const uint8_t *curr_input_row_ptr   = tmp_input_ptr;
+            const uint8_t *curr_input_depth_ptr = tmp_input_ptr;
+
+            // Linearize volume
+            for (unsigned int d = 0; d < kernel_depth; ++d)
             {
-                for(unsigned int i = 0; i < kernel_size_x; ++i)
+                for (unsigned int j = 0; j < kernel_size_y; ++j)
                 {
-                    std::memcpy(tmp_output_ptr, tmp_input_ptr, src->info()->element_size());
-                    tmp_input_ptr += input_stride_x;
-                    tmp_output_ptr += output_stride_y;
+                    for (unsigned int i = 0; i < kernel_size_x; ++i)
+                    {
+                        std::memcpy(tmp_output_ptr, tmp_input_ptr, src->info()->element_size());
+                        tmp_input_ptr += input_stride_x;
+                        tmp_output_ptr += output_stride_y;
+                    }
+                    curr_input_row_ptr += input_stride_y;
+                    tmp_input_ptr = curr_input_row_ptr;
                 }
-                curr_input_row_ptr += input_stride_y;
-                tmp_input_ptr = curr_input_row_ptr;
+                curr_input_depth_ptr += input_stride_z;
+                curr_input_row_ptr = curr_input_depth_ptr;
+                tmp_input_ptr      = curr_input_depth_ptr;
             }
-            curr_input_depth_ptr += input_stride_z;
-            curr_input_row_ptr = curr_input_depth_ptr;
-            tmp_input_ptr      = curr_input_depth_ptr;
-        }
 
-        // Add bias
-        if(biases != nullptr)
-        {
-            std::memcpy(tmp_output_ptr, biases->ptr_to_element(Coordinates(kernel_idx, kernel_idz)), src->info()->element_size());
-        }
-    },
-    in);
+            // Add bias
+            if (biases != nullptr)
+            {
+                std::memcpy(tmp_output_ptr, biases->ptr_to_element(Coordinates(kernel_idx, kernel_idz)),
+                            src->info()->element_size());
+            }
+        },
+        in);
 }
 const char *CpuWeightsReshapeKernel::name() const
 {
@@ -167,4 +171,4 @@ const char *CpuWeightsReshapeKernel::name() const
 }
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuWeightsReshapeKernel.h b/src/cpu/kernels/CpuWeightsReshapeKernel.h
index 1a260edc96..9310b3c784 100644
--- a/src/cpu/kernels/CpuWeightsReshapeKernel.h
+++ b/src/cpu/kernels/CpuWeightsReshapeKernel.h
@@ -82,7 +82,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 };
 } // namespace kernels
diff --git a/src/cpu/kernels/CpuWinogradConv2dKernel.cpp b/src/cpu/kernels/CpuWinogradConv2dKernel.cpp
index 818d878119..52e3f2549c 100644
--- a/src/cpu/kernels/CpuWinogradConv2dKernel.cpp
+++ b/src/cpu/kernels/CpuWinogradConv2dKernel.cpp
@@ -28,8 +28,10 @@ namespace arm_compute
 {
 namespace cpu
 {
-CpuWinogradConv2dTransformInputKernel::CpuWinogradConv2dTransformInputKernel(arm_conv::winograd::WinogradImpl &w_impl, arm_conv::ConvolutionArgs &_c_args, uint32_t nthreads)
-    : _winograd_impl{ w_impl }, _conv_args{ _c_args }, _nthreads{ nthreads }
+CpuWinogradConv2dTransformInputKernel::CpuWinogradConv2dTransformInputKernel(arm_conv::winograd::WinogradImpl &w_impl,
+                                                                             arm_conv::ConvolutionArgs        &_c_args,
+                                                                             uint32_t                          nthreads)
+    : _winograd_impl{w_impl}, _conv_args{_c_args}, _nthreads{nthreads}
 {
 }
 
@@ -49,24 +51,20 @@ void CpuWinogradConv2dTransformInputKernel::run_op(ITensorPack &tensors, const W
     const size_t input_row_stride   = src_strides[height_idx] / element_size_in_bytes;
     const size_t input_col_stride   = src_strides[width_idx] / element_size_in_bytes;
     const size_t input_batch_stride = src_strides[batch_idx] / element_size_in_bytes;
-    const auto   input_nhwc_ptr     = reinterpret_cast<const void *>(input_nhwc->buffer() + input_nhwc->info()->offset_first_element_in_bytes());
-    auto         win_transf_ptr     = reinterpret_cast<void *>(winograd_input_transform->buffer() + winograd_input_transform->info()->offset_first_element_in_bytes());
+    const auto   input_nhwc_ptr =
+        reinterpret_cast<const void *>(input_nhwc->buffer() + input_nhwc->info()->offset_first_element_in_bytes());
+    auto win_transf_ptr = reinterpret_cast<void *>(winograd_input_transform->buffer() +
+                                                   winograd_input_transform->info()->offset_first_element_in_bytes());
 
-    _winograd_impl.input_transform->execute(
-        _conv_args,
-        input_nhwc_ptr,
-        input_batch_stride,
-        input_row_stride,
-        input_col_stride,
-        win_transf_ptr,
-        _winograd_impl.winograd_spec,
-        workspace->buffer(),
-        info.thread_id,
-        _nthreads);
+    _winograd_impl.input_transform->execute(_conv_args, input_nhwc_ptr, input_batch_stride, input_row_stride,
+                                            input_col_stride, win_transf_ptr, _winograd_impl.winograd_spec,
+                                            workspace->buffer(), info.thread_id, _nthreads);
 }
 
-CpuWinogradConv2dTransformOutputKernel::CpuWinogradConv2dTransformOutputKernel(arm_conv::winograd::WinogradImpl &w_impl, arm_conv::ConvolutionArgs &_c_args, uint32_t nthreads)
-    : _winograd_impl{ w_impl }, _conv_args{ _c_args }, _nthreads{ nthreads }
+CpuWinogradConv2dTransformOutputKernel::CpuWinogradConv2dTransformOutputKernel(arm_conv::winograd::WinogradImpl &w_impl,
+                                                                               arm_conv::ConvolutionArgs &_c_args,
+                                                                               uint32_t                   nthreads)
+    : _winograd_impl{w_impl}, _conv_args{_c_args}, _nthreads{nthreads}
 {
 }
 
@@ -88,28 +86,21 @@ void CpuWinogradConv2dTransformOutputKernel::run_op(ITensorPack &tensors, const
     const size_t out_row_stride   = dst_strides[height_idx] / element_size_in_bytes;
     const size_t out_col_stride   = dst_strides[width_idx] / element_size_in_bytes;
     const size_t out_batch_stride = dst_strides[batch_idx] / element_size_in_bytes;
-    const auto   wout_transf_ptr  = reinterpret_cast<const void *>(winograd_output_transform->buffer() + winograd_output_transform->info()->offset_first_element_in_bytes());
-    auto         dst_nhwc_ptr     = reinterpret_cast<void *>(dst_nhwc->buffer() + dst_nhwc->info()->offset_first_element_in_bytes());
-    void        *biases_data_ptr  = nullptr;
-    if(biases != nullptr)
+    const auto   wout_transf_ptr  = reinterpret_cast<const void *>(
+        winograd_output_transform->buffer() + winograd_output_transform->info()->offset_first_element_in_bytes());
+    auto dst_nhwc_ptr =
+        reinterpret_cast<void *>(dst_nhwc->buffer() + dst_nhwc->info()->offset_first_element_in_bytes());
+    void *biases_data_ptr = nullptr;
+    if (biases != nullptr)
     {
         biases_data_ptr = reinterpret_cast<void *>(biases->buffer() + biases->info()->offset_first_element_in_bytes());
     }
 
     // Output transform
-    _winograd_impl.output_transform->execute(
-        _conv_args,
-        wout_transf_ptr,
-        _winograd_impl.winograd_spec,
-        biases_data_ptr,
-        dst_nhwc_ptr,
-        out_batch_stride,
-        out_row_stride,
-        out_col_stride,
-        workspace->buffer(),
-        info.thread_id,
-        _nthreads);
+    _winograd_impl.output_transform->execute(_conv_args, wout_transf_ptr, _winograd_impl.winograd_spec, biases_data_ptr,
+                                             dst_nhwc_ptr, out_batch_stride, out_row_stride, out_col_stride,
+                                             workspace->buffer(), info.thread_id, _nthreads);
 }
 
 } // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/CpuWinogradConv2dKernel.h b/src/cpu/kernels/CpuWinogradConv2dKernel.h
index 0170dcae22..8a3b745e85 100644
--- a/src/cpu/kernels/CpuWinogradConv2dKernel.h
+++ b/src/cpu/kernels/CpuWinogradConv2dKernel.h
@@ -30,6 +30,7 @@
 #include "arm_compute/core/Steps.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/NEON/kernels/assembly/winograd.hpp"
 #include "src/core/NEON/kernels/convolution/common/tensor.hpp"
 #include "src/cpu/ICpuKernel.h"
@@ -53,7 +54,9 @@ public:
     /**  Prevent instances of this class from being moved it contains references.*/
     CpuWinogradConv2dTransformInputKernel &operator=(CpuWinogradConv2dTransformInputKernel &&) = delete;
 
-    CpuWinogradConv2dTransformInputKernel(arm_conv::winograd::WinogradImpl &w_impl, arm_conv::ConvolutionArgs &_c_args, uint32_t nthreads);
+    CpuWinogradConv2dTransformInputKernel(arm_conv::winograd::WinogradImpl &w_impl,
+                                          arm_conv::ConvolutionArgs        &_c_args,
+                                          uint32_t                          nthreads);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
@@ -83,7 +86,9 @@ public:
     /**  Prevent instances of this class from being moved it contains references.*/
     CpuWinogradConv2dTransformOutputKernel &operator=(CpuWinogradConv2dTransformOutputKernel &&) = delete;
 
-    CpuWinogradConv2dTransformOutputKernel(arm_conv::winograd::WinogradImpl &w_impl, arm_conv::ConvolutionArgs &_c_args, uint32_t nthreads);
+    CpuWinogradConv2dTransformOutputKernel(arm_conv::winograd::WinogradImpl &w_impl,
+                                           arm_conv::ConvolutionArgs        &_c_args,
+                                           uint32_t                          nthreads);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
@@ -95,7 +100,7 @@ public:
 
 private:
     arm_conv::winograd::WinogradImpl &_winograd_impl;
-    const arm_conv::ConvolutionArgs &_conv_args;
+    const arm_conv::ConvolutionArgs  &_conv_args;
     uint32_t                          _nthreads;
 };
 
diff --git a/src/cpu/kernels/activation/generic/neon/fp16.cpp b/src/cpu/kernels/activation/generic/neon/fp16.cpp
index e51b5b3423..ddc6dc24cd 100644
--- a/src/cpu/kernels/activation/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/activation/generic/neon/fp16.cpp
@@ -31,7 +31,7 @@ namespace cpu
 {
 namespace
 {
-constexpr ActFpImplParams Fp16Params = { static_cast<float16_t>(1e-7), 8 };
+constexpr ActFpImplParams Fp16Params = {static_cast<float16_t>(1e-7), 8};
 } // namespace
 
 void neon_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
@@ -40,4 +40,4 @@ void neon_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLaye
 }
 } // namespace cpu
 } // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
-\ No newline at end of file
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/activation/generic/neon/fp32.cpp b/src/cpu/kernels/activation/generic/neon/fp32.cpp
index 2a3b8a0bfd..e558f8c73e 100644
--- a/src/cpu/kernels/activation/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/activation/generic/neon/fp32.cpp
@@ -29,7 +29,7 @@ namespace cpu
 {
 namespace
 {
-constexpr ActFpImplParams Fp32Params = { static_cast<float>(1e-24), 4 };
+constexpr ActFpImplParams Fp32Params = {static_cast<float>(1e-24), 4};
 } // namespace
 void neon_fp32_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 {
diff --git a/src/cpu/kernels/activation/generic/neon/impl.h b/src/cpu/kernels/activation/generic/neon/impl.h
index 05885d8476..afeb6f7f3d 100644
--- a/src/cpu/kernels/activation/generic/neon/impl.h
+++ b/src/cpu/kernels/activation/generic/neon/impl.h
@@ -24,6 +24,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 namespace arm_compute
 {
@@ -56,10 +57,14 @@ inline float16x8_t mask_float_vector(const float16x8_t &in, const uint16x8_t &ma
 #endif /* __aarch64__ */
 
 template <typename T, const ActFpImplParams &P>
-void fp_neon_activation_impl(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void fp_neon_activation_impl(const ITensor             *src,
+                             ITensor                   *dst,
+                             const ActivationLayerInfo &act_info,
+                             const Window              &window)
 {
     /** SIMD vector tag type. */
-    using ExactTagType                                           = typename arm_compute::wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+    using ExactTagType =
+        typename arm_compute::wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
     constexpr int                                 window_step_x  = P.step_x;
     const auto                                    window_start_x = static_cast<int>(window.x().start());
     const auto                                    window_end_x   = static_cast<int>(window.x().end());
@@ -72,12 +77,12 @@ void fp_neon_activation_impl(const ITensor *src, ITensor *dst, const ActivationL
     // to prevent NAN values caused by zeros in inputs to SQRT.
     // In case of aarh64, we call vsqrt directly, so we don't use delta.
 #ifndef __aarch64__
-    const auto delta = wrapper::vdup_n(static_cast<T>(P.delta), ExactTagType {});
+    const auto delta = wrapper::vdup_n(static_cast<T>(P.delta), ExactTagType{});
 #else  /* #ifndef __aarch64__ */
-    const auto const_inv_2      = wrapper::vdup_n(static_cast<T>(0.5f), ExactTagType {});
+    const auto const_inv_2      = wrapper::vdup_n(static_cast<T>(0.5f), ExactTagType{});
     const auto const_inv_sqrt_2 = wrapper::vdup_n(static_cast<T>(0.70710678118f), ExactTagType{});
 #endif /* __aarch64__ */
-    const auto      const_1           = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType {});
+    const auto      const_1           = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
     const auto      const_0           = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
     const auto      const_6           = wrapper::vdup_n(static_cast<T>(6.f), ExactTagType{});
     const auto      const_3           = wrapper::vdup_n(static_cast<T>(3.f), ExactTagType{});
@@ -88,143 +93,154 @@ void fp_neon_activation_impl(const ITensor *src, ITensor *dst, const ActivationL
     const auto      vb                = wrapper::vdup_n(static_cast<T>(act_info.b()), ExactTagType{});
     const auto      a                 = static_cast<T>(act_info.a());
     const auto      b                 = static_cast<T>(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<T *>(output.ptr());
-        wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp;
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            switch(act)
+            const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+            wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp;
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = wrapper::vabs(vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = wrapper::vmla(vb, va, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = wrapper::vmax(const_0, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin, wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
+                const auto vin = wrapper::vloadq(input_ptr + x);
+                switch (act)
+                {
+                    case ActivationLayerInfo::ActivationFunction::ABS:
+                        tmp = wrapper::vabs(vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LINEAR:
+                        tmp = wrapper::vmla(vb, va, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                        tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::RELU:
+                        tmp = wrapper::vmax(const_0, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                        tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                        tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                        tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                        tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin,
+                                            wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::ELU:
+                        tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin,
+                                            wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQRT:
 #ifdef __aarch64__
-                    tmp = wrapper::vsqrt(vin);
+                        tmp = wrapper::vsqrt(vin);
 #else  /* __aarch64__ */
                     {
                         const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0.f, ExactTagType{}));
-                        tmp                 = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask))));
-                        tmp                 = mask_float_vector(tmp, wrapper::vnot(bitmask));
+                        tmp = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask))));
+                        tmp = mask_float_vector(tmp, wrapper::vnot(bitmask));
                     }
 #endif /* __aarch64__ */
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = wrapper::vmul(vin, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = vin;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SWISH:
-                    tmp = wrapper::vmul(vin, wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(wrapper::vmul(va, vin))))));
-                    break;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQUARE:
+                        tmp = wrapper::vmul(vin, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::TANH:
+                        tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                        tmp = vin;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                        tmp = wrapper::vmul(
+                            vin,
+                            wrapper::vmul(const_inv_6,
+                                          wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SWISH:
+                        tmp = wrapper::vmul(vin, wrapper::vinv(wrapper::vadd(
+                                                     const_1, wrapper::vexpq(wrapper::vneg(wrapper::vmul(va, vin))))));
+                        break;
 #ifdef __aarch64__
-                case ActivationLayerInfo::ActivationFunction::GELU:
-                    tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_2, wrapper::vadd(const_1, wrapper::verf(wrapper::vmul(vin, const_inv_sqrt_2)))));
-                    break;
+                    case ActivationLayerInfo::ActivationFunction::GELU:
+                        tmp = wrapper::vmul(
+                            vin,
+                            wrapper::vmul(const_inv_2,
+                                          wrapper::vadd(const_1, wrapper::verf(wrapper::vmul(vin, const_inv_sqrt_2)))));
+                        break;
 #endif /* __aarch64__ */
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                wrapper::vstore(output_ptr + x, tmp);
             }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const T in = *(reinterpret_cast<const T *>(input_ptr + x));
-            T       tmp;
-            switch(act)
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = std::abs(in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = a * in + b;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = std::max<T>(static_cast<T>(0), in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = std::min<T>(a, std::max(static_cast<T>(0), in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = std::min<T>(a, std::max<T>(b, in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = (in > 0) ? in : a * in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<T>(1) + std::exp(in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
-                    tmp = std::sqrt(in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = in * in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = a * std::tanh(b * in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SWISH:
-                    tmp = in / (static_cast<T>(1) + std::exp(-a * in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::GELU:
-                    tmp = in * static_cast<T>(0.5f * (1.0f + erff(static_cast<float>(in) / 1.41421356237f)));
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                const T in = *(reinterpret_cast<const T *>(input_ptr + x));
+                T       tmp;
+                switch (act)
+                {
+                    case ActivationLayerInfo::ActivationFunction::ABS:
+                        tmp = std::abs(in);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LINEAR:
+                        tmp = a * in + b;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                        tmp = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-in));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::RELU:
+                        tmp = std::max<T>(static_cast<T>(0), in);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                        tmp = std::min<T>(a, std::max(static_cast<T>(0), in));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                        tmp = std::min<T>(a, std::max<T>(b, in));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                        tmp = (in > 0) ? in : a * in;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                        tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<T>(1) + std::exp(in));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::ELU:
+                        tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQRT:
+                        tmp = std::sqrt(in);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQUARE:
+                        tmp = in * in;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::TANH:
+                        tmp = a * std::tanh(b * in);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                        tmp = in;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                        tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SWISH:
+                        tmp = in / (static_cast<T>(1) + std::exp(-a * in));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::GELU:
+                        tmp = in * static_cast<T>(0.5f * (1.0f + erff(static_cast<float>(in) / 1.41421356237f)));
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                *(output_ptr + x) = tmp;
             }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/neon/lut.cpp b/src/cpu/kernels/activation/generic/neon/lut.cpp
index c973e964e4..f289c80d4b 100644
--- a/src/cpu/kernels/activation/generic/neon/lut.cpp
+++ b/src/cpu/kernels/activation/generic/neon/lut.cpp
@@ -24,6 +24,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/cpu/kernels/lut/list.h"
 
 namespace arm_compute
@@ -33,19 +34,22 @@ namespace cpu
 #ifdef __aarch64__
 void neon_q8_activation_lut(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 {
-    ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 && src->info()->data_type() != DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 &&
+                         src->info()->data_type() != DataType::QASYMM8_SIGNED);
     const auto window_end_x  = window.x().end();
     Window     win_collapsed = window.collapse_if_possible(window, Window::DimZ);
     win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
-        auto       output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-        lut_u8_neon(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr);
-    },
-    input, output);
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
+            auto       output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+            lut_u8_neon(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr);
+        },
+        input, output);
 }
 #endif // __aarch64__
 } // namespace cpu
diff --git a/src/cpu/kernels/activation/generic/neon/qasymm8.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
index e7c146e46f..1451301ea2 100644
--- a/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
@@ -25,6 +25,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
@@ -38,7 +39,10 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void neon_qasymm8_activation(const ITensor             *src,
+                             ITensor                   *dst,
+                             const ActivationLayerInfo &act_info,
+                             const Window              &window)
 {
     constexpr int                                 window_step_x  = 16;
     const auto                                    window_start_x = static_cast<int>(window.x().start());
@@ -85,206 +89,222 @@ void neon_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationL
     float32x4_t vs = vdupq_n_f32(s);
     float32x4_t vo = vdupq_n_f32(o);
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const qasymm8_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<qasymm8_t *>(output.ptr());
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const qasymm8_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<qasymm8_t *>(output.ptr());
 
-        wrapper::traits::neon_bitvector_t<qasymm8_t, wrapper::traits::BitWidth::W128> tmp;
+            wrapper::traits::neon_bitvector_t<qasymm8_t, wrapper::traits::BitWidth::W128> tmp;
 
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                // Perform activation
-                tmp = vmaxq_u8(vconst_0, vin);
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin));
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                // Perform activation
-                tmp = vminq_u8(va, vmaxq_u8(vb, vin));
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
-            }
+                const auto vin = wrapper::vloadq(input_ptr + x);
+                if (act == ActivationLayerInfo::ActivationFunction::RELU)
+                {
+                    // Perform activation
+                    tmp = vmaxq_u8(vconst_0, vin);
+                    // Re-quantize to new output space
+                    tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                {
+                    // Perform activation
+                    tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin));
+                    // Re-quantize to new output space
+                    tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    // Perform activation
+                    tmp = vminq_u8(va, vmaxq_u8(vb, vin));
+                    // Re-quantize to new output space
+                    tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+                }
 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
+                else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
                 {
-                    {
+                    // De-quantize
+                    const auto vin_deq = vdequantize(vin, qi_in);
+                    // Perform activation
+                    const float32x4x4_t tmp_dep = {{
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize(tmp_dep, qi_out);
-            }
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize(tmp_dep, qi_out);
+                }
 #endif // __aarch64__
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
                 {
-                    {
+                    // De-quantize
+                    const auto vin_deq = vdequantize(vin, qi_in);
+                    // Perform activation
+                    const float32x4x4_t tmp_dep = {{
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize(tmp_dep, qi_out);
-            }
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize(tmp_dep, qi_out);
+                }
 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
+                else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
                 {
-                    {
-                        wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize(tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                const auto vin_deq = vdequantize(vin, qi_in);
-
-                const uint32x4x4_t pos_mask =
+                    // De-quantize
+                    const auto vin_deq = vdequantize(vin, qi_in);
+                    // Perform activation
+                    const float32x4x4_t tmp_dep = {{
+                        wrapper::vmul(
+                            vin_deq.val[0],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
+                        wrapper::vmul(
+                            vin_deq.val[1],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
+                        wrapper::vmul(
+                            vin_deq.val[2],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
+                        wrapper::vmul(
+                            vin_deq.val[3],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize(tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
                 {
-                    {
+                    const auto vin_deq = vdequantize(vin, qi_in);
+
+                    const uint32x4x4_t pos_mask = {{
                         wrapper::vcgt(vin_deq.val[0], vconst_0_f32),
                         wrapper::vcgt(vin_deq.val[1], vconst_0_f32),
                         wrapper::vcgt(vin_deq.val[2], vconst_0_f32),
                         wrapper::vcgt(vin_deq.val[3], vconst_0_f32),
-                    }
-                };
+                    }};
 
-                const float32x4x4_t tmp_dep =
-                {
-                    {
+                    const float32x4x4_t tmp_dep = {{
                         wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])),
                         wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])),
                         wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])),
                         wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])),
-                    }
-                };
+                    }};
 
-                tmp = vquantize(tmp_dep, qi_out);
-            }
+                    tmp = vquantize(tmp_dep, qi_out);
+                }
 #else  // #ifndef __aarch64__
-            else if (act == ActivationLayerInfo::ActivationFunction::GELU)
-            {
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
+                else if (act == ActivationLayerInfo::ActivationFunction::GELU)
                 {
-                    {
-                        wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[0], const_inv_sqrt_2))))),
-                        wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[1], const_inv_sqrt_2))))),
-                        wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[2], const_inv_sqrt_2))))),
-                        wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[3], const_inv_sqrt_2))))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize(tmp_dep, qi_out);
-            }
+                    const auto vin_deq = vdequantize(vin, qi_in);
+                    // Perform activation
+                    const float32x4x4_t tmp_dep = {{
+                        wrapper::vmul(vin_deq.val[0],
+                                      wrapper::vmul(const_inv_2,
+                                                    wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(
+                                                                                vin_deq.val[0], const_inv_sqrt_2))))),
+                        wrapper::vmul(vin_deq.val[1],
+                                      wrapper::vmul(const_inv_2,
+                                                    wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(
+                                                                                vin_deq.val[1], const_inv_sqrt_2))))),
+                        wrapper::vmul(vin_deq.val[2],
+                                      wrapper::vmul(const_inv_2,
+                                                    wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(
+                                                                                vin_deq.val[2], const_inv_sqrt_2))))),
+                        wrapper::vmul(vin_deq.val[3],
+                                      wrapper::vmul(const_inv_2,
+                                                    wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(
+                                                                                vin_deq.val[3], const_inv_sqrt_2))))),
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize(tmp_dep, qi_out);
+                }
 #endif // __aarch64__
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                wrapper::vstore(output_ptr + x, tmp);
             }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            qasymm8_t in  = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
-            qasymm8_t tmp = 0;
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                tmp = std::max(const_0, in);
-                tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                tmp = std::min(a, std::max(const_0, in));
-                tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                tmp = std::min(a, std::max(b, in));
-                tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
-            }
+                qasymm8_t in  = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
+                qasymm8_t tmp = 0;
+                if (act == ActivationLayerInfo::ActivationFunction::RELU)
+                {
+                    tmp = std::max(const_0, in);
+                    tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                {
+                    tmp = std::min(a, std::max(const_0, in));
+                    tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    tmp = std::min(a, std::max(b, in));
+                    tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
+                }
 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
+                else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                {
+                    float tmp_f = dequantize_qasymm8(in, qi_in);
+                    tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
+                    tmp         = quantize_qasymm8(tmp_f, qi_out);
+                }
 #endif // __aarch64__
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+                {
+                    float tmp_f = dequantize_qasymm8(in, qi_in);
+                    tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
+                    tmp         = quantize_qasymm8(tmp_f, qi_out);
+                }
 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp_f       = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp_f       = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::GELU)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp         = tmp_f * 0.5f * (1.0f + std::erff(in / 1.41421356237f));
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
+                else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+                {
+                    float tmp_f = dequantize_qasymm8(in, qi_in);
+                    tmp_f       = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
+                    tmp         = quantize_qasymm8(tmp_f, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+                {
+                    float tmp_f = dequantize_qasymm8(in, qi_in);
+                    tmp_f       = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
+                    tmp         = quantize_qasymm8(tmp_f, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::GELU)
+                {
+                    float tmp_f = dequantize_qasymm8(in, qi_in);
+                    tmp         = tmp_f * 0.5f * (1.0f + std::erff(in / 1.41421356237f));
+                    tmp         = quantize_qasymm8(tmp_f, qi_out);
+                }
 #endif // __aarch64__
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                *(output_ptr + x) = tmp;
             }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp
index 52c396459b..a2f588245a 100644
--- a/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
@@ -36,7 +37,10 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void neon_qasymm8_signed_activation(const ITensor             *src,
+                                    ITensor                   *dst,
+                                    const ActivationLayerInfo &act_info,
+                                    const Window              &window)
 {
     constexpr int                                 window_step_x  = 16;
     const auto                                    window_start_x = static_cast<int>(window.x().start());
@@ -76,191 +80,195 @@ void neon_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const Acti
     float32x4_t vs = vdupq_n_f32(s);
     float32x4_t vo = vdupq_n_f32(o);
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const qasymm8_signed_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<qasymm8_signed_t *>(output.ptr());
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const qasymm8_signed_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<qasymm8_signed_t *>(output.ptr());
 
-        wrapper::traits::neon_bitvector_t<qasymm8_signed_t, wrapper::traits::BitWidth::W128> tmp;
+            wrapper::traits::neon_bitvector_t<qasymm8_signed_t, wrapper::traits::BitWidth::W128> tmp;
 
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                // Perform activation
-                tmp = vmaxq_s8(vconst_0, vin);
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = vminq_s8(va, vmaxq_s8(vconst_0, vin));
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                // Perform activation
-                tmp = vminq_s8(va, vmaxq_s8(vb, vin));
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
-            }
+                const auto vin = wrapper::vloadq(input_ptr + x);
+                if (act == ActivationLayerInfo::ActivationFunction::RELU)
+                {
+                    // Perform activation
+                    tmp = vmaxq_s8(vconst_0, vin);
+                    // Re-quantize to new output space
+                    tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                {
+                    // Perform activation
+                    tmp = vminq_s8(va, vmaxq_s8(vconst_0, vin));
+                    // Re-quantize to new output space
+                    tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    // Perform activation
+                    tmp = vminq_s8(va, vmaxq_s8(vb, vin));
+                    // Re-quantize to new output space
+                    tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+                }
 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
+                else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
                 {
-                    {
+                    // De-quantize
+                    const auto vin_deq = vdequantize(vin, qi_in);
+                    // Perform activation
+                    const float32x4x4_t tmp_dep = {{
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_signed(tmp_dep, qi_out);
-            }
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize_signed(tmp_dep, qi_out);
+                }
 #endif // __aarch64__
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
                 {
-                    {
+                    // De-quantize
+                    const auto vin_deq = vdequantize(vin, qi_in);
+                    // Perform activation
+                    const float32x4x4_t tmp_dep = {{
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_signed(tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize_signed(tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
                 {
-                    {
-                        wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_signed(tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                const auto vin_deq = vdequantize(vin, qi_in);
+                    // De-quantize
+                    const auto vin_deq = vdequantize(vin, qi_in);
+                    // Perform activation
+                    const float32x4x4_t tmp_dep = {{
+                        wrapper::vmul(
+                            vin_deq.val[0],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
+                        wrapper::vmul(
+                            vin_deq.val[1],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
+                        wrapper::vmul(
+                            vin_deq.val[2],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
+                        wrapper::vmul(
+                            vin_deq.val[3],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize_signed(tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+                {
+                    const auto vin_deq = vdequantize(vin, qi_in);
 
 #ifdef __aarch64__
-                const uint32x4x4_t pos_mask =
-                {
-                    {
+                    const uint32x4x4_t pos_mask = {{
                         wrapper::vcgtz(vin_deq.val[0]),
                         wrapper::vcgtz(vin_deq.val[1]),
                         wrapper::vcgtz(vin_deq.val[2]),
                         wrapper::vcgtz(vin_deq.val[3]),
-                    }
-                };
+                    }};
 #else  // __aarch64__
-                const uint32x4x4_t pos_mask =
-                {
-                    {
+                    const uint32x4x4_t pos_mask = {{
                         wrapper::vcgt(vin_deq.val[0], vconst_0_f32),
                         wrapper::vcgt(vin_deq.val[1], vconst_0_f32),
                         wrapper::vcgt(vin_deq.val[2], vconst_0_f32),
                         wrapper::vcgt(vin_deq.val[3], vconst_0_f32),
-                    }
-                };
+                    }};
 #endif // __aarch64__
 
-                const float32x4x4_t tmp_dep =
-                {
-                    {
+                    const float32x4x4_t tmp_dep = {{
                         wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])),
                         wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])),
                         wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])),
                         wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])),
-                    }
-                };
+                    }};
 
-                tmp = vquantize_signed(tmp_dep, qi_out);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
+                    tmp = vquantize_signed(tmp_dep, qi_out);
+                }
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                wrapper::vstore(output_ptr + x, tmp);
             }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            qasymm8_signed_t in  = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
-            qasymm8_signed_t tmp = 0;
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                tmp = std::max(const_0, in);
-                tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                tmp = std::min(a, std::max(const_0, in));
-                tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                tmp = std::min(a, std::max(b, in));
-                tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
-            }
+                qasymm8_signed_t in  = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
+                qasymm8_signed_t tmp = 0;
+                if (act == ActivationLayerInfo::ActivationFunction::RELU)
+                {
+                    tmp = std::max(const_0, in);
+                    tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                {
+                    tmp = std::min(a, std::max(const_0, in));
+                    tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    tmp = std::min(a, std::max(b, in));
+                    tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
+                }
 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
-                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
-                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
-            }
+                else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                {
+                    float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                    tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
+                    tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
+                }
 #endif // __aarch64__
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
-                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
-                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
-                tmp_f       = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
-                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
-                tmp_f       = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
-                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+                {
+                    float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                    tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
+                    tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+                {
+                    float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                    tmp_f       = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
+                    tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+                {
+                    float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                    tmp_f       = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
+                    tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
+                }
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                *(output_ptr + x) = tmp;
             }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/neon/qsymm16.cpp b/src/cpu/kernels/activation/generic/neon/qsymm16.cpp
index 2aea6cba3c..891646ea00 100644
--- a/src/cpu/kernels/activation/generic/neon/qsymm16.cpp
+++ b/src/cpu/kernels/activation/generic/neon/qsymm16.cpp
@@ -21,11 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/NESymm.h"
 #include "src/core/NEON/wrapper/wrapper.h"
@@ -38,7 +39,10 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_qsymm16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void neon_qsymm16_activation(const ITensor             *src,
+                             ITensor                   *dst,
+                             const ActivationLayerInfo &act_info,
+                             const Window              &window)
 {
     constexpr int                                 window_step_x  = 8;
     const auto                                    window_start_x = static_cast<int>(window.x().start());
@@ -59,103 +63,94 @@ void neon_qsymm16_activation(const ITensor *src, ITensor *dst, const ActivationL
     const float                   a_f32    = act_info.a();
     const float                   b_f32    = act_info.b();
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const qsymm16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<qsymm16_t *>(output.ptr());
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const qsymm16_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<qsymm16_t *>(output.ptr());
 
-        wrapper::traits::neon_bitvector_t<qsymm16_t, wrapper::traits::BitWidth::W128> tmp;
-        ARM_COMPUTE_UNUSED(tmp);
+            wrapper::traits::neon_bitvector_t<qsymm16_t, wrapper::traits::BitWidth::W128> tmp;
+            ARM_COMPUTE_UNUSED(tmp);
 
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                // De-quantize
-                const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
-                // Perform activation
-                const float32x4x2_t tmp_dep =
+                const auto vin = wrapper::vloadq(input_ptr + x);
+                if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
                 {
-                    {
+                    // De-quantize
+                    const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
+                    // Perform activation
+                    const float32x4x2_t tmp_dep = {{
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_int16(tmp_dep, qi_out.scale);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
-                // Perform activation
-                const float32x4x2_t tmp_dep =
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize_int16(tmp_dep, qi_out.scale);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
                 {
-                    {
+                    // De-quantize
+                    const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
+                    // Perform activation
+                    const float32x4x2_t tmp_dep = {{
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_int16(tmp_dep, qi_out.scale);
-            }
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize_int16(tmp_dep, qi_out.scale);
+                }
 
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
-                // Perform activation
-                const float32x4x2_t tmp_dep =
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
                 {
-                    {
-                        wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[0])),
-                        wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[1]))
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_int16(tmp_dep, qi_out.scale);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
+                    // De-quantize
+                    const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
+                    // Perform activation
+                    const float32x4x2_t tmp_dep = {{wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[0])),
+                                                    wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[1]))}};
+                    // Re-quantize to new output space
+                    tmp = vquantize_int16(tmp_dep, qi_out.scale);
+                }
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                wrapper::vstore(output_ptr + x, tmp);
             }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            qsymm16_t in  = *(reinterpret_cast<const qsymm16_t *>(input_ptr + x));
-            qsymm16_t tmp = 0;
-            if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                float tmp_f = dequantize_qsymm16(in, qi_in.scale);
-                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
-                tmp         = quantize_qsymm16(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                float tmp_f = dequantize_qsymm16(in, qi_in.scale);
-                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
-                tmp         = quantize_qsymm16(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                float tmp_f = dequantize_qsymm16(in, qi_in.scale);
-                tmp_f       = std::min<float>(a_f32, std::max<float>(b_f32, tmp_f));
-                tmp         = quantize_qsymm16(tmp_f, qi_out);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
+                qsymm16_t in  = *(reinterpret_cast<const qsymm16_t *>(input_ptr + x));
+                qsymm16_t tmp = 0;
+                if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                {
+                    float tmp_f = dequantize_qsymm16(in, qi_in.scale);
+                    tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
+                    tmp         = quantize_qsymm16(tmp_f, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+                {
+                    float tmp_f = dequantize_qsymm16(in, qi_in.scale);
+                    tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
+                    tmp         = quantize_qsymm16(tmp_f, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    float tmp_f = dequantize_qsymm16(in, qi_in.scale);
+                    tmp_f       = std::min<float>(a_f32, std::max<float>(b_f32, tmp_f));
+                    tmp         = quantize_qsymm16(tmp_f, qi_out);
+                }
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                *(output_ptr + x) = tmp;
             }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/sve/fp16.cpp b/src/cpu/kernels/activation/generic/sve/fp16.cpp
index 4757c60d8f..97399e01e0 100644
--- a/src/cpu/kernels/activation/generic/sve/fp16.cpp
+++ b/src/cpu/kernels/activation/generic/sve/fp16.cpp
@@ -29,11 +29,11 @@
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 
-#include <cmath>
-#include <cstddef>
-
 #include "src/core/NEON/SVEMath.h"
+
 #include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
 
 namespace arm_compute
 {
@@ -59,77 +59,87 @@ void sve_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLayer
 
     const auto va = svdup_n_f16(act_info.a());
     const auto vb = svdup_n_f16(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
 
-        svfloat16_t tmp;
+            svfloat16_t tmp;
 
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b16(x, window_end_x);
-        do
-        {
-            const auto vin = svld1_f16(pg, input_ptr + x);
-            switch(act)
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b16(x, window_end_x);
+            do
             {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = svabs_f16_z(pg, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = svmla_f16_z(pg, vb, va, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = svmax_f16_z(pg, const_0, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = svadd_f16_z(pg, svmul_f16_z(pg, svmin_f16_z(pg, vin, const_0), va), svmax_f16_z(pg, vin, const_0));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = svlog_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = svsel_f16(svcmpgt_f16(pg, vin, const_0), vin, svmul_f16_z(pg, va, svsub_f16_z(pg, svexp_f16_z(pg, vin), const_1)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
-                    tmp = svsqrt_f16_z(pg, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = svmul_f16_z(pg, vin, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = svmul_f16_z(pg, va, svtanh_f16_z(pg, svmul_f16_z(pg, vb, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = vin;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = svmul_f16_z(pg, vin, svmul_f16_z(pg, const_inv_6, svmin_f16_z(pg, const_6, svmax_f16_z(pg, const_0, svadd_f16_z(pg, vin, const_3)))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SWISH:
-                    tmp = svmul_f16_z(pg, vin, svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, svmul_f16_z(pg, va, vin))))));
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            svst1_f16(pg, output_ptr + x, tmp);
+                const auto vin = svld1_f16(pg, input_ptr + x);
+                switch (act)
+                {
+                    case ActivationLayerInfo::ActivationFunction::ABS:
+                        tmp = svabs_f16_z(pg, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LINEAR:
+                        tmp = svmla_f16_z(pg, vb, va, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                        tmp = svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, vin))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::RELU:
+                        tmp = svmax_f16_z(pg, const_0, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                        tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, vin));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                        tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, vin));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                        tmp = svadd_f16_z(pg, svmul_f16_z(pg, svmin_f16_z(pg, vin, const_0), va),
+                                          svmax_f16_z(pg, vin, const_0));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                        tmp = svlog_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, vin)));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::ELU:
+                        tmp = svsel_f16(svcmpgt_f16(pg, vin, const_0), vin,
+                                        svmul_f16_z(pg, va, svsub_f16_z(pg, svexp_f16_z(pg, vin), const_1)));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQRT:
+                        tmp = svsqrt_f16_z(pg, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQUARE:
+                        tmp = svmul_f16_z(pg, vin, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::TANH:
+                        tmp = svmul_f16_z(pg, va, svtanh_f16_z(pg, svmul_f16_z(pg, vb, vin)));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                        tmp = vin;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                        tmp = svmul_f16_z(
+                            pg, vin,
+                            svmul_f16_z(
+                                pg, const_inv_6,
+                                svmin_f16_z(pg, const_6, svmax_f16_z(pg, const_0, svadd_f16_z(pg, vin, const_3)))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SWISH:
+                        tmp = svmul_f16_z(
+                            pg, vin,
+                            svinv_f16_z(pg, svadd_f16_z(pg, const_1,
+                                                        svexp_f16_z(pg, svneg_f16_z(pg, svmul_f16_z(pg, va, vin))))));
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                svst1_f16(pg, output_ptr + x, tmp);
 
-            x += svcnth();
-            pg = svwhilelt_b16(x, window_end_x);
+                x += svcnth();
+                pg = svwhilelt_b16(x, window_end_x);
 
-        }
-        while(svptest_any(svptrue_b16(), pg));
-    },
-    input, output);
+            } while (svptest_any(svptrue_b16(), pg));
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/sve/fp32.cpp b/src/cpu/kernels/activation/generic/sve/fp32.cpp
index 87f04c255a..d1b075d52c 100644
--- a/src/cpu/kernels/activation/generic/sve/fp32.cpp
+++ b/src/cpu/kernels/activation/generic/sve/fp32.cpp
@@ -26,13 +26,13 @@
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/SVEMath.h"
 
+#include <arm_sve.h>
 #include <cmath>
 #include <cstddef>
 
-#include <arm_sve.h>
-
 namespace arm_compute
 {
 namespace cpu
@@ -58,78 +58,89 @@ void sve_fp32_activation(const ITensor *src, ITensor *dst, const ActivationLayer
 
     const auto va = svdup_n_f32(act_info.a());
     const auto vb = svdup_n_f32(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float *>(output.ptr());
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<float *>(output.ptr());
 
-        svfloat32_t tmp;
+            svfloat32_t tmp;
 
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b32(x, window_end_x);
-        do
-        {
-            const auto vin = svld1_f32(pg, input_ptr + x);
-            switch(act)
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b32(x, window_end_x);
+            do
             {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = svabs_f32_z(pg, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = svmla_f32_z(pg, vb, va, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = svmax_f32_z(pg, const_0, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = svadd_f32_z(pg, svmul_f32_z(pg, svmin_f32_z(pg, vin, const_0), va), svmax_f32_z(pg, vin, const_0));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = svsel_f32(svcmpgt_f32(pg, vin, soft_relu_thresh), vin, svlog_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = svsel_f32(svcmpgt_f32(pg, vin, const_0), vin, svmul_f32_z(pg, va, svsub_f32_z(pg, svexp_f32_z(pg, vin), const_1)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
-                    tmp = svsqrt_f32_z(pg, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = svmul_f32_z(pg, vin, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = svmul_f32_z(pg, va, svtanh_f32_z(pg, svmul_f32_z(pg, vb, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = vin;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = svmul_f32_z(pg, vin, svmul_f32_z(pg, const_inv_6, svmin_f32_z(pg, const_6, svmax_f32_z(pg, const_0, svadd_f32_z(pg, vin, const_3)))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SWISH:
-                    tmp = svmul_f32_z(pg, vin, svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, svmul_f32_z(pg, va, vin))))));
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            svst1_f32(pg, output_ptr + x, tmp);
+                const auto vin = svld1_f32(pg, input_ptr + x);
+                switch (act)
+                {
+                    case ActivationLayerInfo::ActivationFunction::ABS:
+                        tmp = svabs_f32_z(pg, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LINEAR:
+                        tmp = svmla_f32_z(pg, vb, va, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                        tmp = svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, vin))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::RELU:
+                        tmp = svmax_f32_z(pg, const_0, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                        tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, vin));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                        tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, vin));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                        tmp = svadd_f32_z(pg, svmul_f32_z(pg, svmin_f32_z(pg, vin, const_0), va),
+                                          svmax_f32_z(pg, vin, const_0));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                        tmp = svsel_f32(svcmpgt_f32(pg, vin, soft_relu_thresh), vin,
+                                        svlog_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, vin))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::ELU:
+                        tmp = svsel_f32(svcmpgt_f32(pg, vin, const_0), vin,
+                                        svmul_f32_z(pg, va, svsub_f32_z(pg, svexp_f32_z(pg, vin), const_1)));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQRT:
+                        tmp = svsqrt_f32_z(pg, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQUARE:
+                        tmp = svmul_f32_z(pg, vin, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::TANH:
+                        tmp = svmul_f32_z(pg, va, svtanh_f32_z(pg, svmul_f32_z(pg, vb, vin)));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                        tmp = vin;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                        tmp = svmul_f32_z(
+                            pg, vin,
+                            svmul_f32_z(
+                                pg, const_inv_6,
+                                svmin_f32_z(pg, const_6, svmax_f32_z(pg, const_0, svadd_f32_z(pg, vin, const_3)))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SWISH:
+                        tmp = svmul_f32_z(
+                            pg, vin,
+                            svinv_f32_z(pg, svadd_f32_z(pg, const_1,
+                                                        svexp_f32_z(pg, svneg_f32_z(pg, svmul_f32_z(pg, va, vin))))));
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                svst1_f32(pg, output_ptr + x, tmp);
 
-            x += svcntw();
-            pg = svwhilelt_b32(x, window_end_x);
+                x += svcntw();
+                pg = svwhilelt_b32(x, window_end_x);
 
-        }
-        while(svptest_any(svptrue_b32(), pg));
-    },
-    input, output);
+            } while (svptest_any(svptrue_b32(), pg));
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/sve2/lut.cpp b/src/cpu/kernels/activation/generic/sve2/lut.cpp
index d65de8d649..2ed667debf 100644
--- a/src/cpu/kernels/activation/generic/sve2/lut.cpp
+++ b/src/cpu/kernels/activation/generic/sve2/lut.cpp
@@ -24,6 +24,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/cpu/kernels/lut/list.h"
 
 namespace arm_compute
@@ -33,19 +34,22 @@ namespace cpu
 #ifdef __aarch64__
 void sve2_q8_activation_lut(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 {
-    ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 && src->info()->data_type() != DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 &&
+                         src->info()->data_type() != DataType::QASYMM8_SIGNED);
     const auto window_end_x  = window.x().end();
     Window     win_collapsed = window.collapse_if_possible(window, Window::DimZ);
     win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = input.ptr();
-        auto       output_ptr = output.ptr();
-        lut_u8_sve2(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr);
-    },
-    input, output);
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = input.ptr();
+            auto       output_ptr = output.ptr();
+            lut_u8_sve2(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr);
+        },
+        input, output);
 }
 #endif // __aarch64__
 } // namespace cpu
diff --git a/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp b/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp
index bc9bc7aa3c..7efa9e4b72 100644
--- a/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp
+++ b/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp
@@ -26,18 +26,21 @@
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 
-#include <cmath>
-#include <cstddef>
-
 #include "src/core/NEON/SVEAsymm.h"
 #include "src/core/NEON/SVEMath.h"
+
 #include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sve2_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void sve2_qasymm8_activation(const ITensor             *src,
+                             ITensor                   *dst,
+                             const ActivationLayerInfo &act_info,
+                             const Window              &window)
 {
     const auto                                    window_start_x = static_cast<int>(window.x().start());
     const auto                                    window_end_x   = static_cast<int>(window.x().end());
@@ -61,7 +64,7 @@ void sve2_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationL
 
     // Initialise scale/offset for re-quantization
     bool requant = true;
-    if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
+    if (qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
     {
         requant = false;
     }
@@ -78,139 +81,160 @@ void sve2_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationL
     const auto vo_s32     = svdup_n_s32(o_s32);
 
     // Initialise scale/offset for re-quantization for leaky relu
-    int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
-                                arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    int32_t    s_leaky_s32  = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    int32_t    o_leaky_s32  = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
+                                    arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
     const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32);
     const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32);
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
 
-        svuint8_t tmp;
+            svuint8_t tmp;
 
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b8(x, window_end_x);
-        do
-        {
-            const auto vin = svld1_u8(pg, input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                // Perform activation
-                tmp = svmax_u8_z(pg, vconst_0, vin);
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vconst_0, vin));
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vb, vin));
-                // Re-quantize to new output space
-                tmp = svmla_qasymm8_z(pg, tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep = svcreate4_f32(svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
-                                                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
-                                                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
-                                                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))));
-
-                // Re-quantize to new output space
-                tmp = svquantize_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b8(x, window_end_x);
+            do
             {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep = svcreate4_f32(svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
-                                                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
-                                                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
-                                                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))));
-
-                // Re-quantize to new output space
-                tmp = svquantize_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                svbool_t    p0, p1, p2, p3;
-                svint32x4_t tmp_dep;
-
-                // Expand to int32
-                const svint32x4_t vin_s32 = svcreate4_s32(
-                                                svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(vin))),
-                                                svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(vin))),
-                                                svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(vin))),
-                                                svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(vin))));
-
-                // Compare elements to input offset
-                if(qi_in.scale >= 0)
+                const auto vin = svld1_u8(pg, input_ptr + x);
+                if (act == ActivationLayerInfo::ActivationFunction::RELU)
                 {
-                    p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
-                    p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
-                    p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
-                    p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                    // Perform activation
+                    tmp = svmax_u8_z(pg, vconst_0, vin);
+                    // Re-quantize to new output space
+                    tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
                 }
-                else
+                else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
                 {
-                    p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
-                    p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
-                    p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
-                    p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                    // Perform activation
+                    tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vconst_0, vin));
+                    // Re-quantize to new output space
+                    tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
                 }
-
-                // Multiply negative elements and requantize if necessary
-                if(requant)
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    // Perform activation
+                    tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vb, vin));
+                    // Re-quantize to new output space
+                    tmp = svmla_qasymm8_z(pg, tmp, vs, vo);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                {
+                    // De-quantize
+                    const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                    // Perform activation
+                    const svfloat32x4_t tmp_dep = svcreate4_f32(
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))));
+
+                    // Re-quantize to new output space
+                    tmp = svquantize_z(pg, tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
                 {
-                    tmp_dep = svcreate4_s32(
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8));
+                    // De-quantize
+                    const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                    // Perform activation
+                    const svfloat32x4_t tmp_dep = svcreate4_f32(
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))));
+
+                    // Re-quantize to new output space
+                    tmp = svquantize_z(pg, tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+                {
+                    svbool_t    p0, p1, p2, p3;
+                    svint32x4_t tmp_dep;
+
+                    // Expand to int32
+                    const svint32x4_t vin_s32 = svcreate4_s32(svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(vin))),
+                                                              svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(vin))),
+                                                              svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(vin))),
+                                                              svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(vin))));
+
+                    // Compare elements to input offset
+                    if (qi_in.scale >= 0)
+                    {
+                        p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+                        p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+                        p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+                        p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                    }
+                    else
+                    {
+                        p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+                        p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+                        p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+                        p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                    }
+
+                    // Multiply negative elements and requantize if necessary
+                    if (requant)
+                    {
+                        tmp_dep = svcreate4_s32(
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0),
+                                                      svsel(p0, vs_leaky_s32, vs_s32)),
+                                          8),
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1),
+                                                      svsel(p1, vs_leaky_s32, vs_s32)),
+                                          8),
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2),
+                                                      svsel(p2, vs_leaky_s32, vs_s32)),
+                                          8),
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3),
+                                                      svsel(p3, vs_leaky_s32, vs_s32)),
+                                          8));
+                    }
+                    else
+                    {
+                        tmp_dep = svcreate4_s32(
+                            svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
+                            svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
+                            svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
+                            svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
+                    }
+
+                    // Convert uint32 vectors to uint16 vectors (with saturation)
+                    const auto v_low_u16  = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
+                    const auto v_high_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
+
+                    // convert uint16 vectors to uint8 vectors (with saturation)
+                    tmp = svqxtnt_u16(svqxtnb_u16(v_low_u16), v_high_u16);
                 }
                 else
                 {
-                    tmp_dep = svcreate4_s32(
-                                  svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
                 }
 
-                // Convert uint32 vectors to uint16 vectors (with saturation)
-                const auto v_low_u16  = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
-                const auto v_high_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
-
-                // convert uint16 vectors to uint8 vectors (with saturation)
-                tmp = svqxtnt_u16(svqxtnb_u16(v_low_u16), v_high_u16);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-
-            svst1_u8(pg, output_ptr + x, tmp);
-
-            x += svcntb();
-            pg = svwhilelt_b8(x, window_end_x);
+                svst1_u8(pg, output_ptr + x, tmp);
 
-        }
-        while(svptest_any(svptrue_b8(), pg));
+                x += svcntb();
+                pg = svwhilelt_b8(x, window_end_x);
 
-    },
-    input, output);
+            } while (svptest_any(svptrue_b8(), pg));
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp
index d20684f54d..e4667522dd 100644
--- a/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp
+++ b/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp
@@ -24,20 +24,23 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <cmath>
-#include <cstddef>
 
 #include "src/core/NEON/SVEAsymm.h"
 #include "src/core/NEON/SVEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
 #include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sve2_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void sve2_qasymm8_signed_activation(const ITensor             *src,
+                                    ITensor                   *dst,
+                                    const ActivationLayerInfo &act_info,
+                                    const Window              &window)
 {
     const auto                                    window_start_x = static_cast<int>(window.x().start());
     const auto                                    window_end_x   = static_cast<int>(window.x().end());
@@ -65,7 +68,7 @@ void sve2_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const Acti
 
     // Initialise scale/offset for re-quantization
     bool requant = true;
-    if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
+    if (qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
     {
         requant = false;
     }
@@ -82,151 +85,190 @@ void sve2_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const Acti
     const auto vo_s32     = svdup_n_s32(o_s32);
 
     // Initialise scale/offset for re-quantization for leaky relu
-    int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
-                                arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    int32_t    s_leaky_s32  = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    int32_t    o_leaky_s32  = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
+                                    arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
     const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32);
     const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32);
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const int8_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const int8_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
 
-        svint8_t tmp;
+            svint8_t tmp;
 
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b8(x, window_end_x);
-        do
-        {
-            const auto vin = svld1_s8(pg, input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                // Perform activation
-                tmp = svmax_s8_z(pg, vconst_0, vin);
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vconst_0, vin));
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vb, vin));
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep = svcreate4_f32(
-                                                  svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
-                                                  svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
-                                                  svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
-                                                  svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))));
-                // Re-quantize to new output space
-                tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep = svcreate4_f32(
-                                                  svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
-                                                  svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
-                                                  svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
-                                                  svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))));
-                // Re-quantize to new output space
-                tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep = svcreate4_f32(
-                                                  svmul_f32_z(pg, svget4_f32(vin_deq, 0), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 0), const_3_f32))))),
-                                                  svmul_f32_z(pg, svget4_f32(vin_deq, 1), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 1), const_3_f32))))),
-                                                  svmul_f32_z(pg, svget4_f32(vin_deq, 2), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 2), const_3_f32))))),
-                                                  svmul_f32_z(pg, svget4_f32(vin_deq, 3), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 3), const_3_f32))))));
-                // Re-quantize to new output space
-                tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b8(x, window_end_x);
+            do
             {
-                svbool_t    p0, p1, p2, p3;
-                svint32x4_t tmp_dep;
-
-                // Expand to int32
-                const svint32x4_t vin_s32 = svcreate4_s32(
-                                                svmovlb_s32(svmovlb_s16(vin)),
-                                                svmovlt_s32(svmovlb_s16(vin)),
-                                                svmovlb_s32(svmovlt_s16(vin)),
-                                                svmovlt_s32(svmovlt_s16(vin)));
-
-                // Compare elements to input offset
-                if(qi_in.scale >= 0)
+                const auto vin = svld1_s8(pg, input_ptr + x);
+                if (act == ActivationLayerInfo::ActivationFunction::RELU)
                 {
-                    p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
-                    p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
-                    p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
-                    p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                    // Perform activation
+                    tmp = svmax_s8_z(pg, vconst_0, vin);
+                    // Re-quantize to new output space
+                    tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
                 }
-                else
+                else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
                 {
-                    p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
-                    p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
-                    p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
-                    p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                    // Perform activation
+                    tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vconst_0, vin));
+                    // Re-quantize to new output space
+                    tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
                 }
-
-                // Multiply negative elements and requantize if necessary
-                if(requant)
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
                 {
-                    tmp_dep = svcreate4_s32(
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8));
+                    // Perform activation
+                    tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vb, vin));
+                    // Re-quantize to new output space
+                    tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
                 }
-                else
+                else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                {
+                    // De-quantize
+                    const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                    // Perform activation
+                    const svfloat32x4_t tmp_dep = svcreate4_f32(
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))));
+                    // Re-quantize to new output space
+                    tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+                {
+                    // De-quantize
+                    const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                    // Perform activation
+                    const svfloat32x4_t tmp_dep = svcreate4_f32(
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))));
+                    // Re-quantize to new output space
+                    tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
                 {
-                    tmp_dep = svcreate4_s32(
-                                  svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
+                    // De-quantize
+                    const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                    // Perform activation
+                    const svfloat32x4_t tmp_dep = svcreate4_f32(
+                        svmul_f32_z(pg, svget4_f32(vin_deq, 0),
+                                    svmul_f32_z(pg, const_inv_6_f32,
+                                                svmin_f32_z(pg, const_6_f32,
+                                                            svmax_f32_z(pg, const_0_f32,
+                                                                        svadd_f32_z(pg, svget4_f32(vin_deq, 0),
+                                                                                    const_3_f32))))),
+                        svmul_f32_z(pg, svget4_f32(vin_deq, 1),
+                                    svmul_f32_z(pg, const_inv_6_f32,
+                                                svmin_f32_z(pg, const_6_f32,
+                                                            svmax_f32_z(pg, const_0_f32,
+                                                                        svadd_f32_z(pg, svget4_f32(vin_deq, 1),
+                                                                                    const_3_f32))))),
+                        svmul_f32_z(pg, svget4_f32(vin_deq, 2),
+                                    svmul_f32_z(pg, const_inv_6_f32,
+                                                svmin_f32_z(pg, const_6_f32,
+                                                            svmax_f32_z(pg, const_0_f32,
+                                                                        svadd_f32_z(pg, svget4_f32(vin_deq, 2),
+                                                                                    const_3_f32))))),
+                        svmul_f32_z(pg, svget4_f32(vin_deq, 3),
+                                    svmul_f32_z(pg, const_inv_6_f32,
+                                                svmin_f32_z(pg, const_6_f32,
+                                                            svmax_f32_z(pg, const_0_f32,
+                                                                        svadd_f32_z(pg, svget4_f32(vin_deq, 3),
+                                                                                    const_3_f32))))));
+                    // Re-quantize to new output space
+                    tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
                 }
+                else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+                {
+                    svbool_t    p0, p1, p2, p3;
+                    svint32x4_t tmp_dep;
 
-                // Convert uint32 vectors to uint16 vectors (with saturation)
-                const auto v_low_s16  = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
-                const auto v_high_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
+                    // Expand to int32
+                    const svint32x4_t vin_s32 =
+                        svcreate4_s32(svmovlb_s32(svmovlb_s16(vin)), svmovlt_s32(svmovlb_s16(vin)),
+                                      svmovlb_s32(svmovlt_s16(vin)), svmovlt_s32(svmovlt_s16(vin)));
 
-                // convert uint16 vectors to uint8 vectors (with saturation)
-                tmp = svqxtnt_s16(svqxtnb_s16(v_low_s16), v_high_s16);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
+                    // Compare elements to input offset
+                    if (qi_in.scale >= 0)
+                    {
+                        p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+                        p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+                        p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+                        p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                    }
+                    else
+                    {
+                        p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+                        p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+                        p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+                        p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                    }
+
+                    // Multiply negative elements and requantize if necessary
+                    if (requant)
+                    {
+                        tmp_dep = svcreate4_s32(
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0),
+                                                      svsel(p0, vs_leaky_s32, vs_s32)),
+                                          8),
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1),
+                                                      svsel(p1, vs_leaky_s32, vs_s32)),
+                                          8),
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2),
+                                                      svsel(p2, vs_leaky_s32, vs_s32)),
+                                          8),
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3),
+                                                      svsel(p3, vs_leaky_s32, vs_s32)),
+                                          8));
+                    }
+                    else
+                    {
+                        tmp_dep = svcreate4_s32(
+                            svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
+                            svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
+                            svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
+                            svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
+                    }
+
+                    // Convert uint32 vectors to uint16 vectors (with saturation)
+                    const auto v_low_s16  = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
+                    const auto v_high_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
+
+                    // convert uint16 vectors to uint8 vectors (with saturation)
+                    tmp = svqxtnt_s16(svqxtnb_s16(v_low_s16), v_high_s16);
+                }
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
 
-            svst1_s8(pg, output_ptr + x, tmp);
+                svst1_s8(pg, output_ptr + x, tmp);
 
-            x += svcntb();
-            pg = svwhilelt_b8(x, window_end_x);
+                x += svcntb();
+                pg = svwhilelt_b8(x, window_end_x);
 
-        }
-        while(svptest_any(svptrue_b8(), pg));
-    },
-    input, output);
+            } while (svptest_any(svptrue_b8(), pg));
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp b/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp
index 5154fac8a7..f955893307 100644
--- a/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp
+++ b/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp
@@ -21,24 +21,27 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 
-#include <cmath>
-#include <cstddef>
-
 #include "src/core/NEON/SVEMath.h"
 #include "src/core/NEON/SVESymm.h"
+
 #include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sve2_qsymm16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void sve2_qsymm16_activation(const ITensor             *src,
+                             ITensor                   *dst,
+                             const ActivationLayerInfo &act_info,
+                             const Window              &window)
 {
     const auto                                    window_start_x = static_cast<int>(window.x().start());
     const auto                                    window_end_x   = static_cast<int>(window.x().end());
@@ -56,62 +59,70 @@ void sve2_qsymm16_activation(const ITensor *src, ITensor *dst, const ActivationL
     const auto                    va_f32   = svdup_n_f32(act_info.a());
     const auto                    vb_f32   = svdup_n_f32(act_info.b());
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const int16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const int16_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
 
-        svint16_t tmp;
+            svint16_t tmp;
 
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b16(x, window_end_x);
-        do
-        {
-            const auto vin = svld1_s16(pg, input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
-                // Perform activation
-                const svfloat32x2_t tmp_dep = svcreate2_f32(svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 0))))),
-                                                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 1))))));
-                // Re-quantize to new output space
-                tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
-                // Perform activation
-                const svfloat32x2_t tmp_dep = svcreate2_f32(svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 0), vb_f32))),
-                                                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 1), vb_f32))));
-                // Re-quantize to new output space
-                tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                // De-quantize
-                auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
-                // Perform activation
-                const svfloat32x2_t tmp_dep = svcreate2_f32(svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 0))),
-                                                            svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 1))));
-                // Re-quantize to new output space
-                tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
-            }
-            else
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b16(x, window_end_x);
+            do
             {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
+                const auto vin = svld1_s16(pg, input_ptr + x);
+                if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                {
+                    // De-quantize
+                    auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
+                    // Perform activation
+                    const svfloat32x2_t tmp_dep = svcreate2_f32(
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 0))))),
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 1))))));
+                    // Re-quantize to new output space
+                    tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+                {
+                    // De-quantize
+                    auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
+                    // Perform activation
+                    const svfloat32x2_t tmp_dep = svcreate2_f32(
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 0), vb_f32))),
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 1), vb_f32))));
+                    // Re-quantize to new output space
+                    tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    // De-quantize
+                    auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
+                    // Perform activation
+                    const svfloat32x2_t tmp_dep =
+                        svcreate2_f32(svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 0))),
+                                      svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 1))));
+                    // Re-quantize to new output space
+                    tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
+                }
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
 
-            svst1_s16(pg, output_ptr + x, tmp);
+                svst1_s16(pg, output_ptr + x, tmp);
 
-            x += svcnth();
-            pg = svwhilelt_b16(x, window_end_x);
+                x += svcnth();
+                pg = svwhilelt_b16(x, window_end_x);
 
-        }
-        while(svptest_any(svptrue_b16(), pg));
-    },
-    input, output);
+            } while (svptest_any(svptrue_b16(), pg));
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/neon/fp16.cpp b/src/cpu/kernels/add/generic/neon/fp16.cpp
index fca7b2cd9f..e7679c14e3 100644
--- a/src/cpu/kernels/add/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/add/generic/neon/fp16.cpp
@@ -30,10 +30,11 @@ namespace arm_compute
 {
 namespace cpu
 {
-void add_fp16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_fp16_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_neon<float16_t>(src0, src1, dst, policy, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
 #endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/add/generic/neon/fp32.cpp b/src/cpu/kernels/add/generic/neon/fp32.cpp
index 1f599b1968..11a970bef4 100644
--- a/src/cpu/kernels/add/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/add/generic/neon/fp32.cpp
@@ -28,9 +28,10 @@ namespace arm_compute
 {
 namespace cpu
 {
-void add_fp32_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_fp32_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_neon<float>(src0, src1, dst, policy, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/neon/impl.cpp b/src/cpu/kernels/add/generic/neon/impl.cpp
index 2dde13544a..34938cc4c4 100644
--- a/src/cpu/kernels/add/generic/neon/impl.cpp
+++ b/src/cpu/kernels/add/generic/neon/impl.cpp
@@ -23,8 +23,10 @@
  */
 
 #include "src/cpu/kernels/add/generic/neon/impl.h"
+
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 namespace arm_compute
 {
@@ -40,7 +42,10 @@ bool add_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo
     return add_sub_q8_neon_fixedpoint_possible(src0, src1, dst, true);
 }
 
-bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, bool is_addition)
+bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0,
+                                         const ITensorInfo *src1,
+                                         const ITensorInfo *dst,
+                                         bool               is_addition)
 {
     const auto iq0 = src0->quantization_info().uniform();
     const auto iq1 = src1->quantization_info().uniform();
@@ -49,7 +54,7 @@ bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorI
     const auto scale0 = iq0.scale / oq.scale;
     const auto scale1 = iq1.scale / oq.scale;
 
-    if(scale0 < -15.f || scale0 > 15.f || scale1 < -15.f || scale1 > 15.f)
+    if (scale0 < -15.f || scale0 > 15.f || scale1 < -15.f || scale1 > 15.f)
     {
         // The scale factor cannot be stored as 5.11 signed fixed-point number.
         return false;
@@ -57,9 +62,10 @@ bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorI
 
     const auto offset = float(oq.offset) - scale0 * float(iq0.offset) - scale1 * float(iq1.offset);
 
-    const auto max_acc = is_addition ? ((std::abs(scale0) + std::abs(scale1)) * 256.f + std::abs(offset)) : ((std::abs(scale0) - std::abs(scale1)) * 256.f + std::abs(offset));
+    const auto max_acc = is_addition ? ((std::abs(scale0) + std::abs(scale1)) * 256.f + std::abs(offset))
+                                     : ((std::abs(scale0) - std::abs(scale1)) * 256.f + std::abs(offset));
 
-    if(max_acc > 1048575.f) // 2^20 - 1
+    if (max_acc > 1048575.f) // 2^20 - 1
     {
         // It might not be possible to store the result as 21.11 signed fixed-point number.
         return false;
@@ -69,13 +75,19 @@ bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorI
 }
 
 template <typename ScalarType>
-void add_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_q8_neon_fixedpoint(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     add_sub_q8_neon_fixedpoint<ScalarType>(src0, src1, dst, policy, window, true /*is_addition*/);
 }
 
 template <typename ScalarType>
-void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition)
+void add_sub_q8_neon_fixedpoint(const ITensor       *src0,
+                                const ITensor       *src1,
+                                ITensor             *dst,
+                                const ConvertPolicy &policy,
+                                const Window        &window,
+                                bool                 is_addition)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -103,7 +115,7 @@ void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITenso
     const auto oq_info   = dst->info()->quantization_info().uniform();
     const auto in0_scale = iq0_info.scale / oq_info.scale;
     const auto in1_scale = is_addition ? (iq1_info.scale / oq_info.scale) : (-(iq1_info.scale / oq_info.scale));
-    const auto offset    = float(oq_info.offset) - in0_scale * float(iq0_info.offset) - in1_scale * float(iq1_info.offset);
+    const auto offset = float(oq_info.offset) - in0_scale * float(iq0_info.offset) - in1_scale * float(iq1_info.offset);
 
     constexpr float _2pow11        = 2048;
     const auto      in0_scale_5p11 = static_cast<int16_t>(support::cpp11::lround(in0_scale * _2pow11));
@@ -112,7 +124,7 @@ void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITenso
 
     constexpr uint8_t shift_amount_remainder = 3;
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         // Prefix: a = non-broadcast, b = broadcast.
 
@@ -138,68 +150,75 @@ void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITenso
         Iterator out_it(dst, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto a_ptr   = reinterpret_cast<const ScalarType *>(a_input_it.ptr());
-            const auto b_ptr   = reinterpret_cast<const ScalarType *>(b_input_it.ptr());
-            const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
-
-            const auto b_val                    = *b_ptr;
-            const auto b_scaled                 = b_scale * b_val;
-            const auto b_scaled_21p11           = static_cast<int32_t>(support::cpp11::lround(b_scaled * _2pow11));
-            const auto b_scaled_offseted_21p11  = b_scaled_21p11 + offset_21p11;
-            const auto b_vscaled_offseted_21p11 = wrapper::vdup_n(b_scaled_offseted_21p11, wrapper::traits::vector_128_tag());
+            win,
+            [&](const Coordinates &)
+            {
+                const auto a_ptr   = reinterpret_cast<const ScalarType *>(a_input_it.ptr());
+                const auto b_ptr   = reinterpret_cast<const ScalarType *>(b_input_it.ptr());
+                const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
+
+                const auto b_val                   = *b_ptr;
+                const auto b_scaled                = b_scale * b_val;
+                const auto b_scaled_21p11          = static_cast<int32_t>(support::cpp11::lround(b_scaled * _2pow11));
+                const auto b_scaled_offseted_21p11 = b_scaled_21p11 + offset_21p11;
+                const auto b_vscaled_offseted_21p11 =
+                    wrapper::vdup_n(b_scaled_offseted_21p11, wrapper::traits::vector_128_tag());
 
 #ifndef __aarch64__
-            const auto b_scaled_offseted = b_scaled + offset;
+                const auto b_scaled_offseted = b_scaled + offset;
 #endif // __aarch64__
 
-            int x = window_start_x;
-
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                // Load the input.
-                const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x);
-
-                // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness.
-                const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0)));
-                const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0)));
-
-                // Multiply the non-broadcast elements by the scale factor, add the scaled broadcast elements and the offset.
-                // Widen and store the result in 32-bit integer.
-                const auto vout_21p11_00 = wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgetlow(a_vin_16p0_0), a_vscale_5p11);
-                const auto vout_21p11_01 = wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgethigh(a_vin_16p0_0), a_vscale_5p11);
-                const auto vout_21p11_10 = wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgetlow(a_vin_16p0_1), a_vscale_5p11);
-                const auto vout_21p11_11 = wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgethigh(a_vin_16p0_1), a_vscale_5p11);
-
-                // Remove 3 bits of the fractional part, round, narrow to 16-bit and saturate the result.
-                const auto vout_8p8_0 = wrapper::vcombine(
-                                            wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_00),
-                                            wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_01));
-                const auto vout_8p8_1 = wrapper::vcombine(
-                                            wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_10),
-                                            wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_11));
-
-                // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result.
-                const auto vout_8p0 = wrapper::vcombine(
-                                          wrapper::vqrshrn<8>(vout_8p8_0),
-                                          wrapper::vqrshrn<8>(vout_8p8_1));
-
-                // Store the result.
-                wrapper::vstore(out_ptr + x, vout_8p0);
-            }
-
-            // Process the left-over elements.
-            for(; x < window_end_x; ++x)
-            {
+                int x = window_start_x;
+
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    // Load the input.
+                    const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x);
+
+                    // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness.
+                    const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0)));
+                    const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0)));
+
+                    // Multiply the non-broadcast elements by the scale factor, add the scaled broadcast elements and the offset.
+                    // Widen and store the result in 32-bit integer.
+                    const auto vout_21p11_00 =
+                        wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgetlow(a_vin_16p0_0), a_vscale_5p11);
+                    const auto vout_21p11_01 =
+                        wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgethigh(a_vin_16p0_0), a_vscale_5p11);
+                    const auto vout_21p11_10 =
+                        wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgetlow(a_vin_16p0_1), a_vscale_5p11);
+                    const auto vout_21p11_11 =
+                        wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgethigh(a_vin_16p0_1), a_vscale_5p11);
+
+                    // Remove 3 bits of the fractional part, round, narrow to 16-bit and saturate the result.
+                    const auto vout_8p8_0 =
+                        wrapper::vcombine(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_00),
+                                          wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_01));
+                    const auto vout_8p8_1 =
+                        wrapper::vcombine(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_10),
+                                          wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_11));
+
+                    // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result.
+                    const auto vout_8p0 =
+                        wrapper::vcombine(wrapper::vqrshrn<8>(vout_8p8_0), wrapper::vqrshrn<8>(vout_8p8_1));
+
+                    // Store the result.
+                    wrapper::vstore(out_ptr + x, vout_8p0);
+                }
+
+                // Process the left-over elements.
+                for (; x < window_end_x; ++x)
+                {
 #ifdef __aarch64__
-                out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(int32_t(a_ptr[x]) * a_scale_5p11 + b_scaled_offseted_21p11));
+                    out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(
+                        int32_t(a_ptr[x]) * a_scale_5p11 + b_scaled_offseted_21p11));
 #else  // __aarch64__
-                out_ptr[x] = utility::clamp<int, ScalarType>(support::cpp11::lround(float(a_ptr[x]) * a_scale + b_scaled_offseted));
+                    out_ptr[x] = utility::clamp<int, ScalarType>(
+                        support::cpp11::lround(float(a_ptr[x]) * a_scale + b_scaled_offseted));
 #endif // __aarch64__
-            }
-        },
-        b_input_it, a_input_it, out_it);
+                }
+            },
+            b_input_it, a_input_it, out_it);
     }
     else
     {
@@ -216,70 +235,85 @@ void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITenso
         Iterator out_it(dst, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr());
-            const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr());
-            const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
-
-            int x = window_start_x;
-
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                // Load the inputs.
-                const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x);
-                const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x);
-
-                // Widen the input elements to signed 16-bit regardless of the input signedness.
-                const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0)));
-                const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0)));
-                const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0)));
-                const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0)));
-
-                // Multiply the input elements by the scale factor and add the offset.
-                // Widen and store the result in 32-bit integer.
-                const auto vscaled0_offseted_21p11_00 = wrapper::vmlal(voffset_21p11, wrapper::vgetlow(vin0_16p0_0), vscale0_5p11);
-                const auto vscaled0_offseted_21p11_01 = wrapper::vmlal(voffset_21p11, wrapper::vgethigh(vin0_16p0_0), vscale0_5p11);
-                const auto vscaled0_offseted_21p11_10 = wrapper::vmlal(voffset_21p11, wrapper::vgetlow(vin0_16p0_1), vscale0_5p11);
-                const auto vscaled0_offseted_21p11_11 = wrapper::vmlal(voffset_21p11, wrapper::vgethigh(vin0_16p0_1), vscale0_5p11);
-
-                const auto vout_21p11_00 = wrapper::vmlal(vscaled0_offseted_21p11_00, wrapper::vgetlow(vin1_16p0_0), vscale1_5p11);
-                const auto vout_21p11_01 = wrapper::vmlal(vscaled0_offseted_21p11_01, wrapper::vgethigh(vin1_16p0_0), vscale1_5p11);
-                const auto vout_21p11_10 = wrapper::vmlal(vscaled0_offseted_21p11_10, wrapper::vgetlow(vin1_16p0_1), vscale1_5p11);
-                const auto vout_21p11_11 = wrapper::vmlal(vscaled0_offseted_21p11_11, wrapper::vgethigh(vin1_16p0_1), vscale1_5p11);
-
-                // Remove 3 bits of the fractional part, round, narrow to 16-bit and saturate the result.
-                const auto vout_8p8_0 = wrapper::vcombine(
-                                            wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_00),
-                                            wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_01));
-                const auto vout_8p8_1 = wrapper::vcombine(
-                                            wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_10),
-                                            wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_11));
-
-                // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result.
-                const auto vout_8p0 = wrapper::vcombine(
-                                          wrapper::vqrshrn<8>(vout_8p8_0),
-                                          wrapper::vqrshrn<8>(vout_8p8_1));
-
-                // Store the result.
-                wrapper::vstore(out_ptr + x, vout_8p0);
-            }
-
-            // Process the left-over elements.
-            for(; x < window_end_x; ++x)
+            win,
+            [&](const Coordinates &)
             {
+                const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr());
+                const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr());
+                const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
+
+                int x = window_start_x;
+
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    // Load the inputs.
+                    const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x);
+                    const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x);
+
+                    // Widen the input elements to signed 16-bit regardless of the input signedness.
+                    const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0)));
+                    const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0)));
+                    const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0)));
+                    const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0)));
+
+                    // Multiply the input elements by the scale factor and add the offset.
+                    // Widen and store the result in 32-bit integer.
+                    const auto vscaled0_offseted_21p11_00 =
+                        wrapper::vmlal(voffset_21p11, wrapper::vgetlow(vin0_16p0_0), vscale0_5p11);
+                    const auto vscaled0_offseted_21p11_01 =
+                        wrapper::vmlal(voffset_21p11, wrapper::vgethigh(vin0_16p0_0), vscale0_5p11);
+                    const auto vscaled0_offseted_21p11_10 =
+                        wrapper::vmlal(voffset_21p11, wrapper::vgetlow(vin0_16p0_1), vscale0_5p11);
+                    const auto vscaled0_offseted_21p11_11 =
+                        wrapper::vmlal(voffset_21p11, wrapper::vgethigh(vin0_16p0_1), vscale0_5p11);
+
+                    const auto vout_21p11_00 =
+                        wrapper::vmlal(vscaled0_offseted_21p11_00, wrapper::vgetlow(vin1_16p0_0), vscale1_5p11);
+                    const auto vout_21p11_01 =
+                        wrapper::vmlal(vscaled0_offseted_21p11_01, wrapper::vgethigh(vin1_16p0_0), vscale1_5p11);
+                    const auto vout_21p11_10 =
+                        wrapper::vmlal(vscaled0_offseted_21p11_10, wrapper::vgetlow(vin1_16p0_1), vscale1_5p11);
+                    const auto vout_21p11_11 =
+                        wrapper::vmlal(vscaled0_offseted_21p11_11, wrapper::vgethigh(vin1_16p0_1), vscale1_5p11);
+
+                    // Remove 3 bits of the fractional part, round, narrow to 16-bit and saturate the result.
+                    const auto vout_8p8_0 =
+                        wrapper::vcombine(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_00),
+                                          wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_01));
+                    const auto vout_8p8_1 =
+                        wrapper::vcombine(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_10),
+                                          wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_11));
+
+                    // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result.
+                    const auto vout_8p0 =
+                        wrapper::vcombine(wrapper::vqrshrn<8>(vout_8p8_0), wrapper::vqrshrn<8>(vout_8p8_1));
+
+                    // Store the result.
+                    wrapper::vstore(out_ptr + x, vout_8p0);
+                }
+
+                // Process the left-over elements.
+                for (; x < window_end_x; ++x)
+                {
 #ifdef __aarch64__
-                out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(int32_t(in0_ptr[x]) * in0_scale_5p11 + int32_t(in1_ptr[x]) * in1_scale_5p11 + offset_21p11));
+                    out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(
+                        int32_t(in0_ptr[x]) * in0_scale_5p11 + int32_t(in1_ptr[x]) * in1_scale_5p11 + offset_21p11));
 #else  // __aarch64__
-                out_ptr[x] = utility::clamp<int, ScalarType>(support::cpp11::lround(float(in0_ptr[x]) * in0_scale + float(in1_ptr[x]) * in1_scale + offset));
+                    out_ptr[x] = utility::clamp<int, ScalarType>(
+                        support::cpp11::lround(float(in0_ptr[x]) * in0_scale + float(in1_ptr[x]) * in1_scale + offset));
 #endif // __aarch64__
-            }
-        },
-        in0_it, in1_it, out_it);
+                }
+            },
+            in0_it, in1_it, out_it);
     }
 }
 
-void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition)
+void add_sub_qasymm8_neon(const ITensor       *src0,
+                          const ITensor       *src1,
+                          ITensor             *dst,
+                          const ConvertPolicy &policy,
+                          const Window        &window,
+                          bool                 is_addition)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -304,7 +338,7 @@ void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst
     const auto scale2 = is_addition ? (iq2_info.scale / oq_info.scale) : (-(iq2_info.scale / oq_info.scale));
     const auto offset = float(oq_info.offset) - scale1 * float(iq1_info.offset) - scale2 * float(iq2_info.offset);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -324,63 +358,64 @@ void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst
         Iterator output(dst, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = non_broadcast_input.ptr();
-            const auto output_ptr              = output.ptr();
-
-            const auto broadcast_value = *broadcast_input.ptr();
-            const auto bf              = vdupq_n_f32(float(broadcast_value) * scale2 + offset);
-            const auto bfs             = float(broadcast_value) * bf_scale + offset;
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x);
+                const auto non_broadcast_input_ptr = non_broadcast_input.ptr();
+                const auto output_ptr              = output.ptr();
+
+                const auto broadcast_value = *broadcast_input.ptr();
+                const auto bf              = vdupq_n_f32(float(broadcast_value) * scale2 + offset);
+                const auto bfs             = float(broadcast_value) * bf_scale + offset;
 
-                const auto a_u16_0 = vmovl_u8(vget_low_u8(a));
-                const auto a_u16_1 = vmovl_u8(vget_high_u8(a));
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x);
 
-                const auto af_0 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_0))), vscale1);
-                const auto af_1 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_0))), vscale1);
-                const auto af_2 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_1))), vscale1);
-                const auto af_3 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_1))), vscale1);
+                    const auto a_u16_0 = vmovl_u8(vget_low_u8(a));
+                    const auto a_u16_1 = vmovl_u8(vget_high_u8(a));
 
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
-                int32x4_t rf_2{};
-                int32x4_t rf_3{};
+                    const auto af_0 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_0))), vscale1);
+                    const auto af_1 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_0))), vscale1);
+                    const auto af_2 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_1))), vscale1);
+                    const auto af_3 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_1))), vscale1);
+
+                    int32x4_t rf_0{};
+                    int32x4_t rf_1{};
+                    int32x4_t rf_2{};
+                    int32x4_t rf_3{};
 
 #ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(af_0);
-                rf_1 = vcvtnq_s32_f32(af_1);
-                rf_2 = vcvtnq_s32_f32(af_2);
-                rf_3 = vcvtnq_s32_f32(af_3);
+                    rf_0 = vcvtnq_s32_f32(af_0);
+                    rf_1 = vcvtnq_s32_f32(af_1);
+                    rf_2 = vcvtnq_s32_f32(af_2);
+                    rf_3 = vcvtnq_s32_f32(af_3);
 #else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(af_0);
-                rf_1 = vcvtq_s32_f32(af_1);
-                rf_2 = vcvtq_s32_f32(af_2);
-                rf_3 = vcvtq_s32_f32(af_3);
+                    rf_0          = vcvtq_s32_f32(af_0);
+                    rf_1          = vcvtq_s32_f32(af_1);
+                    rf_2          = vcvtq_s32_f32(af_2);
+                    rf_3          = vcvtq_s32_f32(af_3);
 #endif //__aarch64__
 
-                const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
-                const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
-                vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
-            }
+                    const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
+                    const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
+                    vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto result = float(non_broadcast_input_ptr[x]) * af_scale + bfs;
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto result = float(non_broadcast_input_ptr[x]) * af_scale + bfs;
 #ifdef __aarch64__
-                output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::lround(result));
+                    output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::lround(result));
 #else  // __aarch64__
-                output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::trunc(result));
+                    output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::trunc(result));
 #endif // __aarch64__
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -397,72 +432,78 @@ void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst
         const auto voffset = vdupq_n_f32(offset);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = input1.ptr();
-            const auto input2_ptr = input2.ptr();
-            const auto output_ptr = output.ptr();
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const uint8x16_t a = vld1q_u8(input1_ptr + x);
-                const uint8x16_t b = vld1q_u8(input2_ptr + x);
-
-                const auto a_u16_0 = vmovl_u8(vget_low_u8(a));
-                const auto a_u16_1 = vmovl_u8(vget_high_u8(a));
-                const auto b_u16_0 = vmovl_u8(vget_low_u8(b));
-                const auto b_u16_1 = vmovl_u8(vget_high_u8(b));
-
-                const auto af_0 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_0))), vscale1);
-                const auto af_1 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_0))), vscale1);
-                const auto af_2 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_1))), vscale1);
-                const auto af_3 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_1))), vscale1);
-
-                const auto bf_0 = vmlaq_f32(af_0, vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_0))), vscale2);
-                const auto bf_1 = vmlaq_f32(af_1, vcvtq_f32_u32(vmovl_u16(vget_high_u16(b_u16_0))), vscale2);
-                const auto bf_2 = vmlaq_f32(af_2, vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_1))), vscale2);
-                const auto bf_3 = vmlaq_f32(af_3, vcvtq_f32_u32(vmovl_u16(vget_high_u16(b_u16_1))), vscale2);
-
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
-                int32x4_t rf_2{};
-                int32x4_t rf_3{};
+                const auto input1_ptr = input1.ptr();
+                const auto input2_ptr = input2.ptr();
+                const auto output_ptr = output.ptr();
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const uint8x16_t a = vld1q_u8(input1_ptr + x);
+                    const uint8x16_t b = vld1q_u8(input2_ptr + x);
+
+                    const auto a_u16_0 = vmovl_u8(vget_low_u8(a));
+                    const auto a_u16_1 = vmovl_u8(vget_high_u8(a));
+                    const auto b_u16_0 = vmovl_u8(vget_low_u8(b));
+                    const auto b_u16_1 = vmovl_u8(vget_high_u8(b));
+
+                    const auto af_0 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_0))), vscale1);
+                    const auto af_1 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_0))), vscale1);
+                    const auto af_2 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_1))), vscale1);
+                    const auto af_3 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_1))), vscale1);
+
+                    const auto bf_0 = vmlaq_f32(af_0, vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_0))), vscale2);
+                    const auto bf_1 = vmlaq_f32(af_1, vcvtq_f32_u32(vmovl_u16(vget_high_u16(b_u16_0))), vscale2);
+                    const auto bf_2 = vmlaq_f32(af_2, vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_1))), vscale2);
+                    const auto bf_3 = vmlaq_f32(af_3, vcvtq_f32_u32(vmovl_u16(vget_high_u16(b_u16_1))), vscale2);
+
+                    int32x4_t rf_0{};
+                    int32x4_t rf_1{};
+                    int32x4_t rf_2{};
+                    int32x4_t rf_3{};
 
 #ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(bf_0);
-                rf_1 = vcvtnq_s32_f32(bf_1);
-                rf_2 = vcvtnq_s32_f32(bf_2);
-                rf_3 = vcvtnq_s32_f32(bf_3);
+                    rf_0 = vcvtnq_s32_f32(bf_0);
+                    rf_1 = vcvtnq_s32_f32(bf_1);
+                    rf_2 = vcvtnq_s32_f32(bf_2);
+                    rf_3 = vcvtnq_s32_f32(bf_3);
 #else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(bf_0);
-                rf_1 = vcvtq_s32_f32(bf_1);
-                rf_2 = vcvtq_s32_f32(bf_2);
-                rf_3 = vcvtq_s32_f32(bf_3);
+                    rf_0          = vcvtq_s32_f32(bf_0);
+                    rf_1          = vcvtq_s32_f32(bf_1);
+                    rf_2          = vcvtq_s32_f32(bf_2);
+                    rf_3          = vcvtq_s32_f32(bf_3);
 #endif //__aarch64__
 
-                const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
-                const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
-                vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
-            }
+                    const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
+                    const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
+                    vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto result = float(input1_ptr[x]) * scale1 + float(input2_ptr[x]) * scale2 + offset;
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto result = float(input1_ptr[x]) * scale1 + float(input2_ptr[x]) * scale2 + offset;
 #ifdef __aarch64__
-                output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::lround(result));
+                    output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::lround(result));
 #else  // __aarch64__
-                output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::trunc(result));
+                    output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::trunc(result));
 #endif // __aarch64__
-            }
-        },
-        input1, input2, output);
+                }
+            },
+            input1, input2, output);
     }
 }
 
-void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition)
+void add_sub_qasymm8_signed_neon(const ITensor       *src0,
+                                 const ITensor       *src1,
+                                 ITensor             *dst,
+                                 const ConvertPolicy &policy,
+                                 const Window        &window,
+                                 bool                 is_addition)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -487,7 +528,7 @@ void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITens
     const auto scale2 = is_addition ? (iq2_info.scale / oq_info.scale) : (-(iq2_info.scale / oq_info.scale));
     const auto offset = float(oq_info.offset) - scale1 * float(iq1_info.offset) - scale2 * float(iq2_info.offset);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -507,63 +548,64 @@ void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITens
         Iterator output(dst, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
-
-            const auto broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
-            const auto bf              = vdupq_n_f32(float(broadcast_value) * scale2 + offset);
-            const auto bfs             = float(broadcast_value) * bf_scale + offset;
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const int8x16_t a = vld1q_s8(non_broadcast_input_ptr + x);
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
 
-                const auto a_s16_0 = vmovl_s8(vget_low_s8(a));
-                const auto a_s16_1 = vmovl_s8(vget_high_s8(a));
+                const auto broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
+                const auto bf              = vdupq_n_f32(float(broadcast_value) * scale2 + offset);
+                const auto bfs             = float(broadcast_value) * bf_scale + offset;
 
-                const auto af_0 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_0))), vscale1);
-                const auto af_1 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_0))), vscale1);
-                const auto af_2 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_1))), vscale1);
-                const auto af_3 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_1))), vscale1);
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const int8x16_t a = vld1q_s8(non_broadcast_input_ptr + x);
 
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
-                int32x4_t rf_2{};
-                int32x4_t rf_3{};
+                    const auto a_s16_0 = vmovl_s8(vget_low_s8(a));
+                    const auto a_s16_1 = vmovl_s8(vget_high_s8(a));
+
+                    const auto af_0 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_0))), vscale1);
+                    const auto af_1 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_0))), vscale1);
+                    const auto af_2 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_1))), vscale1);
+                    const auto af_3 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_1))), vscale1);
+
+                    int32x4_t rf_0{};
+                    int32x4_t rf_1{};
+                    int32x4_t rf_2{};
+                    int32x4_t rf_3{};
 
 #ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(af_0);
-                rf_1 = vcvtnq_s32_f32(af_1);
-                rf_2 = vcvtnq_s32_f32(af_2);
-                rf_3 = vcvtnq_s32_f32(af_3);
+                    rf_0 = vcvtnq_s32_f32(af_0);
+                    rf_1 = vcvtnq_s32_f32(af_1);
+                    rf_2 = vcvtnq_s32_f32(af_2);
+                    rf_3 = vcvtnq_s32_f32(af_3);
 #else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(af_0);
-                rf_1 = vcvtq_s32_f32(af_1);
-                rf_2 = vcvtq_s32_f32(af_2);
-                rf_3 = vcvtq_s32_f32(af_3);
+                    rf_0          = vcvtq_s32_f32(af_0);
+                    rf_1          = vcvtq_s32_f32(af_1);
+                    rf_2          = vcvtq_s32_f32(af_2);
+                    rf_3          = vcvtq_s32_f32(af_3);
 #endif //__aarch64__
 
-                const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
-                const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
-                vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
-            }
+                    const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
+                    const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
+                    vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto result = float(non_broadcast_input_ptr[x]) * af_scale + bfs;
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto result = float(non_broadcast_input_ptr[x]) * af_scale + bfs;
 #ifdef __aarch64__
-                output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::lround(result));
+                    output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::lround(result));
 #else  // __aarch64__
-                output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::trunc(result));
+                    output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::trunc(result));
 #endif // __aarch64__
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -580,79 +622,102 @@ void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITens
         const auto voffset = vdupq_n_f32(offset);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const int8x16_t a = vld1q_s8(input1_ptr + x);
-                const int8x16_t b = vld1q_s8(input2_ptr + x);
-
-                const auto a_s16_0 = vmovl_s8(vget_low_s8(a));
-                const auto a_s16_1 = vmovl_s8(vget_high_s8(a));
-                const auto b_s16_0 = vmovl_s8(vget_low_s8(b));
-                const auto b_s16_1 = vmovl_s8(vget_high_s8(b));
-
-                const auto af_0 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_0))), vscale1);
-                const auto af_1 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_0))), vscale1);
-                const auto af_2 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_1))), vscale1);
-                const auto af_3 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_1))), vscale1);
-
-                const auto bf_0 = vmlaq_f32(af_0, vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16_0))), vscale2);
-                const auto bf_1 = vmlaq_f32(af_1, vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16_0))), vscale2);
-                const auto bf_2 = vmlaq_f32(af_2, vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16_1))), vscale2);
-                const auto bf_3 = vmlaq_f32(af_3, vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16_1))), vscale2);
-
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
-                int32x4_t rf_2{};
-                int32x4_t rf_3{};
+                const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const int8x16_t a = vld1q_s8(input1_ptr + x);
+                    const int8x16_t b = vld1q_s8(input2_ptr + x);
+
+                    const auto a_s16_0 = vmovl_s8(vget_low_s8(a));
+                    const auto a_s16_1 = vmovl_s8(vget_high_s8(a));
+                    const auto b_s16_0 = vmovl_s8(vget_low_s8(b));
+                    const auto b_s16_1 = vmovl_s8(vget_high_s8(b));
+
+                    const auto af_0 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_0))), vscale1);
+                    const auto af_1 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_0))), vscale1);
+                    const auto af_2 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_1))), vscale1);
+                    const auto af_3 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_1))), vscale1);
+
+                    const auto bf_0 = vmlaq_f32(af_0, vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16_0))), vscale2);
+                    const auto bf_1 = vmlaq_f32(af_1, vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16_0))), vscale2);
+                    const auto bf_2 = vmlaq_f32(af_2, vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16_1))), vscale2);
+                    const auto bf_3 = vmlaq_f32(af_3, vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16_1))), vscale2);
+
+                    int32x4_t rf_0{};
+                    int32x4_t rf_1{};
+                    int32x4_t rf_2{};
+                    int32x4_t rf_3{};
 
 #ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(bf_0);
-                rf_1 = vcvtnq_s32_f32(bf_1);
-                rf_2 = vcvtnq_s32_f32(bf_2);
-                rf_3 = vcvtnq_s32_f32(bf_3);
+                    rf_0 = vcvtnq_s32_f32(bf_0);
+                    rf_1 = vcvtnq_s32_f32(bf_1);
+                    rf_2 = vcvtnq_s32_f32(bf_2);
+                    rf_3 = vcvtnq_s32_f32(bf_3);
 #else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(bf_0);
-                rf_1 = vcvtq_s32_f32(bf_1);
-                rf_2 = vcvtq_s32_f32(bf_2);
-                rf_3 = vcvtq_s32_f32(bf_3);
+                    rf_0          = vcvtq_s32_f32(bf_0);
+                    rf_1          = vcvtq_s32_f32(bf_1);
+                    rf_2          = vcvtq_s32_f32(bf_2);
+                    rf_3          = vcvtq_s32_f32(bf_3);
 #endif //__aarch64__
 
-                const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
-                const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
-                vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
-            }
+                    const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
+                    const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
+                    vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto result = float(input1_ptr[x]) * scale1 + float(input2_ptr[x]) * scale2 + offset;
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto result = float(input1_ptr[x]) * scale1 + float(input2_ptr[x]) * scale2 + offset;
 #ifdef __aarch64__
-                output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::lround(result));
+                    output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::lround(result));
 #else  // __aarch64__
-                output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::trunc(result));
+                    output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::trunc(result));
 #endif // __aarch64__
-            }
-        },
-        input1, input2, output);
+                }
+            },
+            input1, input2, output);
     }
 }
 
-template void add_q8_neon_fixedpoint<int8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_q8_neon_fixedpoint<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-
-template void add_sub_q8_neon_fixedpoint<int8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition);
-template void add_sub_q8_neon_fixedpoint<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition);
-
-void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition);
-void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition);
+template void add_q8_neon_fixedpoint<int8_t>(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_q8_neon_fixedpoint<uint8_t>(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+
+template void add_sub_q8_neon_fixedpoint<int8_t>(const ITensor       *src0,
+                                                 const ITensor       *src1,
+                                                 ITensor             *dst,
+                                                 const ConvertPolicy &policy,
+                                                 const Window        &window,
+                                                 bool                 is_addition);
+template void add_sub_q8_neon_fixedpoint<uint8_t>(const ITensor       *src0,
+                                                  const ITensor       *src1,
+                                                  ITensor             *dst,
+                                                  const ConvertPolicy &policy,
+                                                  const Window        &window,
+                                                  bool                 is_addition);
+
+void add_sub_qasymm8_neon(const ITensor       *src0,
+                          const ITensor       *src1,
+                          ITensor             *dst,
+                          const ConvertPolicy &policy,
+                          const Window        &window,
+                          bool                 is_addition);
+void add_sub_qasymm8_signed_neon(const ITensor       *src0,
+                                 const ITensor       *src1,
+                                 ITensor             *dst,
+                                 const ConvertPolicy &policy,
+                                 const Window        &window,
+                                 bool                 is_addition);
 
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/neon/impl.h b/src/cpu/kernels/add/generic/neon/impl.h
index fb786c5bc1..faa99baffe 100644
--- a/src/cpu/kernels/add/generic/neon/impl.h
+++ b/src/cpu/kernels/add/generic/neon/impl.h
@@ -26,8 +26,9 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+#include "arm_compute/core/Window.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
@@ -35,7 +36,8 @@ namespace arm_compute
 namespace cpu
 {
 template <typename ScalarType>
-void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_same_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<ScalarType, wrapper::traits::BitWidth::W128>;
@@ -53,7 +55,7 @@ void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const
     const auto    window_end_x          = static_cast<int>(window.x().end());
     const bool    is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -69,31 +71,36 @@ void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const
         Iterator output(dst, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
-
-            const ScalarType broadcast_value     = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
-            const auto       broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
-                const auto res             = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v);
-                wrapper::vstore(output_ptr + x, res);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
-                *(output_ptr + x)          = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v;
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
+                const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
+
+                const ScalarType broadcast_value     = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
+                const auto       broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+                    const auto res             = (policy == ConvertPolicy::SATURATE)
+                                                     ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v)
+                                                     : wrapper::vadd(broadcast_value_vec, non_broadcast_v);
+                    wrapper::vstore(output_ptr + x, res);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
+                    *(output_ptr + x)          = (policy == ConvertPolicy::SATURATE)
+                                                     ? wrapper::add_sat(broadcast_value, non_broadcast_v)
+                                                     : broadcast_value + non_broadcast_v;
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -106,31 +113,34 @@ void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const
         Iterator output(dst, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto val1 = wrapper::vloadq(input1_ptr + x);
-                const auto val2 = wrapper::vloadq(input2_ptr + x);
-                const auto res  = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
-                wrapper::vstore(output_ptr + x, res);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
+            win,
+            [&](const Coordinates &)
             {
-                const auto val1   = *(input1_ptr + x);
-                const auto val2   = *(input2_ptr + x);
-                *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2;
-            }
-        },
-        input1, input2, output);
+                const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto val1 = wrapper::vloadq(input1_ptr + x);
+                    const auto val2 = wrapper::vloadq(input2_ptr + x);
+                    const auto res =
+                        (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
+                    wrapper::vstore(output_ptr + x, res);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto val1 = *(input1_ptr + x);
+                    const auto val2 = *(input2_ptr + x);
+                    *(output_ptr + x) =
+                        (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2;
+                }
+            },
+            input1, input2, output);
     }
 }
 
@@ -138,17 +148,36 @@ bool add_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo
 
 bool sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
 
-bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, bool is_addition);
-
-void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition);
-
-void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition);
+bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0,
+                                         const ITensorInfo *src1,
+                                         const ITensorInfo *dst,
+                                         bool               is_addition);
+
+void add_sub_qasymm8_neon(const ITensor       *src0,
+                          const ITensor       *src1,
+                          ITensor             *dst,
+                          const ConvertPolicy &policy,
+                          const Window        &window,
+                          bool                 is_addition);
+
+void add_sub_qasymm8_signed_neon(const ITensor       *src0,
+                                 const ITensor       *src1,
+                                 ITensor             *dst,
+                                 const ConvertPolicy &policy,
+                                 const Window        &window,
+                                 bool                 is_addition);
 
 template <typename ScalarType>
-void add_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+void add_q8_neon_fixedpoint(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
 
 template <typename ScalarType>
-void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition);
+void add_sub_q8_neon_fixedpoint(const ITensor       *src0,
+                                const ITensor       *src1,
+                                ITensor             *dst,
+                                const ConvertPolicy &policy,
+                                const Window        &window,
+                                bool                 is_addition);
 } // namespace cpu
 } // namespace arm_compute
 #endif // SRC_CORE_NEON_KERNELS_ADD_IMPL_H
diff --git a/src/cpu/kernels/add/generic/neon/integer.cpp b/src/cpu/kernels/add/generic/neon/integer.cpp
index 5698d6d552..f0bcebc9d2 100644
--- a/src/cpu/kernels/add/generic/neon/integer.cpp
+++ b/src/cpu/kernels/add/generic/neon/integer.cpp
@@ -28,19 +28,22 @@ namespace arm_compute
 {
 namespace cpu
 {
-void add_u8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_u8_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_neon<uint8_t>(src0, src1, dst, policy, window);
 }
 
-void add_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_s16_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_neon<int16_t>(src0, src1, dst, policy, window);
 }
 
-void add_s32_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_s32_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_neon<int32_t>(src0, src1, dst, policy, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/neon/qasymm8.cpp b/src/cpu/kernels/add/generic/neon/qasymm8.cpp
index 69cca956c8..8195d229d9 100644
--- a/src/cpu/kernels/add/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/add/generic/neon/qasymm8.cpp
@@ -23,15 +23,17 @@
  */
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/cpu/kernels/add/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void add_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_qasymm8_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     add_sub_qasymm8_neon(src0, src1, dst, policy, window, true /*is_addition*/);
 }
 } // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp
index dfdf8fe85b..7e23096239 100644
--- a/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp
@@ -23,15 +23,17 @@
  */
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/cpu/kernels/add/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void add_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_qasymm8_signed_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     add_sub_qasymm8_signed_neon(src0, src1, dst, policy, window, true /*is_addition*/);
 }
 } // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/neon/qsymm16.cpp b/src/cpu/kernels/add/generic/neon/qsymm16.cpp
index e76e408d6e..ac2de0557a 100644
--- a/src/cpu/kernels/add/generic/neon/qsymm16.cpp
+++ b/src/cpu/kernels/add/generic/neon/qsymm16.cpp
@@ -25,14 +25,16 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_qsymm16_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -57,7 +59,7 @@ void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co
     const float32x4_t vscale2    = vdupq_n_f32(iq2_info.scale);
     const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool                    is_broadcast_input_2 = input2_win.x().step() == 0;
         Window                        broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -65,7 +67,7 @@ void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co
         const ITensor                *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
         const ITensor                *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
         const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo  = non_broadcast_tensor->info()->quantization_info().uniform();
+        const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
 
         // Clear X Dimension on execution window as we handle manually
         non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -74,48 +76,50 @@ void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int16_t *>(output.ptr());
-
-            const int16_t   broadcast_value     = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
-            const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
-
-            const auto  bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2);
-            const auto  bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2);
-            const float bfs  = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const int16x8_t a    = vld1q_s16(non_broadcast_input_ptr + x);
-                const auto      af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
-                const auto      af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
-
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<int16_t *>(output.ptr());
+
+                const int16_t   broadcast_value     = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
+                const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
+
+                const auto  bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2);
+                const auto  bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2);
+                const float bfs  = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const int16x8_t a    = vld1q_s16(non_broadcast_input_ptr + x);
+                    const auto      af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
+                    const auto      af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
+
+                    int32x4_t rf_0{};
+                    int32x4_t rf_1{};
 #ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
+                    rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
+                    rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
 #else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
+                    rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
+                    rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
 #endif //__aarch64__
 
-                const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
-                vst1q_s16(output_ptr + x, pa);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
-                *(output_ptr + x) = quantize_qsymm16((afs + bfs), oq_info);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
+                    const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
+                    vst1q_s16(output_ptr + x, pa);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
+                    *(output_ptr + x) = quantize_qsymm16((afs + bfs), oq_info);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -127,48 +131,50 @@ void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co
         Iterator input2(src1, input2_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const int16x8_t a = vld1q_s16(input1_ptr + x);
-                const int16x8_t b = vld1q_s16(input2_ptr + x);
-
-                const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
-                const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
-                const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2);
-                const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2);
-
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
+                const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const int16x8_t a = vld1q_s16(input1_ptr + x);
+                    const int16x8_t b = vld1q_s16(input2_ptr + x);
+
+                    const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
+                    const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
+                    const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2);
+                    const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2);
+
+                    int32x4_t rf_0{};
+                    int32x4_t rf_1{};
 #ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
+                    rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
+                    rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
 #else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
+                    rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
+                    rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
 #endif //__aarch64__
 
-                const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
-                vst1q_s16(output_ptr + x, pa);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
-                const float bfs   = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
-                *(output_ptr + x) = quantize_qsymm16((afs + bfs), dst->info()->quantization_info());
-            }
-        },
-        input1, input2, output);
+                    const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
+                    vst1q_s16(output_ptr + x, pa);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
+                    const float bfs   = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
+                    *(output_ptr + x) = quantize_qsymm16((afs + bfs), dst->info()->quantization_info());
+                }
+            },
+            input1, input2, output);
     }
 }
 } // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/sve/fp16.cpp b/src/cpu/kernels/add/generic/sve/fp16.cpp
index 581f3abded..01dfe6c44b 100644
--- a/src/cpu/kernels/add/generic/sve/fp16.cpp
+++ b/src/cpu/kernels/add/generic/sve/fp16.cpp
@@ -31,10 +31,11 @@ namespace arm_compute
 {
 namespace cpu
 {
-void add_fp16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_fp16_sve(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_sve<float16_t>(src0, src1, dst, policy, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
 #endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/add/generic/sve/fp32.cpp b/src/cpu/kernels/add/generic/sve/fp32.cpp
index b37799113a..56771a5411 100644
--- a/src/cpu/kernels/add/generic/sve/fp32.cpp
+++ b/src/cpu/kernels/add/generic/sve/fp32.cpp
@@ -24,15 +24,17 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+
 #include "src/cpu/kernels/add/generic/sve/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void add_fp32_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_fp32_sve(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_sve<float>(src0, src1, dst, policy, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/sve/impl.cpp b/src/cpu/kernels/add/generic/sve/impl.cpp
index e8606436fd..ca850fcef4 100644
--- a/src/cpu/kernels/add/generic/sve/impl.cpp
+++ b/src/cpu/kernels/add/generic/sve/impl.cpp
@@ -23,17 +23,21 @@
  */
 
 #include "src/cpu/kernels/add/generic/sve/impl.h"
+
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+
 #include "src/core/NEON/SVEMath.h"
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include <arm_sve.h>
 namespace arm_compute
 {
 namespace cpu
 {
 template <typename ScalarType>
-void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_same_sve(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     const auto all_true_pg           = wrapper::svptrue<ScalarType>();
     const auto window_start_x        = static_cast<int>(window.x().start());
@@ -53,7 +57,7 @@ void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const
     Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()));
     Iterator output(dst, window);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -68,28 +72,30 @@ void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
 
-            const ScalarType broadcast_value     = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
-            const auto       broadcast_value_vec = wrapper::svdup_n(broadcast_value);
+                const ScalarType broadcast_value     = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
+                const auto       broadcast_value_vec = wrapper::svdup_n(broadcast_value);
 
-            int      x  = window_start_x;
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            do
-            {
-                const auto non_broadcast_v = svld1(pg, non_broadcast_input_ptr + x);
-                auto       res             = is_sat ? wrapper::svqadd(broadcast_value_vec, non_broadcast_v) : svadd_z(pg, broadcast_value_vec, non_broadcast_v);
-                svst1(pg, output_ptr + x, res);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
+                int      x  = window_start_x;
+                svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                do
+                {
+                    const auto non_broadcast_v = svld1(pg, non_broadcast_input_ptr + x);
+                    auto       res             = is_sat ? wrapper::svqadd(broadcast_value_vec, non_broadcast_v)
+                                                        : svadd_z(pg, broadcast_value_vec, non_broadcast_v);
+                    svst1(pg, output_ptr + x, res);
+
+                    x += wrapper::svcnt<ScalarType>();
+                    pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -101,35 +107,41 @@ void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const
         Iterator input2(src1, input2_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-
-            int      x  = window_start_x;
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto val1 = svld1(pg, input1_ptr + x);
-                const auto val2 = svld1(pg, input2_ptr + x);
-                const auto res  = is_sat ? wrapper::svqadd(val1, val2) : svadd_z(pg, val1, val2);
-                svst1(pg, output_ptr + x, res);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
+                const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+
+                int      x  = window_start_x;
+                svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                do
+                {
+                    const auto val1 = svld1(pg, input1_ptr + x);
+                    const auto val2 = svld1(pg, input2_ptr + x);
+                    const auto res  = is_sat ? wrapper::svqadd(val1, val2) : svadd_z(pg, val1, val2);
+                    svst1(pg, output_ptr + x, res);
+
+                    x += wrapper::svcnt<ScalarType>();
+                    pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            input1, input2, output);
     }
 }
-template void add_same_sve<float>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_same_sve<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_same_sve<int16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_same_sve<int32_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<float>(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<uint8_t>(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<int16_t>(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<int32_t>(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-template void add_same_sve<float16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<float16_t>(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
 #endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/sve/impl.h b/src/cpu/kernels/add/generic/sve/impl.h
index 0136f14246..6a95d66826 100644
--- a/src/cpu/kernels/add/generic/sve/impl.h
+++ b/src/cpu/kernels/add/generic/sve/impl.h
@@ -33,7 +33,8 @@ namespace arm_compute
 namespace cpu
 {
 template <typename ScalarType>
-void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+void add_same_sve(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
 } // namespace cpu
 } // namespace arm_compute
 #endif // SRC_CORE_SVE_KERNELS_ADD_IMPL_H
diff --git a/src/cpu/kernels/add/generic/sve/integer.cpp b/src/cpu/kernels/add/generic/sve/integer.cpp
index 3642dccd7b..4d17f2adbd 100644
--- a/src/cpu/kernels/add/generic/sve/integer.cpp
+++ b/src/cpu/kernels/add/generic/sve/integer.cpp
@@ -24,25 +24,29 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+
 #include "src/cpu/kernels/add/generic/sve/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void add_u8_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_u8_sve(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_sve<uint8_t>(src0, src1, dst, policy, window);
 }
 
-void add_s16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_s16_sve(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_sve<int16_t>(src0, src1, dst, policy, window);
 }
 
-void add_s32_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_s32_sve(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_sve<int32_t>(src0, src1, dst, policy, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/sve2/qasymm8.cpp b/src/cpu/kernels/add/generic/sve2/qasymm8.cpp
index 1dec214aa0..40add9d51b 100644
--- a/src/cpu/kernels/add/generic/sve2/qasymm8.cpp
+++ b/src/cpu/kernels/add/generic/sve2/qasymm8.cpp
@@ -26,15 +26,18 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+
 #include "src/core/NEON/SVEMath.h"
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
 {
 namespace cpu
 {
-void add_qasymm8_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_qasymm8_sve2(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -58,7 +61,7 @@ void add_qasymm8_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co
     const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale);
     const auto voffseto   = svdup_n_f32(oq_info.offset);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -78,48 +81,89 @@ void add_qasymm8_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
 
-            const uint8_t   broadcast_value     = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
-            const svuint8_t broadcast_value_vec = svdup_n_u8(broadcast_value);
+                const uint8_t   broadcast_value     = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
+                const svuint8_t broadcast_value_vec = svdup_n_u8(broadcast_value);
 
-            int      x  = window_start_x;
-            svbool_t pg = svwhilelt_b8(x, window_end_x);
+                int      x  = window_start_x;
+                svbool_t pg = svwhilelt_b8(x, window_end_x);
 
-            const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(broadcast_value_vec))), voffset2)), vscale2);
-            const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(broadcast_value_vec))), voffset2)), vscale2);
-            const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(broadcast_value_vec))), voffset2)), vscale2);
-            const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(broadcast_value_vec))), voffset2)), vscale2);
+                const auto bf_0 = svmul_f32_z(
+                    pg,
+                    svcvt_f32_s32_z(
+                        pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(broadcast_value_vec))),
+                                        voffset2)),
+                    vscale2);
+                const auto bf_1 = svmul_f32_z(
+                    pg,
+                    svcvt_f32_s32_z(
+                        pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(broadcast_value_vec))),
+                                        voffset2)),
+                    vscale2);
+                const auto bf_2 = svmul_f32_z(
+                    pg,
+                    svcvt_f32_s32_z(
+                        pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(broadcast_value_vec))),
+                                        voffset2)),
+                    vscale2);
+                const auto bf_3 = svmul_f32_z(
+                    pg,
+                    svcvt_f32_s32_z(
+                        pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(broadcast_value_vec))),
+                                        voffset2)),
+                    vscale2);
 
-            do
-            {
-                const svuint8_t a = svld1_u8(pg, non_broadcast_input_ptr + x);
-
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), vscale1);
-                const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), vscale1);
-                const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), vscale1);
-
-                const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
-                const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
-                const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
-
-                const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
-                const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
-
-                const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb);
-                svst1_u8(pg, output_ptr + x, res);
-
-                x += svcntb();
-                pg = svwhilelt_b8(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
+                do
+                {
+                    const svuint8_t a = svld1_u8(pg, non_broadcast_input_ptr + x);
+
+                    const auto af_0 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)),
+                        vscale1);
+                    const auto af_1 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)),
+                        vscale1);
+                    const auto af_2 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)),
+                        vscale1);
+                    const auto af_3 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)),
+                        vscale1);
+
+                    const auto rf_0 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+                    const auto rf_1 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+                    const auto rf_2 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
+                    const auto rf_3 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
+
+                    const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
+                    const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
+
+                    const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb);
+                    svst1_u8(pg, output_ptr + x, res);
+
+                    x += svcntb();
+                    pg = svwhilelt_b8(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -136,45 +180,82 @@ void add_qasymm8_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co
         const auto voffset1 = svdup_n_s32(iq1_info.offset);
         const auto voffset2 = svdup_n_s32(iq2_info.offset);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-            int      x  = window_start_x;
-            svbool_t pg = svwhilelt_b8(x, window_end_x);
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto a    = svld1_u8(pg, input1_ptr + x);
-                const auto b    = svld1_u8(pg, input2_ptr + x);
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), vscale1);
-                const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), vscale1);
-                const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), vscale1);
-
-                const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(b))), voffset2)), vscale2);
-                const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(b))), voffset2)), vscale2);
-                const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(b))), voffset2)), vscale2);
-                const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(b))), voffset2)), vscale2);
-
-                const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
-                const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
-                const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
-
-                const auto pa  = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
-                const auto pb  = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
-                const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb);
-
-                svst1_u8(pg, output_ptr + x, res);
-
-                x += svcntb();
-                pg = svwhilelt_b8(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
+                const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+                int      x  = window_start_x;
+                svbool_t pg = svwhilelt_b8(x, window_end_x);
+                do
+                {
+                    const auto a    = svld1_u8(pg, input1_ptr + x);
+                    const auto b    = svld1_u8(pg, input2_ptr + x);
+                    const auto af_0 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)),
+                        vscale1);
+                    const auto af_1 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)),
+                        vscale1);
+                    const auto af_2 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)),
+                        vscale1);
+                    const auto af_3 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)),
+                        vscale1);
+
+                    const auto bf_0 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(b))), voffset2)),
+                        vscale2);
+                    const auto bf_1 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(b))), voffset2)),
+                        vscale2);
+                    const auto bf_2 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(b))), voffset2)),
+                        vscale2);
+                    const auto bf_3 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(b))), voffset2)),
+                        vscale2);
+
+                    const auto rf_0 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+                    const auto rf_1 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+                    const auto rf_2 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
+                    const auto rf_3 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
+
+                    const auto pa  = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
+                    const auto pb  = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
+                    const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb);
+
+                    svst1_u8(pg, output_ptr + x, res);
+
+                    x += svcntb();
+                    pg = svwhilelt_b8(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            input1, input2, output);
     }
 }
 } // namespace cpu
diff --git a/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp
index dae8899753..2e585115e1 100644
--- a/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp
+++ b/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp
@@ -26,15 +26,18 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+
 #include "src/core/NEON/SVEMath.h"
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
 {
 namespace cpu
 {
-void add_qasymm8_signed_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_qasymm8_signed_sve2(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -57,7 +60,7 @@ void add_qasymm8_signed_sve2(const ITensor *src0, const ITensor *src1, ITensor *
     const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale);
     const auto voffseto   = svdup_n_f32(oq_info.offset);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -78,46 +81,63 @@ void add_qasymm8_signed_sve2(const ITensor *src0, const ITensor *src1, ITensor *
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
-
-            const int8_t broadcast_value     = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
-            const auto   broadcast_value_vec = svdup_n_s8(broadcast_value);
-
-            int        x    = window_start_x;
-            svbool_t   pg   = svwhilelt_b8(x, window_end_x);
-            const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), vscale2);
-            const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), vscale2);
-            const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), vscale2);
-            const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), vscale2);
-
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto a    = svld1_s8(pg, non_broadcast_input_ptr + x);
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
-                const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
-                const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
-
-                const auto rf_0 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
-                const auto rf_2 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
-                const auto rf_3 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
-
-                const auto pa  = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
-                const auto pb  = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
-                const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
-
-                svst1_s8(pg, output_ptr + x, res);
-
-                x += svcntb();
-                pg = svwhilelt_b8(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
+
+                const int8_t broadcast_value     = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
+                const auto   broadcast_value_vec = svdup_n_s8(broadcast_value);
+
+                int        x    = window_start_x;
+                svbool_t   pg   = svwhilelt_b8(x, window_end_x);
+                const auto bf_0 = svmul_f32_z(
+                    pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(broadcast_value_vec)), voffset2)),
+                    vscale2);
+                const auto bf_1 = svmul_f32_z(
+                    pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(broadcast_value_vec)), voffset2)),
+                    vscale2);
+                const auto bf_2 = svmul_f32_z(
+                    pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(broadcast_value_vec)), voffset2)),
+                    vscale2);
+                const auto bf_3 = svmul_f32_z(
+                    pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(broadcast_value_vec)), voffset2)),
+                    vscale2);
+
+                do
+                {
+                    const auto a    = svld1_s8(pg, non_broadcast_input_ptr + x);
+                    const auto af_0 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
+                    const auto af_1 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
+                    const auto af_2 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
+                    const auto af_3 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
+
+                    const auto rf_0 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+                    const auto rf_1 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+                    const auto rf_2 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
+                    const auto rf_3 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
+
+                    const auto pa  = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
+                    const auto pb  = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
+                    const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
+
+                    svst1_s8(pg, output_ptr + x, res);
+
+                    x += svcntb();
+                    pg = svwhilelt_b8(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -134,46 +154,59 @@ void add_qasymm8_signed_sve2(const ITensor *src0, const ITensor *src1, ITensor *
         const auto voffset1 = svdup_n_s32(iq1_info.offset);
         const auto voffset2 = svdup_n_s32(iq2_info.offset);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
-            int      x  = window_start_x;
-            svbool_t pg = svwhilelt_b8(x, window_end_x);
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto a = svld1_s8(pg, input1_ptr + x);
-                const auto b = svld1_s8(pg, input2_ptr + x);
-
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
-                const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
-                const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
-
-                const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(b)), voffset2)), vscale2);
-                const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(b)), voffset2)), vscale2);
-                const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(b)), voffset2)), vscale2);
-                const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(b)), voffset2)), vscale2);
-
-                const auto rf_0 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
-                const auto rf_2 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
-                const auto rf_3 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
-
-                const auto pa  = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
-                const auto pb  = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
-                const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
-
-                svst1_s8(pg, output_ptr + x, res);
-
-                x += svcntb();
-                pg = svwhilelt_b8(x, window_end_x);
-            }
-            while(svptest_any(svptrue_b8(), pg));
-        },
-        input1, input2, output);
+                const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+
+                int      x  = window_start_x;
+                svbool_t pg = svwhilelt_b8(x, window_end_x);
+                do
+                {
+                    const auto a = svld1_s8(pg, input1_ptr + x);
+                    const auto b = svld1_s8(pg, input2_ptr + x);
+
+                    const auto af_0 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
+                    const auto af_1 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
+                    const auto af_2 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
+                    const auto af_3 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
+
+                    const auto bf_0 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(b)), voffset2)), vscale2);
+                    const auto bf_1 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(b)), voffset2)), vscale2);
+                    const auto bf_2 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(b)), voffset2)), vscale2);
+                    const auto bf_3 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(b)), voffset2)), vscale2);
+
+                    const auto rf_0 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+                    const auto rf_1 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+                    const auto rf_2 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
+                    const auto rf_3 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
+
+                    const auto pa  = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
+                    const auto pb  = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
+                    const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
+
+                    svst1_s8(pg, output_ptr + x, res);
+
+                    x += svcntb();
+                    pg = svwhilelt_b8(x, window_end_x);
+                } while (svptest_any(svptrue_b8(), pg));
+            },
+            input1, input2, output);
     }
 }
 } // namespace cpu
diff --git a/src/cpu/kernels/add/generic/sve2/qsymm16.cpp b/src/cpu/kernels/add/generic/sve2/qsymm16.cpp
index 8c48ded942..17a42c2138 100644
--- a/src/cpu/kernels/add/generic/sve2/qsymm16.cpp
+++ b/src/cpu/kernels/add/generic/sve2/qsymm16.cpp
@@ -26,15 +26,18 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+
 #include "src/core/NEON/SVEMath.h"
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
 {
 namespace cpu
 {
-void add_qsymm16_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_qsymm16_sve2(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -59,7 +62,7 @@ void add_qsymm16_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co
     const auto invvscaleo  = svdup_n_f32(1.f / oq_info.scale);
     const auto all_true_pg = svptrue_b16();
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -74,39 +77,40 @@ void add_qsymm16_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int16_t *>(output.ptr());
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<int16_t *>(output.ptr());
 
-            const int16_t broadcast_value     = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
-            const auto    broadcast_value_vec = svdup_n_s16(broadcast_value);
+                const int16_t broadcast_value     = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
+                const auto    broadcast_value_vec = svdup_n_s16(broadcast_value);
 
-            int      x  = window_start_x;
-            svbool_t pg = svwhilelt_b16(x, window_end_x);
+                int      x  = window_start_x;
+                svbool_t pg = svwhilelt_b16(x, window_end_x);
 
-            const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(broadcast_value_vec)), vscale2);
-            const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(broadcast_value_vec)), vscale2);
+                const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(broadcast_value_vec)), vscale2);
+                const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(broadcast_value_vec)), vscale2);
 
-            do
-            {
-                const auto a    = svld1_s16(pg, non_broadcast_input_ptr + x);
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1);
+                do
+                {
+                    const auto a    = svld1_s16(pg, non_broadcast_input_ptr + x);
+                    const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1);
+                    const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1);
 
-                const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+                    const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+                    const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
 
-                const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
+                    const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
 
-                svst1_s16(pg, output_ptr + x, res);
+                    svst1_s16(pg, output_ptr + x, res);
 
-                x += svcnth();
-                pg = svwhilelt_b16(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
+                    x += svcnth();
+                    pg = svwhilelt_b16(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -118,37 +122,38 @@ void add_qsymm16_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co
         Iterator input2(src1, input2_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-            int      x  = window_start_x;
-            svbool_t pg = svwhilelt_b16(x, window_end_x);
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                auto a = svld1_s16(pg, input1_ptr + x);
-                auto b = svld1_s16(pg, input2_ptr + x);
-
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1);
-
-                const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(b)), vscale2);
-                const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(b)), vscale2);
-
-                const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
-
-                const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
-                svst1_s16(pg, output_ptr + x, res);
-
-                x += svcnth();
-                pg = svwhilelt_b16(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
+                const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+                int      x  = window_start_x;
+                svbool_t pg = svwhilelt_b16(x, window_end_x);
+                do
+                {
+                    auto a = svld1_s16(pg, input1_ptr + x);
+                    auto b = svld1_s16(pg, input2_ptr + x);
+
+                    const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1);
+                    const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1);
+
+                    const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(b)), vscale2);
+                    const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(b)), vscale2);
+
+                    const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+                    const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+
+                    const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
+                    svst1_s16(pg, output_ptr + x, res);
+
+                    x += svcnth();
+                    pg = svwhilelt_b16(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            input1, input2, output);
     }
 }
 } // namespace cpu
diff --git a/src/cpu/kernels/add/list.h b/src/cpu/kernels/add/list.h
index 7cdb70fd9e..1040c39a41 100644
--- a/src/cpu/kernels/add/list.h
+++ b/src/cpu/kernels/add/list.h
@@ -31,8 +31,9 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_ADD_KERNEL(func_name) \
-    void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+#define DECLARE_ADD_KERNEL(func_name)                                                                   \
+    void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, \
+                   const Window &window)
 
 DECLARE_ADD_KERNEL(add_qasymm8_neon);
 DECLARE_ADD_KERNEL(add_qasymm8_signed_neon);
@@ -55,4 +56,4 @@ DECLARE_ADD_KERNEL(add_qsymm16_sve2);
 
 } // namespace cpu
 } // namespace arm_compute
-#endif // SRC_CORE_KERNELS_ADD_LIST_H
-\ No newline at end of file
+#endif // SRC_CORE_KERNELS_ADD_LIST_H
diff --git a/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp b/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp
index d8e5f694a8..b4b81aa78b 100644
--- a/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/cpu/CpuTypes.h"
 
 #include <cstddef>
@@ -38,16 +39,20 @@ namespace
 {
 using arm_compute::float16_t;
 
-void a64_add_bn_clamp_direct_fp16_2x32(
-    float16_t *out, size_t out_stride,
-    float16_t *out_direct, size_t out_direct_stride,
-    const float16_t *in0, size_t in0_stride,
-    const float16_t *in1, size_t in1_stride,
-    const float16_t *bn_mul,
-    const float16_t *bn_add,
-    const float16_t  minval,
-    const float16_t  maxval,
-    size_t width, size_t height)
+void a64_add_bn_clamp_direct_fp16_2x32(float16_t       *out,
+                                       size_t           out_stride,
+                                       float16_t       *out_direct,
+                                       size_t           out_direct_stride,
+                                       const float16_t *in0,
+                                       size_t           in0_stride,
+                                       const float16_t *in1,
+                                       size_t           in1_stride,
+                                       const float16_t *bn_mul,
+                                       const float16_t *bn_add,
+                                       const float16_t  minval,
+                                       const float16_t  maxval,
+                                       size_t           width,
+                                       size_t           height)
 {
     struct KernelArgs
     {
@@ -858,9 +863,14 @@ void a64_add_bn_clamp_direct_fp16_2x32(
         "subs x20, x20, #0x2\n"
         "bgt 8b\n"
         "58:" // odd columns skip
-        : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width)
-        : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
-        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
+        : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out),
+          [out_direct] "+&r"(out_direct), [width] "+&r"(width)
+        : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride),
+          [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)),
+          [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16",
+          "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9",
+          "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
 }
 
 } // namespace
@@ -869,8 +879,15 @@ namespace arm_compute
 {
 namespace cpu
 {
-void add_mul_add_fp16_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add,
-                           ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window)
+void add_mul_add_fp16_neon(const ITensor             *input1,
+                           const ITensor             *input2,
+                           const ITensor             *bn_mul,
+                           const ITensor             *bn_add,
+                           ITensor                   *add_output,
+                           ITensor                   *final_output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info,
+                           const Window              &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -882,16 +899,16 @@ void add_mul_add_fp16_neon(const ITensor *input1, const ITensor *input2, const I
     float16_t minval = std::numeric_limits<half>::lowest();
     float16_t maxval = std::numeric_limits<half>::max();
 
-    if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+    if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
     {
         minval = static_cast<float16_t>(0.f);
     }
-    else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
     {
         minval = static_cast<float16_t>(0.f);
         maxval = static_cast<float16_t>(act_info.a());
     }
-    else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
     {
         minval = static_cast<float16_t>(act_info.b());
         maxval = static_cast<float16_t>(act_info.a());
@@ -909,42 +926,37 @@ void add_mul_add_fp16_neon(const ITensor *input1, const ITensor *input2, const I
     const size_t width  = window.num_iterations(0);
     const size_t height = window.num_iterations(1);
 
-    if(add_output != nullptr)
+    if (add_output != nullptr)
     {
         Iterator add_out_it(add_output, window);
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            a64_add_bn_clamp_direct_fp16_2x32(
-                reinterpret_cast<float16_t *>(out_it.ptr()), out_stride,
-                reinterpret_cast<float16_t *>(add_out_it.ptr()), out_direct_stride,
-                reinterpret_cast<float16_t *>(in1_it.ptr()), in0_stride,
-                reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride,
-                reinterpret_cast<float16_t *>(bn_mul->buffer()),
-                reinterpret_cast<float16_t *>(bn_add->buffer()),
-                minval,
-                maxval,
-                width, height);
-        },
-        in1_it, in2_it, add_out_it, out_it);
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_fp16_2x32(reinterpret_cast<float16_t *>(out_it.ptr()), out_stride,
+                                                  reinterpret_cast<float16_t *>(add_out_it.ptr()), out_direct_stride,
+                                                  reinterpret_cast<float16_t *>(in1_it.ptr()), in0_stride,
+                                                  reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride,
+                                                  reinterpret_cast<float16_t *>(bn_mul->buffer()),
+                                                  reinterpret_cast<float16_t *>(bn_add->buffer()), minval, maxval,
+                                                  width, height);
+            },
+            in1_it, in2_it, add_out_it, out_it);
     }
     else
     {
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            a64_add_bn_clamp_direct_fp16_2x32(
-                reinterpret_cast<float16_t *>(out_it.ptr()), out_stride,
-                nullptr, out_direct_stride,
-                reinterpret_cast<float16_t *>(in1_it.ptr()), in0_stride,
-                reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride,
-                reinterpret_cast<float16_t *>(bn_mul->buffer()),
-                reinterpret_cast<float16_t *>(bn_add->buffer()),
-                minval,
-                maxval,
-                width, height);
-        },
-        in1_it, in2_it, out_it);
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_fp16_2x32(reinterpret_cast<float16_t *>(out_it.ptr()), out_stride, nullptr,
+                                                  out_direct_stride, reinterpret_cast<float16_t *>(in1_it.ptr()),
+                                                  in0_stride, reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride,
+                                                  reinterpret_cast<float16_t *>(bn_mul->buffer()),
+                                                  reinterpret_cast<float16_t *>(bn_add->buffer()), minval, maxval,
+                                                  width, height);
+            },
+            in1_it, in2_it, out_it);
     }
 }
 } // namespace cpu
diff --git a/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp b/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp
index b0c487ec56..f0444b6acd 100644
--- a/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp
@@ -35,16 +35,20 @@
 #ifdef __aarch64__
 namespace
 {
-void a64_add_bn_clamp_direct_fp32_2x16(
-    float *out, size_t out_stride,
-    float *out_direct, size_t out_direct_stride,
-    const float *in0, size_t in0_stride,
-    const float *in1, size_t in1_stride,
-    const float *bn_mul,
-    const float *bn_add,
-    const float  minval,
-    const float  maxval,
-    size_t width, size_t height)
+void a64_add_bn_clamp_direct_fp32_2x16(float       *out,
+                                       size_t       out_stride,
+                                       float       *out_direct,
+                                       size_t       out_direct_stride,
+                                       const float *in0,
+                                       size_t       in0_stride,
+                                       const float *in1,
+                                       size_t       in1_stride,
+                                       const float *bn_mul,
+                                       const float *bn_add,
+                                       const float  minval,
+                                       const float  maxval,
+                                       size_t       width,
+                                       size_t       height)
 {
     struct KernelArgs
     {
@@ -631,18 +635,30 @@ void a64_add_bn_clamp_direct_fp32_2x16(
         "subs x20, x20, #0x2\n"
         "bgt 8b\n"
         "34:" // odd columns skip
-        : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width)
-        : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
-        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
-}
+        : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out),
+          [out_direct] "+&r"(out_direct), [width] "+&r"(width)
+        : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride),
+          [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)),
+          [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16",
+          "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9",
+          "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
 }
+} // namespace
 
 namespace arm_compute
 {
 namespace cpu
 {
-void add_mul_add_fp32_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add,
-                           ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window)
+void add_mul_add_fp32_neon(const ITensor             *input1,
+                           const ITensor             *input2,
+                           const ITensor             *bn_mul,
+                           const ITensor             *bn_add,
+                           ITensor                   *add_output,
+                           ITensor                   *final_output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info,
+                           const Window              &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -654,16 +670,16 @@ void add_mul_add_fp32_neon(const ITensor *input1, const ITensor *input2, const I
     float minval = std::numeric_limits<float>::lowest();
     float maxval = std::numeric_limits<float>::max();
 
-    if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+    if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
     {
         minval = 0.f;
     }
-    else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
     {
         minval = 0.f;
         maxval = act_info.a();
     }
-    else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
     {
         minval = act_info.b();
         maxval = act_info.a();
@@ -681,42 +697,34 @@ void add_mul_add_fp32_neon(const ITensor *input1, const ITensor *input2, const I
     const size_t width  = window.num_iterations(0);
     const size_t height = window.num_iterations(1);
 
-    if(add_output != nullptr)
+    if (add_output != nullptr)
     {
         Iterator add_out_it(add_output, window);
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            a64_add_bn_clamp_direct_fp32_2x16(
-                reinterpret_cast<float *>(out_it.ptr()), out_stride,
-                reinterpret_cast<float *>(add_out_it.ptr()), out_direct_stride,
-                reinterpret_cast<float *>(in1_it.ptr()), in0_stride,
-                reinterpret_cast<float *>(in2_it.ptr()), in1_stride,
-                reinterpret_cast<float *>(bn_mul->buffer()),
-                reinterpret_cast<float *>(bn_add->buffer()),
-                minval,
-                maxval,
-                width, height);
-        },
-        in1_it, in2_it, add_out_it, out_it);
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_fp32_2x16(
+                    reinterpret_cast<float *>(out_it.ptr()), out_stride, reinterpret_cast<float *>(add_out_it.ptr()),
+                    out_direct_stride, reinterpret_cast<float *>(in1_it.ptr()), in0_stride,
+                    reinterpret_cast<float *>(in2_it.ptr()), in1_stride, reinterpret_cast<float *>(bn_mul->buffer()),
+                    reinterpret_cast<float *>(bn_add->buffer()), minval, maxval, width, height);
+            },
+            in1_it, in2_it, add_out_it, out_it);
     }
     else
     {
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            a64_add_bn_clamp_direct_fp32_2x16(
-                reinterpret_cast<float *>(out_it.ptr()), out_stride,
-                nullptr, out_direct_stride,
-                reinterpret_cast<float *>(in1_it.ptr()), in0_stride,
-                reinterpret_cast<float *>(in2_it.ptr()), in1_stride,
-                reinterpret_cast<float *>(bn_mul->buffer()),
-                reinterpret_cast<float *>(bn_add->buffer()),
-                minval,
-                maxval,
-                width, height);
-        },
-        in1_it, in2_it, out_it);
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_fp32_2x16(
+                    reinterpret_cast<float *>(out_it.ptr()), out_stride, nullptr, out_direct_stride,
+                    reinterpret_cast<float *>(in1_it.ptr()), in0_stride, reinterpret_cast<float *>(in2_it.ptr()),
+                    in1_stride, reinterpret_cast<float *>(bn_mul->buffer()),
+                    reinterpret_cast<float *>(bn_add->buffer()), minval, maxval, width, height);
+            },
+            in1_it, in2_it, out_it);
     }
 }
 } // namespace cpu
diff --git a/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp b/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp
index f7448a6717..035805c944 100644
--- a/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp
@@ -36,22 +36,30 @@
 #ifdef __aarch64__
 namespace
 {
-void a64_add_bn_clamp_direct_u8_fp32_2x16(
-    uint8_t *out, size_t out_stride,
-    uint8_t *out_direct, size_t out_direct_stride,
-    const uint8_t *in0, size_t in0_stride,
-    const uint8_t *in1, size_t in1_stride,
-    const float *bn_mul,
-    const float *bn_add,
-    const uint8_t minval,
-    const uint8_t maxval,
-    int32_t out_zeropt, float out_scale,
-    int32_t out_direct_zeropt, float out_direct_scale,
-    int32_t in0_zeropt, float in0_scale,
-    int32_t in1_zeropt, float in1_scale,
-    size_t width, size_t height)
+void a64_add_bn_clamp_direct_u8_fp32_2x16(uint8_t       *out,
+                                          size_t         out_stride,
+                                          uint8_t       *out_direct,
+                                          size_t         out_direct_stride,
+                                          const uint8_t *in0,
+                                          size_t         in0_stride,
+                                          const uint8_t *in1,
+                                          size_t         in1_stride,
+                                          const float   *bn_mul,
+                                          const float   *bn_add,
+                                          const uint8_t  minval,
+                                          const uint8_t  maxval,
+                                          int32_t        out_zeropt,
+                                          float          out_scale,
+                                          int32_t        out_direct_zeropt,
+                                          float          out_direct_scale,
+                                          int32_t        in0_zeropt,
+                                          float          in0_scale,
+                                          int32_t        in1_zeropt,
+                                          float          in1_scale,
+                                          size_t         width,
+                                          size_t         height)
 {
-    float scales[4] = { in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale };
+    float scales[4] = {in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale};
     struct KernelArgs
     {
         const float *scales;
@@ -709,9 +717,19 @@ void a64_add_bn_clamp_direct_u8_fp32_2x16(
         "subs x23, x23, #0x2\n"
         "bgt 6b\n"
         "32:" // odd columns skip
-        : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width)
-        : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)), [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)), [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)), [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
-        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
+        : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out),
+          [out_direct] "+&r"(out_direct), [width] "+&r"(width)
+        : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride),
+          [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)),
+          [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)),
+          [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)),
+          [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)),
+          [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)),
+          [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride),
+          [out_stride] "r"(out_stride)
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16",
+          "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9",
+          "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
 }
 
 } // namespace
@@ -720,8 +738,15 @@ namespace arm_compute
 {
 namespace cpu
 {
-void add_mul_add_u8_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add,
-                         ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window)
+void add_mul_add_u8_neon(const ITensor             *input1,
+                         const ITensor             *input2,
+                         const ITensor             *bn_mul,
+                         const ITensor             *bn_add,
+                         ITensor                   *add_output,
+                         ITensor                   *final_output,
+                         ConvertPolicy              policy,
+                         const ActivationLayerInfo &act_info,
+                         const Window              &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -739,24 +764,25 @@ void add_mul_add_u8_neon(const ITensor *input1, const ITensor *input2, const ITe
     uint8_t maxval = std::numeric_limits<uint8_t>::max();
 
     const UniformQuantizationInfo final_output_qinfo = final_output_info->quantization_info().uniform();
-    if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+    if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
     {
         minval = quantize_qasymm8(0.f, final_output_qinfo);
     }
-    else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
     {
         minval = quantize_qasymm8(0.f, final_output_qinfo);
         maxval = quantize_qasymm8(act_info.a(), final_output_qinfo);
     }
-    else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
     {
         minval = quantize_qasymm8(act_info.b(), final_output_qinfo);
         maxval = quantize_qasymm8(act_info.a(), final_output_qinfo);
     }
 
-    const UniformQuantizationInfo in1_qinfo        = input1_info->quantization_info().uniform();
-    const UniformQuantizationInfo in2_qinfo        = input2_info->quantization_info().uniform();
-    const UniformQuantizationInfo add_output_qinfo = (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo();
+    const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform();
+    const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform();
+    const UniformQuantizationInfo add_output_qinfo =
+        (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo();
 
     const int32_t in1_offset        = in1_qinfo.offset;
     const int32_t in2_offset        = in2_qinfo.offset;
@@ -783,50 +809,35 @@ void add_mul_add_u8_neon(const ITensor *input1, const ITensor *input2, const ITe
     const size_t width  = window.num_iterations(0);
     const size_t height = window.num_iterations(1);
 
-    if(add_output != nullptr)
+    if (add_output != nullptr)
     {
         Iterator add_out_it(add_output, window);
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            a64_add_bn_clamp_direct_u8_fp32_2x16(
-                reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride,
-                reinterpret_cast<uint8_t *>(add_out_it.ptr()), out_direct_stride,
-                reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride,
-                reinterpret_cast<uint8_t *>(in2_it.ptr()), in1_stride,
-                bn_mul_buffer,
-                bn_add_buffer,
-                minval,
-                maxval,
-                out_offset, out_scale,
-                out_direct_offset, out_direct_scale,
-                in1_offset, in1_scale,
-                in2_offset, in2_scale,
-                width, height);
-        },
-        in1_it, in2_it, add_out_it, out_it);
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_u8_fp32_2x16(
+                    reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride,
+                    reinterpret_cast<uint8_t *>(add_out_it.ptr()), out_direct_stride,
+                    reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride, reinterpret_cast<uint8_t *>(in2_it.ptr()),
+                    in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, out_offset, out_scale, out_direct_offset,
+                    out_direct_scale, in1_offset, in1_scale, in2_offset, in2_scale, width, height);
+            },
+            in1_it, in2_it, add_out_it, out_it);
     }
     else
     {
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            a64_add_bn_clamp_direct_u8_fp32_2x16(
-                reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride,
-                nullptr, out_direct_stride,
-                reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride,
-                reinterpret_cast<uint8_t *>(in2_it.ptr()), in1_stride,
-                bn_mul_buffer,
-                bn_add_buffer,
-                minval,
-                maxval,
-                out_offset, out_scale,
-                out_direct_offset, out_direct_scale,
-                in1_offset, in1_scale,
-                in2_offset, in2_scale,
-                width, height);
-        },
-        in1_it, in2_it, out_it);
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_u8_fp32_2x16(
+                    reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride, nullptr, out_direct_stride,
+                    reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride, reinterpret_cast<uint8_t *>(in2_it.ptr()),
+                    in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, out_offset, out_scale, out_direct_offset,
+                    out_direct_scale, in1_offset, in1_scale, in2_offset, in2_scale, width, height);
+            },
+            in1_it, in2_it, out_it);
     }
 }
 } // namespace cpu
diff --git a/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp
index 1ae2cb76a9..e1a45b467b 100644
--- a/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp
@@ -36,22 +36,30 @@
 #ifdef __aarch64__
 namespace
 {
-void a64_add_bn_clamp_direct_s8_fp32_2x16(
-    int8_t *out, size_t out_stride,
-    int8_t *out_direct, size_t out_direct_stride,
-    const int8_t *in0, size_t in0_stride,
-    const int8_t *in1, size_t in1_stride,
-    const float *bn_mul,
-    const float *bn_add,
-    const int8_t minval,
-    const int8_t maxval,
-    int32_t out_zeropt, float out_scale,
-    int32_t out_direct_zeropt, float out_direct_scale,
-    int32_t in0_zeropt, float in0_scale,
-    int32_t in1_zeropt, float in1_scale,
-    size_t width, size_t height)
+void a64_add_bn_clamp_direct_s8_fp32_2x16(int8_t       *out,
+                                          size_t        out_stride,
+                                          int8_t       *out_direct,
+                                          size_t        out_direct_stride,
+                                          const int8_t *in0,
+                                          size_t        in0_stride,
+                                          const int8_t *in1,
+                                          size_t        in1_stride,
+                                          const float  *bn_mul,
+                                          const float  *bn_add,
+                                          const int8_t  minval,
+                                          const int8_t  maxval,
+                                          int32_t       out_zeropt,
+                                          float         out_scale,
+                                          int32_t       out_direct_zeropt,
+                                          float         out_direct_scale,
+                                          int32_t       in0_zeropt,
+                                          float         in0_scale,
+                                          int32_t       in1_zeropt,
+                                          float         in1_scale,
+                                          size_t        width,
+                                          size_t        height)
 {
-    float scales[4] = { in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale };
+    float scales[4] = {in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale};
     struct KernelArgs
     {
         const float *scales;
@@ -709,9 +717,19 @@ void a64_add_bn_clamp_direct_s8_fp32_2x16(
         "subs x23, x23, #0x2\n"
         "bgt 6b\n"
         "32:" // odd columns skip
-        : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width)
-        : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)), [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)), [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)), [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
-        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
+        : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out),
+          [out_direct] "+&r"(out_direct), [width] "+&r"(width)
+        : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride),
+          [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)),
+          [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)),
+          [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)),
+          [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)),
+          [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)),
+          [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride),
+          [out_stride] "r"(out_stride)
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16",
+          "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9",
+          "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
 }
 
 } // namespace
@@ -720,8 +738,15 @@ namespace arm_compute
 {
 namespace cpu
 {
-void add_mul_add_s8_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add,
-                         ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window)
+void add_mul_add_s8_neon(const ITensor             *input1,
+                         const ITensor             *input2,
+                         const ITensor             *bn_mul,
+                         const ITensor             *bn_add,
+                         ITensor                   *add_output,
+                         ITensor                   *final_output,
+                         ConvertPolicy              policy,
+                         const ActivationLayerInfo &act_info,
+                         const Window              &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -739,24 +764,25 @@ void add_mul_add_s8_neon(const ITensor *input1, const ITensor *input2, const ITe
     int8_t maxval = std::numeric_limits<int8_t>::max();
 
     const UniformQuantizationInfo final_output_qinfo = final_output_info->quantization_info().uniform();
-    if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+    if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
     {
         minval = quantize_qasymm8_signed(0.f, final_output_qinfo);
     }
-    else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
     {
         minval = quantize_qasymm8_signed(0.f, final_output_qinfo);
         maxval = quantize_qasymm8_signed(act_info.a(), final_output_qinfo);
     }
-    else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
     {
         minval = quantize_qasymm8_signed(act_info.b(), final_output_qinfo);
         maxval = quantize_qasymm8_signed(act_info.a(), final_output_qinfo);
     }
 
-    const UniformQuantizationInfo in1_qinfo        = input1_info->quantization_info().uniform();
-    const UniformQuantizationInfo in2_qinfo        = input2_info->quantization_info().uniform();
-    const UniformQuantizationInfo add_output_qinfo = (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo();
+    const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform();
+    const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform();
+    const UniformQuantizationInfo add_output_qinfo =
+        (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo();
 
     const int32_t in1_offset        = in1_qinfo.offset;
     const int32_t in2_offset        = in2_qinfo.offset;
@@ -783,50 +809,35 @@ void add_mul_add_s8_neon(const ITensor *input1, const ITensor *input2, const ITe
     const size_t width  = window.num_iterations(0);
     const size_t height = window.num_iterations(1);
 
-    if(add_output != nullptr)
+    if (add_output != nullptr)
     {
         Iterator add_out_it(add_output, window);
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            a64_add_bn_clamp_direct_s8_fp32_2x16(
-                reinterpret_cast<int8_t *>(out_it.ptr()), out_stride,
-                reinterpret_cast<int8_t *>(add_out_it.ptr()), out_direct_stride,
-                reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride,
-                reinterpret_cast<int8_t *>(in2_it.ptr()), in1_stride,
-                bn_mul_buffer,
-                bn_add_buffer,
-                minval,
-                maxval,
-                out_offset, out_scale,
-                out_direct_offset, out_direct_scale,
-                in1_offset, in1_scale,
-                in2_offset, in2_scale,
-                width, height);
-        },
-        in1_it, in2_it, add_out_it, out_it);
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_s8_fp32_2x16(
+                    reinterpret_cast<int8_t *>(out_it.ptr()), out_stride, reinterpret_cast<int8_t *>(add_out_it.ptr()),
+                    out_direct_stride, reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride,
+                    reinterpret_cast<int8_t *>(in2_it.ptr()), in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval,
+                    out_offset, out_scale, out_direct_offset, out_direct_scale, in1_offset, in1_scale, in2_offset,
+                    in2_scale, width, height);
+            },
+            in1_it, in2_it, add_out_it, out_it);
     }
     else
     {
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            a64_add_bn_clamp_direct_s8_fp32_2x16(
-                reinterpret_cast<int8_t *>(out_it.ptr()), out_stride,
-                nullptr, out_direct_stride,
-                reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride,
-                reinterpret_cast<int8_t *>(in2_it.ptr()), in1_stride,
-                bn_mul_buffer,
-                bn_add_buffer,
-                minval,
-                maxval,
-                out_offset, out_scale,
-                out_direct_offset, out_direct_scale,
-                in1_offset, in1_scale,
-                in2_offset, in2_scale,
-                width, height);
-        },
-        in1_it, in2_it, out_it);
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_s8_fp32_2x16(
+                    reinterpret_cast<int8_t *>(out_it.ptr()), out_stride, nullptr, out_direct_stride,
+                    reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride, reinterpret_cast<int8_t *>(in2_it.ptr()),
+                    in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, out_offset, out_scale, out_direct_offset,
+                    out_direct_scale, in1_offset, in1_scale, in2_offset, in2_scale, width, height);
+            },
+            in1_it, in2_it, out_it);
     }
 }
 } // namespace cpu
diff --git a/src/cpu/kernels/addmuladd/list.h b/src/cpu/kernels/addmuladd/list.h
index a7c22c06d8..568003a916 100644
--- a/src/cpu/kernels/addmuladd/list.h
+++ b/src/cpu/kernels/addmuladd/list.h
@@ -32,9 +32,10 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_ADD_MUL_ADD_KERNEL(func_name)                                                                          \
+#define DECLARE_ADD_MUL_ADD_KERNEL(func_name)                                                                  \
     void func_name(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add, \
-                   ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window)
+                   ITensor *add_output, ITensor *final_output, ConvertPolicy policy,                           \
+                   const ActivationLayerInfo &act_info, const Window &window)
 
 DECLARE_ADD_MUL_ADD_KERNEL(add_mul_add_fp32_neon);
 DECLARE_ADD_MUL_ADD_KERNEL(add_mul_add_fp16_neon);
diff --git a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
index 10bf8e4ff7..6e8f32ef47 100644
--- a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
+++ b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/NEON/INEKernel.h"
 #include "src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp"
 
@@ -57,13 +58,12 @@ class CpuGemmAssemblyWrapperKernel final : public INEKernel
 public:
     /** Constructor
      */
-    CpuGemmAssemblyWrapperKernel()
-        : _kernel(nullptr), _name("CpuGemmAssemblyWrapperKernel")
+    CpuGemmAssemblyWrapperKernel() : _kernel(nullptr), _name("CpuGemmAssemblyWrapperKernel")
     {
     }
 
-    CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &)  = delete;
-    CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &&) = default;
+    CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &)            = delete;
+    CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &&)           = default;
     CpuGemmAssemblyWrapperKernel &operator=(CpuGemmAssemblyWrapperKernel &) = delete;
 
     const char *name() const override
@@ -110,7 +110,7 @@ public:
 
         INEKernel::configure(win);
 
-        if(!kernel_name_tag.empty())
+        if (!kernel_name_tag.empty())
         {
             _name += "/" + kernel_name_tag;
         }
@@ -132,7 +132,7 @@ public:
 
 private:
     arm_gemm::GemmCommon<TypeInput, TypeOutput> *_kernel;
-    std::string _name;
+    std::string                                  _name;
 };
 } // namespace kernel
 } // namespace cpu
diff --git a/src/cpu/kernels/assembly/arm_gemm.hpp b/src/cpu/kernels/assembly/arm_gemm.hpp
index 4c127b4ec3..9a913c5c58 100644
--- a/src/cpu/kernels/assembly/arm_gemm.hpp
+++ b/src/cpu/kernels/assembly/arm_gemm.hpp
@@ -23,13 +23,12 @@
  */
 #pragma once
 
+#include "arm_gemm_local.hpp"
+#include "gemm_common.hpp"
 #include <cstring>
 #include <memory>
 #include <vector>
 
-#include "arm_gemm_local.hpp"
-#include "gemm_common.hpp"
-
 namespace arm_gemm
 {
 enum class GemmMethod
@@ -111,8 +110,7 @@ struct GemmConfig
     unsigned int outer_block_size = 0;
     WeightFormat weight_format    = WeightFormat::ANY;
 
-    GemmConfig(GemmMethod method)
-        : method(method)
+    GemmConfig(GemmMethod method) : method(method)
     {
     }
     GemmConfig()
@@ -133,8 +131,7 @@ struct Activation
     float param1;
     float param2;
 
-    Activation(Type type = Type::None, float p1 = 0.0f, float p2 = 0.0f)
-        : type(type), param1(p1), param2(p2)
+    Activation(Type type = Type::None, float p1 = 0.0f, float p2 = 0.0f) : type(type), param1(p1), param2(p2)
     {
     }
 };
@@ -156,12 +153,32 @@ public:
     bool              _fast_mode;
     const GemmConfig *_cfg;
 
-    GemmArgs(const CPUInfo *ci, unsigned int M, unsigned int N,
-             unsigned int K, unsigned int Ksections, unsigned int nbatches,
-             unsigned int nmulti, bool indirect_input, Activation act, const int maxthreads,
-             bool fixed_format = false, bool fast_mode = false, const GemmConfig *cfg = nullptr)
-        : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _Ksections(Ksections), _nbatches(nbatches), _nmulti(nmulti), _indirect_input(indirect_input), _act(act), _maxthreads(maxthreads),
-          _fixed_format(fixed_format), _fast_mode(fast_mode), _cfg(cfg)
+    GemmArgs(const CPUInfo    *ci,
+             unsigned int      M,
+             unsigned int      N,
+             unsigned int      K,
+             unsigned int      Ksections,
+             unsigned int      nbatches,
+             unsigned int      nmulti,
+             bool              indirect_input,
+             Activation        act,
+             const int         maxthreads,
+             bool              fixed_format = false,
+             bool              fast_mode    = false,
+             const GemmConfig *cfg          = nullptr)
+        : _ci(ci),
+          _Msize(M),
+          _Nsize(N),
+          _Ksize(K),
+          _Ksections(Ksections),
+          _nbatches(nbatches),
+          _nmulti(nmulti),
+          _indirect_input(indirect_input),
+          _act(act),
+          _maxthreads(maxthreads),
+          _fixed_format(fixed_format),
+          _fast_mode(fast_mode),
+          _cfg(cfg)
     {
     }
 };
@@ -187,23 +204,51 @@ public:
     Requantize32() = default;
 
     // Constructor for per-tensor quantization
-    Requantize32(const int32_t *bias, size_t bias_multi_stride,
-                 int32_t a_offset, int32_t b_offset, int32_t c_offset,
-                 int32_t requant_shift, int32_t requant_mul, int32_t minv, int32_t maxv)
-        : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(false), per_layer_left_shift(std::max<int32_t>(requant_shift, 0)),
-          per_layer_right_shift(std::min<int32_t>(requant_shift, 0)), per_layer_mul(requant_mul), minval(minv), maxval(maxv)
+    Requantize32(const int32_t *bias,
+                 size_t         bias_multi_stride,
+                 int32_t        a_offset,
+                 int32_t        b_offset,
+                 int32_t        c_offset,
+                 int32_t        requant_shift,
+                 int32_t        requant_mul,
+                 int32_t        minv,
+                 int32_t        maxv)
+        : bias(bias),
+          bias_multi_stride(bias_multi_stride),
+          a_offset(a_offset),
+          b_offset(b_offset),
+          c_offset(c_offset),
+          per_channel_requant(false),
+          per_layer_left_shift(std::max<int32_t>(requant_shift, 0)),
+          per_layer_right_shift(std::min<int32_t>(requant_shift, 0)),
+          per_layer_mul(requant_mul),
+          minval(minv),
+          maxval(maxv)
     {
     }
 
     // Constructor for per-channel quantization
-    Requantize32(const int32_t *bias, size_t bias_multi_stride,
-                 int32_t a_offset, int32_t b_offset, int32_t c_offset,
+    Requantize32(const int32_t *bias,
+                 size_t         bias_multi_stride,
+                 int32_t        a_offset,
+                 int32_t        b_offset,
+                 int32_t        c_offset,
                  const int32_t *requant_left_shifts,
                  const int32_t *requant_right_shifts,
                  const int32_t *requant_muls,
-                 int32_t minv, int32_t maxv)
-        : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(true), per_channel_left_shifts(requant_left_shifts),
-          per_channel_right_shifts(requant_right_shifts), per_channel_muls(requant_muls), minval(minv), maxval(maxv)
+                 int32_t        minv,
+                 int32_t        maxv)
+        : bias(bias),
+          bias_multi_stride(bias_multi_stride),
+          a_offset(a_offset),
+          b_offset(b_offset),
+          c_offset(c_offset),
+          per_channel_requant(true),
+          per_channel_left_shifts(requant_left_shifts),
+          per_channel_right_shifts(requant_right_shifts),
+          per_channel_muls(requant_muls),
+          minval(minv),
+          maxval(maxv)
     {
     }
 };
diff --git a/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp b/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp
index 718fcd1fb4..0672e899b6 100644
--- a/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp
+++ b/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp
@@ -27,7 +27,6 @@
 #include "arm_compute/core/Window.h"
 
 #include "ndrange.hpp"
-
 #include <cassert>
 
 /* This file contains mapping between integral types used in arm_compute and arm_gemm
@@ -38,8 +37,7 @@
 namespace arm_gemm
 {
 //we want to unify the maximum number of dimensions used beween arm_gemm and arm compute library
-constexpr std::size_t ndrange_max =
-    arm_compute::Dimensions<unsigned int>::num_max_dimensions;
+constexpr std::size_t ndrange_max = arm_compute::Dimensions<unsigned int>::num_max_dimensions;
 
 using ndrange_t = NDRange<ndrange_max>;
 using ndcoord_t = NDCoordinate<ndrange_max>;
@@ -56,7 +54,7 @@ inline arm_compute::Window to_window(const ndrange_t &ndr)
 {
     arm_compute::Window win;
 
-    for(unsigned int i = 0; i != ndrange_max; ++i)
+    for (unsigned int i = 0; i != ndrange_max; ++i)
     {
         //populate the window with the dimensions of the NDRange
         win.set(i, arm_compute::Window::Dimension(0, ndr.get_size(i)));
@@ -75,7 +73,7 @@ inline arm_compute::Window to_window(const ndcoord_t &ndc)
 {
     arm_compute::Window win;
 
-    for(unsigned int i = 0; i != ndrange_max; ++i)
+    for (unsigned int i = 0; i != ndrange_max; ++i)
     {
         const auto start = ndc.get_position(i);
         const auto size  = ndc.get_size(i);
@@ -98,15 +96,12 @@ inline arm_compute::Window to_window(const ndcoord_t &ndc)
  */
 inline ndrange_t to_ndrange(const arm_compute::Window &win)
 {
-    return
-    {
-        static_cast<unsigned int>(win[0].end() - win[0].start()),
-        static_cast<unsigned int>(win[1].end() - win[1].start()),
-        static_cast<unsigned int>(win[2].end() - win[2].start()),
-        static_cast<unsigned int>(win[3].end() - win[3].start()),
-        static_cast<unsigned int>(win[4].end() - win[4].start()),
-        static_cast<unsigned int>(win[5].end() - win[5].start())
-    };
+    return {static_cast<unsigned int>(win[0].end() - win[0].start()),
+            static_cast<unsigned int>(win[1].end() - win[1].start()),
+            static_cast<unsigned int>(win[2].end() - win[2].start()),
+            static_cast<unsigned int>(win[3].end() - win[3].start()),
+            static_cast<unsigned int>(win[4].end() - win[4].start()),
+            static_cast<unsigned int>(win[5].end() - win[5].start())};
 }
 
 /** Convert an `arm_compute::Window` to an `arm_gemm::NDCoord` of the same max dimensions
@@ -116,15 +111,12 @@ inline ndrange_t to_ndrange(const arm_compute::Window &win)
  */
 inline ndcoord_t to_ndcoord(const arm_compute::Window &win)
 {
-    return
-    {
-        { static_cast<unsigned int>(win[0].start()), static_cast<unsigned int>(win[0].end() - win[0].start()) },
-        { static_cast<unsigned int>(win[1].start()), static_cast<unsigned int>(win[1].end() - win[1].start()) },
-        { static_cast<unsigned int>(win[2].start()), static_cast<unsigned int>(win[2].end() - win[2].start()) },
-        { static_cast<unsigned int>(win[3].start()), static_cast<unsigned int>(win[3].end() - win[3].start()) },
-        { static_cast<unsigned int>(win[4].start()), static_cast<unsigned int>(win[4].end() - win[4].start()) },
-        { static_cast<unsigned int>(win[5].start()), static_cast<unsigned int>(win[5].end() - win[5].start()) }
-    };
+    return {{static_cast<unsigned int>(win[0].start()), static_cast<unsigned int>(win[0].end() - win[0].start())},
+            {static_cast<unsigned int>(win[1].start()), static_cast<unsigned int>(win[1].end() - win[1].start())},
+            {static_cast<unsigned int>(win[2].start()), static_cast<unsigned int>(win[2].end() - win[2].start())},
+            {static_cast<unsigned int>(win[3].start()), static_cast<unsigned int>(win[3].end() - win[3].start())},
+            {static_cast<unsigned int>(win[4].start()), static_cast<unsigned int>(win[4].end() - win[4].start())},
+            {static_cast<unsigned int>(win[5].start()), static_cast<unsigned int>(win[5].end() - win[5].start())}};
 }
 
 } //namespace arm_gemm
diff --git a/src/cpu/kernels/assembly/gemm_common.hpp b/src/cpu/kernels/assembly/gemm_common.hpp
index 834cd1061e..6fe9f13f02 100644
--- a/src/cpu/kernels/assembly/gemm_common.hpp
+++ b/src/cpu/kernels/assembly/gemm_common.hpp
@@ -25,7 +25,6 @@
 
 #include "convolution_parameters.hpp"
 #include "ndrange.hpp"
-
 #include <cstddef>
 
 namespace arm_gemm
@@ -51,10 +50,19 @@ public:
      * appropriately typed pointers.  If B is pretransposed (see below) then
      * the settings for B here are ignored.
      */
-    virtual void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
-                                    const void *B, const int ldb, /* batches share B */ const int B_multi_stride,
-                                    void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
-                                    const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) = 0;
+    virtual void set_arrays_generic(const void                                   *A,
+                                    const int                                     lda,
+                                    const int                                     A_batch_stride,
+                                    const int                                     A_multi_stride,
+                                    const void                                   *B,
+                                    const int                                     ldb,
+                                    /* batches share B */ const int               B_multi_stride,
+                                    void                                         *C,
+                                    const int                                     ldc,
+                                    const int                                     C_batch_stride,
+                                    const int                                     C_multi_stride,
+                                    const void                                   *bias,
+                                    /* no row or batch stride needed */ const int bias_multi_stride) = 0;
 
     /** @returns an ndrange containing ranges of the compute space which can be
      * broken up and parallelised over
@@ -73,7 +81,7 @@ public:
      * This has an empty default implementation, as GEMMs which don't care
      * about thread count can safely ignore this.
      */
-    virtual void set_nthreads(int) {};
+    virtual void set_nthreads(int){};
 
     /* Whether this GEMM can be dynamically scheduled or not. */
     virtual bool supports_dynamic_scheduling() const
@@ -95,7 +103,7 @@ public:
         return 0;
     }
     /* Provide working space buffer - the void * passed in must remain allocated for the duration of any execute calls. */
-    virtual void set_working_space(void *) {};
+    virtual void set_working_space(void *){};
 
     /*** "Pretransposed" interface (optional) ***/
     /* Is this object set up for pretranspose?  If so, pretranspose_array() needs to be called before execute(); */
@@ -122,7 +130,8 @@ public:
     /* The "real" version of this depends on the templated operand type (see below).  */
     virtual void pretranspose_B_array_generic(void *, const void *, const int, const int) = 0;
     /* Threaded version with window start/end parameters */
-    virtual void pretranspose_B_array_part_generic(void *, const void *, const int, const int, const size_t, const size_t) = 0;
+    virtual void
+    pretranspose_B_array_part_generic(void *, const void *, const int, const int, const size_t, const size_t) = 0;
 
     /* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */
     virtual void set_pretransposed_B_data(void *)
@@ -186,10 +195,19 @@ protected:
 public:
     /* Pass in the pointers to the arrays to be operated on and their
      * strides (templated version with appropriate types). */
-    virtual void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
-                            const To *B, const int ldb, /* batches share B */ const int B_multi_stride,
-                            Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
-                            const Tr *bias, /* no row or batch stride needed */ const int bias_multi_stride)
+    virtual void set_arrays(const To                                     *A,
+                            const int                                     lda,
+                            const int                                     A_batch_stride,
+                            const int                                     A_multi_stride,
+                            const To                                     *B,
+                            const int                                     ldb,
+                            /* batches share B */ const int               B_multi_stride,
+                            Tr                                           *C,
+                            const int                                     ldc,
+                            const int                                     C_batch_stride,
+                            const int                                     C_multi_stride,
+                            const Tr                                     *bias,
+                            /* no row or batch stride needed */ const int bias_multi_stride)
     {
         _Aptr              = A;
         _lda               = lda;
@@ -207,25 +225,33 @@ public:
     }
 
     /* Implementation of the void * overload which casts its arguments to the appropriate type. */
-    void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
-                            const void *B, const int ldb, /* batches share B */ const int B_multi_stride,
-                            void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
-                            const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) override
+    void set_arrays_generic(const void                                   *A,
+                            const int                                     lda,
+                            const int                                     A_batch_stride,
+                            const int                                     A_multi_stride,
+                            const void                                   *B,
+                            const int                                     ldb,
+                            /* batches share B */ const int               B_multi_stride,
+                            void                                         *C,
+                            const int                                     ldc,
+                            const int                                     C_batch_stride,
+                            const int                                     C_multi_stride,
+                            const void                                   *bias,
+                            /* no row or batch stride needed */ const int bias_multi_stride) override
     {
-        set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride,
-                   static_cast<const To *>(B), ldb, B_multi_stride,
-                   static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride,
+        set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride, static_cast<const To *>(B), ldb,
+                   B_multi_stride, static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride,
                    static_cast<const Tr *>(bias), bias_multi_stride);
     }
 
     /*** "Pretransposed" interface ***/
 
     /* Compute col sums over all columns */
-    virtual void requantize_bias(void *, const To *, const int, const int) {};
+    virtual void requantize_bias(void *, const To *, const int, const int){};
 
     /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */
     /* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */
-    virtual void pretranspose_B_array(void *, const To *, const int, const int) {};
+    virtual void pretranspose_B_array(void *, const To *, const int, const int){};
 
     /* Implementation of the void * overload which casts its arguments to the appropriate type. */
     void pretranspose_B_array_generic(void *out, const void *in, const int row_stride, const int multi_stride) override
@@ -237,12 +263,14 @@ public:
      * The fallback/backwards compatible version of the threaded interface exposes a window size of 1 and
      * just calls the non-threaded functions to do the work.  This is valid as with window size of 1 the only
      * legal values for start and end are 0 and 1 respectively. */
-    virtual void pretranspose_B_array_part(void *out, const To *in, const int row_stride, const int multi_stride, size_t, size_t)
+    virtual void
+    pretranspose_B_array_part(void *out, const To *in, const int row_stride, const int multi_stride, size_t, size_t)
     {
         pretranspose_B_array(out, in, row_stride, multi_stride);
     };
 
-    void pretranspose_B_array_part_generic(void *out, const void *in, const int row_stride, const int multi_stride, size_t start, size_t end) override
+    void pretranspose_B_array_part_generic(
+        void *out, const void *in, const int row_stride, const int multi_stride, size_t start, size_t end) override
     {
         pretranspose_B_array_part(out, static_cast<const To *>(in), row_stride, multi_stride, start, end);
     }
diff --git a/src/cpu/kernels/assembly/ndrange.hpp b/src/cpu/kernels/assembly/ndrange.hpp
index 1c8261aef7..baccdc0d88 100644
--- a/src/cpu/kernels/assembly/ndrange.hpp
+++ b/src/cpu/kernels/assembly/ndrange.hpp
@@ -45,8 +45,7 @@ private:
         unsigned int   m_end = 0;
 
     public:
-        NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e)
-            : m_parent(p), m_pos(s), m_end(e)
+        NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e) : m_parent(p), m_pos(s), m_end(e)
         {
         }
 
@@ -59,12 +58,12 @@ private:
         {
             unsigned int r = m_pos;
 
-            if(d < (D - 1))
+            if (d < (D - 1))
             {
                 r %= m_parent.m_totalsizes[d];
             }
 
-            if(d > 0)
+            if (d > 0)
             {
                 r /= m_parent.m_totalsizes[d - 1];
             }
@@ -98,9 +97,9 @@ private:
     {
         unsigned int t = 1;
 
-        for(unsigned int i = 0; i < D; i++)
+        for (unsigned int i = 0; i < D; i++)
         {
-            if(m_sizes[i] == 0)
+            if (m_sizes[i] == 0)
             {
                 m_sizes[i] = 1;
             }
@@ -116,14 +115,12 @@ public:
     NDRange(const NDRange &rhs)            = default;
 
     template <typename... T>
-    NDRange(T... ts)
-        : m_sizes{ ts... }
+    NDRange(T... ts) : m_sizes{ts...}
     {
         set_totalsizes();
     }
 
-    NDRange(const std::array<unsigned int, D> &n)
-        : m_sizes(n)
+    NDRange(const std::array<unsigned int, D> &n) : m_sizes(n)
     {
         set_totalsizes();
     }
@@ -163,7 +160,7 @@ public:
         std::array<int_t, N> sizes{};
 
         std::size_t i = 0;
-        for(auto &p : list)
+        for (auto &p : list)
         {
             m_positions[i] = p.first;
             sizes[i++]     = p.second;
diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp
index 5661479059..dbdec5fb50 100644
--- a/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp
@@ -29,7 +29,11 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp16_boundingboxtransform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
+void neon_fp16_boundingboxtransform(const ITensor           *boxes,
+                                    ITensor                 *pred_boxes,
+                                    const ITensor           *deltas,
+                                    BoundingBoxTransformInfo bbinfo,
+                                    const Window            &window)
 {
     return bounding_box_transform<float16_t>(boxes, pred_boxes, deltas, bbinfo, window);
 }
diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp
index 34ff9224d5..0224b3406a 100644
--- a/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp
@@ -26,7 +26,11 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp32_boundingboxtransform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
+void neon_fp32_boundingboxtransform(const ITensor           *boxes,
+                                    ITensor                 *pred_boxes,
+                                    const ITensor           *deltas,
+                                    BoundingBoxTransformInfo bbinfo,
+                                    const Window            &window)
 {
     return bounding_box_transform<float>(boxes, pred_boxes, deltas, bbinfo, window);
 }
diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp
index b3ffd0a676..5a2939b587 100644
--- a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp
@@ -29,7 +29,11 @@ namespace arm_compute
 {
 namespace cpu
 {
-void bounding_box_transform_qsymm16(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
+void bounding_box_transform_qsymm16(const ITensor           *boxes,
+                                    ITensor                 *pred_boxes,
+                                    const ITensor           *deltas,
+                                    BoundingBoxTransformInfo bbinfo,
+                                    const Window            &window)
 
 {
     const size_t num_classes  = deltas->info()->tensor_shape()[0] >> 2;
@@ -41,7 +45,8 @@ void bounding_box_transform_qsymm16(const ITensor *boxes, ITensor *pred_boxes, c
     const auto scale_before = bbinfo.scale();
     const auto offset       = (bbinfo.correct_transform_coords() ? 1.f : 0.f);
 
-    auto pred_ptr  = reinterpret_cast<uint16_t *>(pred_boxes->buffer() + pred_boxes->info()->offset_first_element_in_bytes());
+    auto pred_ptr =
+        reinterpret_cast<uint16_t *>(pred_boxes->buffer() + pred_boxes->info()->offset_first_element_in_bytes());
     auto delta_ptr = reinterpret_cast<uint8_t *>(deltas->buffer() + deltas->info()->offset_first_element_in_bytes());
 
     const auto boxes_qinfo  = boxes->info()->quantization_info().uniform();
@@ -49,41 +54,49 @@ void bounding_box_transform_qsymm16(const ITensor *boxes, ITensor *pred_boxes, c
     const auto pred_qinfo   = pred_boxes->info()->quantization_info().uniform();
 
     Iterator box_it(boxes, window);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto  ptr    = reinterpret_cast<uint16_t *>(box_it.ptr());
-        const auto  b0     = dequantize_qasymm16(*ptr, boxes_qinfo);
-        const auto  b1     = dequantize_qasymm16(*(ptr + 1), boxes_qinfo);
-        const auto  b2     = dequantize_qasymm16(*(ptr + 2), boxes_qinfo);
-        const auto  b3     = dequantize_qasymm16(*(ptr + 3), boxes_qinfo);
-        const float width  = (b2 / scale_before) - (b0 / scale_before) + 1.f;
-        const float height = (b3 / scale_before) - (b1 / scale_before) + 1.f;
-        const float ctr_x  = (b0 / scale_before) + 0.5f * width;
-        const float ctr_y  = (b1 / scale_before) + 0.5f * height;
-        for(size_t j = 0; j < num_classes; ++j)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            // Extract deltas
-            const size_t delta_id = id.y() * deltas_width + 4u * j;
-            const float  dx       = dequantize_qasymm8(delta_ptr[delta_id], deltas_qinfo) / bbinfo.weights()[0];
-            const float  dy       = dequantize_qasymm8(delta_ptr[delta_id + 1], deltas_qinfo) / bbinfo.weights()[1];
-            float        dw       = dequantize_qasymm8(delta_ptr[delta_id + 2], deltas_qinfo) / bbinfo.weights()[2];
-            float        dh       = dequantize_qasymm8(delta_ptr[delta_id + 3], deltas_qinfo) / bbinfo.weights()[3];
-            // Clip dw and dh
-            dw = std::min(dw, bbinfo.bbox_xform_clip());
-            dh = std::min(dh, bbinfo.bbox_xform_clip());
-            // Determine the predictions
-            const float pred_ctr_x = dx * width + ctr_x;
-            const float pred_ctr_y = dy * height + ctr_y;
-            const float pred_w     = std::exp(dw) * width;
-            const float pred_h     = std::exp(dh) * height;
-            // Store the prediction into the output tensor
-            pred_ptr[delta_id]     = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_x - 0.5f * pred_w, 0.f, img_w - 1.f), pred_qinfo);
-            pred_ptr[delta_id + 1] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_y - 0.5f * pred_h, 0.f, img_h - 1.f), pred_qinfo);
-            pred_ptr[delta_id + 2] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_x + 0.5f * pred_w - offset, 0.f, img_w - 1.f), pred_qinfo);
-            pred_ptr[delta_id + 3] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_y + 0.5f * pred_h - offset, 0.f, img_h - 1.f), pred_qinfo);
-        }
-    },
-    box_it);
+            const auto  ptr    = reinterpret_cast<uint16_t *>(box_it.ptr());
+            const auto  b0     = dequantize_qasymm16(*ptr, boxes_qinfo);
+            const auto  b1     = dequantize_qasymm16(*(ptr + 1), boxes_qinfo);
+            const auto  b2     = dequantize_qasymm16(*(ptr + 2), boxes_qinfo);
+            const auto  b3     = dequantize_qasymm16(*(ptr + 3), boxes_qinfo);
+            const float width  = (b2 / scale_before) - (b0 / scale_before) + 1.f;
+            const float height = (b3 / scale_before) - (b1 / scale_before) + 1.f;
+            const float ctr_x  = (b0 / scale_before) + 0.5f * width;
+            const float ctr_y  = (b1 / scale_before) + 0.5f * height;
+            for (size_t j = 0; j < num_classes; ++j)
+            {
+                // Extract deltas
+                const size_t delta_id = id.y() * deltas_width + 4u * j;
+                const float  dx       = dequantize_qasymm8(delta_ptr[delta_id], deltas_qinfo) / bbinfo.weights()[0];
+                const float  dy       = dequantize_qasymm8(delta_ptr[delta_id + 1], deltas_qinfo) / bbinfo.weights()[1];
+                float        dw       = dequantize_qasymm8(delta_ptr[delta_id + 2], deltas_qinfo) / bbinfo.weights()[2];
+                float        dh       = dequantize_qasymm8(delta_ptr[delta_id + 3], deltas_qinfo) / bbinfo.weights()[3];
+                // Clip dw and dh
+                dw = std::min(dw, bbinfo.bbox_xform_clip());
+                dh = std::min(dh, bbinfo.bbox_xform_clip());
+                // Determine the predictions
+                const float pred_ctr_x = dx * width + ctr_x;
+                const float pred_ctr_y = dy * height + ctr_y;
+                const float pred_w     = std::exp(dw) * width;
+                const float pred_h     = std::exp(dh) * height;
+                // Store the prediction into the output tensor
+                pred_ptr[delta_id] = quantize_qasymm16(
+                    scale_after * utility::clamp<float>(pred_ctr_x - 0.5f * pred_w, 0.f, img_w - 1.f), pred_qinfo);
+                pred_ptr[delta_id + 1] = quantize_qasymm16(
+                    scale_after * utility::clamp<float>(pred_ctr_y - 0.5f * pred_h, 0.f, img_h - 1.f), pred_qinfo);
+                pred_ptr[delta_id + 2] = quantize_qasymm16(
+                    scale_after * utility::clamp<float>(pred_ctr_x + 0.5f * pred_w - offset, 0.f, img_w - 1.f),
+                    pred_qinfo);
+                pred_ptr[delta_id + 3] = quantize_qasymm16(
+                    scale_after * utility::clamp<float>(pred_ctr_y + 0.5f * pred_h - offset, 0.f, img_h - 1.f),
+                    pred_qinfo);
+            }
+        },
+        box_it);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h
index 7f990396df..d8013c6227 100644
--- a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h
@@ -30,7 +30,11 @@ namespace arm_compute
 namespace cpu
 {
 template <typename T>
-void bounding_box_transform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
+void bounding_box_transform(const ITensor           *boxes,
+                            ITensor                 *pred_boxes,
+                            const ITensor           *deltas,
+                            BoundingBoxTransformInfo bbinfo,
+                            const Window            &window)
 {
     const size_t num_classes  = deltas->info()->tensor_shape()[0] >> 2;
     const size_t deltas_width = deltas->info()->tensor_shape()[0];
@@ -46,44 +50,53 @@ void bounding_box_transform(const ITensor *boxes, ITensor *pred_boxes, const ITe
     auto delta_ptr = reinterpret_cast<T *>(deltas->buffer() + deltas->info()->offset_first_element_in_bytes());
 
     Iterator box_it(boxes, window);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto ptr    = reinterpret_cast<T *>(box_it.ptr());
-        const auto b0     = *ptr;
-        const auto b1     = *(ptr + 1);
-        const auto b2     = *(ptr + 2);
-        const auto b3     = *(ptr + 3);
-        const T    width  = (b2 / scale_before) - (b0 / scale_before) + T(1.f);
-        const T    height = (b3 / scale_before) - (b1 / scale_before) + T(1.f);
-        const T    ctr_x  = (b0 / scale_before) + T(0.5f) * width;
-        const T    ctr_y  = (b1 / scale_before) + T(0.5f) * height;
-        for(size_t j = 0; j < num_classes; ++j)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            // Extract deltas
-            const size_t delta_id = id.y() * deltas_width + 4u * j;
-            const T      dx       = delta_ptr[delta_id] / T(bbinfo.weights()[0]);
-            const T      dy       = delta_ptr[delta_id + 1] / T(bbinfo.weights()[1]);
-            T            dw       = delta_ptr[delta_id + 2] / T(bbinfo.weights()[2]);
-            T            dh       = delta_ptr[delta_id + 3] / T(bbinfo.weights()[3]);
-            // Clip dw and dh
-            dw = std::min(dw, T(bbinfo.bbox_xform_clip()));
-            dh = std::min(dh, T(bbinfo.bbox_xform_clip()));
-            // Determine the predictions
-            const T pred_ctr_x = dx * width + ctr_x;
-            const T pred_ctr_y = dy * height + ctr_y;
-            const T pred_w     = std::exp(dw) * width;
-            const T pred_h     = std::exp(dh) * height;
-            // Store the prediction into the output tensor
-            pred_ptr[delta_id]     = scale_after * utility::clamp<T>(pred_ctr_x - T(0.5f) * pred_w, T(0), T(img_w - 1));
-            pred_ptr[delta_id + 1] = scale_after * utility::clamp<T>(pred_ctr_y - T(0.5f) * pred_h, T(0), T(img_h - 1));
-            pred_ptr[delta_id + 2] = scale_after * utility::clamp<T>(pred_ctr_x + T(0.5f) * pred_w - offset, T(0), T(img_w - 1));
-            pred_ptr[delta_id + 3] = scale_after * utility::clamp<T>(pred_ctr_y + T(0.5f) * pred_h - offset, T(0), T(img_h - 1));
-        }
-    },
-    box_it);
+            const auto ptr    = reinterpret_cast<T *>(box_it.ptr());
+            const auto b0     = *ptr;
+            const auto b1     = *(ptr + 1);
+            const auto b2     = *(ptr + 2);
+            const auto b3     = *(ptr + 3);
+            const T    width  = (b2 / scale_before) - (b0 / scale_before) + T(1.f);
+            const T    height = (b3 / scale_before) - (b1 / scale_before) + T(1.f);
+            const T    ctr_x  = (b0 / scale_before) + T(0.5f) * width;
+            const T    ctr_y  = (b1 / scale_before) + T(0.5f) * height;
+            for (size_t j = 0; j < num_classes; ++j)
+            {
+                // Extract deltas
+                const size_t delta_id = id.y() * deltas_width + 4u * j;
+                const T      dx       = delta_ptr[delta_id] / T(bbinfo.weights()[0]);
+                const T      dy       = delta_ptr[delta_id + 1] / T(bbinfo.weights()[1]);
+                T            dw       = delta_ptr[delta_id + 2] / T(bbinfo.weights()[2]);
+                T            dh       = delta_ptr[delta_id + 3] / T(bbinfo.weights()[3]);
+                // Clip dw and dh
+                dw = std::min(dw, T(bbinfo.bbox_xform_clip()));
+                dh = std::min(dh, T(bbinfo.bbox_xform_clip()));
+                // Determine the predictions
+                const T pred_ctr_x = dx * width + ctr_x;
+                const T pred_ctr_y = dy * height + ctr_y;
+                const T pred_w     = std::exp(dw) * width;
+                const T pred_h     = std::exp(dh) * height;
+                // Store the prediction into the output tensor
+                pred_ptr[delta_id] = scale_after * utility::clamp<T>(pred_ctr_x - T(0.5f) * pred_w, T(0), T(img_w - 1));
+                pred_ptr[delta_id + 1] =
+                    scale_after * utility::clamp<T>(pred_ctr_y - T(0.5f) * pred_h, T(0), T(img_h - 1));
+                pred_ptr[delta_id + 2] =
+                    scale_after * utility::clamp<T>(pred_ctr_x + T(0.5f) * pred_w - offset, T(0), T(img_w - 1));
+                pred_ptr[delta_id + 3] =
+                    scale_after * utility::clamp<T>(pred_ctr_y + T(0.5f) * pred_h - offset, T(0), T(img_h - 1));
+            }
+        },
+        box_it);
 }
 
-void bounding_box_transform_qsymm16(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window);
+void bounding_box_transform_qsymm16(const ITensor           *boxes,
+                                    ITensor                 *pred_boxes,
+                                    const ITensor           *deltas,
+                                    BoundingBoxTransformInfo bbinfo,
+                                    const Window            &window);
 } // namespace cpu
 } // namespace arm_compute
 #endif //define SRC_CORE_SVE_KERNELS_BOUNDINGBOXTRANFORM_IMPL_H
diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp
index b27c187df3..64ef815195 100644
--- a/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp
@@ -26,7 +26,11 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_qu16_boundingboxtransform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
+void neon_qu16_boundingboxtransform(const ITensor           *boxes,
+                                    ITensor                 *pred_boxes,
+                                    const ITensor           *deltas,
+                                    BoundingBoxTransformInfo bbinfo,
+                                    const Window            &window)
 {
     return bounding_box_transform_qsymm16(boxes, pred_boxes, deltas, bbinfo, window);
 }
diff --git a/src/cpu/kernels/boundingboxtransform/list.h b/src/cpu/kernels/boundingboxtransform/list.h
index 8f06acc8a6..4da725a257 100644
--- a/src/cpu/kernels/boundingboxtransform/list.h
+++ b/src/cpu/kernels/boundingboxtransform/list.h
@@ -27,8 +27,9 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_BOUNDINGBOXTRANFORM_KERNEL(func_name) \
-    void func_name(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
+#define DECLARE_BOUNDINGBOXTRANFORM_KERNEL(func_name)                                                                 \
+    void func_name(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, \
+                   const Window &window)
 DECLARE_BOUNDINGBOXTRANFORM_KERNEL(neon_fp32_boundingboxtransform);
 DECLARE_BOUNDINGBOXTRANFORM_KERNEL(neon_fp16_boundingboxtransform);
 DECLARE_BOUNDINGBOXTRANFORM_KERNEL(neon_qu16_boundingboxtransform);
diff --git a/src/cpu/kernels/cast/generic/neon/fp16.cpp b/src/cpu/kernels/cast/generic/neon/fp16.cpp
index 6cd0c8500b..2897f4b242 100644
--- a/src/cpu/kernels/cast/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/cast/generic/neon/fp16.cpp
@@ -25,8 +25,9 @@
 
 #include "arm_compute/core/CPP/CPPTypes.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "src/cpu/kernels/CpuCastKernel.h"
+
 #include "src/cpu/kernels/cast/list.h"
+#include "src/cpu/kernels/CpuCastKernel.h"
 #include "support/SaturateCast.h"
 
 #include "arm_neon.h"
@@ -35,7 +36,8 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_qasymm8_signed_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_qasymm8_signed_to_fp16_cast(
+    const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_UNUSED(_policy);
@@ -49,42 +51,39 @@ void neon_qasymm8_signed_to_fp16_cast(const ITensor *_src, ITensor *_dst, const
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator src(_src, win);
     Iterator dst(_dst, win);
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
-        const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-        int        x       = window_start_x;
-
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
+            const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
+            const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
+            int        x       = window_start_x;
 
-            const int16x8x2_t texels =
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                {
-                    vmovl_s8(vget_low_s8(texels_s8)),
-                    vmovl_s8(vget_high_s8(texels_s8))
-                }
-            };
-            vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
-            vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
-        }
+                const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
-        }
-    },
-    src, dst);
+                const int16x8x2_t texels = {{vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
+                vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
+                vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
+            }
+        },
+        src, dst);
 }
 
-void neon_s32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_s32_to_fp16_cast(
+    const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_UNUSED(_policy);
@@ -98,44 +97,41 @@ void neon_s32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator src(_src, win);
     Iterator dst(_dst, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-        const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const float32x4x4_t texels =
+            const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+            const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                {
-                    vcvtq_f32_s32(vld1q_s32(src_ptr + x)),
-                    vcvtq_f32_s32(vld1q_s32(src_ptr + x + 4)),
-                    vcvtq_f32_s32(vld1q_s32(src_ptr + x + 8)),
-                    vcvtq_f32_s32(vld1q_s32(src_ptr + x + 12))
-                }
-            };
-
-            vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
-            vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
-        }
+                const float32x4x4_t texels = {
+                    {vcvtq_f32_s32(vld1q_s32(src_ptr + x)), vcvtq_f32_s32(vld1q_s32(src_ptr + x + 4)),
+                     vcvtq_f32_s32(vld1q_s32(src_ptr + x + 8)), vcvtq_f32_s32(vld1q_s32(src_ptr + x + 12))}};
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
-        }
-    },
-    src, dst);
+                vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
+                vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
+            }
+        },
+        src, dst);
 }
 
-void neon_fp32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_fp32_to_fp16_cast(
+    const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_UNUSED(_policy);
@@ -149,44 +145,40 @@ void neon_fp32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator src(_src, win);
     Iterator dst(_dst, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
-        const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const float32x4x4_t texels =
+            const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
+            const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                {
-                    vld1q_f32(src_ptr + x),
-                    vld1q_f32(src_ptr + x + 4),
-                    vld1q_f32(src_ptr + x + 8),
-                    vld1q_f32(src_ptr + x + 12)
-                }
-            };
-
-            vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
-            vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
-        }
+                const float32x4x4_t texels = {{vld1q_f32(src_ptr + x), vld1q_f32(src_ptr + x + 4),
+                                               vld1q_f32(src_ptr + x + 8), vld1q_f32(src_ptr + x + 12)}};
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
-        }
-    },
-    src, dst);
+                vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
+                vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
+            }
+        },
+        src, dst);
 }
 
-void neon_fp16_to_other_dt_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_fp16_to_other_dt_cast(
+    const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_UNUSED(_policy);
@@ -200,142 +192,133 @@ void neon_fp16_to_other_dt_cast(const ITensor *_src, ITensor *_dst, const Thread
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator src(_src, win);
     Iterator dst(_dst, win);
-    switch(_dst->info()->data_type())
+    switch (_dst->info()->data_type())
     {
         case DataType::QASYMM8_SIGNED:
         {
             /* Down-conversion F16 -> QASYMM8_SIGNED (Always saturating) */
-            execute_window_loop(win, [&](const Coordinates &)
-            {
-                const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
-                const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                int x = window_start_x;
-                for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            execute_window_loop(
+                win,
+                [&](const Coordinates &)
                 {
-                    const float16x8x2_t texels =
+                    const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
+                    const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
+
+                    int x = window_start_x;
+                    for (; x <= (window_end_x - window_step_x); x += window_step_x)
                     {
-                        {
+                        const float16x8x2_t texels = {{
                             vld1q_f16(src_ptr + x),
                             vld1q_f16(src_ptr + x + 8),
-                        }
-                    };
+                        }};
 
-                    vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])), vqmovn_s16(vcvtq_s16_f16(texels.val[1]))));
-                }
+                        vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])),
+                                                          vqmovn_s16(vcvtq_s16_f16(texels.val[1]))));
+                    }
 
-                // Compute left-over elements
-                for(; x < window_end_x; ++x)
-                {
-                    *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
-                }
-            },
-            src, dst);
+                    // Compute left-over elements
+                    for (; x < window_end_x; ++x)
+                    {
+                        *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
+                    }
+                },
+                src, dst);
             break;
         }
         case DataType::QASYMM8:
         case DataType::U8:
         {
             /* Down-conversion F16 -> QASYMM8/U8 (Always saturating) */
-            execute_window_loop(win, [&](const Coordinates &)
-            {
-                const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
-                const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                int x = window_start_x;
-                for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            execute_window_loop(
+                win,
+                [&](const Coordinates &)
                 {
-                    const float16x8x2_t texels =
+                    const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
+                    const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+                    int x = window_start_x;
+                    for (; x <= (window_end_x - window_step_x); x += window_step_x)
                     {
-                        {
+                        const float16x8x2_t texels = {{
                             vld1q_f16(src_ptr + x),
                             vld1q_f16(src_ptr + x + 8),
-                        }
-                    };
+                        }};
 
-                    vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])), vqmovun_s16(vcvtq_s16_f16(texels.val[1]))));
-                }
-
-                // Compute left-over elements
-                for(; x < window_end_x; ++x)
-                {
-                    *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
-                }
+                        vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])),
+                                                          vqmovun_s16(vcvtq_s16_f16(texels.val[1]))));
+                    }
 
-            },
-            src, dst);
+                    // Compute left-over elements
+                    for (; x < window_end_x; ++x)
+                    {
+                        *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
+                    }
+                },
+                src, dst);
             break;
         }
         case DataType::F32:
         {
             /* Up-conversion F16 -> F32 */
-            execute_window_loop(win, [&](const Coordinates &)
-            {
-                const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
-                const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
-                int x = window_start_x;
-                for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            execute_window_loop(
+                win,
+                [&](const Coordinates &)
                 {
-                    const float16x8x2_t texels =
+                    const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
+                    const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
+
+                    int x = window_start_x;
+                    for (; x <= (window_end_x - window_step_x); x += window_step_x)
                     {
-                        {
-                            vld1q_f16(src_ptr + x),
-                            vld1q_f16(src_ptr + x + 8)
-                        }
-                    };
-                    vst1q_f32(dst_ptr + x, vcvt_f32_f16(vget_low_f16(texels.val[0])));
-                    vst1q_f32(dst_ptr + x + 4, vcvt_f32_f16(vget_high_f16(texels.val[0])));
-                    vst1q_f32(dst_ptr + x + 8, vcvt_f32_f16(vget_low_f16(texels.val[1])));
-                    vst1q_f32(dst_ptr + x + 12, vcvt_f32_f16(vget_high_f16(texels.val[1])));
-                }
-
-                // Compute left-over elements
-                for(; x < window_end_x; ++x)
-                {
-                    *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
-                }
-            },
-            src, dst);
+                        const float16x8x2_t texels = {{vld1q_f16(src_ptr + x), vld1q_f16(src_ptr + x + 8)}};
+                        vst1q_f32(dst_ptr + x, vcvt_f32_f16(vget_low_f16(texels.val[0])));
+                        vst1q_f32(dst_ptr + x + 4, vcvt_f32_f16(vget_high_f16(texels.val[0])));
+                        vst1q_f32(dst_ptr + x + 8, vcvt_f32_f16(vget_low_f16(texels.val[1])));
+                        vst1q_f32(dst_ptr + x + 12, vcvt_f32_f16(vget_high_f16(texels.val[1])));
+                    }
+
+                    // Compute left-over elements
+                    for (; x < window_end_x; ++x)
+                    {
+                        *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
+                    }
+                },
+                src, dst);
             break;
         }
         case DataType::S32:
         {
             /* Up-conversion F16 -> S32 */
-            execute_window_loop(win, [&](const Coordinates &)
-            {
-                const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
-                const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-                int x = window_start_x;
-                for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            execute_window_loop(
+                win,
+                [&](const Coordinates &)
                 {
-                    const float16x8x2_t texels =
+                    const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
+                    const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+
+                    int x = window_start_x;
+                    for (; x <= (window_end_x - window_step_x); x += window_step_x)
                     {
-                        {
-                            vld1q_f16(src_ptr + x),
-                            vld1q_f16(src_ptr + x + 8)
-                        }
-                    };
-
-                    vst1q_s32(dst_ptr + x, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[0]))));
-                    vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[0]))));
-                    vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[1]))));
-                    vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[1]))));
-                }
-
-                // Compute left-over elements
-                for(; x < window_end_x; ++x)
-                {
-                    *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
-                }
-            },
-            src, dst);
+                        const float16x8x2_t texels = {{vld1q_f16(src_ptr + x), vld1q_f16(src_ptr + x + 8)}};
+
+                        vst1q_s32(dst_ptr + x, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[0]))));
+                        vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[0]))));
+                        vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[1]))));
+                        vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[1]))));
+                    }
+
+                    // Compute left-over elements
+                    for (; x < window_end_x; ++x)
+                    {
+                        *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
+                    }
+                },
+                src, dst);
             break;
         }
         default:
@@ -343,7 +326,8 @@ void neon_fp16_to_other_dt_cast(const ITensor *_src, ITensor *_dst, const Thread
     }
 }
 
-void neon_u8_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_u8_to_fp16_cast(
+    const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_UNUSED(_policy);
@@ -357,40 +341,37 @@ void neon_u8_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator src(_src, win);
     Iterator dst(_dst, win);
     /* Up-conversion U8 -> F16 */
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
-        const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+            const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
+            const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
 
-            const int16x8x2_t texels =
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                {
-                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
-                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
-                }
-            };
-            vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
-            vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
-        }
+                const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
-        }
-    },
-    src, dst);
+                const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
+                                             vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
+                vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
+                vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
+            }
+        },
+        src, dst);
     return;
 }
 
diff --git a/src/cpu/kernels/cast/list.h b/src/cpu/kernels/cast/list.h
index ffd82d5bf3..5e634fc170 100644
--- a/src/cpu/kernels/cast/list.h
+++ b/src/cpu/kernels/cast/list.h
@@ -27,8 +27,9 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_CAST_KERNEL(func_name) \
-    void func_name(const ITensor *_src, ITensor *_dst, const ThreadInfo &tensor, ConvertPolicy _policy, const Window &window)
+#define DECLARE_CAST_KERNEL(func_name)                                                                  \
+    void func_name(const ITensor *_src, ITensor *_dst, const ThreadInfo &tensor, ConvertPolicy _policy, \
+                   const Window &window)
 
 DECLARE_CAST_KERNEL(neon_fp32_to_fp16_cast);
 DECLARE_CAST_KERNEL(neon_u8_to_fp16_cast);
@@ -41,4 +42,4 @@ DECLARE_CAST_KERNEL(neon_bfloat16_to_fp32_cast);
 #undef DECLARE_CAST_KERNEL
 } // namespace cpu
 } // namespace arm_compute
-#endif //SRC_CORE_NEON_KERNELS_CAST_LIST_H
-\ No newline at end of file
+#endif //SRC_CORE_NEON_KERNELS_CAST_LIST_H
diff --git a/src/cpu/kernels/conv3d/neon/list.h b/src/cpu/kernels/conv3d/neon/list.h
index 3bfa124dc3..082c60be29 100644
--- a/src/cpu/kernels/conv3d/neon/list.h
+++ b/src/cpu/kernels/conv3d/neon/list.h
@@ -27,8 +27,9 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/conv3d/neon/quantized.h"
 
 namespace arm_compute
@@ -36,7 +37,12 @@ namespace arm_compute
 namespace cpu
 {
 template <typename T>
-void directconv3d_float_neon_ndhwc(const ITensor *src0, const ITensor *src1, const ITensor *src2, ITensor *dst, const Conv3dInfo &conv_info, const Window &window)
+void directconv3d_float_neon_ndhwc(const ITensor    *src0,
+                                   const ITensor    *src1,
+                                   const ITensor    *src2,
+                                   ITensor          *dst,
+                                   const Conv3dInfo &conv_info,
+                                   const Window     &window)
 {
     const ITensor *src     = src0;
     const ITensor *weights = src1;
@@ -88,91 +94,104 @@ void directconv3d_float_neon_ndhwc(const ITensor *src0, const ITensor *src1, con
     Iterator wei(weights, window_w);
 
     const T *biases_ptr = nullptr;
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         biases_ptr = reinterpret_cast<T *>(biases->buffer() + biases->info()->offset_first_element_in_bytes());
     }
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // We are computing the theoretical input starting points
-        const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
-        const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
-        const int in_d_start_t = static_cast<int>(id[3]) * conv_stride_d - conv_pad_front;
-        const int in_w_end_t   = in_w_start_t + kernel_dim_w;
-        const int in_h_end_t   = in_h_start_t + kernel_dim_h;
-        const int in_d_end_t   = in_d_start_t + kernel_dim_d;
-
-        // We are computing the valid initial and ending input points by checking the borders
-        const int in_w_start = std::max(in_w_start_t, 0);
-        const int in_h_start = std::max(in_h_start_t, 0);
-        const int in_d_start = std::max(in_d_start_t, 0);
-        const int in_w_end   = std::min(in_w_end_t, input_dim_w);
-        const int in_h_end   = std::min(in_h_end_t, input_dim_h);
-        const int in_d_end   = std::min(in_d_end_t, input_dim_d);
-
-        // We use the input points to select the valid weight points to use
-        const int wei_w_start = in_w_start - in_w_start_t;
-        const int wei_h_start = in_h_start - in_h_start_t;
-        const int wei_d_start = in_d_start - in_d_start_t;
-        const int wei_w_end   = kernel_dim_w - (in_w_end_t - in_w_end);
-        const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
-        const int wei_d_end   = kernel_dim_d - (in_d_end_t - in_d_end);
-
-        const int      index_c_out_end = weights->info()->dimension(0);
-        const int      index_c_in_end  = weights->info()->dimension(1);
-        const T *const in_ptr_start    = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[4] * input_stride_n;
-
-        execute_window_loop(window_w, [&](const Coordinates & id_w)
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
         {
-            /*
+            // We are computing the theoretical input starting points
+            const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
+            const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
+            const int in_d_start_t = static_cast<int>(id[3]) * conv_stride_d - conv_pad_front;
+            const int in_w_end_t   = in_w_start_t + kernel_dim_w;
+            const int in_h_end_t   = in_h_start_t + kernel_dim_h;
+            const int in_d_end_t   = in_d_start_t + kernel_dim_d;
+
+            // We are computing the valid initial and ending input points by checking the borders
+            const int in_w_start = std::max(in_w_start_t, 0);
+            const int in_h_start = std::max(in_h_start_t, 0);
+            const int in_d_start = std::max(in_d_start_t, 0);
+            const int in_w_end   = std::min(in_w_end_t, input_dim_w);
+            const int in_h_end   = std::min(in_h_end_t, input_dim_h);
+            const int in_d_end   = std::min(in_d_end_t, input_dim_d);
+
+            // We use the input points to select the valid weight points to use
+            const int wei_w_start = in_w_start - in_w_start_t;
+            const int wei_h_start = in_h_start - in_h_start_t;
+            const int wei_d_start = in_d_start - in_d_start_t;
+            const int wei_w_end   = kernel_dim_w - (in_w_end_t - in_w_end);
+            const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
+            const int wei_d_end   = kernel_dim_d - (in_d_end_t - in_d_end);
+
+            const int      index_c_out_end = weights->info()->dimension(0);
+            const int      index_c_in_end  = weights->info()->dimension(1);
+            const T *const in_ptr_start =
+                reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) +
+                id[4] * input_stride_n;
+
+            execute_window_loop(
+                window_w,
+                [&](const Coordinates &id_w)
+                {
+                    /*
             * This is the loop in the weights, and it goes along OFM (output feature map)
             */
-            const auto weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
-            T          out_temp          = static_cast<T>(0);
-            T         *out_ptr           = reinterpret_cast<T *>(out.ptr());
-            for(int index_wei_d = wei_d_start, index_in_d = in_d_start; index_wei_d < wei_d_end; ++index_wei_d, ++index_in_d)
-            {
-                const auto in_ptr_d      = in_ptr_start + index_in_d * input_stride_d;
-                const auto weights_ptr_d = weights_ptr_start + index_wei_d * kernel_stride_d;
-                for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
-                {
-                    const T *const in_ptr_row      = in_ptr_d + index_in_h * input_stride_h;
-                    const T *const weights_ptr_row = weights_ptr_d + index_wei_h * kernel_stride_h;
-                    for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w)
+                    const auto weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
+                    T          out_temp          = static_cast<T>(0);
+                    T         *out_ptr           = reinterpret_cast<T *>(out.ptr());
+                    for (int index_wei_d = wei_d_start, index_in_d = in_d_start; index_wei_d < wei_d_end;
+                         ++index_wei_d, ++index_in_d)
                     {
-                        const T    *in_ptr_mover      = in_ptr_row + index_in_w * input_stride_w;
-                        const T    *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
-                        int         index_c_in        = 0;
-                        vector_type out_temp_vec      = wrapper::vdup_n(static_cast<T>(0), tag_type());
-                        vector_type w_vec             = wrapper::vdup_n(static_cast<T>(0), tag_type());
-                        for(; index_c_in <= index_c_in_end - num_elems_read_per_iteration;
-                            index_c_in += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
+                        const auto in_ptr_d      = in_ptr_start + index_in_d * input_stride_d;
+                        const auto weights_ptr_d = weights_ptr_start + index_wei_d * kernel_stride_d;
+                        for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end;
+                             ++index_wei_h, ++index_in_h)
                         {
-                            const auto src_vec = wrapper::vloadq(in_ptr_mover);
-                            //Load Cin weights
-                            for(int k = 0; k < num_elems_read_per_iteration; ++k, weights_ptr_mover += index_c_out_end)
+                            const T *const in_ptr_row      = in_ptr_d + index_in_h * input_stride_h;
+                            const T *const weights_ptr_row = weights_ptr_d + index_wei_h * kernel_stride_h;
+                            for (int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end;
+                                 ++index_wei_w, ++index_in_w)
                             {
-                                w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k);
+                                const T    *in_ptr_mover      = in_ptr_row + index_in_w * input_stride_w;
+                                const T    *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
+                                int         index_c_in        = 0;
+                                vector_type out_temp_vec      = wrapper::vdup_n(static_cast<T>(0), tag_type());
+                                vector_type w_vec             = wrapper::vdup_n(static_cast<T>(0), tag_type());
+                                for (; index_c_in <= index_c_in_end - num_elems_read_per_iteration;
+                                     index_c_in += num_elems_read_per_iteration,
+                                     in_ptr_mover += num_elems_read_per_iteration)
+                                {
+                                    const auto src_vec = wrapper::vloadq(in_ptr_mover);
+                                    //Load Cin weights
+                                    for (int k = 0; k < num_elems_read_per_iteration;
+                                         ++k, weights_ptr_mover += index_c_out_end)
+                                    {
+                                        w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k);
+                                    }
+                                    out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+                                }
+                                out_temp += vreduce(out_temp_vec);
+                                for (; index_c_in < index_c_in_end;
+                                     ++index_c_in, ++in_ptr_mover, weights_ptr_mover += index_c_out_end)
+                                {
+                                    const auto src_val = *(in_ptr_mover);
+                                    const auto w_val   = *(weights_ptr_mover);
+                                    out_temp += src_val * w_val;
+                                }
                             }
-                            out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec);
-                        }
-                        out_temp += vreduce(out_temp_vec);
-                        for(; index_c_in < index_c_in_end; ++index_c_in, ++in_ptr_mover, weights_ptr_mover += index_c_out_end)
-                        {
-                            const auto src_val = *(in_ptr_mover);
-                            const auto w_val   = *(weights_ptr_mover);
-                            out_temp += src_val * w_val;
                         }
                     }
-                }
-            }
-            *(reinterpret_cast<T *>(out_ptr + id_w[0])) = (biases_ptr != nullptr) ? out_temp + biases_ptr[id_w[0]] : out_temp;
+                    *(reinterpret_cast<T *>(out_ptr + id_w[0])) =
+                        (biases_ptr != nullptr) ? out_temp + biases_ptr[id_w[0]] : out_temp;
+                },
+                wei);
         },
-        wei);
-    },
-    out);
+        out);
 }
 
 } // namespace cpu
 } // namespace arm_compute
-#endif // SRC_CORE_NEON_KERNELS_CONV3D_LIST_H
-\ No newline at end of file
+#endif // SRC_CORE_NEON_KERNELS_CONV3D_LIST_H
diff --git a/src/cpu/kernels/conv3d/neon/quantized.h b/src/cpu/kernels/conv3d/neon/quantized.h
index a8165b4944..f0fc9b5a71 100644
--- a/src/cpu/kernels/conv3d/neon/quantized.h
+++ b/src/cpu/kernels/conv3d/neon/quantized.h
@@ -28,16 +28,22 @@
 #include "arm_compute/core/utils/misc/Traits.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
+
+#include "src/core/helpers/WindowHelpers.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
 template <typename T>
-void directconv3d_quantized_neon_ndhwc(const ITensor *src0, const ITensor *src1, const ITensor *src2, ITensor *dst, const Conv3dInfo &conv_info, const Window &window)
+void directconv3d_quantized_neon_ndhwc(const ITensor    *src0,
+                                       const ITensor    *src1,
+                                       const ITensor    *src2,
+                                       ITensor          *dst,
+                                       const Conv3dInfo &conv_info,
+                                       const Window     &window)
 {
     const ITensor *src     = src0;
     const ITensor *weights = src1;
@@ -104,153 +110,166 @@ void directconv3d_quantized_neon_ndhwc(const ITensor *src0, const ITensor *src1,
     Iterator wei(weights, window_w);
 
     const int32_t *biases_ptr = nullptr;
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         biases_ptr = reinterpret_cast<int32_t *>(biases->buffer() + biases->info()->offset_first_element_in_bytes());
     }
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // We are computing the theoretical input starting points
-        const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
-        const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
-        const int in_d_start_t = static_cast<int>(id[3]) * conv_stride_d - conv_pad_front;
-        const int in_w_end_t   = in_w_start_t + kernel_dim_w;
-        const int in_h_end_t   = in_h_start_t + kernel_dim_h;
-        const int in_d_end_t   = in_d_start_t + kernel_dim_d;
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // We are computing the theoretical input starting points
+            const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
+            const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
+            const int in_d_start_t = static_cast<int>(id[3]) * conv_stride_d - conv_pad_front;
+            const int in_w_end_t   = in_w_start_t + kernel_dim_w;
+            const int in_h_end_t   = in_h_start_t + kernel_dim_h;
+            const int in_d_end_t   = in_d_start_t + kernel_dim_d;
 
-        // We are computing the valid initial and ending input points by checking the borders
-        const int in_w_start = std::max(in_w_start_t, 0);
-        const int in_h_start = std::max(in_h_start_t, 0);
-        const int in_d_start = std::max(in_d_start_t, 0);
-        const int in_w_end   = std::min(in_w_end_t, input_dim_w);
-        const int in_h_end   = std::min(in_h_end_t, input_dim_h);
-        const int in_d_end   = std::min(in_d_end_t, input_dim_d);
+            // We are computing the valid initial and ending input points by checking the borders
+            const int in_w_start = std::max(in_w_start_t, 0);
+            const int in_h_start = std::max(in_h_start_t, 0);
+            const int in_d_start = std::max(in_d_start_t, 0);
+            const int in_w_end   = std::min(in_w_end_t, input_dim_w);
+            const int in_h_end   = std::min(in_h_end_t, input_dim_h);
+            const int in_d_end   = std::min(in_d_end_t, input_dim_d);
 
-        // We use the input points to select the valid weight points to use
-        const int wei_w_start = in_w_start - in_w_start_t;
-        const int wei_h_start = in_h_start - in_h_start_t;
-        const int wei_d_start = in_d_start - in_d_start_t;
-        const int wei_w_end   = kernel_dim_w - (in_w_end_t - in_w_end);
-        const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
-        const int wei_d_end   = kernel_dim_d - (in_d_end_t - in_d_end);
+            // We use the input points to select the valid weight points to use
+            const int wei_w_start = in_w_start - in_w_start_t;
+            const int wei_h_start = in_h_start - in_h_start_t;
+            const int wei_d_start = in_d_start - in_d_start_t;
+            const int wei_w_end   = kernel_dim_w - (in_w_end_t - in_w_end);
+            const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
+            const int wei_d_end   = kernel_dim_d - (in_d_end_t - in_d_end);
 
-        const int      index_c_out_end = weights->info()->dimension(0);
-        const int      index_c_in_end  = weights->info()->dimension(1);
-        const T *const in_ptr_start    = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[4] * input_stride_n;
+            const int      index_c_out_end = weights->info()->dimension(0);
+            const int      index_c_in_end  = weights->info()->dimension(1);
+            const T *const in_ptr_start =
+                reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) +
+                id[4] * input_stride_n;
 
-        execute_window_loop(window_w, [&](const Coordinates & id_w)
-        {
-            /*
+            execute_window_loop(
+                window_w,
+                [&](const Coordinates &id_w)
+                {
+                    /*
             * This is the loop in the weights, and it goes along OFM (output feature map)
             */
-            const auto weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
-            int32_t    acc               = static_cast<int32_t>(0);
-            T         *out_ptr           = reinterpret_cast<T *>(out.ptr());
-            for(int index_wei_d = wei_d_start, index_in_d = in_d_start; index_wei_d < wei_d_end; ++index_wei_d, ++index_in_d)
-            {
-                const auto in_ptr_d      = in_ptr_start + index_in_d * input_stride_d;
-                const auto weights_ptr_d = weights_ptr_start + index_wei_d * kernel_stride_d;
-                for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
-                {
-                    const T *const in_ptr_row      = in_ptr_d + index_in_h * input_stride_h;
-                    const T *const weights_ptr_row = weights_ptr_d + index_wei_h * kernel_stride_h;
-                    for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w)
+                    const auto weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
+                    int32_t    acc               = static_cast<int32_t>(0);
+                    T         *out_ptr           = reinterpret_cast<T *>(out.ptr());
+                    for (int index_wei_d = wei_d_start, index_in_d = in_d_start; index_wei_d < wei_d_end;
+                         ++index_wei_d, ++index_in_d)
                     {
-                        const T    *in_ptr_mover      = in_ptr_row + index_in_w * input_stride_w;
-                        const T    *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
-                        int         index_c_in        = 0;
-                        vector_type w_vec             = wrapper::vdup_n(static_cast<T>(0), tag_type());
-
-                        q32x4_t acc_q32_0 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
-                        q32x4_t acc_q32_1 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
-                        q32x4_t acc_q32_2 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
-                        q32x4_t acc_q32_3 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
-
-                        for(; index_c_in <= index_c_in_end - num_elems_read_per_iteration;
-                            index_c_in += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
+                        const auto in_ptr_d      = in_ptr_start + index_in_d * input_stride_d;
+                        const auto weights_ptr_d = weights_ptr_start + index_wei_d * kernel_stride_d;
+                        for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end;
+                             ++index_wei_h, ++index_in_h)
                         {
-                            const auto src_vec = wrapper::vloadq(in_ptr_mover);
-                            //Load Cin weights
-                            for(int k = 0; k < num_elems_read_per_iteration; ++k, weights_ptr_mover += index_c_out_end)
+                            const T *const in_ptr_row      = in_ptr_d + index_in_h * input_stride_h;
+                            const T *const weights_ptr_row = weights_ptr_d + index_wei_h * kernel_stride_h;
+                            for (int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end;
+                                 ++index_wei_w, ++index_in_w)
                             {
-                                w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k);
-                            }
-                            q32x4_t src_q32_0 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
-                            q32x4_t src_q32_1 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
-                            q32x4_t src_q32_2 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
-                            q32x4_t src_q32_3 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
+                                const T    *in_ptr_mover      = in_ptr_row + index_in_w * input_stride_w;
+                                const T    *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
+                                int         index_c_in        = 0;
+                                vector_type w_vec             = wrapper::vdup_n(static_cast<T>(0), tag_type());
 
-                            q32x4_t wei_q32_0 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
-                            q32x4_t wei_q32_1 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
-                            q32x4_t wei_q32_2 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
-                            q32x4_t wei_q32_3 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
+                                q32x4_t acc_q32_0 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
+                                q32x4_t acc_q32_1 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
+                                q32x4_t acc_q32_2 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
+                                q32x4_t acc_q32_3 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
 
-                            const auto src_q16_0 = wrapper::vmovl(wrapper::vgetlow(src_vec));
-                            const auto src_q16_1 = wrapper::vmovl(wrapper::vgethigh(src_vec));
-                            const auto wei_q16_0 = wrapper::vmovl(wrapper::vgetlow(w_vec));
-                            const auto wei_q16_1 = wrapper::vmovl(wrapper::vgethigh(w_vec));
+                                for (; index_c_in <= index_c_in_end - num_elems_read_per_iteration;
+                                     index_c_in += num_elems_read_per_iteration,
+                                     in_ptr_mover += num_elems_read_per_iteration)
+                                {
+                                    const auto src_vec = wrapper::vloadq(in_ptr_mover);
+                                    //Load Cin weights
+                                    for (int k = 0; k < num_elems_read_per_iteration;
+                                         ++k, weights_ptr_mover += index_c_out_end)
+                                    {
+                                        w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k);
+                                    }
+                                    q32x4_t src_q32_0 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
+                                    q32x4_t src_q32_1 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
+                                    q32x4_t src_q32_2 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
+                                    q32x4_t src_q32_3 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
 
-                            src_q32_0 = wrapper::vadd(src_q32_0, wrapper::vmovl(wrapper::vgetlow(src_q16_0)));
-                            src_q32_1 = wrapper::vadd(src_q32_1, wrapper::vmovl(wrapper::vgethigh(src_q16_0)));
-                            src_q32_2 = wrapper::vadd(src_q32_2, wrapper::vmovl(wrapper::vgetlow(src_q16_1)));
-                            src_q32_3 = wrapper::vadd(src_q32_3, wrapper::vmovl(wrapper::vgethigh(src_q16_1)));
+                                    q32x4_t wei_q32_0 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
+                                    q32x4_t wei_q32_1 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
+                                    q32x4_t wei_q32_2 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
+                                    q32x4_t wei_q32_3 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
 
-                            wei_q32_0 = wrapper::vadd(wei_q32_0, wrapper::vmovl(wrapper::vgetlow(wei_q16_0)));
-                            wei_q32_1 = wrapper::vadd(wei_q32_1, wrapper::vmovl(wrapper::vgethigh(wei_q16_0)));
-                            wei_q32_2 = wrapper::vadd(wei_q32_2, wrapper::vmovl(wrapper::vgetlow(wei_q16_1)));
-                            wei_q32_3 = wrapper::vadd(wei_q32_3, wrapper::vmovl(wrapper::vgethigh(wei_q16_1)));
+                                    const auto src_q16_0 = wrapper::vmovl(wrapper::vgetlow(src_vec));
+                                    const auto src_q16_1 = wrapper::vmovl(wrapper::vgethigh(src_vec));
+                                    const auto wei_q16_0 = wrapper::vmovl(wrapper::vgetlow(w_vec));
+                                    const auto wei_q16_1 = wrapper::vmovl(wrapper::vgethigh(w_vec));
 
-                            acc_q32_0 = wrapper::vmla(acc_q32_0, wei_q32_0, src_q32_0);
-                            acc_q32_1 = wrapper::vmla(acc_q32_1, wei_q32_1, src_q32_1);
-                            acc_q32_2 = wrapper::vmla(acc_q32_2, wei_q32_2, src_q32_2);
-                            acc_q32_3 = wrapper::vmla(acc_q32_3, wei_q32_3, src_q32_3);
-                        }
+                                    src_q32_0 = wrapper::vadd(src_q32_0, wrapper::vmovl(wrapper::vgetlow(src_q16_0)));
+                                    src_q32_1 = wrapper::vadd(src_q32_1, wrapper::vmovl(wrapper::vgethigh(src_q16_0)));
+                                    src_q32_2 = wrapper::vadd(src_q32_2, wrapper::vmovl(wrapper::vgetlow(src_q16_1)));
+                                    src_q32_3 = wrapper::vadd(src_q32_3, wrapper::vmovl(wrapper::vgethigh(src_q16_1)));
+
+                                    wei_q32_0 = wrapper::vadd(wei_q32_0, wrapper::vmovl(wrapper::vgetlow(wei_q16_0)));
+                                    wei_q32_1 = wrapper::vadd(wei_q32_1, wrapper::vmovl(wrapper::vgethigh(wei_q16_0)));
+                                    wei_q32_2 = wrapper::vadd(wei_q32_2, wrapper::vmovl(wrapper::vgetlow(wei_q16_1)));
+                                    wei_q32_3 = wrapper::vadd(wei_q32_3, wrapper::vmovl(wrapper::vgethigh(wei_q16_1)));
+
+                                    acc_q32_0 = wrapper::vmla(acc_q32_0, wei_q32_0, src_q32_0);
+                                    acc_q32_1 = wrapper::vmla(acc_q32_1, wei_q32_1, src_q32_1);
+                                    acc_q32_2 = wrapper::vmla(acc_q32_2, wei_q32_2, src_q32_2);
+                                    acc_q32_3 = wrapper::vmla(acc_q32_3, wei_q32_3, src_q32_3);
+                                }
 #if defined(__aarch64__)
-                        acc += wrapper::vaddv(acc_q32_0);
-                        acc += wrapper::vaddv(acc_q32_1);
-                        acc += wrapper::vaddv(acc_q32_2);
-                        acc += wrapper::vaddv(acc_q32_3);
+                                acc += wrapper::vaddv(acc_q32_0);
+                                acc += wrapper::vaddv(acc_q32_1);
+                                acc += wrapper::vaddv(acc_q32_2);
+                                acc += wrapper::vaddv(acc_q32_3);
 #else // __aarch64__
-                        auto temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_0), wrapper::vgetlow(acc_q32_0));
-                        temp      = wrapper::vpadd(temp, temp);
-                        acc       += wrapper::vgetlane(temp, 0);
+                                auto temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_0), wrapper::vgetlow(acc_q32_0));
+                                temp      = wrapper::vpadd(temp, temp);
+                                acc += wrapper::vgetlane(temp, 0);
 
-                        temp      = wrapper::vpadd(wrapper::vgethigh(acc_q32_1), wrapper::vgetlow(acc_q32_1));
-                        temp      = wrapper::vpadd(temp, temp);
-                        acc       += wrapper::vgetlane(temp, 0);
+                                temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_1), wrapper::vgetlow(acc_q32_1));
+                                temp = wrapper::vpadd(temp, temp);
+                                acc += wrapper::vgetlane(temp, 0);
 
-                        temp      = wrapper::vpadd(wrapper::vgethigh(acc_q32_2), wrapper::vgetlow(acc_q32_2));
-                        temp      = wrapper::vpadd(temp, temp);
-                        acc       += wrapper::vgetlane(temp, 0);
+                                temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_2), wrapper::vgetlow(acc_q32_2));
+                                temp = wrapper::vpadd(temp, temp);
+                                acc += wrapper::vgetlane(temp, 0);
 
-                        temp      = wrapper::vpadd(wrapper::vgethigh(acc_q32_3), wrapper::vgetlow(acc_q32_3));
-                        temp      = wrapper::vpadd(temp, temp);
-                        acc       += wrapper::vgetlane(temp, 0);
+                                temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_3), wrapper::vgetlow(acc_q32_3));
+                                temp = wrapper::vpadd(temp, temp);
+                                acc += wrapper::vgetlane(temp, 0);
 
 #endif // __aarch64__
 
-                        for(; index_c_in < index_c_in_end; ++index_c_in, ++in_ptr_mover, weights_ptr_mover += index_c_out_end)
-                        {
-                            const auto src_val = *(in_ptr_mover) + input_offset;
-                            const auto w_val   = *(weights_ptr_mover) + weights_offset;
-                            acc += src_val * w_val;
+                                for (; index_c_in < index_c_in_end;
+                                     ++index_c_in, ++in_ptr_mover, weights_ptr_mover += index_c_out_end)
+                                {
+                                    const auto src_val = *(in_ptr_mover) + input_offset;
+                                    const auto w_val   = *(weights_ptr_mover) + weights_offset;
+                                    acc += src_val * w_val;
+                                }
+                            }
                         }
                     }
-                }
-            }
 
-            if(biases)
-            {
-                acc += *reinterpret_cast<const int32_t *>(biases_ptr + id_w[0]);
-            }
+                    if (biases)
+                    {
+                        acc += *reinterpret_cast<const int32_t *>(biases_ptr + id_w[0]);
+                    }
 
-            T out_val                                   = finalize_quantization(acc, output_multiplier, output_shift, output_offset, T(0), T(0), false);
-            *(reinterpret_cast<T *>(out_ptr + id_w[0])) = out_val;
+                    T out_val =
+                        finalize_quantization(acc, output_multiplier, output_shift, output_offset, T(0), T(0), false);
+                    *(reinterpret_cast<T *>(out_ptr + id_w[0])) = out_val;
+                },
+                wei);
         },
-        wei);
-    },
-    out);
+        out);
 }
 } // namespace cpu
 } // namespace arm_compute
-#endif // SRC_CORE_NEON_KERNELS_CONV3D_QUANTIZED_H
-\ No newline at end of file
+#endif // SRC_CORE_NEON_KERNELS_CONV3D_QUANTIZED_H
diff --git a/src/cpu/kernels/crop/generic/neon/crop_helper.h b/src/cpu/kernels/crop/generic/neon/crop_helper.h
index 1fe8e11e98..8fb7ad2087 100644
--- a/src/cpu/kernels/crop/generic/neon/crop_helper.h
+++ b/src/cpu/kernels/crop/generic/neon/crop_helper.h
@@ -80,7 +80,7 @@ inline float32x4_t load_as_f32(uint8_t *ptr)
 {
     return vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(wrapper::vload(ptr)))));
 }
-}
+} // namespace cpu
 } // namespace arm_compute
 
-#endif //SRC_CORE_NEON_KERNELS_CROP_CROP_HELPER_H
-\ No newline at end of file
+#endif //SRC_CORE_NEON_KERNELS_CROP_CROP_HELPER_H
diff --git a/src/cpu/kernels/crop/generic/neon/fp16.cpp b/src/cpu/kernels/crop/generic/neon/fp16.cpp
index 218ebba191..3739c9d4e0 100644
--- a/src/cpu/kernels/crop/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/crop/generic/neon/fp16.cpp
@@ -29,12 +29,19 @@ namespace arm_compute
 {
 namespace cpu
 {
-void fp16_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                                int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void fp16_in_bounds_crop_window(const ITensor *input,
+                                const ITensor *output,
+                                float         *output_ptr,
+                                Coordinates    input_offset,
+                                int32_t        window_step_x,
+                                int32_t        output_width_start,
+                                int32_t        output_width_limit,
+                                bool           input_has_single_channel,
+                                bool           is_width_flipped)
 {
-    return in_bounds_crop_window<float16_t>(input, output, output_ptr, input_offset,
-                                            window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
-}
+    return in_bounds_crop_window<float16_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                            output_width_limit, input_has_single_channel, is_width_flipped);
 }
+} // namespace cpu
 } // namespace arm_compute
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/crop/generic/neon/fp32.cpp b/src/cpu/kernels/crop/generic/neon/fp32.cpp
index 16d0218fce..f665c3652c 100644
--- a/src/cpu/kernels/crop/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/crop/generic/neon/fp32.cpp
@@ -28,11 +28,18 @@ namespace arm_compute
 {
 namespace cpu
 {
-void fp32_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                                int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void fp32_in_bounds_crop_window(const ITensor *input,
+                                const ITensor *output,
+                                float         *output_ptr,
+                                Coordinates    input_offset,
+                                int32_t        window_step_x,
+                                int32_t        output_width_start,
+                                int32_t        output_width_limit,
+                                bool           input_has_single_channel,
+                                bool           is_width_flipped)
 {
-    return in_bounds_crop_window<float32_t>(input, output, output_ptr, input_offset,
-                                            window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
-}
+    return in_bounds_crop_window<float32_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                            output_width_limit, input_has_single_channel, is_width_flipped);
 }
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/crop/generic/neon/impl.h b/src/cpu/kernels/crop/generic/neon/impl.h
index a59588be45..b90ba9ddbf 100644
--- a/src/cpu/kernels/crop/generic/neon/impl.h
+++ b/src/cpu/kernels/crop/generic/neon/impl.h
@@ -26,8 +26,9 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/crop/generic/neon/crop_helper.h"
 
 namespace arm_compute
@@ -35,19 +36,26 @@ namespace arm_compute
 namespace cpu
 {
 template <typename T>
-void in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                           int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void in_bounds_crop_window(const ITensor *input,
+                           const ITensor *output,
+                           float         *output_ptr,
+                           Coordinates    input_offset,
+                           int32_t        window_step_x,
+                           int32_t        output_width_start,
+                           int32_t        output_width_limit,
+                           bool           input_has_single_channel,
+                           bool           is_width_flipped)
 {
     // Reverse elements if width flipped.
-    if(is_width_flipped)
+    if (is_width_flipped)
     {
         // Collapse first dimension if possible.
-        if(input_has_single_channel)
+        if (input_has_single_channel)
         {
             int32_t     x = output_width_start;
             Coordinates negative_offset(input_offset);
             negative_offset.set(1, negative_offset[1] - window_step_x + 1);
-            for(; x <= output_width_limit - window_step_x; x += window_step_x, negative_offset[1] -= window_step_x)
+            for (; x <= output_width_limit - window_step_x; x += window_step_x, negative_offset[1] -= window_step_x)
             {
                 auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(negative_offset)));
 
@@ -57,25 +65,27 @@ void in_bounds_crop_window(const ITensor *input, const ITensor *output, float *o
                 wrapper::vstore(output_ptr + x, in);
             }
             input_offset[1] = negative_offset[1] + window_step_x - 1;
-            for(; x < output_width_limit; ++x, --input_offset[1])
+            for (; x < output_width_limit; ++x, --input_offset[1])
             {
                 *(output_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
             }
         }
         else
         {
-            for(int32_t x = output_width_start; x < output_width_limit; ++x, --input_offset[1])
+            for (int32_t x = output_width_start; x < output_width_limit; ++x, --input_offset[1])
             {
                 input_offset.set(0, 0);
                 int32_t c = 0;
-                for(; c <= static_cast<int32_t>(input->info()->dimension(0)) - window_step_x; c += window_step_x, input_offset[0] += window_step_x)
+                for (; c <= static_cast<int32_t>(input->info()->dimension(0)) - window_step_x;
+                     c += window_step_x, input_offset[0] += window_step_x)
                 {
                     auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
                     wrapper::vstore(output_ptr + x * output->info()->dimension(0) + c, in);
                 }
-                for(; c < static_cast<int32_t>(input->info()->dimension(0)); ++c, ++input_offset[0])
+                for (; c < static_cast<int32_t>(input->info()->dimension(0)); ++c, ++input_offset[0])
                 {
-                    *(output_ptr + x * output->info()->dimension(0) + c) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+                    *(output_ptr + x * output->info()->dimension(0) + c) =
+                        static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
                 }
             }
         }
@@ -83,25 +93,28 @@ void in_bounds_crop_window(const ITensor *input, const ITensor *output, float *o
     else
     {
         // Use memcpy if the elements don't need converting to float.
-        if(std::is_same<T, float>::value)
+        if (std::is_same<T, float>::value)
         {
             memcpy(static_cast<void *>(output_ptr + output_width_start * output->info()->dimension(0)),
                    reinterpret_cast<const void *>(input->ptr_to_element(input_offset)),
-                   (output_width_limit - output_width_start) * output->info()->dimension(0) * output->info()->element_size());
+                   (output_width_limit - output_width_start) * output->info()->dimension(0) *
+                       output->info()->element_size());
         }
         else
         {
-            int32_t x                = 0;
-            int32_t limit            = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
+            int32_t x = 0;
+            int32_t limit =
+                (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
             float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
-            for(; x <= limit - window_step_x; x += window_step_x, input_offset[0] += window_step_x)
+            for (; x <= limit - window_step_x; x += window_step_x, input_offset[0] += window_step_x)
             {
                 auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
                 wrapper::vstore(output_start_ptr + x, in);
             }
-            for(; x < limit; ++x, ++input_offset[0])
+            for (; x < limit; ++x, ++input_offset[0])
             {
-                *(output_start_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+                *(output_start_ptr + x) =
+                    static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
             }
         }
     }
diff --git a/src/cpu/kernels/crop/generic/neon/integer.cpp b/src/cpu/kernels/crop/generic/neon/integer.cpp
index ebf2c1fbd3..602434f54f 100644
--- a/src/cpu/kernels/crop/generic/neon/integer.cpp
+++ b/src/cpu/kernels/crop/generic/neon/integer.cpp
@@ -29,46 +29,88 @@ namespace arm_compute
 {
 namespace cpu
 {
-void u8_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                              int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void u8_in_bounds_crop_window(const ITensor *input,
+                              const ITensor *output,
+                              float         *output_ptr,
+                              Coordinates    input_offset,
+                              int32_t        window_step_x,
+                              int32_t        output_width_start,
+                              int32_t        output_width_limit,
+                              bool           input_has_single_channel,
+                              bool           is_width_flipped)
 {
-    return in_bounds_crop_window<uint8_t>(input, output, output_ptr, input_offset,
-                                          window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+    return in_bounds_crop_window<uint8_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                          output_width_limit, input_has_single_channel, is_width_flipped);
 }
 
-void u16_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                               int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void u16_in_bounds_crop_window(const ITensor *input,
+                               const ITensor *output,
+                               float         *output_ptr,
+                               Coordinates    input_offset,
+                               int32_t        window_step_x,
+                               int32_t        output_width_start,
+                               int32_t        output_width_limit,
+                               bool           input_has_single_channel,
+                               bool           is_width_flipped)
 {
-    return in_bounds_crop_window<uint16_t>(input, output, output_ptr, input_offset,
-                                           window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+    return in_bounds_crop_window<uint16_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                           output_width_limit, input_has_single_channel, is_width_flipped);
 }
 
-void u32_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                               int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void u32_in_bounds_crop_window(const ITensor *input,
+                               const ITensor *output,
+                               float         *output_ptr,
+                               Coordinates    input_offset,
+                               int32_t        window_step_x,
+                               int32_t        output_width_start,
+                               int32_t        output_width_limit,
+                               bool           input_has_single_channel,
+                               bool           is_width_flipped)
 {
-    return in_bounds_crop_window<uint32_t>(input, output, output_ptr, input_offset,
-                                           window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+    return in_bounds_crop_window<uint32_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                           output_width_limit, input_has_single_channel, is_width_flipped);
 }
 
-void s8_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                              int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void s8_in_bounds_crop_window(const ITensor *input,
+                              const ITensor *output,
+                              float         *output_ptr,
+                              Coordinates    input_offset,
+                              int32_t        window_step_x,
+                              int32_t        output_width_start,
+                              int32_t        output_width_limit,
+                              bool           input_has_single_channel,
+                              bool           is_width_flipped)
 {
-    return in_bounds_crop_window<int8_t>(input, output, output_ptr, input_offset,
-                                         window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+    return in_bounds_crop_window<int8_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                         output_width_limit, input_has_single_channel, is_width_flipped);
 }
 
-void s16_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                               int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void s16_in_bounds_crop_window(const ITensor *input,
+                               const ITensor *output,
+                               float         *output_ptr,
+                               Coordinates    input_offset,
+                               int32_t        window_step_x,
+                               int32_t        output_width_start,
+                               int32_t        output_width_limit,
+                               bool           input_has_single_channel,
+                               bool           is_width_flipped)
 {
-    return in_bounds_crop_window<int16_t>(input, output, output_ptr, input_offset,
-                                          window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+    return in_bounds_crop_window<int16_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                          output_width_limit, input_has_single_channel, is_width_flipped);
 }
 
-void s32_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                               int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void s32_in_bounds_crop_window(const ITensor *input,
+                               const ITensor *output,
+                               float         *output_ptr,
+                               Coordinates    input_offset,
+                               int32_t        window_step_x,
+                               int32_t        output_width_start,
+                               int32_t        output_width_limit,
+                               bool           input_has_single_channel,
+                               bool           is_width_flipped)
 {
-    return in_bounds_crop_window<int32_t>(input, output, output_ptr, input_offset,
-                                          window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
-}
+    return in_bounds_crop_window<int32_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                          output_width_limit, input_has_single_channel, is_width_flipped);
 }
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/crop/list.h b/src/cpu/kernels/crop/list.h
index a6b83215ae..9cb7726203 100644
--- a/src/cpu/kernels/crop/list.h
+++ b/src/cpu/kernels/crop/list.h
@@ -26,8 +26,9 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/crop/generic/neon/impl.h"
 
 namespace arm_compute
@@ -36,7 +37,8 @@ namespace cpu
 {
 #define DECLARE_CROP_KERNEL(func_name)                                                                       \
     void func_name(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, \
-                   int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+                   int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit,            \
+                   bool input_has_single_channel, bool is_width_flipped)
 
 DECLARE_CROP_KERNEL(fp16_in_bounds_crop_window);
 DECLARE_CROP_KERNEL(fp32_in_bounds_crop_window);
diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp
index e85a1664ea..293e606d81 100644
--- a/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp
@@ -29,11 +29,16 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp16_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias,
-                                    ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void neon_fp16_deptwiseconv2dnative(const ITensor         *src,
+                                    const ITensor         *weights,
+                                    const ITensor         *bias,
+                                    ITensor               *dst,
+                                    const Window          &window,
+                                    bool                   has_biases,
+                                    const ConvolutionInfo &info)
 {
     return run_depthwise_float<float16_t, float16_t>(src, weights, bias, dst, window, has_biases, info);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
 #endif //__ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp
index b2333a3334..c6fa4790b7 100644
--- a/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp
@@ -26,10 +26,15 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp32_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias,
-                                    ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void neon_fp32_deptwiseconv2dnative(const ITensor         *src,
+                                    const ITensor         *weights,
+                                    const ITensor         *bias,
+                                    ITensor               *dst,
+                                    const Window          &window,
+                                    bool                   has_biases,
+                                    const ConvolutionInfo &info)
 {
     return run_depthwise_float<float, float>(src, weights, bias, dst, window, has_biases, info);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp
index a2ae5564e6..d08e973968 100644
--- a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp
@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h"
+
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/function_info/ConvolutionInfo.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
@@ -65,8 +67,16 @@ inline int32_t rounding_divide_by_exp2(const int32_t &x, const int exponent)
 namespace
 {
 template <typename T, typename TW>
-void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
-                                          const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
+void depthwise_loop_multiplier1_quantized(const ITensor       *src,
+                                          const ITensor       *weights,
+                                          const ITensor       *biases,
+                                          ITensor             *dst,
+                                          const PadStrideInfo &conv_info,
+                                          const Size2D        &dilation,
+                                          std::vector<int>     output_multiplier,
+                                          std::vector<int>     output_shift,
+                                          const Window        &window,
+                                          bool                 has_biases) // NOLINT
 {
     ARM_COMPUTE_UNUSED(output_multiplier, output_shift);
     constexpr auto element_per_vector = vector_size / sizeof(T);
@@ -75,7 +85,8 @@ void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *wei
     using AccType                     = int32_t;
     using AccArrayType                = std::array<AccType, element_per_vector>;
 
-    const auto out_of_bound_value  = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
+    const auto out_of_bound_value =
+        PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
     const auto out_of_bound_vector = wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});
 
     const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window);
@@ -104,152 +115,175 @@ void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *wei
     Iterator output_it(dst, win_output);
     Iterator biases_it{};
 
-    if(has_biases)
+    if (has_biases)
     {
         biases_it = Iterator(biases, win_weights);
     }
 
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        const int32_t input_y           = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int32_t input_z           = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-        auto const    base_weights_ptr  = weights_it.ptr();
-        size_t        x                 = run_info.x_start;
-
-        for(; x < run_info.x_leftover_start; x += run_info.x_step)
+    execute_window_loop(
+        execution_window,
+        [&](const Coordinates &id)
         {
-            AccArrayType acc{};
-            AccArrayType in_sum{};
-            AccArrayType we_sum{};
+            const int32_t input_y           = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+            const int32_t input_z           = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+            const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+            auto const    base_weights_ptr  = weights_it.ptr();
+            size_t        x                 = run_info.x_start;
 
-            auto weights_ptr  = base_weights_ptr;
-            auto input_offset = base_input_offset;
-
-            for(size_t h = 0; h < run_info.weights_height; ++h)
+            for (; x < run_info.x_leftover_start; x += run_info.x_step)
             {
-                int64_t offs = input_offset + x * sizeof(T);
-                for(size_t w = 0; w < run_info.weights_width; ++w)
-                {
-                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                    const auto input_vals      = is_valid_region ?
-                                                 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
-                                                 out_of_bound_vector;
-                    const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
+                AccArrayType acc{};
+                AccArrayType in_sum{};
+                AccArrayType we_sum{};
+
+                auto weights_ptr  = base_weights_ptr;
+                auto input_offset = base_input_offset;
 
-                    for(size_t i = 0; i < element_per_vector; ++i)
+                for (size_t h = 0; h < run_info.weights_height; ++h)
+                {
+                    int64_t offs = input_offset + x * sizeof(T);
+                    for (size_t w = 0; w < run_info.weights_width; ++w)
                     {
-                        acc.at(i) += input_vals[i] * weights_vals[i];
-                        in_sum.at(i) += input_vals[i];
-                        we_sum.at(i) += weights_vals[i];
+                        const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                        const auto input_vals =
+                            is_valid_region
+                                ? wrapper::vload(reinterpret_cast<T *>(
+                                      input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)))
+                                : out_of_bound_vector;
+                        const auto weights_vals =
+                            wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+                        for (size_t i = 0; i < element_per_vector; ++i)
+                        {
+                            acc.at(i) += input_vals[i] * weights_vals[i];
+                            in_sum.at(i) += input_vals[i];
+                            we_sum.at(i) += weights_vals[i];
+                        }
+
+                        offs += dilation.x() * run_info.input_stride_y;
                     }
 
-                    offs += dilation.x() * run_info.input_stride_y;
+                    weights_ptr += run_info.weights_stride_z;
+                    input_offset += dilation.y() * run_info.input_stride_z;
                 }
 
-                weights_ptr += run_info.weights_stride_z;
-                input_offset += dilation.y() * run_info.input_stride_z;
-            }
+                VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
+                for (size_t i = 0; i < element_per_vector; ++i)
+                {
+                    acc.at(i) -= in_sum.at(i) * weights_qoffset;
+                    acc.at(i) -= we_sum.at(i) * input_qoffset;
+                    acc.at(i) += k_offset;
 
-            VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
-            for(size_t i = 0; i < element_per_vector; ++i)
-            {
-                acc.at(i) -= in_sum.at(i) * weights_qoffset;
-                acc.at(i) -= we_sum.at(i) * input_qoffset;
-                acc.at(i) += k_offset;
+                    if (has_biases)
+                    {
+                        acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x);
+                    }
 
-                if(has_biases)
-                {
-                    acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x);
+                    const int32_t out_mul   = output_multiplier.at(x + i);
+                    const int32_t out_shift = output_shift.at(x + i);
+                    if (out_shift < 0)
+                    {
+                        acc.at(i) =
+                            saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
+                    }
+                    else
+                    {
+                        acc.at(i) =
+                            rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) +
+                            output_qoffset;
+                    }
+                    out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i)));
                 }
 
-                const int32_t out_mul   = output_multiplier.at(x + i);
-                const int32_t out_shift = output_shift.at(x + i);
-                if(out_shift < 0)
-                {
-                    acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
-                }
-                else
-                {
-                    acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
-                }
-                out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i)));
+                wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals);
             }
 
-            wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals);
-        }
-
-        // left-over
-        for(; x < run_info.x_end; ++x)
-        {
-            AccType acc    = 0;
-            AccType in_sum = 0;
-            AccType we_sum = 0;
+            // left-over
+            for (; x < run_info.x_end; ++x)
+            {
+                AccType acc    = 0;
+                AccType in_sum = 0;
+                AccType we_sum = 0;
 
-            auto weights_ptr  = base_weights_ptr;
-            auto input_offset = base_input_offset;
+                auto weights_ptr  = base_weights_ptr;
+                auto input_offset = base_input_offset;
 
-            for(size_t h = 0; h < run_info.weights_height; ++h)
-            {
-                int64_t offs = input_offset + x * sizeof(T);
-                for(size_t w = 0; w < run_info.weights_width; ++w)
+                for (size_t h = 0; h < run_info.weights_height; ++h)
                 {
-                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                    const auto input_val       = is_valid_region ?
-                                                 *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) :
-                                                 out_of_bound_value;
-                    const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
-
-                    acc += input_val * weights_val;
-                    in_sum += input_val;
-                    we_sum += weights_val;
+                    int64_t offs = input_offset + x * sizeof(T);
+                    for (size_t w = 0; w < run_info.weights_width; ++w)
+                    {
+                        const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                        const auto input_val =
+                            is_valid_region
+                                ? *reinterpret_cast<T *>(input_it.ptr() +
+                                                         std::min(static_cast<size_t>(offs), run_info.input_max_offset))
+                                : out_of_bound_value;
+                        const auto weights_val =
+                            *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+                        acc += input_val * weights_val;
+                        in_sum += input_val;
+                        we_sum += weights_val;
+
+                        offs += dilation.x() * run_info.input_stride_y;
+                    }
 
-                    offs += dilation.x() * run_info.input_stride_y;
+                    weights_ptr += run_info.weights_stride_z;
+                    input_offset += dilation.y() * run_info.input_stride_z;
                 }
 
-                weights_ptr += run_info.weights_stride_z;
-                input_offset += dilation.y() * run_info.input_stride_z;
-            }
+                T out_vals{0};
 
-            T out_vals{ 0 };
+                acc -= in_sum * weights_qoffset;
+                acc -= we_sum * input_qoffset;
+                acc += k_offset;
 
-            acc -= in_sum * weights_qoffset;
-            acc -= we_sum * input_qoffset;
-            acc += k_offset;
+                if (has_biases)
+                {
+                    acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x);
+                }
 
-            if(has_biases)
-            {
-                acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x);
-            }
+                const int32_t out_mul   = output_multiplier.at(x);
+                const int32_t out_shift = output_shift.at(x);
 
-            const int32_t out_mul   = output_multiplier.at(x);
-            const int32_t out_shift = output_shift.at(x);
+                if (out_shift < 0)
+                {
+                    acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
+                }
+                else
+                {
+                    acc =
+                        rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
+                }
 
-            if(out_shift < 0)
-            {
-                acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
-            }
-            else
-            {
-                acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
+                out_vals                                      = static_cast<T>(utility::clamp<AccType, T>(acc));
+                *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals;
             }
-
-            out_vals                                      = static_cast<T>(utility::clamp<AccType, T>(acc));
-            *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals;
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
+        },
+        input_it, weights_it, biases_it, output_it);
 }
 
 template <typename T, typename TW>
-void depthwise_loop_generic_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
-                                      const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
+void depthwise_loop_generic_quantized(const ITensor       *src,
+                                      const ITensor       *weights,
+                                      const ITensor       *biases,
+                                      ITensor             *dst,
+                                      const PadStrideInfo &conv_info,
+                                      const Size2D        &dilation,
+                                      unsigned int         depth_multiplier,
+                                      std::vector<int>     output_multiplier,
+                                      std::vector<int>     output_shift,
+                                      const Window        &window,
+                                      bool                 has_biases) // NOLINT
 {
     using AccType = int32_t;
 
-    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
+    const auto run_info =
+        DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
 
-    const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
+    const auto out_of_bound_value =
+        PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
 
     const int32_t input_qoffset   = src->info()->quantization_info().uniform().offset;
     const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
@@ -277,76 +311,93 @@ void depthwise_loop_generic_quantized(const ITensor *src, const ITensor *weights
     Iterator output_it(dst, win_output);
     Iterator biases_it{};
 
-    if(has_biases)
+    if (has_biases)
     {
         biases_it = Iterator(biases, win_weights);
     }
 
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        std::vector<AccType> acc(depth_multiplier, 0);
-        std::vector<AccType> we_sum(depth_multiplier, 0);
-        AccType              in_sum = 0;
+    execute_window_loop(
+        execution_window,
+        [&](const Coordinates &id)
+        {
+            std::vector<AccType> acc(depth_multiplier, 0);
+            std::vector<AccType> we_sum(depth_multiplier, 0);
+            AccType              in_sum = 0;
 
-        const int32_t input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int32_t input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        int64_t       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+            const int32_t input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+            const int32_t input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+            int64_t       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
 
-        auto weights_ptr = weights_it.ptr();
-        for(size_t h = 0; h < run_info.weights_height; ++h)
-        {
-            int offs = input_offset;
-            for(size_t w = 0; w < run_info.weights_width; ++w)
+            auto weights_ptr = weights_it.ptr();
+            for (size_t h = 0; h < run_info.weights_height; ++h)
             {
-                const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                const auto input_val       = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : out_of_bound_value;
-
-                for(size_t m = 0; m < depth_multiplier; ++m)
+                int offs = input_offset;
+                for (size_t w = 0; w < run_info.weights_width; ++w)
                 {
-                    const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
-                    acc.at(m) += input_val * weights_val;
-
-                    we_sum.at(m) += weights_val;
-                }
+                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                    const auto input_val =
+                        is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs),
+                                                                                            run_info.input_max_offset)))
+                                        : out_of_bound_value;
 
-                offs += dilation.x() * run_info.input_stride_y;
-                in_sum += input_val;
-            }
+                    for (size_t m = 0; m < depth_multiplier; ++m)
+                    {
+                        const auto weights_val =
+                            *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
+                        acc.at(m) += input_val * weights_val;
 
-            weights_ptr += run_info.weights_stride_z;
-            input_offset += dilation.y() * run_info.input_stride_z;
-        }
+                        we_sum.at(m) += weights_val;
+                    }
 
-        for(size_t m = 0; m < depth_multiplier; ++m)
-        {
-            acc.at(m) -= in_sum * weights_qoffset;
-            acc.at(m) -= we_sum.at(m) * input_qoffset;
-            acc.at(m) += k_offset;
+                    offs += dilation.x() * run_info.input_stride_y;
+                    in_sum += input_val;
+                }
 
-            if(has_biases)
-            {
-                acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
+                weights_ptr += run_info.weights_stride_z;
+                input_offset += dilation.y() * run_info.input_stride_z;
             }
 
-            const int32_t out_mul   = output_multiplier.at(id.x() * depth_multiplier + m);
-            const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m);
-            if(out_shift < 0)
-            {
-                acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
-            }
-            else
+            for (size_t m = 0; m < depth_multiplier; ++m)
             {
-                acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset;
+                acc.at(m) -= in_sum * weights_qoffset;
+                acc.at(m) -= we_sum.at(m) * input_qoffset;
+                acc.at(m) += k_offset;
+
+                if (has_biases)
+                {
+                    acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
+                }
+
+                const int32_t out_mul   = output_multiplier.at(id.x() * depth_multiplier + m);
+                const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m);
+                if (out_shift < 0)
+                {
+                    acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
+                }
+                else
+                {
+                    acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) +
+                                output_qoffset;
+                }
+                *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) =
+                    static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
             }
-            *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
+        },
+        input_it, weights_it, biases_it, output_it);
 }
 
 template <typename T, typename TW>
-void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
-                                              const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
+void depthwise_loop_pow2_quantized_per_tensor(const ITensor       *src,
+                                              const ITensor       *weights,
+                                              const ITensor       *biases,
+                                              ITensor             *dst,
+                                              const PadStrideInfo &conv_info,
+                                              const Size2D        &dilation,
+                                              unsigned int         depth_multiplier,
+                                              std::vector<int>     output_multiplier,
+                                              std::vector<int>     output_shift,
+                                              const Window        &window,
+                                              bool                 has_biases) // NOLINT
 {
     constexpr int half_vec = vector_size / 2;
 
@@ -355,11 +406,15 @@ void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor
     using AccVectorTagType = typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type;
     using TagType          = typename wrapper::traits::neon_vector<T, vector_size>::tag_type;
 
-    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
+    const auto run_info =
+        DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
 
-    const auto input_qoffset_vec   = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<T>(src->info()->quantization_info().uniform().offset), TagType{})));
-    const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{})));
-    const auto output_qoffset_vec  = wrapper::vdup_n(dst->info()->quantization_info().uniform().offset, arm_compute::wrapper::traits::vector_128_tag{});
+    const auto input_qoffset_vec = wrapper::vreinterpret(
+        wrapper::vmovl(wrapper::vdup_n(static_cast<T>(src->info()->quantization_info().uniform().offset), TagType{})));
+    const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(
+        wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{})));
+    const auto output_qoffset_vec  = wrapper::vdup_n(dst->info()->quantization_info().uniform().offset,
+                                                     arm_compute::wrapper::traits::vector_128_tag{});
 
     const auto lower = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::lowest()), AccVectorTagType{});
     const auto upper = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{});
@@ -389,7 +444,7 @@ void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor
     Iterator output_it(dst, win_output);
     Iterator biases_it{};
 
-    if(has_biases)
+    if (has_biases)
     {
         biases_it = Iterator(biases, win_weights);
     }
@@ -397,95 +452,117 @@ void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor
     std::vector<AccVectorType> acc0(depth_multiplier / vector_size);
     std::vector<AccVectorType> acc1(depth_multiplier / vector_size);
 
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        std::fill(begin(acc0), end(acc0), zero);
-        std::fill(begin(acc1), end(acc1), zero);
+    execute_window_loop(
+        execution_window,
+        [&](const Coordinates &id)
+        {
+            std::fill(begin(acc0), end(acc0), zero);
+            std::fill(begin(acc1), end(acc1), zero);
 
-        const int32_t input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int32_t input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        int64_t       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+            const int32_t input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+            const int32_t input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+            int64_t       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
 
-        auto weights_ptr = weights_it.ptr();
-        for(size_t h = 0; h < run_info.weights_height; ++h)
-        {
-            const int32_t current_h = input_z + h * dilation.y();
-            if(current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height))
+            auto weights_ptr = weights_it.ptr();
+            for (size_t h = 0; h < run_info.weights_height; ++h)
             {
-                int offs = input_offset;
-                for(size_t w = 0; w < run_info.weights_width; ++w)
+                const int32_t current_h = input_z + h * dilation.y();
+                if (current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height))
                 {
-                    const int32_t current_w = input_y + w * dilation.x();
-                    if(current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width))
+                    int offs = input_offset;
+                    for (size_t w = 0; w < run_info.weights_width; ++w)
                     {
-                        const auto input_8x8     = wrapper::vdup_n(*(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))), TagType{});
-                        const auto input_s16x8   = wrapper::vreinterpret(wrapper::vmovl(input_8x8));
-                        const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec);
-
-                        for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
+                        const int32_t current_w = input_y + w * dilation.x();
+                        if (current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width))
                         {
-                            const auto weights_8x8     = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
-                            const auto weights_s16x8   = wrapper::vreinterpret(wrapper::vmovl(weights_8x8));
-                            const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec);
-
-                            acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs), wrapper::vgetlow(weights_no_offs));
-                            acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs), wrapper::vgethigh(weights_no_offs));
+                            const auto input_8x8 = wrapper::vdup_n(
+                                *(reinterpret_cast<T *>(
+                                    input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))),
+                                TagType{});
+                            const auto input_s16x8   = wrapper::vreinterpret(wrapper::vmovl(input_8x8));
+                            const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec);
+
+                            for (size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
+                            {
+                                const auto weights_8x8     = wrapper::vload(reinterpret_cast<TW *>(
+                                    weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
+                                const auto weights_s16x8   = wrapper::vreinterpret(wrapper::vmovl(weights_8x8));
+                                const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec);
+
+                                acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs),
+                                                            wrapper::vgetlow(weights_no_offs));
+                                acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs),
+                                                            wrapper::vgethigh(weights_no_offs));
+                            }
                         }
-                    }
 
-                    offs += dilation.x() * run_info.input_stride_y;
+                        offs += dilation.x() * run_info.input_stride_y;
+                    }
                 }
-            }
 
-            weights_ptr += run_info.weights_stride_z;
-            input_offset += dilation.y() * run_info.input_stride_z;
-        }
+                weights_ptr += run_info.weights_stride_z;
+                input_offset += dilation.y() * run_info.input_stride_z;
+            }
 
-        for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
-        {
-            if(has_biases)
+            for (size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
             {
-                const auto bias_val0 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
-                const auto bias_val1 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) * sizeof(int32_t)));
+                if (has_biases)
+                {
+                    const auto bias_val0 =
+                        wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
+                    const auto bias_val1 = wrapper::vloadq(
+                        reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) * sizeof(int32_t)));
 
-                acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0);
-                acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1);
-            }
+                    acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0);
+                    acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1);
+                }
 
-            if(out_shift < 0)
-            {
-                acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
-                acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
-            }
-            else
-            {
-                acc0.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), output_qoffset_vec);
-                acc1.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), output_qoffset_vec);
-            }
+                if (out_shift < 0)
+                {
+                    acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul),
+                                               output_qoffset_vec);
+                    acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul),
+                                               output_qoffset_vec);
+                }
+                else
+                {
+                    acc0.at(i) = wrapper::vadd(
+                        rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift),
+                        output_qoffset_vec);
+                    acc1.at(i) = wrapper::vadd(
+                        rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift),
+                        output_qoffset_vec);
+                }
 
-            acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper);
-            acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper);
+                acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper);
+                acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper);
 
-            const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)),
-                                                   wrapper::vmovn(acc1.at(i)));
+                const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)), wrapper::vmovn(acc1.at(i)));
 
-            if(std::is_same<T, uint8_t>::value)
-            {
-                wrapper::vstore(reinterpret_cast<uint8_t *>(output_it.ptr() + m * sizeof(uint8_t)), wrapper::vqmovn(vreinterpretq_u16_s16(out_val)));
-            }
-            else
-            {
-                wrapper::vstore(reinterpret_cast<int8_t *>(output_it.ptr() + m * sizeof(int8_t)), wrapper::vqmovn(out_val));
+                if (std::is_same<T, uint8_t>::value)
+                {
+                    wrapper::vstore(reinterpret_cast<uint8_t *>(output_it.ptr() + m * sizeof(uint8_t)),
+                                    wrapper::vqmovn(vreinterpretq_u16_s16(out_val)));
+                }
+                else
+                {
+                    wrapper::vstore(reinterpret_cast<int8_t *>(output_it.ptr() + m * sizeof(int8_t)),
+                                    wrapper::vqmovn(out_val));
+                }
             }
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
+        },
+        input_it, weights_it, biases_it, output_it);
 }
 } // namespace
 
 template <typename T, typename TW>
-void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                                  ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void run_depthwise_quanitized8bit(const ITensor         *src,
+                                  const ITensor         *weights,
+                                  const ITensor         *biases,
+                                  ITensor               *dst,
+                                  const Window          &window,
+                                  bool                   has_biases,
+                                  const ConvolutionInfo &info)
 {
     PadStrideInfo    conv_info        = info.pad_stride_info;
     unsigned int     depth_multiplier = info.depth_multiplier;
@@ -497,15 +574,15 @@ void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, co
     const auto output_scale  = dst->info()->quantization_info().uniform().scale;
     auto       weights_scale = weights->info()->quantization_info().scale();
 
-    if(!is_data_type_quantized_per_channel(weights->info()->data_type()))
+    if (!is_data_type_quantized_per_channel(weights->info()->data_type()))
     {
-        for(size_t i = 1; i < weights->info()->dimension(channel_idx); ++i)
+        for (size_t i = 1; i < weights->info()->dimension(channel_idx); ++i)
         {
             weights_scale.push_back(weights_scale.front());
         }
     }
 
-    for(const auto &s : weights_scale)
+    for (const auto &s : weights_scale)
     {
         int32_t     out_mult   = 0;
         int32_t     out_shift  = 0;
@@ -516,30 +593,49 @@ void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, co
         output_shift.push_back(out_shift);
     }
 
-    if(depth_multiplier == 1)
+    if (depth_multiplier == 1)
     {
-        depthwise_loop_multiplier1_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, output_multiplier, output_shift, window, has_biases);
+        depthwise_loop_multiplier1_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, output_multiplier,
+                                                    output_shift, window, has_biases);
     }
     else
     {
         const bool is_pow2                 = ((depth_multiplier & (depth_multiplier - 1)) == 0);
         const bool is_quantized_per_tensor = !(is_data_type_quantized_per_channel(weights->info()->data_type()));
 
-        if(is_pow2 && is_quantized_per_tensor && depth_multiplier >= 8)
+        if (is_pow2 && is_quantized_per_tensor && depth_multiplier >= 8)
         {
-            depthwise_loop_pow2_quantized_per_tensor<T, TW>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, output_multiplier, output_shift, window, has_biases);
+            depthwise_loop_pow2_quantized_per_tensor<T, TW>(src, weights, biases, dst, conv_info, dilation,
+                                                            depth_multiplier, output_multiplier, output_shift, window,
+                                                            has_biases);
         }
         else
         {
-            depthwise_loop_generic_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, output_multiplier, output_shift, window, has_biases);
+            depthwise_loop_generic_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, depth_multiplier,
+                                                    output_multiplier, output_shift, window, has_biases);
         }
     }
 }
-template void run_depthwise_quanitized8bit<uint8_t, uint8_t>(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                                                             ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
-template void run_depthwise_quanitized8bit<int8_t, int8_t>(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                                                           ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
-template void run_depthwise_quanitized8bit<uint8_t, int8_t>(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                                                            ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
+template void run_depthwise_quanitized8bit<uint8_t, uint8_t>(const ITensor         *src,
+                                                             const ITensor         *weights,
+                                                             const ITensor         *biases,
+                                                             ITensor               *dst,
+                                                             const Window          &window,
+                                                             bool                   has_biases,
+                                                             const ConvolutionInfo &info);
+template void run_depthwise_quanitized8bit<int8_t, int8_t>(const ITensor         *src,
+                                                           const ITensor         *weights,
+                                                           const ITensor         *biases,
+                                                           ITensor               *dst,
+                                                           const Window          &window,
+                                                           bool                   has_biases,
+                                                           const ConvolutionInfo &info);
+template void run_depthwise_quanitized8bit<uint8_t, int8_t>(const ITensor         *src,
+                                                            const ITensor         *weights,
+                                                            const ITensor         *biases,
+                                                            ITensor               *dst,
+                                                            const Window          &window,
+                                                            bool                   has_biases,
+                                                            const ConvolutionInfo &info);
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h
index 8410cdbf16..3fa5c58c3c 100644
--- a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h
@@ -24,6 +24,7 @@
 #ifndef SRC_CORE_KERNELS_DEPTWISECONV2DNATIVE_IMPL_H
 #define SRC_CORE_KERNELS_DEPTWISECONV2DNATIVE_IMPL_H
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
@@ -63,15 +64,21 @@ struct DepthwiseConvolutionRunInfo
     const size_t   input_width;
     const size_t   input_depth;
 
-    DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1) // NOLINT
-        : num_read_elements_per_iteration((depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
+    DepthwiseConvolutionRunInfo(const ITensorInfo   &input,
+                                const ITensorInfo   &weights,
+                                const PadStrideInfo &conv_info,
+                                const Window        &w,
+                                uint32_t             depth_multiplier = 1) // NOLINT
+        : num_read_elements_per_iteration(
+              (depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
           x_start(w.x().start()),
           x_end(w.x().end()),
           x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)),
           x_leftover_start(std::max(static_cast<int32_t>(w.x().end() + 1) - static_cast<int32_t>(x_step), int32_t(0))),
           input_stride_y(input.strides_in_bytes().y()),
           input_stride_z(input.strides_in_bytes().z()),
-          input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
+          input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) -
+                           (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
           weights_width(weights.dimension(width_idx)),
           weights_height(weights.dimension(height_idx)),
           weights_stride_y(weights.strides_in_bytes().y()),
@@ -87,7 +94,12 @@ struct DepthwiseConvolutionRunInfo
     }
 };
 
-inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, uint32_t h, const DepthwiseConvolutionRunInfo &run_info, const Size2D &dilation)
+inline bool is_valid_input_region(int32_t                            base_w,
+                                  uint32_t                           base_h,
+                                  uint32_t                           w,
+                                  uint32_t                           h,
+                                  const DepthwiseConvolutionRunInfo &run_info,
+                                  const Size2D                      &dilation)
 {
     const int32_t current_h  = base_h + h * dilation.y();
     const bool    is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height);
@@ -99,8 +111,14 @@ inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, u
 }
 
 template <typename T>
-void depthwise_loop_multiplier1_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
-                                   const Size2D &dilation, const Window &window, bool has_biases)
+void depthwise_loop_multiplier1_fp(const ITensor       *src,
+                                   const ITensor       *weights,
+                                   const ITensor       *biases,
+                                   ITensor             *dst,
+                                   const PadStrideInfo &conv_info,
+                                   const Size2D        &dilation,
+                                   const Window        &window,
+                                   bool                 has_biases)
 {
     constexpr auto element_per_vector = vector_size / sizeof(T);
     using VectorType                  = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
@@ -129,94 +147,112 @@ void depthwise_loop_multiplier1_fp(const ITensor *src, const ITensor *weights, c
     Iterator output_it(dst, win_output);
     Iterator biases_it{};
 
-    if(has_biases)
+    if (has_biases)
     {
         biases_it = Iterator(biases, win_weights);
     }
 
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        const int32_t input_y           = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int32_t input_z           = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-
-        auto const base_weights_ptr = weights_it.ptr();
-        uint32_t   x                = run_info.x_start;
-
-        for(; x < run_info.x_leftover_start; x += run_info.x_step)
+    execute_window_loop(
+        execution_window,
+        [&](const Coordinates &id)
         {
-            VectorType acc          = zero_vector;
-            auto       weights_ptr  = base_weights_ptr;
-            int64_t    input_offset = base_input_offset;
+            const int32_t input_y           = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+            const int32_t input_z           = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+            const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+
+            auto const base_weights_ptr = weights_it.ptr();
+            uint32_t   x                = run_info.x_start;
 
-            for(uint32_t h = 0; h < run_info.weights_height; ++h)
+            for (; x < run_info.x_leftover_start; x += run_info.x_step)
             {
-                int64_t offs = input_offset + x * sizeof(T);
-                for(uint32_t w = 0; w < run_info.weights_width; ++w)
+                VectorType acc          = zero_vector;
+                auto       weights_ptr  = base_weights_ptr;
+                int64_t    input_offset = base_input_offset;
+
+                for (uint32_t h = 0; h < run_info.weights_height; ++h)
                 {
-                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                    const auto input_vals      = is_valid_region ?
-                                                 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
-                                                 zero_vector;
-                    const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
-                    acc                     = wrapper::vmla(acc, weights_vals, input_vals);
+                    int64_t offs = input_offset + x * sizeof(T);
+                    for (uint32_t w = 0; w < run_info.weights_width; ++w)
+                    {
+                        const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                        const auto input_vals =
+                            is_valid_region
+                                ? wrapper::vload(reinterpret_cast<T *>(
+                                      input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)))
+                                : zero_vector;
+                        const auto weights_vals =
+                            wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
+                        acc = wrapper::vmla(acc, weights_vals, input_vals);
+
+                        offs += dilation.x() * run_info.input_stride_y;
+                    }
+
+                    weights_ptr += run_info.weights_stride_z;
+                    input_offset += dilation.y() * run_info.input_stride_z;
+                }
 
-                    offs += dilation.x() * run_info.input_stride_y;
+                if (has_biases)
+                {
+                    const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
+                    acc                    = wrapper::vadd(acc, biases_vals);
                 }
 
-                weights_ptr += run_info.weights_stride_z;
-                input_offset += dilation.y() * run_info.input_stride_z;
+                wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc);
             }
 
-            if(has_biases)
+            for (; x < run_info.x_end; ++x)
             {
-                const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
-                acc                    = wrapper::vadd(acc, biases_vals);
-            }
-
-            wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc);
-        }
+                auto    acc_scalar   = T{0};
+                auto    weights_ptr  = base_weights_ptr;
+                int64_t input_offset = base_input_offset;
 
-        for(; x < run_info.x_end; ++x)
-        {
-            auto    acc_scalar   = T{ 0 };
-            auto    weights_ptr  = base_weights_ptr;
-            int64_t input_offset = base_input_offset;
-
-            for(size_t h = 0; h < run_info.weights_height; ++h)
-            {
-                int64_t offs = input_offset + x * sizeof(T);
-                for(size_t w = 0; w < run_info.weights_width; ++w)
+                for (size_t h = 0; h < run_info.weights_height; ++h)
                 {
-                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                    const auto input_vals      = is_valid_region ? *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : 0;
-                    const auto weights_vals    = *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
-
-                    acc_scalar += (input_vals * weights_vals);
-
-                    offs += dilation.x() * run_info.input_stride_y;
+                    int64_t offs = input_offset + x * sizeof(T);
+                    for (size_t w = 0; w < run_info.weights_width; ++w)
+                    {
+                        const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                        const auto input_vals =
+                            is_valid_region
+                                ? *reinterpret_cast<T *>(input_it.ptr() +
+                                                         std::min(static_cast<size_t>(offs), run_info.input_max_offset))
+                                : 0;
+                        const auto weights_vals =
+                            *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+                        acc_scalar += (input_vals * weights_vals);
+
+                        offs += dilation.x() * run_info.input_stride_y;
+                    }
+
+                    weights_ptr += run_info.weights_stride_z;
+                    input_offset += dilation.y() * run_info.input_stride_z;
                 }
 
-                weights_ptr += run_info.weights_stride_z;
-                input_offset += dilation.y() * run_info.input_stride_z;
-            }
-
-            if(has_biases)
-            {
-                const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x);
-                acc_scalar += biases_vals;
+                if (has_biases)
+                {
+                    const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x);
+                    acc_scalar += biases_vals;
+                }
+                *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar;
             }
-            *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar;
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
+        },
+        input_it, weights_it, biases_it, output_it);
 }
 
 template <typename T>
-void depthwise_loop_generic_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
-                               const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases)
+void depthwise_loop_generic_fp(const ITensor       *src,
+                               const ITensor       *weights,
+                               const ITensor       *biases,
+                               ITensor             *dst,
+                               const PadStrideInfo &conv_info,
+                               const Size2D        &dilation,
+                               unsigned int         depth_multiplier,
+                               const Window        &window,
+                               bool                 has_biases)
 {
-    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
+    const auto run_info =
+        DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
 
     Window execution_window = window;
     execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
@@ -240,81 +276,98 @@ void depthwise_loop_generic_fp(const ITensor *src, const ITensor *weights, const
     Iterator output_it(dst, win_output);
     Iterator biases_it{};
 
-    if(has_biases)
+    if (has_biases)
     {
         biases_it = Iterator(biases, win_weights);
     }
 
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        std::vector<T> acc(depth_multiplier, static_cast<T>(0));
+    execute_window_loop(
+        execution_window,
+        [&](const Coordinates &id)
+        {
+            std::vector<T> acc(depth_multiplier, static_cast<T>(0));
 
-        const int input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        int       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+            const int input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+            const int input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+            int       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
 
-        auto weights_ptr = weights_it.ptr();
-        for(size_t h = 0; h < run_info.weights_height; ++h)
-        {
-            int offs = input_offset;
-            for(size_t w = 0; w < run_info.weights_width; ++w)
+            auto weights_ptr = weights_it.ptr();
+            for (size_t h = 0; h < run_info.weights_height; ++h)
             {
-                const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                const auto input_val       = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : T(0);
-
-                for(size_t m = 0; m < depth_multiplier; ++m)
+                int offs = input_offset;
+                for (size_t w = 0; w < run_info.weights_width; ++w)
                 {
-                    const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
-                    acc.at(m)              = support::cpp11::fma(weights_val, input_val, acc.at(m));
+                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                    const auto input_val =
+                        is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs),
+                                                                                            run_info.input_max_offset)))
+                                        : T(0);
+
+                    for (size_t m = 0; m < depth_multiplier; ++m)
+                    {
+                        const auto weights_val =
+                            *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
+                        acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m));
+                    }
+
+                    offs += dilation.x() * run_info.input_stride_y;
                 }
 
-                offs += dilation.x() * run_info.input_stride_y;
+                weights_ptr += run_info.weights_stride_z;
+                input_offset += dilation.y() * run_info.input_stride_z;
             }
 
-            weights_ptr += run_info.weights_stride_z;
-            input_offset += dilation.y() * run_info.input_stride_z;
-        }
-
-        if(has_biases)
-        {
-            for(size_t m = 0; m < depth_multiplier; ++m)
+            if (has_biases)
             {
-                const auto biases_val                                     = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
-                *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
+                for (size_t m = 0; m < depth_multiplier; ++m)
+                {
+                    const auto biases_val = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
+                    *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
+                }
             }
-        }
-        else
-        {
-            for(size_t m = 0; m < depth_multiplier; ++m)
+            else
             {
-                *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
+                for (size_t m = 0; m < depth_multiplier; ++m)
+                {
+                    *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
+                }
             }
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
+        },
+        input_it, weights_it, biases_it, output_it);
 }
 
 template <typename T, typename TW>
-void run_depthwise_float(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                         ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void run_depthwise_float(const ITensor         *src,
+                         const ITensor         *weights,
+                         const ITensor         *biases,
+                         ITensor               *dst,
+                         const Window          &window,
+                         bool                   has_biases,
+                         const ConvolutionInfo &info)
 {
     PadStrideInfo conv_info        = info.pad_stride_info;
     unsigned int  depth_multiplier = info.depth_multiplier;
     Size2D        dilation         = info.dilation;
 
-    if(depth_multiplier == 1)
+    if (depth_multiplier == 1)
     {
         depthwise_loop_multiplier1_fp<T>(src, weights, biases, dst, conv_info, dilation, window, has_biases);
     }
     else
     {
-        depthwise_loop_generic_fp<T>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, window, has_biases);
+        depthwise_loop_generic_fp<T>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, window,
+                                     has_biases);
     }
 }
 
 template <typename T, typename TW>
-void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                                  ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
+void run_depthwise_quanitized8bit(const ITensor         *src,
+                                  const ITensor         *weights,
+                                  const ITensor         *biases,
+                                  ITensor               *dst,
+                                  const Window          &window,
+                                  bool                   has_biases,
+                                  const ConvolutionInfo &info);
 
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp
index 1bf7ad7007..d32847c1e8 100644
--- a/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp
@@ -26,16 +26,26 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_qu8_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias,
-                                   ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void neon_qu8_deptwiseconv2dnative(const ITensor         *src,
+                                   const ITensor         *weights,
+                                   const ITensor         *bias,
+                                   ITensor               *dst,
+                                   const Window          &window,
+                                   bool                   has_biases,
+                                   const ConvolutionInfo &info)
 {
     return run_depthwise_quanitized8bit<uint8_t, uint8_t>(src, weights, bias, dst, window, has_biases, info);
 }
 
-void neon_qp8_qu8_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias,
-                                       ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void neon_qp8_qu8_deptwiseconv2dnative(const ITensor         *src,
+                                       const ITensor         *weights,
+                                       const ITensor         *bias,
+                                       ITensor               *dst,
+                                       const Window          &window,
+                                       bool                   has_biases,
+                                       const ConvolutionInfo &info)
 {
     return run_depthwise_quanitized8bit<uint8_t, int8_t>(src, weights, bias, dst, window, has_biases, info);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp
index 58f7536064..682fad0bda 100644
--- a/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp
@@ -26,16 +26,26 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_qs8_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias,
-                                   ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void neon_qs8_deptwiseconv2dnative(const ITensor         *src,
+                                   const ITensor         *weights,
+                                   const ITensor         *bias,
+                                   ITensor               *dst,
+                                   const Window          &window,
+                                   bool                   has_biases,
+                                   const ConvolutionInfo &info)
 {
     return run_depthwise_quanitized8bit<int8_t, int8_t>(src, weights, bias, dst, window, has_biases, info);
 }
 
-void neon_qp8_qs8_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias,
-                                       ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void neon_qp8_qs8_deptwiseconv2dnative(const ITensor         *src,
+                                       const ITensor         *weights,
+                                       const ITensor         *bias,
+                                       ITensor               *dst,
+                                       const Window          &window,
+                                       bool                   has_biases,
+                                       const ConvolutionInfo &info)
 {
     return run_depthwise_quanitized8bit<int8_t, int8_t>(src, weights, bias, dst, window, has_biases, info);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/depthwiseconv2d/list.h b/src/cpu/kernels/depthwiseconv2d/list.h
index 44f055d6a9..cf80608f4f 100644
--- a/src/cpu/kernels/depthwiseconv2d/list.h
+++ b/src/cpu/kernels/depthwiseconv2d/list.h
@@ -27,9 +27,9 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_DEPTHWISECONV2D_KERNEL(func_name)                                   \
-    void func_name(const ITensor *src, const ITensor *weights, const ITensor *bias, \
-                   ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+#define DECLARE_DEPTHWISECONV2D_KERNEL(func_name)                                                 \
+    void func_name(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, \
+                   const Window &window, bool has_biases, const ConvolutionInfo &info)
 DECLARE_DEPTHWISECONV2D_KERNEL(neon_qu8_deptwiseconv2dnative);
 DECLARE_DEPTHWISECONV2D_KERNEL(neon_qs8_deptwiseconv2dnative);
 DECLARE_DEPTHWISECONV2D_KERNEL(neon_fp16_deptwiseconv2dnative);
diff --git a/src/cpu/kernels/directconv2d/list.h b/src/cpu/kernels/directconv2d/list.h
index 9a0472643d..5cbf7a36c6 100644
--- a/src/cpu/kernels/directconv2d/list.h
+++ b/src/cpu/kernels/directconv2d/list.h
@@ -32,8 +32,9 @@ namespace cpu
 {
 namespace kernels
 {
-#define DECLARE_DIRECT_CONV2D_KERNEL(func_name) \
-    void func_name(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+#define DECLARE_DIRECT_CONV2D_KERNEL(func_name)                                                    \
+    void func_name(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, \
+                   const PadStrideInfo &conv_info)
 
 DECLARE_DIRECT_CONV2D_KERNEL(neon_fp32_nhwc_directconv2d);
 DECLARE_DIRECT_CONV2D_KERNEL(neon_fp16_nchw_directconv2d);
diff --git a/src/cpu/kernels/directconv2d/nchw/all.cpp b/src/cpu/kernels/directconv2d/nchw/all.cpp
index a719fa50d6..218a4b7ee4 100644
--- a/src/cpu/kernels/directconv2d/nchw/all.cpp
+++ b/src/cpu/kernels/directconv2d/nchw/all.cpp
@@ -22,18 +22,17 @@
  * SOFTWARE.
  */
 
-#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h"
-
-#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h"
 
 #include <algorithm>
 
@@ -44,22 +43,26 @@ namespace cpu
 namespace kernels
 {
 template <typename T>
-void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+void convolve_nchw(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
 
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-void neon_fp16_nchw_directconv2d(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+void neon_fp16_nchw_directconv2d(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
 {
     convolve_nchw<float16_t>(window, src, weights, dst, conv_info);
 }
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
-void neon_fp32_nchw_directconv2d(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+void neon_fp32_nchw_directconv2d(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
 {
     convolve_nchw<float>(window, src, weights, dst, conv_info);
 }
 
 template <typename T>
-void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+void convolve_nchw(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_UNUSED(conv_info);
 
@@ -107,72 +110,81 @@ void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weig
 
     constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
 
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // We are computing the theoretical starting input starting points
-        const int in_w_start_t = static_cast<int>(id.x()) * conv_stride_w - conv_pad_left;
-        const int in_h_start_t = static_cast<int>(id.y()) * conv_stride_h - conv_pad_top;
-        const int in_w_end_t   = in_w_start_t + kernel_dim_w;
-        const int in_h_end_t   = in_h_start_t + kernel_dim_h;
-
-        // We are computing the valid initial and ending input points by checking the borders
-        const int in_w_start = std::max(in_w_start_t, 0);
-        const int in_h_start = std::max(in_h_start_t, 0);
-        const int in_w_end   = std::min(in_w_end_t, input_dim_w);
-        const int in_h_end   = std::min(in_h_end_t, input_dim_h);
-
-        // We use the input points to select the valid weight points to use
-        const int wei_w_start = in_w_start - in_w_start_t;
-        const int wei_h_start = in_h_start - in_h_start_t;
-        const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
-
-        const int      index_c_end  = weights->info()->dimension(2);
-        const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n;
-        execute_window_loop(window_w, [&](const Coordinates & id_w)
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
         {
-            const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
-            uint8_t       *out_ptr           = out.ptr() + id_w[3] * output_stride_c;
-            T              out_temp          = static_cast<T>(0);
-
-            for(int index_wei_c = 0, index_in_c = 0; index_wei_c < index_c_end; ++index_wei_c, ++index_in_c)
-            {
-                const T *const in_ptr_row_0      = in_ptr_start + index_in_c * input_stride_c;
-                const T *const weights_ptr_row_0 = weights_ptr_start + index_wei_c * kernel_stride_c;
-                for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
+            // We are computing the theoretical starting input starting points
+            const int in_w_start_t = static_cast<int>(id.x()) * conv_stride_w - conv_pad_left;
+            const int in_h_start_t = static_cast<int>(id.y()) * conv_stride_h - conv_pad_top;
+            const int in_w_end_t   = in_w_start_t + kernel_dim_w;
+            const int in_h_end_t   = in_h_start_t + kernel_dim_h;
+
+            // We are computing the valid initial and ending input points by checking the borders
+            const int in_w_start = std::max(in_w_start_t, 0);
+            const int in_h_start = std::max(in_h_start_t, 0);
+            const int in_w_end   = std::min(in_w_end_t, input_dim_w);
+            const int in_h_end   = std::min(in_h_end_t, input_dim_h);
+
+            // We use the input points to select the valid weight points to use
+            const int wei_w_start = in_w_start - in_w_start_t;
+            const int wei_h_start = in_h_start - in_h_start_t;
+            const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
+
+            const int      index_c_end = weights->info()->dimension(2);
+            const T *const in_ptr_start =
+                reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) +
+                id[3] * input_stride_n;
+            execute_window_loop(
+                window_w,
+                [&](const Coordinates &id_w)
                 {
-                    const T    *in_ptr_row      = in_ptr_row_0 + index_in_h * input_stride_h;
-                    const T    *weights_ptr_row = weights_ptr_row_0 + index_wei_h * kernel_stride_h;
-                    int         index_w         = in_w_start;
-                    int         index_wei_w     = wei_w_start;
-                    vector_type out_temp_vec    = wrapper::vdup_n(static_cast<T>(0), tag_type());
-                    for(; index_w <= ((in_w_end - num_elems_read_per_iteration)); index_w += num_elems_read_per_iteration, index_wei_w += num_elems_read_per_iteration)
-                    {
-                        const auto src_vec = wrapper::vloadq(in_ptr_row + index_w * input_stride_w);
-                        const auto w_vec   = wrapper::vloadq(weights_ptr_row + index_wei_w * kernel_stride_w);
-                        out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
-                    }
-                    out_temp += vreduce(out_temp_vec);
-                    for(; index_w < in_w_end; ++index_w, ++index_wei_w)
+                    const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
+                    uint8_t       *out_ptr           = out.ptr() + id_w[3] * output_stride_c;
+                    T              out_temp          = static_cast<T>(0);
+
+                    for (int index_wei_c = 0, index_in_c = 0; index_wei_c < index_c_end; ++index_wei_c, ++index_in_c)
                     {
-                        const auto src_val = *(in_ptr_row + index_w * input_stride_w);
-                        const auto w_val   = *(weights_ptr_row + index_wei_w * kernel_stride_w);
-                        out_temp += src_val * w_val;
+                        const T *const in_ptr_row_0      = in_ptr_start + index_in_c * input_stride_c;
+                        const T *const weights_ptr_row_0 = weights_ptr_start + index_wei_c * kernel_stride_c;
+                        for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end;
+                             ++index_wei_h, ++index_in_h)
+                        {
+                            const T    *in_ptr_row      = in_ptr_row_0 + index_in_h * input_stride_h;
+                            const T    *weights_ptr_row = weights_ptr_row_0 + index_wei_h * kernel_stride_h;
+                            int         index_w         = in_w_start;
+                            int         index_wei_w     = wei_w_start;
+                            vector_type out_temp_vec    = wrapper::vdup_n(static_cast<T>(0), tag_type());
+                            for (; index_w <= ((in_w_end - num_elems_read_per_iteration));
+                                 index_w += num_elems_read_per_iteration, index_wei_w += num_elems_read_per_iteration)
+                            {
+                                const auto src_vec = wrapper::vloadq(in_ptr_row + index_w * input_stride_w);
+                                const auto w_vec   = wrapper::vloadq(weights_ptr_row + index_wei_w * kernel_stride_w);
+                                out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+                            }
+                            out_temp += vreduce(out_temp_vec);
+                            for (; index_w < in_w_end; ++index_w, ++index_wei_w)
+                            {
+                                const auto src_val = *(in_ptr_row + index_w * input_stride_w);
+                                const auto w_val   = *(weights_ptr_row + index_wei_w * kernel_stride_w);
+                                out_temp += src_val * w_val;
+                            }
+                        }
                     }
-                }
-            }
-            *(reinterpret_cast<T *>(out_ptr)) = out_temp;
-
+                    *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+                },
+                wei);
         },
-        wei);
-    },
-    out);
+        out);
 }
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template void convolve_nchw<float16_t>(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+template void convolve_nchw<float16_t>(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
-template void convolve_nchw<float>(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+template void convolve_nchw<float>(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
 
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp
index 9982431de5..36a8e76f13 100644
--- a/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp
+++ b/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp
@@ -30,10 +30,11 @@ namespace cpu
 {
 namespace kernels
 {
-void neon_fp32_nhwc_directconv2d(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+void neon_fp32_nhwc_directconv2d(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
 {
     convolve_nhwc<float>(window, src, weights, dst, conv_info);
 }
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp
index 500ad1b420..f235167e28 100644
--- a/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp
+++ b/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp
@@ -24,16 +24,16 @@
 
 #include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h"
 
-#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <algorithm>
 
@@ -49,12 +49,14 @@ namespace
 {
 bool have_zero_x_internal_padding(ITensorInfo *src, const ITensorInfo *weights)
 {
-    return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 && weights->padding().right == 0);
-}
+    return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 &&
+            weights->padding().right == 0);
 }
+} // namespace
 
 template <typename T>
-void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+void convolve_nhwc(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
 {
     // Declare useful types
     using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
@@ -97,7 +99,7 @@ void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weig
     constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
 
     // nhwc optimized
-    if(have_zero_x_internal_padding(src->info(), weights->info()))
+    if (have_zero_x_internal_padding(src->info(), weights->info()))
     {
         // This function assumes that input and weights have not padding in channel
 
@@ -114,138 +116,154 @@ void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weig
         * multiplication works on the correct input/weight elements.
         */
         execute_window_loop(
-            window_out, [&](const Coordinates & id)
-        {
-            /*
+            window_out,
+            [&](const Coordinates &id)
+            {
+                /*
             * In here we create theoretical indexes which then we validate for both
             * inputs and weights.
             * As a reminder, this loop take each output point in NHW, C is treated
             * in the weights loop.
             */
-            // We are computing the theoretical starting input starting points
-            const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
-            const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
-            const int in_w_end_t   = in_w_start_t + kernel_dim_w;
-            const int in_h_end_t   = in_h_start_t + kernel_dim_h;
-
-            // We are computing the valid initial and ending input points by checking the borders
-            const int in_w_start = std::max(in_w_start_t, 0);
-            const int in_h_start = std::max(in_h_start_t, 0);
-            const int in_w_end   = std::min(in_w_end_t, input_dim_w);
-            const int in_h_end   = std::min(in_h_end_t, input_dim_h);
-
-            // We use the input points to select the valid weight points to use
-            const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w;
-            const int index_h_start  = in_h_start - in_h_start_t;
-            const int index_wc_end   = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w;
-            const int index_h_end    = kernel_dim_h - (in_h_end_t - in_h_end);
-
-            execute_window_loop(
-                window_w, [&](const Coordinates & id_w)
-            {
-                /*
+                // We are computing the theoretical starting input starting points
+                const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
+                const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
+                const int in_w_end_t   = in_w_start_t + kernel_dim_w;
+                const int in_h_end_t   = in_h_start_t + kernel_dim_h;
+
+                // We are computing the valid initial and ending input points by checking the borders
+                const int in_w_start = std::max(in_w_start_t, 0);
+                const int in_h_start = std::max(in_h_start_t, 0);
+                const int in_w_end   = std::min(in_w_end_t, input_dim_w);
+                const int in_h_end   = std::min(in_h_end_t, input_dim_h);
+
+                // We use the input points to select the valid weight points to use
+                const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w;
+                const int index_h_start  = in_h_start - in_h_start_t;
+                const int index_wc_end   = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w;
+                const int index_h_end    = kernel_dim_h - (in_h_end_t - in_h_end);
+
+                execute_window_loop(
+                    window_w,
+                    [&](const Coordinates &id_w)
+                    {
+                        /*
                 * This is the loop in the weights, and it goes along N (the batches)
                 * As a reminder, the batches of the weights are translated into the
                 * channels of the output
                 */
-                const T *in_ptr_row = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes())
-                                      + id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h;
-                const T *weights_ptr_row = reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h;
-                uint8_t *out_ptr         = out.ptr() + id_w[3] * output_stride_c;
-
-                T out_temp = static_cast<T>(0);
-                for(int index_h = index_h_start; index_h < index_h_end; ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h)
-                {
-                    const T    *in_ptr_mover = in_ptr_row;
-                    int         index_wc     = index_wc_start;
-                    vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
-                    for(; index_wc <= index_wc_end - num_elems_read_per_iteration; index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
-                    {
-                        const auto src_vec = wrapper::vloadq(in_ptr_mover);
-                        const auto w_vec   = wrapper::vloadq(weights_ptr_row + index_wc);
-                        out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
-                    }
-                    out_temp += vreduce(out_temp_vec);
-                    for(; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover)
-                    {
-                        const auto src_val = *(in_ptr_mover);
-                        const auto w_val   = *(weights_ptr_row + index_wc);
-                        out_temp += src_val * w_val;
-                    }
-                }
-                *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+                        const T *in_ptr_row =
+                            reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) +
+                            id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h;
+                        const T *weights_ptr_row =
+                            reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h;
+                        uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c;
+
+                        T out_temp = static_cast<T>(0);
+                        for (int index_h = index_h_start; index_h < index_h_end;
+                             ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h)
+                        {
+                            const T    *in_ptr_mover = in_ptr_row;
+                            int         index_wc     = index_wc_start;
+                            vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
+                            for (; index_wc <= index_wc_end - num_elems_read_per_iteration;
+                                 index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
+                            {
+                                const auto src_vec = wrapper::vloadq(in_ptr_mover);
+                                const auto w_vec   = wrapper::vloadq(weights_ptr_row + index_wc);
+                                out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+                            }
+                            out_temp += vreduce(out_temp_vec);
+                            for (; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover)
+                            {
+                                const auto src_val = *(in_ptr_mover);
+                                const auto w_val   = *(weights_ptr_row + index_wc);
+                                out_temp += src_val * w_val;
+                            }
+                        }
+                        *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+                    },
+                    wei);
             },
-            wei);
-        },
-        out);
+            out);
     }
     else // nhwc non optimized
     {
         execute_window_loop(
-            window_out, [&](const Coordinates & id)
-        {
-            // We are computing the theoretical starting input starting points
-            const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
-            const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
-            const int in_w_end_t   = in_w_start_t + kernel_dim_w;
-            const int in_h_end_t   = in_h_start_t + kernel_dim_h;
-
-            // We are computing the valid initial and ending input points by checking the borders
-            const int in_w_start = std::max(in_w_start_t, 0);
-            const int in_h_start = std::max(in_h_start_t, 0);
-            const int in_w_end   = std::min(in_w_end_t, input_dim_w);
-            const int in_h_end   = std::min(in_h_end_t, input_dim_h);
-
-            // We use the input points to select the valid weight points to use
-            const int wei_w_start = in_w_start - in_w_start_t;
-            const int wei_h_start = in_h_start - in_h_start_t;
-            const int wei_w_end   = kernel_dim_w - (in_w_end_t - in_w_end);
-            const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
-
-            const int      index_c_end  = weights->info()->dimension(0);
-            const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n;
-
-            execute_window_loop(
-                window_w, [&](const Coordinates & id_w)
+            window_out,
+            [&](const Coordinates &id)
             {
-                const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
-                uint8_t       *out_ptr           = out.ptr() + id_w[3] * output_stride_c;
-
-                T out_temp = static_cast<T>(0);
-                for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
-                {
-                    const T *const in_ptr_row      = in_ptr_start + index_in_h * input_stride_h;
-                    const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h;
-                    for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w)
+                // We are computing the theoretical starting input starting points
+                const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
+                const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
+                const int in_w_end_t   = in_w_start_t + kernel_dim_w;
+                const int in_h_end_t   = in_h_start_t + kernel_dim_h;
+
+                // We are computing the valid initial and ending input points by checking the borders
+                const int in_w_start = std::max(in_w_start_t, 0);
+                const int in_h_start = std::max(in_h_start_t, 0);
+                const int in_w_end   = std::min(in_w_end_t, input_dim_w);
+                const int in_h_end   = std::min(in_h_end_t, input_dim_h);
+
+                // We use the input points to select the valid weight points to use
+                const int wei_w_start = in_w_start - in_w_start_t;
+                const int wei_h_start = in_h_start - in_h_start_t;
+                const int wei_w_end   = kernel_dim_w - (in_w_end_t - in_w_end);
+                const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
+
+                const int      index_c_end = weights->info()->dimension(0);
+                const T *const in_ptr_start =
+                    reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) +
+                    id[3] * input_stride_n;
+
+                execute_window_loop(
+                    window_w,
+                    [&](const Coordinates &id_w)
                     {
-                        const T    *in_ptr_mover      = in_ptr_row + index_in_w * input_stride_w;
-                        const T    *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
-                        int         index_c           = 0;
-                        vector_type out_temp_vec      = wrapper::vdup_n(static_cast<T>(0), tag_type());
-                        for(; index_c <= index_c_end - num_elems_read_per_iteration; index_c += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration, weights_ptr_mover += num_elems_read_per_iteration)
-                        {
-                            const auto src_vec = wrapper::vloadq(in_ptr_mover);
-                            const auto w_vec   = wrapper::vloadq(weights_ptr_mover);
-                            out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
-                        }
-                        out_temp += vreduce(out_temp_vec);
-                        for(; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover)
+                        const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
+                        uint8_t       *out_ptr           = out.ptr() + id_w[3] * output_stride_c;
+
+                        T out_temp = static_cast<T>(0);
+                        for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end;
+                             ++index_wei_h, ++index_in_h)
                         {
-                            const auto src_val = *(in_ptr_mover);
-                            const auto w_val   = *(weights_ptr_mover);
-                            out_temp += src_val * w_val;
+                            const T *const in_ptr_row      = in_ptr_start + index_in_h * input_stride_h;
+                            const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h;
+                            for (int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end;
+                                 ++index_wei_w, ++index_in_w)
+                            {
+                                const T    *in_ptr_mover      = in_ptr_row + index_in_w * input_stride_w;
+                                const T    *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
+                                int         index_c           = 0;
+                                vector_type out_temp_vec      = wrapper::vdup_n(static_cast<T>(0), tag_type());
+                                for (; index_c <= index_c_end - num_elems_read_per_iteration;
+                                     index_c += num_elems_read_per_iteration,
+                                     in_ptr_mover += num_elems_read_per_iteration,
+                                     weights_ptr_mover += num_elems_read_per_iteration)
+                                {
+                                    const auto src_vec = wrapper::vloadq(in_ptr_mover);
+                                    const auto w_vec   = wrapper::vloadq(weights_ptr_mover);
+                                    out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+                                }
+                                out_temp += vreduce(out_temp_vec);
+                                for (; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover)
+                                {
+                                    const auto src_val = *(in_ptr_mover);
+                                    const auto w_val   = *(weights_ptr_mover);
+                                    out_temp += src_val * w_val;
+                                }
+                            }
                         }
-                    }
-                }
-                *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+                        *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+                    },
+                    wei);
             },
-            wei);
-        },
-        out);
+            out);
     }
 }
 
-template void convolve_nhwc<float>(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+template void convolve_nhwc<float>(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
 
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/impl.h b/src/cpu/kernels/directconv2d/nhwc/neon/impl.h
index 3b26fcdf29..efb9ce8e2a 100644
--- a/src/cpu/kernels/directconv2d/nhwc/neon/impl.h
+++ b/src/cpu/kernels/directconv2d/nhwc/neon/impl.h
@@ -26,6 +26,7 @@
 #define SRC_CORE_NEON_KERNELS_CONV2D_IMPL_H
 
 #include "arm_compute/core/ITensor.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
@@ -35,7 +36,8 @@ namespace cpu
 namespace kernels
 {
 template <typename T>
-void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+void convolve_nhwc(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp
index 6091ef215e..9b4375f17c 100644
--- a/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp
@@ -23,6 +23,7 @@
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
 
 namespace arm_compute
@@ -35,14 +36,38 @@ void neon_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITenso
     return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float16_t, 8>>(in1, in2, out, window);
 }
 
-template void neon_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                              const ITensor *in2,
+                                                                              ITensor       *out,
+                                                                              const Window  &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                       const ITensor *in2,
+                                                                       ITensor       *out,
+                                                                       const Window  &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                       const ITensor *in2,
+                                                                       ITensor       *out,
+                                                                       const Window  &window);
 
 template <ComparisonOperation op>
 void neon_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
@@ -50,12 +75,30 @@ void neon_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *
     return elementwise_comp_op_16<op, float16_t, float16x8_t>(in1, in2, out, window);
 }
 
-template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-}
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                  const ITensor *in2,
+                                                                                  ITensor       *out,
+                                                                                  const Window  &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                         const ITensor *in2,
+                                                                                         ITensor       *out,
+                                                                                         const Window  &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                      const ITensor *in2,
+                                                                                      ITensor       *out,
+                                                                                      const Window  &window);
+} // namespace cpu
 } // namespace arm_compute
 #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp
index 2d8fec91c5..53ccd89dcc 100644
--- a/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
 
 namespace arm_compute
@@ -34,25 +35,67 @@ void neon_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITenso
     return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float, 4>>(in1, in2, out, window);
 }
 
-template void neon_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                              const ITensor *in2,
+                                                                              ITensor       *out,
+                                                                              const Window  &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                       const ITensor *in2,
+                                                                       ITensor       *out,
+                                                                       const Window  &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                       const ITensor *in2,
+                                                                       ITensor       *out,
+                                                                       const Window  &window);
 
 template <ComparisonOperation op>
 void neon_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_comp_op_32<op, float, float32x4_t>(in1, in2, out, window);
 }
-template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-}
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                  const ITensor *in2,
+                                                                                  ITensor       *out,
+                                                                                  const Window  &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                         const ITensor *in2,
+                                                                                         ITensor       *out,
+                                                                                         const Window  &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                      const ITensor *in2,
+                                                                                      ITensor       *out,
+                                                                                      const Window  &window);
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/impl.h b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h
index 98b154e8fd..98f7e8b949 100644
--- a/src/cpu/kernels/elementwise_binary/generic/neon/impl.h
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h
@@ -39,7 +39,7 @@ typename VectorType::type elementwise_arithm_op(const typename VectorType::type
 
     vec_type res = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});
 
-    switch(op)
+    switch (op)
     {
         case ArithmeticOperation::MAX:
             res = wrapper::vmax(a, b);
@@ -71,7 +71,9 @@ typename VectorType::type elementwise_arithm_op(const typename VectorType::type
 }
 
 template <ArithmeticOperation op, typename ScalarType, typename VectorType>
-typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a, const ScalarType &broadcast_value, const bool reorder)
+typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a,
+                                                          const ScalarType                &broadcast_value,
+                                                          const bool                       reorder)
 {
     using tag_type = typename VectorType::tag_type;
     using vec_type = typename VectorType::type;
@@ -81,10 +83,15 @@ typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorT
 }
 
 template <typename InputScalarType, typename OutputScalarType, typename InputVectorType>
-void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                    OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &),
-                    int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool),
-                    int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *))
+void elementwise_op(
+    const ITensor *in1,
+    const ITensor *in2,
+    ITensor       *out,
+    const Window  &window,
+    OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &),
+    int (*broadcast_func)(
+        int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool),
+    int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *))
 {
     // Create input windows
     Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
@@ -99,7 +106,7 @@ void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const
     const auto window_end_x          = static_cast<int>(window.x().end());
     const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -114,20 +121,26 @@ void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto                  output_ptr              = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto            non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
-            const InputScalarType broadcast_value         = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
-
-            int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_value, output_ptr, !is_broadcast_input_2);
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto a      = *(non_broadcast_input_ptr + x);
-                *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, !is_broadcast_input_2 ? a : broadcast_value);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
+                auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+                const auto non_broadcast_input_ptr =
+                    reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
+                const InputScalarType broadcast_value =
+                    *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
+
+                int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,
+                                          broadcast_value, output_ptr, !is_broadcast_input_2);
+                for (; x < window_end_x; ++x)
+                {
+                    const auto a      = *(non_broadcast_input_ptr + x);
+                    *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a,
+                                                       !is_broadcast_input_2 ? a : broadcast_value);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -139,21 +152,23 @@ void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const
         Iterator input2(in2, input2_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
-
-            int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr);
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto a      = *(input1_ptr + x);
-                const auto b      = *(input2_ptr + x);
-                *(output_ptr + x) = (*scalar_func)(a, b);
-            }
-        },
-        input1, input2, output);
+                auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+                const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
+
+                int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr);
+                for (; x < window_end_x; ++x)
+                {
+                    const auto a      = *(input1_ptr + x);
+                    const auto b      = *(input2_ptr + x);
+                    *(output_ptr + x) = (*scalar_func)(a, b);
+                }
+            },
+            input1, input2, output);
     }
 }
 
@@ -162,7 +177,7 @@ inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const Scalar
 {
     auto res = ScalarType(0);
 
-    switch(op)
+    switch (op)
     {
         case ArithmeticOperation::MAX:
             res = std::max(a, b);
@@ -183,10 +198,10 @@ inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const Scalar
         case ArithmeticOperation::DIV:
         {
             res = a / b;
-            if(std::is_integral<ScalarType>::value)
+            if (std::is_integral<ScalarType>::value)
             {
                 res = (b == 0) ? 0 : res;
-                if(static_cast<int32_t>(a) % static_cast<int32_t>(b) != 0 && ((a < 0) != (b < 0)))
+                if (static_cast<int32_t>(a) % static_cast<int32_t>(b) != 0 && ((a < 0) != (b < 0)))
                 {
                     --res;
                 }
@@ -205,43 +220,56 @@ inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const Scalar
 }
 
 template <>
-inline int32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<int32_t, 4>>(const int32x4_t &a, const int32x4_t &b)
+inline int32x4_t
+elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<int32_t, 4>>(const int32x4_t &a,
+                                                                                                   const int32x4_t &b)
 {
     return vcvtq_s32_f32(vfloorq_f32(wrapper::vdiv(vcvtq_f32_s32(a), vcvtq_f32_s32(b))));
 }
 
 template <>
-inline float32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b)
+inline float32x4_t
+elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a,
+                                                                                                 const float32x4_t &b)
 {
     return wrapper::vdiv(a, b);
 }
 
 template <>
-inline float32x4_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b)
+inline float32x4_t
+elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a,
+                                                                                                   const float32x4_t &b)
 {
     return wrapper::vpow(a, b);
 }
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 template <>
-inline float16x8_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b)
+inline float16x8_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float16_t, 8>>(
+    const float16x8_t &a, const float16x8_t &b)
 {
     return wrapper::vdiv(a, b);
 }
 
 template <>
-inline float16x8_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b)
+inline float16x8_t
+elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float16_t, 8>>(
+    const float16x8_t &a, const float16x8_t &b)
 {
     return wrapper::vpow(a, b);
 }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 template <ArithmeticOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_arithm_op_loop(int window_start_x, int window_end_x, int window_step_x,
-                                      const ScalarType *input1_ptr, const ScalarType *input2_ptr, ScalarType *output_ptr)
+inline int elementwise_arithm_op_loop(int               window_start_x,
+                                      int               window_end_x,
+                                      int               window_step_x,
+                                      const ScalarType *input1_ptr,
+                                      const ScalarType *input2_ptr,
+                                      ScalarType       *output_ptr)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         const auto a = wrapper::vloadq(input1_ptr + x);
         const auto b = wrapper::vloadq(input2_ptr + x);
@@ -251,14 +279,20 @@ inline int elementwise_arithm_op_loop(int window_start_x, int window_end_x, int
 }
 
 template <ArithmeticOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_arithm_op_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                const ScalarType *non_broadcast_input_ptr, const ScalarType &broadcast_value, ScalarType *output_ptr, const bool reorder)
+inline int elementwise_arithm_op_broadcast_loop(int               window_start_x,
+                                                int               window_end_x,
+                                                int               window_step_x,
+                                                const ScalarType *non_broadcast_input_ptr,
+                                                const ScalarType &broadcast_value,
+                                                ScalarType       *output_ptr,
+                                                const bool        reorder)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
-        wrapper::vstore(output_ptr + x, elementwise_arithm_op_broadcast<op, ScalarType, VectorType>(a, broadcast_value, reorder));
+        wrapper::vstore(output_ptr + x,
+                        elementwise_arithm_op_broadcast<op, ScalarType, VectorType>(a, broadcast_value, reorder));
     }
     return x;
 }
@@ -268,10 +302,10 @@ void elementwise_arithm_op(const ITensor *in1, const ITensor *in2, ITensor *out,
 {
     using scalar_type = typename VectorType::scalar_type;
 
-    elementwise_op<scalar_type, scalar_type, VectorType>(in1, in2, out, window,
-                                                         &elementwise_arithm_op_scalar<op, scalar_type>,
-                                                         &elementwise_arithm_op_broadcast_loop<op, scalar_type, VectorType>,
-                                                         &elementwise_arithm_op_loop<op, scalar_type, VectorType>);
+    elementwise_op<scalar_type, scalar_type, VectorType>(
+        in1, in2, out, window, &elementwise_arithm_op_scalar<op, scalar_type>,
+        &elementwise_arithm_op_broadcast_loop<op, scalar_type, VectorType>,
+        &elementwise_arithm_op_loop<op, scalar_type, VectorType>);
 }
 
 template <ComparisonOperation op, typename InputScalarType>
@@ -279,7 +313,7 @@ inline uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputS
 {
     bool res = false;
 
-    switch(op)
+    switch (op)
     {
         case ComparisonOperation::Equal:
             res = (a == b);
@@ -308,9 +342,9 @@ inline uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputS
 template <ComparisonOperation op, typename InputVectorType, typename OutputVectorType>
 inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const InputVectorType &b)
 {
-    OutputVectorType res = { 0, 0, 0, 0 };
+    OutputVectorType res = {0, 0, 0, 0};
 
-    switch(op)
+    switch (op)
     {
         case ComparisonOperation::Equal:
             res = wrapper::vceq(a, b);
@@ -338,53 +372,75 @@ inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const Inpu
 }
 
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType, typename OutputVectorType>
-inline OutputVectorType elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder)
+inline OutputVectorType
+elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder)
 {
     InputVectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
-    return elementwise_comp_op<op, InputVectorType, OutputVectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
+    return elementwise_comp_op<op, InputVectorType, OutputVectorType>(reorder ? broadcast_vector : a,
+                                                                      reorder ? a : broadcast_vector);
 }
 
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_broadcast_8_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
+inline int elementwise_comp_op_broadcast_8_loop(int                    window_start_x,
+                                                int                    window_end_x,
+                                                int                    window_step_x,
+                                                const InputScalarType *non_broadcast_input_ptr,
+                                                const InputScalarType &broadcast_value,
+                                                uint8_t               *output_ptr,
+                                                const bool             reorder)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
-        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint8x16_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
+        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint8x16_t>(
+            wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
         wrapper::vstore(output_ptr + x, a);
     }
     return x;
 }
 
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_broadcast_16_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                 const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
+inline int elementwise_comp_op_broadcast_16_loop(int                    window_start_x,
+                                                 int                    window_end_x,
+                                                 int                    window_step_x,
+                                                 const InputScalarType *non_broadcast_input_ptr,
+                                                 const InputScalarType &broadcast_value,
+                                                 uint8_t               *output_ptr,
+                                                 const bool             reorder)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
-        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint16x8_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
+        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint16x8_t>(
+            wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
         wrapper::vstore(output_ptr + x, wrapper::vmovn(a));
     }
     return x;
 }
 
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_broadcast_32_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                 const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
+inline int elementwise_comp_op_broadcast_32_loop(int                    window_start_x,
+                                                 int                    window_end_x,
+                                                 int                    window_step_x,
+                                                 const InputScalarType *non_broadcast_input_ptr,
+                                                 const InputScalarType &broadcast_value,
+                                                 uint8_t               *output_ptr,
+                                                 const bool             reorder)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
-        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder);
-        const auto b = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder);
+        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(
+            wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder);
+        const auto b = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(
+            wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder);
         wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(a), wrapper::vmovn(b))));
     }
-    if(x <= window_end_x - 4)
+    if (x <= window_end_x - 4)
     {
-        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
-        for(int i = 0; i < 4; i++)
+        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(
+            wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
+        for (int i = 0; i < 4; i++)
         {
             *(output_ptr + x + i) = wrapper::vgetlane(a, i);
         }
@@ -394,11 +450,15 @@ inline int elementwise_comp_op_broadcast_32_loop(int window_start_x, int window_
 }
 
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_8_loop(int window_start_x, int window_end_x, int window_step_x,
-                                      const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
+inline int elementwise_comp_op_8_loop(int                    window_start_x,
+                                      int                    window_end_x,
+                                      int                    window_step_x,
+                                      const InputScalarType *input1_ptr,
+                                      const InputScalarType *input2_ptr,
+                                      uint8_t               *output_ptr)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         const auto a   = wrapper::vloadq(input1_ptr + x);
         const auto b   = wrapper::vloadq(input2_ptr + x);
@@ -409,11 +469,15 @@ inline int elementwise_comp_op_8_loop(int window_start_x, int window_end_x, int
 }
 
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_16_loop(int window_start_x, int window_end_x, int window_step_x,
-                                       const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
+inline int elementwise_comp_op_16_loop(int                    window_start_x,
+                                       int                    window_end_x,
+                                       int                    window_step_x,
+                                       const InputScalarType *input1_ptr,
+                                       const InputScalarType *input2_ptr,
+                                       uint8_t               *output_ptr)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         const auto a   = wrapper::vloadq(input1_ptr + x);
         const auto b   = wrapper::vloadq(input2_ptr + x);
@@ -424,11 +488,15 @@ inline int elementwise_comp_op_16_loop(int window_start_x, int window_end_x, int
 }
 
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int window_step_x,
-                                       const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
+inline int elementwise_comp_op_32_loop(int                    window_start_x,
+                                       int                    window_end_x,
+                                       int                    window_step_x,
+                                       const InputScalarType *input1_ptr,
+                                       const InputScalarType *input2_ptr,
+                                       uint8_t               *output_ptr)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         auto       a    = wrapper::vloadq(input1_ptr + x);
         auto       b    = wrapper::vloadq(input2_ptr + x);
@@ -438,12 +506,12 @@ inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int
         const auto res2 = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
         wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(res), wrapper::vmovn(res2))));
     }
-    if(x <= window_end_x - 4)
+    if (x <= window_end_x - 4)
     {
         const auto a   = wrapper::vloadq(input1_ptr + x);
         const auto b   = wrapper::vloadq(input2_ptr + x);
         const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
-        for(int i = 0; i < 4; i++)
+        for (int i = 0; i < 4; i++)
         {
             *(output_ptr + x + i) = wrapper::vgetlane(res, i);
         }
@@ -455,57 +523,59 @@ inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
 void elementwise_comp_op_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
-                                                              &elementwise_comp_op_scalar<op, InputScalarType>,
-                                                              &elementwise_comp_op_broadcast_8_loop<op, InputScalarType, InputVectorType>,
-                                                              &elementwise_comp_op_8_loop<op, InputScalarType, InputVectorType>);
+    elementwise_op<InputScalarType, uint8_t, InputVectorType>(
+        in1, in2, out, window, &elementwise_comp_op_scalar<op, InputScalarType>,
+        &elementwise_comp_op_broadcast_8_loop<op, InputScalarType, InputVectorType>,
+        &elementwise_comp_op_8_loop<op, InputScalarType, InputVectorType>);
 }
 
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
 void elementwise_comp_op_16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
-                                                              &elementwise_comp_op_scalar<op, InputScalarType>,
-                                                              &elementwise_comp_op_broadcast_16_loop<op, InputScalarType, InputVectorType>,
-                                                              &elementwise_comp_op_16_loop<op, InputScalarType, InputVectorType>);
+    elementwise_op<InputScalarType, uint8_t, InputVectorType>(
+        in1, in2, out, window, &elementwise_comp_op_scalar<op, InputScalarType>,
+        &elementwise_comp_op_broadcast_16_loop<op, InputScalarType, InputVectorType>,
+        &elementwise_comp_op_16_loop<op, InputScalarType, InputVectorType>);
 }
 
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
 void elementwise_comp_op_32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
-                                                              &elementwise_comp_op_scalar<op, InputScalarType>,
-                                                              &elementwise_comp_op_broadcast_32_loop<op, InputScalarType, InputVectorType>,
-                                                              &elementwise_comp_op_32_loop<op, InputScalarType, InputVectorType>);
+    elementwise_op<InputScalarType, uint8_t, InputVectorType>(
+        in1, in2, out, window, &elementwise_comp_op_scalar<op, InputScalarType>,
+        &elementwise_comp_op_broadcast_32_loop<op, InputScalarType, InputVectorType>,
+        &elementwise_comp_op_32_loop<op, InputScalarType, InputVectorType>);
 }
 
 inline float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
 {
-    qasymm8x16_t        x = vld1q_u8(input1_ptr);
-    const float32x4x4_t out =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale),
-        }
-    };
+    qasymm8x16_t        x   = vld1q_u8(input1_ptr);
+    const float32x4x4_t out = {{
+        vmulq_f32(
+            vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)),
+            scale),
+        vmulq_f32(
+            vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)),
+            scale),
+        vmulq_f32(
+            vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)),
+            scale),
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)),
+                  scale),
+    }};
     return out;
 }
 
 inline float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
 {
-    qasymm8x16_signed_t x = vld1q_s8(input1_ptr);
-    const float32x4x4_t out =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
-        }
-    };
+    qasymm8x16_signed_t x   = vld1q_s8(input1_ptr);
+    const float32x4x4_t out = {{
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
+    }};
     return out;
 }
 
@@ -523,17 +593,15 @@ inline void store_quantized(uint8_t *output_ptr, const int32x4x4_t &out)
     vst1q_u8(output_ptr, vcombine_u8(pa, pb));
 }
 
-inline void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
+inline void
+store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
 {
-    int32x4x4_t out =
-    {
-        {
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
-        }
-    };
+    int32x4x4_t out = {{
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
+    }};
     store_quantized(output_ptr, out);
 }
 
@@ -544,17 +612,17 @@ inline void store_quantized_signed(int8_t *output_ptr, const int32x4x4_t &out)
     vst1q_s8(output_ptr, vcombine_s8(pa, pb));
 }
 
-inline void store_quantized_signed(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
+inline void store_quantized_signed(int8_t              *output_ptr,
+                                   const float32x4x4_t &rf,
+                                   const float32x4_t   &offset,
+                                   const float32x4_t   &invscale)
 {
-    int32x4x4_t out =
-    {
-        {
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
-        }
-    };
+    int32x4x4_t out = {{
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
+    }};
     store_quantized_signed(output_ptr, out);
 }
 
@@ -565,7 +633,8 @@ inline uint8_t elementwise_arithm_op_quantized_scalar(const float &a, const floa
 }
 
 template <ArithmeticOperation op>
-inline int8_t elementwise_arithm_op_quantized_signed_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo)
+inline int8_t
+elementwise_arithm_op_quantized_signed_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo)
 {
     return quantize_qasymm8_signed(elementwise_arithm_op_scalar<op>(a, b), qinfo);
 }
@@ -574,15 +643,12 @@ template <ArithmeticOperation op>
 float32x4x4_t elementwise_arithm_op(const float32x4x4_t &a, const float32x4x4_t &b)
 {
     using neon_vector_float = wrapper::traits::neon_vector<float, 4>;
-    float32x4x4_t out =
-    {
-        {
-            elementwise_arithm_op<op, neon_vector_float>(a.val[0], b.val[0]),
-            elementwise_arithm_op<op, neon_vector_float>(a.val[1], b.val[1]),
-            elementwise_arithm_op<op, neon_vector_float>(a.val[2], b.val[2]),
-            elementwise_arithm_op<op, neon_vector_float>(a.val[3], b.val[3]),
-        }
-    };
+    float32x4x4_t out       = {{
+              elementwise_arithm_op<op, neon_vector_float>(a.val[0], b.val[0]),
+              elementwise_arithm_op<op, neon_vector_float>(a.val[1], b.val[1]),
+              elementwise_arithm_op<op, neon_vector_float>(a.val[2], b.val[2]),
+              elementwise_arithm_op<op, neon_vector_float>(a.val[3], b.val[3]),
+    }};
     return out;
 }
 
@@ -596,26 +662,29 @@ inline uint8_t elementwise_comp_op_quantized_scalar(const float &a, const float
 template <ComparisonOperation op>
 inline uint32x4x4_t elementwise_comp_op(const float32x4x4_t &a, const float32x4x4_t &b)
 {
-    uint32x4x4_t out =
-    {
-        {
-            elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[0], b.val[0]),
-            elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[1], b.val[1]),
-            elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[2], b.val[2]),
-            elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[3], b.val[3])
-        }
-    };
+    uint32x4x4_t out = {{elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[0], b.val[0]),
+                         elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[1], b.val[1]),
+                         elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[2], b.val[2]),
+                         elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[3], b.val[3])}};
     return out;
 }
 
 template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr,
-                                                int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
-                                                float32x4_t voffseto, float32x4_t invvscaleo)
+inline int elementwise_arithm_op_quantized_loop(int            window_start_x,
+                                                int            window_end_x,
+                                                int            window_step_x,
+                                                const uint8_t *input1_ptr,
+                                                const uint8_t *input2_ptr,
+                                                uint8_t       *output_ptr,
+                                                int32x4_t      voffset1,
+                                                int32x4_t      voffset2,
+                                                float32x4_t    vscale1,
+                                                float32x4_t    vscale2,
+                                                float32x4_t    voffseto,
+                                                float32x4_t    invvscaleo)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         // Get inputs and compute output
         const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
@@ -627,13 +696,21 @@ inline int elementwise_arithm_op_quantized_loop(int window_start_x, int window_e
 }
 
 template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_singed_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                       const int8_t *input1_ptr, const int8_t *input2_ptr, int8_t *output_ptr,
-                                                       int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
-                                                       float32x4_t voffseto, float32x4_t invvscaleo)
+inline int elementwise_arithm_op_quantized_singed_loop(int           window_start_x,
+                                                       int           window_end_x,
+                                                       int           window_step_x,
+                                                       const int8_t *input1_ptr,
+                                                       const int8_t *input2_ptr,
+                                                       int8_t       *output_ptr,
+                                                       int32x4_t     voffset1,
+                                                       int32x4_t     voffset2,
+                                                       float32x4_t   vscale1,
+                                                       float32x4_t   vscale2,
+                                                       float32x4_t   voffseto,
+                                                       float32x4_t   invvscaleo)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         // Get inputs and compute output
         const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1);
@@ -645,45 +722,71 @@ inline int elementwise_arithm_op_quantized_singed_loop(int window_start_x, int w
 }
 
 template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                          const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr,
-                                                          int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
-                                                          float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
+inline int elementwise_arithm_op_quantized_broadcast_loop(int            window_start_x,
+                                                          int            window_end_x,
+                                                          int            window_step_x,
+                                                          const uint8_t *non_broadcast_input_ptr,
+                                                          float32x4x4_t  broadcast_vector,
+                                                          uint8_t       *output_ptr,
+                                                          int32x4_t      voffset_non_broadcast,
+                                                          float32x4_t    vscale_non_broadcast,
+                                                          float32x4_t    voffseto,
+                                                          float32x4_t    invvscaleo,
+                                                          bool           reorder)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
-        const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
-        const float32x4x4_t rf = elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
+        const float32x4x4_t af =
+            load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
+        const float32x4x4_t rf =
+            elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
         store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
     }
     return x;
 }
 template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_signed_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                                 const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, int8_t *output_ptr,
-                                                                 int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
-                                                                 float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
+inline int elementwise_arithm_op_quantized_signed_broadcast_loop(int           window_start_x,
+                                                                 int           window_end_x,
+                                                                 int           window_step_x,
+                                                                 const int8_t *non_broadcast_input_ptr,
+                                                                 float32x4x4_t broadcast_vector,
+                                                                 int8_t       *output_ptr,
+                                                                 int32x4_t     voffset_non_broadcast,
+                                                                 float32x4_t   vscale_non_broadcast,
+                                                                 float32x4_t   voffseto,
+                                                                 float32x4_t   invvscaleo,
+                                                                 bool          reorder)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
-        const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
-        const float32x4x4_t rf = elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
+        const float32x4x4_t af =
+            load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
+        const float32x4x4_t rf =
+            elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
         store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo);
     }
     return x;
 }
 
 template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x,
-                                              const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr,
-                                              int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
-                                              float32x4_t voffseto, float32x4_t invvscaleo)
+inline int elementwise_comp_op_quantized_loop(int            window_start_x,
+                                              int            window_end_x,
+                                              int            window_step_x,
+                                              const uint8_t *input1_ptr,
+                                              const uint8_t *input2_ptr,
+                                              uint8_t       *output_ptr,
+                                              int32x4_t      voffset1,
+                                              int32x4_t      voffset2,
+                                              float32x4_t    vscale1,
+                                              float32x4_t    vscale2,
+                                              float32x4_t    voffseto,
+                                              float32x4_t    invvscaleo)
 {
     ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
         const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
@@ -694,14 +797,22 @@ inline int elementwise_comp_op_quantized_loop(int window_start_x, int window_end
 }
 
 template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_signed_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                     const int8_t *input1_ptr, const int8_t *input2_ptr, uint8_t *output_ptr,
-                                                     int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
-                                                     float32x4_t voffseto, float32x4_t invvscaleo)
+inline int elementwise_comp_op_quantized_signed_loop(int           window_start_x,
+                                                     int           window_end_x,
+                                                     int           window_step_x,
+                                                     const int8_t *input1_ptr,
+                                                     const int8_t *input2_ptr,
+                                                     uint8_t      *output_ptr,
+                                                     int32x4_t     voffset1,
+                                                     int32x4_t     voffset2,
+                                                     float32x4_t   vscale1,
+                                                     float32x4_t   vscale2,
+                                                     float32x4_t   voffseto,
+                                                     float32x4_t   invvscaleo)
 {
     ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1);
         const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2);
@@ -712,46 +823,85 @@ inline int elementwise_comp_op_quantized_signed_loop(int window_start_x, int win
 }
 
 template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                        const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr,
-                                                        int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
-                                                        float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
+inline int elementwise_comp_op_quantized_broadcast_loop(int            window_start_x,
+                                                        int            window_end_x,
+                                                        int            window_step_x,
+                                                        const uint8_t *non_broadcast_input_ptr,
+                                                        float32x4x4_t  broadcast_vector,
+                                                        uint8_t       *output_ptr,
+                                                        int32x4_t      voffset_non_broadcast,
+                                                        float32x4_t    vscale_non_broadcast,
+                                                        float32x4_t    voffseto,
+                                                        float32x4_t    invvscaleo,
+                                                        bool           reorder)
 {
     ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
-        const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
-        const uint32x4x4_t  rf = elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
+        const float32x4x4_t af =
+            load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
+        const uint32x4x4_t rf =
+            elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
         store_quantized(output_ptr + x, rf);
     }
     return x;
 }
 
 template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_signed_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                               const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr,
-                                                               int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
-                                                               float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
+inline int elementwise_comp_op_quantized_signed_broadcast_loop(int           window_start_x,
+                                                               int           window_end_x,
+                                                               int           window_step_x,
+                                                               const int8_t *non_broadcast_input_ptr,
+                                                               float32x4x4_t broadcast_vector,
+                                                               uint8_t      *output_ptr,
+                                                               int32x4_t     voffset_non_broadcast,
+                                                               float32x4_t   vscale_non_broadcast,
+                                                               float32x4_t   voffseto,
+                                                               float32x4_t   invvscaleo,
+                                                               bool          reorder)
 {
     ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
-        const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
-        const uint32x4x4_t  rf = elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
+        const float32x4x4_t af =
+            load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
+        const uint32x4x4_t rf =
+            elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
         store_quantized(output_ptr + x, rf);
     }
     return x;
 }
 
-inline void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+inline void elementwise_op_quantized(const ITensor *in1,
+                                     const ITensor *in2,
+                                     ITensor       *out,
+                                     const Window  &window,
                                      uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
-                                     int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t,
-                                                           float32x4_t, float32x4_t, const bool),
-                                     int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *,
-                                                      int32x4_t, int32x4_t, float32x4_t, float32x4_t,
-                                                      float32x4_t, float32x4_t))
+                                     int (*broadcast_func)(int,
+                                                           int,
+                                                           int,
+                                                           const uint8_t *,
+                                                           float32x4x4_t,
+                                                           uint8_t *,
+                                                           int32x4_t,
+                                                           float32x4_t,
+                                                           float32x4_t,
+                                                           float32x4_t,
+                                                           const bool),
+                                     int (*neon_func)(int,
+                                                      int,
+                                                      int,
+                                                      const uint8_t *,
+                                                      const uint8_t *,
+                                                      uint8_t *,
+                                                      int32x4_t,
+                                                      int32x4_t,
+                                                      float32x4_t,
+                                                      float32x4_t,
+                                                      float32x4_t,
+                                                      float32x4_t))
 {
     // Create input windows
     Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
@@ -772,7 +922,7 @@ inline void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITe
     const float32x4_t voffseto   = vdupq_n_f32(output_qinfo.offset + 0.5f);
     const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         // Select the broadcast input on the X axis
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
@@ -794,24 +944,28 @@ inline void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITe
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
 
-            const uint8_t       broadcast_value  = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
-            const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_u8(broadcast_value), broadcast_qinfo);
+                const uint8_t       broadcast_value  = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
+                const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_u8(broadcast_value), broadcast_qinfo);
 
-            int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr,
-                                      voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2);
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
-                const float bfs   = dequantize_qasymm8(broadcast_value, broadcast_qinfo);
-                *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
+                int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,
+                                          broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast,
+                                          voffseto, invvscaleo, !is_broadcast_input_2);
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
+                    const float bfs   = dequantize_qasymm8(broadcast_value, broadcast_qinfo);
+                    *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs,
+                                                       !is_broadcast_input_2 ? afs : bfs, output_qinfo);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -834,32 +988,56 @@ inline void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITe
         Iterator input2(in2, input2_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-            int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2,
-                                 vscale1, vscale2, voffseto, invvscaleo);
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const float afs   = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo);
-                const float bfs   = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo);
-                *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
-            }
-        },
-        input1, input2, output);
+                const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+                int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr,
+                                     voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo);
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo);
+                    const float bfs   = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo);
+                    *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
+                }
+            },
+            input1, input2, output);
     }
 }
 
-inline void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                                              uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
-                                              int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t,
-                                                                    float32x4_t, float32x4_t, const bool),
-                                              int (*neon_func)(int, int, int, const int8_t *, const int8_t *, uint8_t *,
-                                                               int32x4_t, int32x4_t, float32x4_t, float32x4_t,
-                                                               float32x4_t, float32x4_t))
+inline void
+elementwise_comp_quantized_signed(const ITensor *in1,
+                                  const ITensor *in2,
+                                  ITensor       *out,
+                                  const Window  &window,
+                                  uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
+                                  int (*broadcast_func)(int,
+                                                        int,
+                                                        int,
+                                                        const int8_t *,
+                                                        float32x4x4_t,
+                                                        uint8_t *,
+                                                        int32x4_t,
+                                                        float32x4_t,
+                                                        float32x4_t,
+                                                        float32x4_t,
+                                                        const bool),
+                                  int (*neon_func)(int,
+                                                   int,
+                                                   int,
+                                                   const int8_t *,
+                                                   const int8_t *,
+                                                   uint8_t *,
+                                                   int32x4_t,
+                                                   int32x4_t,
+                                                   float32x4_t,
+                                                   float32x4_t,
+                                                   float32x4_t,
+                                                   float32x4_t))
 {
     // Create input windows
     Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
@@ -879,7 +1057,7 @@ inline void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor
     const float32x4_t voffseto   = vdupq_n_f32(output_qinfo.offset);
     const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         // Select the broadcast input on the X axis
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
@@ -901,24 +1079,28 @@ inline void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
 
-            const int8_t        broadcast_value  = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
-            const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
+                const int8_t        broadcast_value  = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
+                const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
 
-            int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr,
-                                      voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2);
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
-                const float bfs   = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
-                *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
+                int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,
+                                          broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast,
+                                          voffseto, invvscaleo, !is_broadcast_input_2);
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
+                    const float bfs   = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
+                    *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs,
+                                                       !is_broadcast_input_2 ? afs : bfs, output_qinfo);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -941,32 +1123,56 @@ inline void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor
         Iterator input2(in2, input2_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-            int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2,
-                                 vscale1, vscale2, voffseto, invvscaleo);
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const float afs   = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
-                const float bfs   = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
-                *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
-            }
-        },
-        input1, input2, output);
+                const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+                int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr,
+                                     voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo);
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
+                    const float bfs   = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
+                    *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
+                }
+            },
+            input1, input2, output);
     }
 }
 
-inline void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                                            int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
-                                            int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, int8_t *, int32x4_t, float32x4_t,
-                                                                  float32x4_t, float32x4_t, const bool),
-                                            int (*neon_func)(int, int, int, const int8_t *, const int8_t *, int8_t *,
-                                                             int32x4_t, int32x4_t, float32x4_t, float32x4_t,
-                                                             float32x4_t, float32x4_t))
+inline void
+elementwise_op_quantized_signed(const ITensor *in1,
+                                const ITensor *in2,
+                                ITensor       *out,
+                                const Window  &window,
+                                int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
+                                int (*broadcast_func)(int,
+                                                      int,
+                                                      int,
+                                                      const int8_t *,
+                                                      float32x4x4_t,
+                                                      int8_t *,
+                                                      int32x4_t,
+                                                      float32x4_t,
+                                                      float32x4_t,
+                                                      float32x4_t,
+                                                      const bool),
+                                int (*neon_func)(int,
+                                                 int,
+                                                 int,
+                                                 const int8_t *,
+                                                 const int8_t *,
+                                                 int8_t *,
+                                                 int32x4_t,
+                                                 int32x4_t,
+                                                 float32x4_t,
+                                                 float32x4_t,
+                                                 float32x4_t,
+                                                 float32x4_t))
 {
     // Create input windows
     Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
@@ -986,7 +1192,7 @@ inline void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *i
     const float32x4_t voffseto   = vdupq_n_f32(output_qinfo.offset);
     const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         // Select the broadcast input on the X axis
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
@@ -1008,24 +1214,28 @@ inline void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *i
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
 
-            const int8_t        broadcast_value  = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
-            const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
+                const int8_t        broadcast_value  = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
+                const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
 
-            int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr,
-                                      voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2);
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
-                const float bfs   = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
-                *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
+                int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,
+                                          broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast,
+                                          voffseto, invvscaleo, !is_broadcast_input_2);
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
+                    const float bfs   = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
+                    *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs,
+                                                       !is_broadcast_input_2 ? afs : bfs, output_qinfo);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -1048,22 +1258,24 @@ inline void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *i
         Iterator input2(in2, input2_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
-            int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2,
-                                 vscale1, vscale2, voffseto, invvscaleo);
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const float afs   = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
-                const float bfs   = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
-                *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
-            }
-        },
-        input1, input2, output);
+                const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+
+                int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr,
+                                     voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo);
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
+                    const float bfs   = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
+                    *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
+                }
+            },
+            input1, input2, output);
     }
 }
 
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp
index c5c528d3f3..09ad13d5eb 100644
--- a/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
 namespace arm_compute
 {
@@ -33,63 +34,165 @@ void neon_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor
     return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int32_t, 4>>(in1, in2, out, window);
 }
 
-template void neon_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                             const ITensor *in2,
+                                                                             ITensor       *out,
+                                                                             const Window  &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
 
 template <ArithmeticOperation op>
 void neon_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int16_t, 8>>(in1, in2, out, window);
 }
-template void neon_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                             const ITensor *in2,
+                                                                             ITensor       *out,
+                                                                             const Window  &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
 
 template <ComparisonOperation op>
 void neon_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_comp_op_8<op, uint8_t, uint8x16_t>(in1, in2, out, window);
 }
-template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                const ITensor *in2,
+                                                                                ITensor       *out,
+                                                                                const Window  &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                  const ITensor *in2,
+                                                                                  ITensor       *out,
+                                                                                  const Window  &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                       const ITensor *in2,
+                                                                                       ITensor       *out,
+                                                                                       const Window  &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
 
 template <ComparisonOperation op>
 void neon_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_comp_op_16<op, int16_t, int16x8_t>(in1, in2, out, window);
 }
-template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                const ITensor *in2,
+                                                                                ITensor       *out,
+                                                                                const Window  &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
 
 template <ComparisonOperation op>
 void neon_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_comp_op_32<op, int32_t, int32x4_t>(in1, in2, out, window);
 }
-template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-}
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                const ITensor *in2,
+                                                                                ITensor       *out,
+                                                                                const Window  &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp
index fa8e08745a..d891f70644 100644
--- a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
 namespace arm_compute
 {
@@ -33,27 +34,72 @@ void neon_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITe
     return elementwise_arithm_op_quantized<op>(in1, in2, out, window);
 }
 
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                          const ITensor *in2,
+                                                                          ITensor       *out,
+                                                                          const Window  &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                          const ITensor *in2,
+                                                                          ITensor       *out,
+                                                                          const Window  &window);
 
 template <ComparisonOperation op>
-void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1,
+                                                const ITensor *in2,
+                                                ITensor       *out,
+                                                const Window  &window)
 {
     return elementwise_comp_op_quantized<op>(in1, in2, out, window);
 }
 
-template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                       const ITensor *in2,
+                                                                                       ITensor       *out,
+                                                                                       const Window  &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                            const ITensor *in2,
+                                                                                            ITensor       *out,
+                                                                                            const Window  &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                         const ITensor *in2,
+                                                                                         ITensor       *out,
+                                                                                         const Window  &window);
 
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp
index abfdf93b75..b1f8e018f5 100644
--- a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
 
 namespace arm_compute
@@ -34,27 +35,70 @@ void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *i
     return elementwise_arithm_op_quantized_signed<op>(in1, in2, out, window);
 }
 
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
 
 template <ComparisonOperation op>
-void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1,
+                                                       const ITensor *in2,
+                                                       ITensor       *out,
+                                                       const Window  &window)
 {
     return elementwise_comp_op_quantized_signed<op>(in1, in2, out, window);
 }
 
-template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                            const ITensor *in2,
+                                                                                            ITensor       *out,
+                                                                                            const Window  &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                               const ITensor *in2,
+                                                                                               ITensor       *out,
+                                                                                               const Window  &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                              const ITensor *in2,
+                                                                                              ITensor       *out,
+                                                                                              const Window  &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                           const ITensor *in2,
+                                                                                           ITensor       *out,
+                                                                                           const Window  &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                                const ITensor *in2,
+                                                                                                ITensor       *out,
+                                                                                                const Window  &window);
 
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp
index 85224351df..600c7f1c05 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp
@@ -25,6 +25,7 @@
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
 namespace arm_compute
 {
@@ -36,14 +37,38 @@ void sve_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor
     return elementwise_arithmetic_op<float16_t>(in1, in2, out, op, window);
 }
 
-template void sve_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                             const ITensor *in2,
+                                                                             ITensor       *out,
+                                                                             const Window  &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
 
 template <ComparisonOperation op>
 void sve_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
@@ -51,14 +76,32 @@ void sve_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *i
     return elementwise_comparison_op<float16_t>(in1, in2, out, op, window);
 }
 
-template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                const ITensor *in2,
+                                                                                ITensor       *out,
+                                                                                const Window  &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
 
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
-\ No newline at end of file
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp
index 2b479f76f1..832a966883 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
 namespace arm_compute
 {
@@ -34,26 +35,68 @@ void sve_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor
     return elementwise_arithmetic_op<float32_t>(in1, in2, out, op, window);
 }
 
-template void sve_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                             const ITensor *in2,
+                                                                             ITensor       *out,
+                                                                             const Window  &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
 
 template <ComparisonOperation op>
 void sve_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_comparison_op<float>(in1, in2, out, op, window);
 }
-template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                const ITensor *in2,
+                                                                                ITensor       *out,
+                                                                                const Window  &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
 
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp
index c0515f2abc..fa48407e9b 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp
@@ -23,7 +23,9 @@
  */
 
 #include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
+
 #include "src/core/NEON/SVEMath.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -33,7 +35,8 @@ namespace cpu
 using namespace arm_compute::wrapper;
 
 template <typename ScalarType>
-void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window)
+void elementwise_arithmetic_op(
+    const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window)
 {
     using VectorType = typename sve_vector<ScalarType>::type;
 
@@ -51,7 +54,7 @@ void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *
     const auto window_end_x          = static_cast<int>(window.x().end());
     const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -66,37 +69,40 @@ void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto             output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
-            const auto       non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
-            const ScalarType broadcast_value         = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
-            const auto       broadcast_vector        = svdup_n(broadcast_value);
-
-            int x = window_start_x;
-
-            svbool_t pg = svwhilelt<ScalarType>(x, window_end_x);
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x);
-                VectorType res{};
+                auto       output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
+                const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
+                const ScalarType broadcast_value   = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
+                const auto       broadcast_vector  = svdup_n(broadcast_value);
 
-                if(is_broadcast_input_2)
-                {
-                    res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(pg, non_broadcast_vector, broadcast_vector, op);
-                }
-                else
+                int x = window_start_x;
+
+                svbool_t pg = svwhilelt<ScalarType>(x, window_end_x);
+                do
                 {
-                    res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(pg, broadcast_vector, non_broadcast_vector, op);
-                }
-                svst1(pg, output_ptr + x, res);
-
-                x += svcnt<ScalarType>();
-                pg = svwhilelt<ScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
+                    const auto non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x);
+                    VectorType res{};
+
+                    if (is_broadcast_input_2)
+                    {
+                        res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(pg, non_broadcast_vector,
+                                                                                               broadcast_vector, op);
+                    }
+                    else
+                    {
+                        res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(
+                            pg, broadcast_vector, non_broadcast_vector, op);
+                    }
+                    svst1(pg, output_ptr + x, res);
+
+                    x += svcnt<ScalarType>();
+                    pg = svwhilelt<ScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -108,39 +114,46 @@ void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *
         Iterator input2(in2, input2_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-            const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+                const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
 
-            int x = window_start_x;
+                int x = window_start_x;
 
-            svbool_t pg = svwhilelt<ScalarType>(x, window_end_x);
-            do
-            {
-                const auto in1 = svld1(pg, input1_ptr + x);
-                const auto in2 = svld1(pg, input2_ptr + x);
-                const auto res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(pg, in1, in2, op);
-                svst1(pg, output_ptr + x, res);
-
-                x += svcnt<ScalarType>();
-                pg = svwhilelt<ScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
+                svbool_t pg = svwhilelt<ScalarType>(x, window_end_x);
+                do
+                {
+                    const auto in1 = svld1(pg, input1_ptr + x);
+                    const auto in2 = svld1(pg, input2_ptr + x);
+                    const auto res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(pg, in1, in2, op);
+                    svst1(pg, output_ptr + x, res);
+
+                    x += svcnt<ScalarType>();
+                    pg = svwhilelt<ScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            input1, input2, output);
     }
 }
-template void elementwise_arithmetic_op<float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
-template void elementwise_arithmetic_op<float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
-template void elementwise_arithmetic_op<int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
-template void elementwise_arithmetic_op<int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
+template void elementwise_arithmetic_op<float32_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
+template void elementwise_arithmetic_op<float16_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
+template void elementwise_arithmetic_op<int16_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
+template void elementwise_arithmetic_op<int32_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
 
 template <typename InputScalarType, typename OutputScalarType>
-void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window)
+void elementwise_comparison_op(
+    const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window)
 {
-    static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width");
+    static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType),
+                  "input data type's width should be equal to or greater than output data type's width");
 
     using OutputVectorType = typename sve_vector<OutputScalarType>::type;
     const auto all_true_pg = svptrue<InputScalarType>();
@@ -157,7 +170,7 @@ void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *
     const auto window_end_x          = static_cast<int>(window.x().end());
     const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -172,37 +185,44 @@ void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto                  output_ptr              = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto            non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
-            const InputScalarType broadcast_value         = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
-            const auto            broadcast_vector        = svdup_n(broadcast_value);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+                const auto non_broadcast_input_ptr =
+                    reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
+                const InputScalarType broadcast_value =
+                    *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
+                const auto broadcast_vector = svdup_n(broadcast_value);
 
-            int x = window_start_x;
+                int x = window_start_x;
 
-            svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
-            do
-            {
-                const auto       non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x);
-                const svbool_t   output_pg            = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
-                OutputVectorType res{};
-                if(is_broadcast_input_2)
-                {
-                    res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, non_broadcast_vector, broadcast_vector, op);
-                }
-                else
+                svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
+                do
                 {
-                    res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, broadcast_vector, non_broadcast_vector, op);
-                }
-                svst1(output_pg, output_ptr + x, res);
-
-                x += svcnt<InputScalarType>();
-                pg = svwhilelt<InputScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
+                    const auto       non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x);
+                    const svbool_t   output_pg            = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
+                    OutputVectorType res{};
+                    if (is_broadcast_input_2)
+                    {
+                        res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type,
+                                                        typename sve_vector<OutputScalarType>::type>(
+                            pg, non_broadcast_vector, broadcast_vector, op);
+                    }
+                    else
+                    {
+                        res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type,
+                                                        typename sve_vector<OutputScalarType>::type>(
+                            pg, broadcast_vector, non_broadcast_vector, op);
+                    }
+                    svst1(output_pg, output_ptr + x, res);
+
+                    x += svcnt<InputScalarType>();
+                    pg = svwhilelt<InputScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -214,37 +234,45 @@ void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *
         Iterator input2(in2, input2_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+                const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
 
-            int x = window_start_x;
+                int x = window_start_x;
 
-            svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
-            do
-            {
-                const auto     in1       = svld1(pg, input1_ptr + x);
-                const auto     in2       = svld1(pg, input2_ptr + x);
-                const auto     res       = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, in1, in2, op);
-                const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
-                svst1(output_pg, output_ptr + x, res);
-
-                x += svcnt<InputScalarType>();
-                pg = svwhilelt<InputScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
+                svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
+                do
+                {
+                    const auto in1 = svld1(pg, input1_ptr + x);
+                    const auto in2 = svld1(pg, input2_ptr + x);
+                    const auto res =
+                        elementwise_comparison_op<typename sve_vector<InputScalarType>::type,
+                                                  typename sve_vector<OutputScalarType>::type>(pg, in1, in2, op);
+                    const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
+                    svst1(output_pg, output_ptr + x, res);
+
+                    x += svcnt<InputScalarType>();
+                    pg = svwhilelt<InputScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            input1, input2, output);
     }
 }
 
-template void elementwise_comparison_op<float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
-template void elementwise_comparison_op<float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
-template void elementwise_comparison_op<uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
-template void elementwise_comparison_op<int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
-template void elementwise_comparison_op<int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
+template void elementwise_comparison_op<float32_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
+template void elementwise_comparison_op<float16_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
+template void elementwise_comparison_op<uint8_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
+template void elementwise_comparison_op<int16_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
+template void elementwise_comparison_op<int32_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
 
 template <>
 svint32_t elementwise_pow<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b)
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/impl.h b/src/cpu/kernels/elementwise_binary/generic/sve/impl.h
index 860c50a1e0..4c61b9f315 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve/impl.h
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/impl.h
@@ -25,6 +25,7 @@
 #define SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 #include "src/core/NEON/wrapper/svtraits.h"
 
@@ -51,7 +52,7 @@ svbool_t narrow_to_byte_predicate(svbool_t pg)
 {
     const auto all_false = svpfalse();
 
-    switch(bytewidth)
+    switch (bytewidth)
     {
         case 8:
             pg = svuzp1_b32(pg, all_false);
@@ -74,7 +75,7 @@ VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, const Ve
     using ScalarType = typename wrapper::sve_scalar<VectorType>::type;
     VectorType res{};
 
-    switch(op)
+    switch (op)
     {
         case ArithmeticOperation::MAX:
             res = svmax_z(pg, a, b);
@@ -114,11 +115,12 @@ VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, const Ve
 }
 
 template <typename InputVectorType, typename OutputVectorType>
-OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op)
+OutputVectorType
+elementwise_comparison_op(svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op)
 {
     svbool_t selection_vector{};
 
-    switch(op)
+    switch (op)
     {
         case ComparisonOperation::Equal:
             selection_vector = svcmpeq(pg, a, b);
@@ -154,10 +156,12 @@ OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVectorType &
 }
 
 template <typename ScalarType>
-void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window);
+void elementwise_arithmetic_op(
+    const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window);
 
 template <typename ScalarType, typename OutputScalarType = uint8_t>
-void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window);
+void elementwise_comparison_op(
+    const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window);
 } // namespace cpu
 } // namespace arm_compute
 #endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H */
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp
index c313fc6e04..f7714ff7e9 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
 namespace arm_compute
 {
@@ -33,64 +34,166 @@ void sve_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor
 {
     return elementwise_arithmetic_op<int32_t>(in1, in2, out, op, window);
 }
-template void sve_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                            const ITensor *in2,
+                                                                            ITensor       *out,
+                                                                            const Window  &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
 
 template <ArithmeticOperation op>
 void sve_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_arithmetic_op<int16_t>(in1, in2, out, op, window);
 }
-template void sve_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                            const ITensor *in2,
+                                                                            ITensor       *out,
+                                                                            const Window  &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
 
 template <ComparisonOperation op>
 void sve_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_comparison_op<uint8_t>(in1, in2, out, op, window);
 }
-template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                  const ITensor *in2,
+                                                                                  ITensor       *out,
+                                                                                  const Window  &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                      const ITensor *in2,
+                                                                                      ITensor       *out,
+                                                                                      const Window  &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                              const ITensor *in2,
+                                                                              ITensor       *out,
+                                                                              const Window  &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
 
 template <ComparisonOperation op>
 void sve_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_comparison_op<int16_t>(in1, in2, out, op, window);
 }
-template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                const ITensor *in2,
+                                                                                ITensor       *out,
+                                                                                const Window  &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                  const ITensor *in2,
+                                                                                  ITensor       *out,
+                                                                                  const Window  &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                       const ITensor *in2,
+                                                                                       ITensor       *out,
+                                                                                       const Window  &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
 
 template <ComparisonOperation op>
 void sve_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_comparison_op<int32_t>(in1, in2, out, op, window);
 }
-template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                const ITensor *in2,
+                                                                                ITensor       *out,
+                                                                                const Window  &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                  const ITensor *in2,
+                                                                                  ITensor       *out,
+                                                                                  const Window  &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                       const ITensor *in2,
+                                                                                       ITensor       *out,
+                                                                                       const Window  &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
 
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h b/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h
index 41e0ac77db..7c6015d379 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h
+++ b/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h
@@ -35,19 +35,14 @@ inline svfloat32x4_t load_quantized(const int8_t *ptr, svbool_t pg, const svint3
 {
     auto x = svld1(pg, ptr);
 
-    const auto widened = svcreate4(
-                             svmovlb(svmovlb(x)),
-                             svmovlt(svmovlb(x)),
-                             svmovlb(svmovlt(x)),
-                             svmovlt(svmovlt(x)));
+    const auto widened = svcreate4(svmovlb(svmovlb(x)), svmovlt(svmovlb(x)), svmovlb(svmovlt(x)), svmovlt(svmovlt(x)));
 
     pg = svptrue_b8();
 
-    return svcreate4(
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 0), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 1), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 2), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 3), offset)), scale));
+    return svcreate4(svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 0), offset)), scale),
+                     svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 1), offset)), scale),
+                     svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 2), offset)), scale),
+                     svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 3), offset)), scale));
 }
 
 inline svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale)
@@ -56,28 +51,24 @@ inline svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint
 
     //vprint(x);
 
-    const auto widened = svcreate4(
-                             svmovlb(svmovlb(x)),
-                             svmovlt(svmovlb(x)),
-                             svmovlb(svmovlt(x)),
-                             svmovlt(svmovlt(x)));
+    const auto widened = svcreate4(svmovlb(svmovlb(x)), svmovlt(svmovlb(x)), svmovlb(svmovlt(x)), svmovlt(svmovlt(x)));
 
     pg = svptrue_b8();
 
-    return svcreate4(
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 0)), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 1)), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 2)), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 3)), offset)), scale));
+    return svcreate4(svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 0)), offset)), scale),
+                     svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 1)), offset)), scale),
+                     svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 2)), offset)), scale),
+                     svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 3)), offset)), scale));
 }
 
-inline void store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
+inline void
+store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
 {
-    const auto quantized = svcreate4(
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset));
+    const auto quantized =
+        svcreate4(svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
+                  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset),
+                  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset),
+                  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset));
 
     const auto narrowed_bottom = svqxtunt(svqxtunb(svget4(quantized, 0)), svget4(quantized, 1));
     const auto narrowed_top    = svqxtunt(svqxtunb(svget4(quantized, 2)), svget4(quantized, 3));
@@ -85,13 +76,14 @@ inline void store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const
     svst1(pg, ptr, narrowed);
 }
 
-inline void store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
+inline void
+store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
 {
-    const auto quantized = svcreate4(
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset));
+    const auto quantized =
+        svcreate4(svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
+                  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset),
+                  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset),
+                  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset));
 
     const auto narrowed_bottom = svqxtnt(svqxtnb(svget4(quantized, 0)), svget4(quantized, 1));
     const auto narrowed_top    = svqxtnt(svqxtnb(svget4(quantized, 2)), svget4(quantized, 3));
@@ -101,7 +93,8 @@ inline void store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const
 }
 
 template <typename ScalarType>
-void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window)
+void elementwise_arithmetic_quantized_op(
+    const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window)
 {
     const auto all_true_pg = wrapper::svptrue<ScalarType>();
 
@@ -120,7 +113,7 @@ void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2,
     const auto output_voffset = svdup_n(out->info()->quantization_info().uniform().offset);
     const auto output_vscale  = svdup_n(1.f / out->info()->quantization_info().uniform().scale);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -128,8 +121,10 @@ void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2,
         const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
         const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
 
-        const auto non_broadcast_qinfo = is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info();
-        const auto broadcast_qinfo     = is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info();
+        const auto non_broadcast_qinfo =
+            is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info();
+        const auto broadcast_qinfo =
+            is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info();
 
         const auto non_broadcast_voffset = svdup_n(non_broadcast_qinfo.uniform().offset);
         const auto non_broadcast_vscale  = svdup_n(non_broadcast_qinfo.uniform().scale);
@@ -141,48 +136,52 @@ void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2,
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto             output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
-            const auto       non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
-            const ScalarType broadcast_value         = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
-            const float      broadcast_value_f       = Qasymm8QuantizationHelper<ScalarType>::dequantize(broadcast_value, broadcast_qinfo);
-            const auto       in2                     = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f));
-
-            int x = window_start_x;
-
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto in1 = load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale);
-
-                svfloat32x4_t result{};
-
-                if(!is_broadcast_input_2)
+                auto       output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
+                const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
+                const ScalarType broadcast_value   = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
+                const float      broadcast_value_f =
+                    Qasymm8QuantizationHelper<ScalarType>::dequantize(broadcast_value, broadcast_qinfo);
+                const auto in2 = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f),
+                                           svdup_n(broadcast_value_f), svdup_n(broadcast_value_f));
+
+                int x = window_start_x;
+
+                svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                do
                 {
-                    result = svcreate4(
-                                 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 0), svget4(in1, 0), op),
-                                 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 1), svget4(in1, 1), op),
-                                 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 2), svget4(in1, 2), op),
-                                 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 3), svget4(in1, 3), op));
-                }
-                else
-                {
-                    result = svcreate4(
-                                 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), op),
-                                 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), op),
-                                 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), op),
-                                 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), op));
-                }
-
-                store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
+                    const auto in1 =
+                        load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale);
+
+                    svfloat32x4_t result{};
+
+                    if (!is_broadcast_input_2)
+                    {
+                        result =
+                            svcreate4(elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 0), svget4(in1, 0), op),
+                                      elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 1), svget4(in1, 1), op),
+                                      elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 2), svget4(in1, 2), op),
+                                      elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 3), svget4(in1, 3), op));
+                    }
+                    else
+                    {
+                        result =
+                            svcreate4(elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), op),
+                                      elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), op),
+                                      elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), op),
+                                      elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), op));
+                    }
+
+                    store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale);
+
+                    x += wrapper::svcnt<ScalarType>();
+                    pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -200,41 +199,44 @@ void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2,
         const auto in2_voffset = svdup_n(in2->info()->quantization_info().uniform().offset);
         const auto in2_vscale  = svdup_n(in2->info()->quantization_info().uniform().scale);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-            const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+                const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
 
-            int x = window_start_x;
+                int x = window_start_x;
 
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            do
-            {
-                const auto in1 = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale);
-                const auto in2 = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale);
-
-                const auto result = svcreate4(
-                                        elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), op),
-                                        elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), op),
-                                        elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), op),
-                                        elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), op));
-
-                store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
+                svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                do
+                {
+                    const auto in1 = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale);
+                    const auto in2 = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale);
+
+                    const auto result =
+                        svcreate4(elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), op),
+                                  elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), op),
+                                  elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), op),
+                                  elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), op));
+
+                    store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale);
+
+                    x += wrapper::svcnt<ScalarType>();
+                    pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            input1, input2, output);
     }
 }
 
 template <typename InputScalarType, typename OutputScalarType = uint8_t>
-void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window)
+void elementwise_comparison_quantized_op(
+    const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window)
 {
-    static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width");
+    static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType),
+                  "input data type's width should be equal to or greater than output data type's width");
 
     using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type;
     const auto all_true_pg = wrapper::svptrue<InputScalarType>();
@@ -251,7 +253,7 @@ void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2,
     const auto window_end_x          = static_cast<int>(window.x().end());
     const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -259,8 +261,10 @@ void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2,
         const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
         const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
 
-        const auto non_broadcast_qinfo = is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info();
-        const auto broadcast_qinfo     = is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info();
+        const auto non_broadcast_qinfo =
+            is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info();
+        const auto broadcast_qinfo =
+            is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info();
 
         const auto non_broadcast_voffset = svdup_n(non_broadcast_qinfo.uniform().offset);
         const auto non_broadcast_vscale  = svdup_n(non_broadcast_qinfo.uniform().scale);
@@ -272,51 +276,63 @@ void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2,
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto                  output_ptr              = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto            non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
-            const InputScalarType broadcast_value         = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
-            const float           broadcast_value_f       = Qasymm8QuantizationHelper<InputScalarType>::dequantize(broadcast_value, broadcast_qinfo);
-            const auto            in2                     = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f));
-
-            int x = window_start_x;
-
-            svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto in1 = load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale);
-
-                svuint8x4_t result{};
-
-                if(!is_broadcast_input_2)
+                auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+                const auto non_broadcast_input_ptr =
+                    reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
+                const InputScalarType broadcast_value =
+                    *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
+                const float broadcast_value_f =
+                    Qasymm8QuantizationHelper<InputScalarType>::dequantize(broadcast_value, broadcast_qinfo);
+                const auto in2 = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f),
+                                           svdup_n(broadcast_value_f), svdup_n(broadcast_value_f));
+
+                int x = window_start_x;
+
+                svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
+                do
                 {
-                    result = svcreate4(
-                                 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 0), svget4(in1, 0), op),
-                                 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 1), svget4(in1, 1), op),
-                                 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 2), svget4(in1, 2), op),
-                                 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 3), svget4(in1, 3), op));
-                }
-                else
-                {
-                    result = svcreate4(
-                                 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0), svget4(in2, 0), op),
-                                 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1), svget4(in2, 1), op),
-                                 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2), svget4(in2, 2), op),
-                                 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 3), svget4(in2, 3), op));
-                }
-
-                const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1));
-                const auto zipped_top    = svzip1(svget4(result, 2), svget4(result, 3));
-                const auto zipped        = svzip1(zipped_bottom, zipped_top);
-                svst1(pg, output_ptr + x, zipped);
-
-                x += wrapper::svcnt<InputScalarType>();
-                pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
+                    const auto in1 =
+                        load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale);
+
+                    svuint8x4_t result{};
+
+                    if (!is_broadcast_input_2)
+                    {
+                        result = svcreate4(elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 0),
+                                                                                                    svget4(in1, 0), op),
+                                           elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 1),
+                                                                                                    svget4(in1, 1), op),
+                                           elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 2),
+                                                                                                    svget4(in1, 2), op),
+                                           elementwise_comparison_op<svfloat32_t, OutputVectorType>(
+                                               pg, svget4(in2, 3), svget4(in1, 3), op));
+                    }
+                    else
+                    {
+                        result = svcreate4(elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0),
+                                                                                                    svget4(in2, 0), op),
+                                           elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1),
+                                                                                                    svget4(in2, 1), op),
+                                           elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2),
+                                                                                                    svget4(in2, 2), op),
+                                           elementwise_comparison_op<svfloat32_t, OutputVectorType>(
+                                               pg, svget4(in1, 3), svget4(in2, 3), op));
+                    }
+
+                    const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1));
+                    const auto zipped_top    = svzip1(svget4(result, 2), svget4(result, 3));
+                    const auto zipped        = svzip1(zipped_bottom, zipped_top);
+                    svst1(pg, output_ptr + x, zipped);
+
+                    x += wrapper::svcnt<InputScalarType>();
+                    pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -334,39 +350,44 @@ void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2,
         const auto in2_voffset = svdup_n(in2->info()->quantization_info().uniform().offset);
         const auto in2_vscale  = svdup_n(in2->info()->quantization_info().uniform().scale);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+                const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
 
-            int x = window_start_x;
+                int x = window_start_x;
 
-            svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
-            do
-            {
-                const auto in1    = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale);
-                const auto in2    = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale);
-                const auto result = svcreate4(
-                                        elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0), svget4(in2, 0), op),
-                                        elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1), svget4(in2, 1), op),
-                                        elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2), svget4(in2, 2), op),
-                                        elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 3), svget4(in2, 3), op));
-
-                const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1));
-                const auto zipped_top    = svzip1(svget4(result, 2), svget4(result, 3));
-                const auto zipped        = svzip1(zipped_bottom, zipped_top);
-                svst1(pg, output_ptr + x, zipped);
-
-                x += wrapper::svcnt<InputScalarType>();
-                pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
+                svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
+                do
+                {
+                    const auto in1 = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale);
+                    const auto in2 = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale);
+                    const auto result =
+                        svcreate4(elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0),
+                                                                                           svget4(in2, 0), op),
+                                  elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1),
+                                                                                           svget4(in2, 1), op),
+                                  elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2),
+                                                                                           svget4(in2, 2), op),
+                                  elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 3),
+                                                                                           svget4(in2, 3), op));
+
+                    const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1));
+                    const auto zipped_top    = svzip1(svget4(result, 2), svget4(result, 3));
+                    const auto zipped        = svzip1(zipped_bottom, zipped_top);
+                    svst1(pg, output_ptr + x, zipped);
+
+                    x += wrapper::svcnt<InputScalarType>();
+                    pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            input1, input2, output);
     }
 }
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */
-\ No newline at end of file
+#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp
index 7435bb4f29..5cc66642d7 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/sve2/impl.h"
 namespace arm_compute
 {
@@ -34,27 +35,72 @@ void sve2_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITe
     return elementwise_arithmetic_quantized_op<uint8_t>(in1, in2, out, op, window);
 }
 
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                          const ITensor *in2,
+                                                                          ITensor       *out,
+                                                                          const Window  &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                          const ITensor *in2,
+                                                                          ITensor       *out,
+                                                                          const Window  &window);
 
 template <ComparisonOperation op>
-void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1,
+                                                const ITensor *in2,
+                                                ITensor       *out,
+                                                const Window  &window)
 {
     return elementwise_comparison_quantized_op<uint8_t>(in1, in2, out, op, window);
 }
 
-template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                       const ITensor *in2,
+                                                                                       ITensor       *out,
+                                                                                       const Window  &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                            const ITensor *in2,
+                                                                                            ITensor       *out,
+                                                                                            const Window  &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                         const ITensor *in2,
+                                                                                         ITensor       *out,
+                                                                                         const Window  &window);
 
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp
index 1027a1eed0..165e0c05fa 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/sve2/impl.h"
 namespace arm_compute
 {
@@ -34,27 +35,70 @@ void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *i
     return elementwise_arithmetic_quantized_op<int8_t>(in1, in2, out, op, window);
 }
 
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
 
 template <ComparisonOperation op>
-void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1,
+                                                       const ITensor *in2,
+                                                       ITensor       *out,
+                                                       const Window  &window)
 {
     return elementwise_comparison_quantized_op<int8_t>(in1, in2, out, op, window);
 }
 
-template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                            const ITensor *in2,
+                                                                                            ITensor       *out,
+                                                                                            const Window  &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                               const ITensor *in2,
+                                                                                               ITensor       *out,
+                                                                                               const Window  &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                              const ITensor *in2,
+                                                                                              ITensor       *out,
+                                                                                              const Window  &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                           const ITensor *in2,
+                                                                                           ITensor       *out,
+                                                                                           const Window  &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                                const ITensor *in2,
+                                                                                                ITensor       *out,
+                                                                                                const Window  &window);
 
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp
index b2833c2481..2588db024d 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp
@@ -23,17 +23,19 @@
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp16_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_fp16_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(lut);
     return elementwise_op<__fp16>(in, out, window, op);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
 #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp
index 6566821eca..936a2e588a 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp
@@ -22,16 +22,18 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_fp32_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(lut);
     return elementwise_op<float>(in, out, window, op);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/impl.h b/src/cpu/kernels/elementwise_unary/generic/neon/impl.h
index dbc1dde4fa..d54d3984cb 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/impl.h
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/impl.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 
@@ -36,7 +37,7 @@ namespace cpu
 template <typename ScalarType>
 inline ScalarType elementwise_op_scalar_imp(ElementWiseUnary op, const ScalarType &a)
 {
-    switch(op)
+    switch (op)
     {
         case ElementWiseUnary::RSQRT:
             return 1 / sqrt(a);
@@ -60,7 +61,7 @@ inline ScalarType elementwise_op_scalar_imp(ElementWiseUnary op, const ScalarTyp
 template <typename ScalarType, typename VectorType>
 inline VectorType elementwise_op_imp(ElementWiseUnary op, const VectorType &a)
 {
-    switch(op)
+    switch (op)
     {
         case ElementWiseUnary::RSQRT:
             return wrapper::vinvsqrt(a);
@@ -94,22 +95,24 @@ inline void elementwise_op(const ITensor *in, ITensor *out, const Window &window
     Iterator input(in, win);
     Iterator output(out, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-        const auto input_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
-
-        int x = window_start_x;
-        for(; x <= window_end_x - window_step_x; x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            wrapper::vstore(output_ptr + x, elementwise_op_imp<ScalarType>(op, wrapper::vloadq(input_ptr + x)));
-        }
-        for(; x < window_end_x; ++x)
-        {
-            *(output_ptr + x) = elementwise_op_scalar_imp(op, *(input_ptr + x));
-        }
-    },
-    input, output);
+            auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+            const auto input_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
+
+            int x = window_start_x;
+            for (; x <= window_end_x - window_step_x; x += window_step_x)
+            {
+                wrapper::vstore(output_ptr + x, elementwise_op_imp<ScalarType>(op, wrapper::vloadq(input_ptr + x)));
+            }
+            for (; x < window_end_x; ++x)
+            {
+                *(output_ptr + x) = elementwise_op_scalar_imp(op, *(input_ptr + x));
+            }
+        },
+        input, output);
 }
 
 template <>
@@ -128,75 +131,81 @@ inline void elementwise_op<int8_t>(const ITensor *in, ITensor *out, const Window
     Iterator input(in, win);
     Iterator output(out, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int8x16_t  vout;
-        auto       output_ptr    = reinterpret_cast<int8_t *>(output.ptr());
-        const auto input_ptr     = reinterpret_cast<const int8_t *>(input.ptr());
-        const auto vconst_0_f32  = vdupq_n_f32(0);
-        auto       clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value;
-
-        int x = window_start_x;
-        for(; x <= window_end_x - window_step_x; x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-
-            // De-quantize
-            const auto vin_deq = vdequantize(vin, qi_in);
+            int8x16_t  vout;
+            auto       output_ptr    = reinterpret_cast<int8_t *>(output.ptr());
+            const auto input_ptr     = reinterpret_cast<const int8_t *>(input.ptr());
+            const auto vconst_0_f32  = vdupq_n_f32(0);
+            auto       clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value;
 
-            // Perform activation
-            float32x4x4_t vtmp_deq =
+            int x = window_start_x;
+            for (; x <= window_end_x - window_step_x; x += window_step_x)
             {
-                {
+                const auto vin = wrapper::vloadq(input_ptr + x);
+
+                // De-quantize
+                const auto vin_deq = vdequantize(vin, qi_in);
+
+                // Perform activation
+                float32x4x4_t vtmp_deq = {{
                     elementwise_op_imp<float>(op, vin_deq.val[0]),
                     elementwise_op_imp<float>(op, vin_deq.val[1]),
                     elementwise_op_imp<float>(op, vin_deq.val[2]),
                     elementwise_op_imp<float>(op, vin_deq.val[3]),
+                }};
+
+                if ((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT))
+                {
+                    vtmp_deq.val[0] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]);
+                    vtmp_deq.val[1] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]);
+                    vtmp_deq.val[2] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]);
+                    vtmp_deq.val[3] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]);
                 }
-            };
 
-            if((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT))
-            {
-                vtmp_deq.val[0] = vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]);
-                vtmp_deq.val[1] = vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]);
-                vtmp_deq.val[2] = vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]);
-                vtmp_deq.val[3] = vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]);
+                // Re-quantize to new output space
+                vout = vquantize_signed(vtmp_deq, qi_out);
+                wrapper::vstore(output_ptr + x, vout);
             }
-
-            // Re-quantize to new output space
-            vout = vquantize_signed(vtmp_deq, qi_out);
-            wrapper::vstore(output_ptr + x, vout);
-        }
-        for(; x < window_end_x; ++x)
-        {
-            qasymm8_signed_t in    = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
-            qasymm8_signed_t tmp   = 0;
-            float            tmp_f = dequantize_qasymm8_signed(in, qi_in);
-            if(tmp_f <= 0.0)
+            for (; x < window_end_x; ++x)
             {
-                if(op == ElementWiseUnary::LOG)
-                {
-                    tmp_f = (-128 - qi_out.offset) * qi_out.scale;
-                }
-                else if(op == ElementWiseUnary::RSQRT)
+                qasymm8_signed_t in    = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
+                qasymm8_signed_t tmp   = 0;
+                float            tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                if (tmp_f <= 0.0)
                 {
-                    tmp_f = (127 - qi_out.offset) * qi_out.scale;
+                    if (op == ElementWiseUnary::LOG)
+                    {
+                        tmp_f = (-128 - qi_out.offset) * qi_out.scale;
+                    }
+                    else if (op == ElementWiseUnary::RSQRT)
+                    {
+                        tmp_f = (127 - qi_out.offset) * qi_out.scale;
+                    }
+                    else
+                    {
+                        tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
+                    }
                 }
                 else
                 {
                     tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
                 }
+                tmp = quantize_qasymm8_signed(
+                    tmp_f, qi_out,
+                    RoundingPolicy::
+                        TO_ZERO); // Set rounding policy TO_ZERO to be compatible with vquantize_signed() used above that follow same policy for armv7a.
+                // For aarch64 LUT is used and rounding to nearest is used
+                *(output_ptr + x) = tmp;
             }
-            else
-            {
-                tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
-            }
-            tmp = quantize_qasymm8_signed(tmp_f, qi_out, RoundingPolicy::TO_ZERO); // Set rounding policy TO_ZERO to be compatible with vquantize_signed() used above that follow same policy for armv7a.
-            // For aarch64 LUT is used and rounding to nearest is used
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
+        },
+        input, output);
 }
 template <>
 inline void elementwise_op<uint8_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
@@ -215,71 +224,74 @@ inline void elementwise_op<uint8_t>(const ITensor *in, ITensor *out, const Windo
     Iterator input(in, win);
     Iterator output(out, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        uint8x16_t vout;
-        auto       clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value;
-        auto       output_ptr    = reinterpret_cast<uint8_t *>(output.ptr());
-        const auto input_ptr     = reinterpret_cast<const uint8_t *>(input.ptr());
-        int        x             = window_start_x;
-        for(; x <= window_end_x - window_step_x; x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const auto vin = wrapper::vloadq(input_ptr + x);
+            uint8x16_t vout;
+            auto       clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value;
+            auto       output_ptr    = reinterpret_cast<uint8_t *>(output.ptr());
+            const auto input_ptr     = reinterpret_cast<const uint8_t *>(input.ptr());
+            int        x             = window_start_x;
+            for (; x <= window_end_x - window_step_x; x += window_step_x)
+            {
+                const auto vin = wrapper::vloadq(input_ptr + x);
 
-            // De-quantize
-            const auto vin_deq = vdequantize(vin, qi_in);
+                // De-quantize
+                const auto vin_deq = vdequantize(vin, qi_in);
 
-            // Perform activation
-            float32x4x4_t vtmp_deq =
-            {
-                {
+                // Perform activation
+                float32x4x4_t vtmp_deq = {{
                     elementwise_op_imp<float>(op, vin_deq.val[0]),
                     elementwise_op_imp<float>(op, vin_deq.val[1]),
                     elementwise_op_imp<float>(op, vin_deq.val[2]),
                     elementwise_op_imp<float>(op, vin_deq.val[3]),
+                }};
+                if ((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT))
+                {
+                    vtmp_deq.val[0] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]);
+                    vtmp_deq.val[1] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]);
+                    vtmp_deq.val[2] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]);
+                    vtmp_deq.val[3] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]);
                 }
-            };
-            if((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT))
-            {
-                vtmp_deq.val[0] = vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]);
-                vtmp_deq.val[1] = vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]);
-                vtmp_deq.val[2] = vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]);
-                vtmp_deq.val[3] = vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]);
-            }
 
-            // Re-quantize to new output space
-            vout = vquantize(vtmp_deq, qi_out);
-            wrapper::vstore(output_ptr + x, vout);
-        }
-        for(; x < window_end_x; ++x)
-        {
-            qasymm8_t in    = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
-            qasymm8_t tmp   = 0;
-            float     tmp_f = dequantize_qasymm8(in, qi_in);
-            if(tmp_f <= 0.0)
+                // Re-quantize to new output space
+                vout = vquantize(vtmp_deq, qi_out);
+                wrapper::vstore(output_ptr + x, vout);
+            }
+            for (; x < window_end_x; ++x)
             {
-                if(op == ElementWiseUnary::LOG)
+                qasymm8_t in    = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
+                qasymm8_t tmp   = 0;
+                float     tmp_f = dequantize_qasymm8(in, qi_in);
+                if (tmp_f <= 0.0)
                 {
-                    tmp_f = (0 - qi_out.offset) * qi_out.scale;
-                }
-                else if(op == ElementWiseUnary::RSQRT)
-                {
-                    tmp_f = (255 - qi_out.offset) * qi_out.scale;
+                    if (op == ElementWiseUnary::LOG)
+                    {
+                        tmp_f = (0 - qi_out.offset) * qi_out.scale;
+                    }
+                    else if (op == ElementWiseUnary::RSQRT)
+                    {
+                        tmp_f = (255 - qi_out.offset) * qi_out.scale;
+                    }
+                    else
+                    {
+                        tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
+                    }
                 }
                 else
                 {
                     tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
                 }
+                tmp               = quantize_qasymm8(tmp_f, qi_out, RoundingPolicy::TO_ZERO);
+                *(output_ptr + x) = tmp;
             }
-            else
-            {
-                tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
-            }
-            tmp               = quantize_qasymm8(tmp_f, qi_out, RoundingPolicy::TO_ZERO);
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
+        },
+        input, output);
 }
 
 } // namespace cpu
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp
index dfe5e30035..d4daad4ca6 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp
@@ -22,16 +22,18 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_s32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_s32_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(lut);
     return elementwise_op<int32_t>(in, out, window, op);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp
index 08bb7f28b6..38cb61d0ff 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/lut/list.h"
 
 namespace arm_compute
@@ -32,24 +33,28 @@ namespace cpu
 
 #ifdef __aarch64__
 
-void neon_q8_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_q8_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(op);
 
-    auto win = window;
+    auto       win          = window;
     const auto window_end_x = window.x().end();
     win.set(0, Window::Dimension(0, 1, 1));
 
     Iterator src_it(in, win);
     Iterator dst_it(out, win);
 
-    execute_window_loop(win, [&](const Coordinates &) {
-        const auto src_ptr = src_it.ptr();
-        auto dst_ptr = dst_it.ptr();
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto src_ptr = src_it.ptr();
+            auto       dst_ptr = dst_it.ptr();
 
-        lut_u8_neon(lut, 1, window_end_x, &src_ptr, &dst_ptr);
-    },
-    src_it, dst_it);
+            lut_u8_neon(lut, 1, window_end_x, &src_ptr, &dst_ptr);
+        },
+        src_it, dst_it);
 }
 
 #endif // __aarch64__
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp
index d987f7747b..3e4b88eb47 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Window.h"
+
 #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
 
 namespace arm_compute
@@ -31,7 +32,8 @@ namespace cpu
 {
 #ifndef __aarch64__
 // Fallback function to be used for armv7a, for aarch64 LUT is used
-void neon_qasymm8_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_qasymm8_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(lut);
     return elementwise_op<uint8_t>(in, out, window, op);
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp
index e00970a1e0..a5f4b053e3 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Window.h"
+
 #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
 
 namespace arm_compute
@@ -31,7 +32,8 @@ namespace cpu
 {
 #ifndef __aarch64__
 // Fallback function to be used for armv7a, for aarch64 LUT is used
-void neon_qasymm8_signed_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_qasymm8_signed_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(lut);
     return elementwise_op<int8_t>(in, out, window, op);
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp
index a883309b2e..22ff43c5d9 100644
--- a/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp
@@ -23,6 +23,7 @@
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/CpuTypes.h"
 #include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h"
 
@@ -30,11 +31,12 @@ namespace arm_compute
 {
 namespace cpu
 {
-void sve_fp16_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void sve_fp16_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(lut);
     return elementwise_sve_op<float16_t>(in, out, window, op);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp
index b21ed8ddbc..394bd47adf 100644
--- a/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/CpuTypes.h"
 #include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h"
 
@@ -30,10 +31,11 @@ namespace arm_compute
 {
 namespace cpu
 {
-void sve_fp32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void sve_fp32_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(lut);
     return elementwise_sve_op<float32_t>(in, out, window, op);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp
index a948862906..5af534d9e7 100644
--- a/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp
@@ -24,6 +24,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 
 namespace arm_compute
@@ -31,9 +32,10 @@ namespace arm_compute
 namespace cpu
 {
 template <typename ScalarType, typename VectorType>
-inline typename std::enable_if<utils::traits::is_floating_point<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
+inline typename std::enable_if<utils::traits::is_floating_point<ScalarType>::value, VectorType>::type
+elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
 {
-    switch(op)
+    switch (op)
     {
         case ElementWiseUnary::RSQRT:
             return svinvsqrt(pg, a);
@@ -55,9 +57,10 @@ inline typename std::enable_if<utils::traits::is_floating_point<ScalarType>::val
 }
 
 template <typename ScalarType, typename VectorType>
-inline typename std::enable_if<std::is_integral<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
+inline typename std::enable_if<std::is_integral<ScalarType>::value, VectorType>::type
+elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
 {
-    switch(op)
+    switch (op)
     {
         case ElementWiseUnary::NEG:
             return svneg_z(pg, a);
@@ -81,23 +84,24 @@ void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, E
     Iterator input(in, win);
     Iterator output(out, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-        const auto input_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
-        int        x          = window_start_x;
-
-        svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-        do
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const auto vin = svld1(pg, input_ptr + x);
-            svst1(pg, output_ptr + x, elementwise_op_sve_imp<ScalarType, decltype(vin)>(pg, op, vin));
-            x += wrapper::svcnt<ScalarType>();
-            pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-        }
-        while(svptest_any(all_true_pg, pg));
-    },
-    input, output);
+            auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+            const auto input_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
+            int        x          = window_start_x;
+
+            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+            do
+            {
+                const auto vin = svld1(pg, input_ptr + x);
+                svst1(pg, output_ptr + x, elementwise_op_sve_imp<ScalarType, decltype(vin)>(pg, op, vin));
+                x += wrapper::svcnt<ScalarType>();
+                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+            } while (svptest_any(all_true_pg, pg));
+        },
+        input, output);
 }
 
 template void elementwise_sve_op<float16_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp
index 068c3f7cda..e27fe5a87f 100644
--- a/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp
@@ -23,16 +23,18 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sve_s32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void sve_s32_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(lut);
     return elementwise_sve_op<int32_t>(in, out, window, op);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp b/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp
index 7e32f50132..4e4582debb 100644
--- a/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp
@@ -23,13 +23,15 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/lut/list.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sve2_q8_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void sve2_q8_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(op);
 
@@ -40,14 +42,16 @@ void sve2_q8_elementwise_unary(const ITensor *in, ITensor *out, const Window &wi
     Iterator src_it(in, win);
     Iterator dst_it(out, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto src_ptr = src_it.ptr();
-        auto       dst_ptr = dst_it.ptr();
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto src_ptr = src_it.ptr();
+            auto       dst_ptr = dst_it.ptr();
 
-        lut_u8_sve2(lut, 1, window_end_x, &src_ptr, &dst_ptr);
-    },
-    src_it, dst_it);
+            lut_u8_sve2(lut, 1, window_end_x, &src_ptr, &dst_ptr);
+        },
+        src_it, dst_it);
 }
 
 } // namespace cpu
diff --git a/src/cpu/kernels/floor/list.h b/src/cpu/kernels/floor/list.h
index 4367e0ffc9..5ac78df324 100644
--- a/src/cpu/kernels/floor/list.h
+++ b/src/cpu/kernels/floor/list.h
@@ -28,8 +28,7 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_FLOOR_KERNEL(func_name) \
-    void func_name(const void *src, void *dst, int len)
+#define DECLARE_FLOOR_KERNEL(func_name) void func_name(const void *src, void *dst, int len)
 
 DECLARE_FLOOR_KERNEL(fp16_neon_floor);
 DECLARE_FLOOR_KERNEL(fp32_neon_floor);
diff --git a/src/cpu/kernels/floor/neon/fp16.cpp b/src/cpu/kernels/floor/neon/fp16.cpp
index f362676a36..f47690277d 100644
--- a/src/cpu/kernels/floor/neon/fp16.cpp
+++ b/src/cpu/kernels/floor/neon/fp16.cpp
@@ -45,14 +45,14 @@ void fp16_neon_floor(const void *src, void *dst, int len)
     auto psrc = static_cast<const __fp16 *>(src);
     auto pdst = static_cast<__fp16 *>(dst);
 
-    for(; len >= step; len -= step)
+    for (; len >= step; len -= step)
     {
         vst1q_f16(pdst, vfloorq_f16(vld1q_f16(psrc)));
         psrc += step;
         pdst += step;
     }
 
-    for(; len > 0; --len)
+    for (; len > 0; --len)
     {
         *pdst = std::floor(*psrc);
         ++psrc;
diff --git a/src/cpu/kernels/floor/neon/fp32.cpp b/src/cpu/kernels/floor/neon/fp32.cpp
index f5efb2e849..a86e24d3c3 100644
--- a/src/cpu/kernels/floor/neon/fp32.cpp
+++ b/src/cpu/kernels/floor/neon/fp32.cpp
@@ -43,14 +43,14 @@ void fp32_neon_floor(const void *src, void *dst, int len)
     auto psrc = static_cast<const float *>(src);
     auto pdst = static_cast<float *>(dst);
 
-    for(; len >= step; len -= step)
+    for (; len >= step; len -= step)
     {
         vst1q_f32(pdst, vfloorq_f32(vld1q_f32(psrc)));
         psrc += step;
         pdst += step;
     }
 
-    for(; len > 0; --len)
+    for (; len > 0; --len)
     {
         *pdst = std::floor(*psrc);
         ++pdst;
diff --git a/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp b/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp
index a29ee762fc..2821af32ce 100644
--- a/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp
+++ b/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp
@@ -29,11 +29,19 @@ namespace arm_compute
 {
 namespace cpu
 {
-void fused_batch_normalization_conv_f16(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                        const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_conv_f16(const ITensor *conv_weights,
+                                        const ITensor *conv_bias,
+                                        ITensor       *fused_weights,
+                                        ITensor       *fused_bias,
+                                        const ITensor *bn_mean,
+                                        const ITensor *bn_var,
+                                        const ITensor *bn_beta,
+                                        const ITensor *bn_gamma,
+                                        float          epsilon,
+                                        const Window  &window)
 {
-    return fused_batch_normalization_conv<float16_t>(conv_weights, conv_bias, fused_weights, fused_bias,
-                                                     bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window);
+    return fused_batch_normalization_conv<float16_t>(conv_weights, conv_bias, fused_weights, fused_bias, bn_mean,
+                                                     bn_var, bn_beta, bn_gamma, epsilon, window);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp b/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp
index 076e97651d..3ca5b6977a 100644
--- a/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp
+++ b/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp
@@ -28,11 +28,19 @@ namespace arm_compute
 {
 namespace cpu
 {
-void fused_batch_normalization_conv_f32(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                        const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_conv_f32(const ITensor *conv_weights,
+                                        const ITensor *conv_bias,
+                                        ITensor       *fused_weights,
+                                        ITensor       *fused_bias,
+                                        const ITensor *bn_mean,
+                                        const ITensor *bn_var,
+                                        const ITensor *bn_beta,
+                                        const ITensor *bn_gamma,
+                                        float          epsilon,
+                                        const Window  &window)
 {
-    return fused_batch_normalization_conv<float32_t>(conv_weights, conv_bias, fused_weights, fused_bias,
-                                                     bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window);
+    return fused_batch_normalization_conv<float32_t>(conv_weights, conv_bias, fused_weights, fused_bias, bn_mean,
+                                                     bn_var, bn_beta, bn_gamma, epsilon, window);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/fuse_batch_normalization/generic/impl.h b/src/cpu/kernels/fuse_batch_normalization/generic/impl.h
index b9017600d6..6fa843263a 100644
--- a/src/cpu/kernels/fuse_batch_normalization/generic/impl.h
+++ b/src/cpu/kernels/fuse_batch_normalization/generic/impl.h
@@ -25,6 +25,7 @@
 #define SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_GENERIC_IMPL_H
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
@@ -32,8 +33,16 @@ namespace arm_compute
 namespace cpu
 {
 template <typename T>
-void fused_batch_normalization_conv(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                    const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_conv(const ITensor *conv_weights,
+                                    const ITensor *conv_bias,
+                                    ITensor       *fused_weights,
+                                    ITensor       *fused_bias,
+                                    const ITensor *bn_mean,
+                                    const ITensor *bn_var,
+                                    const ITensor *bn_beta,
+                                    const ITensor *bn_gamma,
+                                    float          epsilon,
+                                    const Window  &window)
 {
     using ScalarType   = T;
     const int size     = 16 / conv_weights->info()->element_size();
@@ -53,13 +62,20 @@ void fused_batch_normalization_conv(const ITensor *conv_weights, const ITensor *
     Iterator conv_w_in(conv_weights, win);
     Iterator conv_w_out(run_in_place_weights ? conv_weights : fused_weights, win);
 
-    const auto conv_bias_in  = (conv_bias != nullptr ? reinterpret_cast<ScalarType *>(conv_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
-    auto       conv_bias_out = (run_in_place_bias ? conv_bias_in : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
+    const auto conv_bias_in =
+        (conv_bias != nullptr ? reinterpret_cast<ScalarType *>(conv_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
+    auto conv_bias_out =
+        (run_in_place_bias ? conv_bias_in
+                           : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
 
     const auto input_mean  = reinterpret_cast<const ScalarType *>(bn_mean->ptr_to_element(Coordinates(0, 0)));
     const auto input_var   = reinterpret_cast<const ScalarType *>(bn_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (bn_gamma != nullptr) ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (bn_beta != nullptr) ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_gamma = (bn_gamma != nullptr)
+                                 ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0)))
+                                 : nullptr;
+    const auto input_beta  = (bn_beta != nullptr)
+                                 ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0)))
+                                 : nullptr;
 
     auto       mean_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
     auto       var_vec     = wrapper::vdup_n(ScalarType(0), ExactTagType{});
@@ -73,59 +89,61 @@ void fused_batch_normalization_conv(const ITensor *conv_weights, const ITensor *
     auto gamma               = ScalarType(1.0);
     auto beta                = ScalarType(0.0);
     auto conv_bias_in_scalar = ScalarType(0.0);
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        var = input_var[id[3]];
-        if(input_gamma != nullptr)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            gamma = input_gamma[id[3]];
-        }
+            var = input_var[id[3]];
+            if (input_gamma != nullptr)
+            {
+                gamma = input_gamma[id[3]];
+            }
 
-        if((id[0] == 0) && (id[1] == 0) && (id[2] == 0))
-        {
-            if(input_beta != nullptr)
+            if ((id[0] == 0) && (id[1] == 0) && (id[2] == 0))
             {
-                beta     = input_beta[id[3]];
-                beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+                if (input_beta != nullptr)
+                {
+                    beta     = input_beta[id[3]];
+                    beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+                }
+
+                // Construct vectors
+                mean     = input_mean[id[3]];
+                mean_vec = wrapper::vdup_n(mean, ExactTagType{});
+
+                if (conv_bias_in != nullptr)
+                {
+                    conv_bias_in_scalar = conv_bias_in[id[3]];
+                }
+                auto conv_bias_tmp_scalar = (conv_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
+                conv_bias_out[id[3]]      = (conv_bias_tmp_scalar * gamma) + beta;
             }
 
-            // Construct vectors
-            mean     = input_mean[id[3]];
-            mean_vec = wrapper::vdup_n(mean, ExactTagType{});
+            int  x              = window_start_x;
+            auto conv_w_in_ptr  = reinterpret_cast<const ScalarType *>(conv_w_in.ptr());
+            auto conv_w_out_ptr = reinterpret_cast<ScalarType *>(conv_w_out.ptr());
+            var_vec             = wrapper::vdup_n(var, ExactTagType{});
+            gamma_vec           = wrapper::vdup_n(gamma, ExactTagType{});
+            rvar_vec            = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
 
-            if(conv_bias_in != nullptr)
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                conv_bias_in_scalar = conv_bias_in[id[3]];
-            }
-            auto conv_bias_tmp_scalar = (conv_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
-            conv_bias_out[id[3]]      = (conv_bias_tmp_scalar * gamma) + beta;
-        }
-
-        int  x              = window_start_x;
-        auto conv_w_in_ptr  = reinterpret_cast<const ScalarType *>(conv_w_in.ptr());
-        auto conv_w_out_ptr = reinterpret_cast<ScalarType *>(conv_w_out.ptr());
-        var_vec             = wrapper::vdup_n(var, ExactTagType{});
-        gamma_vec           = wrapper::vdup_n(gamma, ExactTagType{});
-        rvar_vec            = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
-
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            auto wn = wrapper::vloadq(conv_w_in_ptr + x);
-            wn      = wrapper::vmul(wn, rvar_vec);
-            wn      = wrapper::vmul(wn, gamma_vec);
+                auto wn = wrapper::vloadq(conv_w_in_ptr + x);
+                wn      = wrapper::vmul(wn, rvar_vec);
+                wn      = wrapper::vmul(wn, gamma_vec);
 
-            // Store results
-            wrapper::vstore(conv_w_out_ptr + x, wn);
-        }
+                // Store results
+                wrapper::vstore(conv_w_out_ptr + x, wn);
+            }
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            *(conv_w_out_ptr + x) = *(conv_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
-        }
-    },
-    conv_w_in, conv_w_out);
-}
-}
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                *(conv_w_out_ptr + x) = *(conv_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
+            }
+        },
+        conv_w_in, conv_w_out);
 }
+} // namespace cpu
+} // namespace arm_compute
 #endif //SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_GENERIC_IMPL_H
diff --git a/src/cpu/kernels/fuse_batch_normalization/list.h b/src/cpu/kernels/fuse_batch_normalization/list.h
index e25b1e5fed..a03dd74f78 100644
--- a/src/cpu/kernels/fuse_batch_normalization/list.h
+++ b/src/cpu/kernels/fuse_batch_normalization/list.h
@@ -30,15 +30,18 @@ namespace cpu
 {
 #define DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL(func_name)                                                            \
     void func_name(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias, \
-                   const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+                   const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma,     \
+                   float epsilon, const Window &window)
 
 #define DECLARE_FUSE_BATCH_NORMALIZE_DWC_NCHW_CONV_KERNEL(func_name)                                                 \
     void func_name(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, \
-                   const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+                   const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma,   \
+                   float epsilon, const Window &window)
 
 #define DECLARE_FUSE_BATCH_NORMALIZE_DWC_NHWC_CONV_KERNEL(func_name)                                                 \
     void func_name(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, \
-                   const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+                   const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma,   \
+                   float epsilon, const Window &window)
 
 DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL(fused_batch_normalization_conv_f16);
 DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL(fused_batch_normalization_conv_f32);
@@ -50,7 +53,7 @@ DECLARE_FUSE_BATCH_NORMALIZE_DWC_NCHW_CONV_KERNEL(fused_batch_normalization_dwc_
 #undef DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL
 #undef DECLARE_FUSE_BATCH_NORMALIZE_DWC_NCHW_CONV_KERNEL
 #undef DECLARE_FUSE_BATCH_NORMALIZE_DWC_NHWC_CONV_KERNEL
-}
-}
+} // namespace cpu
+} // namespace arm_compute
 
-#endif //
-\ No newline at end of file
+#endif //
diff --git a/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp b/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp
index 1e3be8792d..c0b0dfd4dc 100644
--- a/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp
+++ b/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp
@@ -29,8 +29,16 @@ namespace arm_compute
 namespace cpu
 {
 template <typename T>
-void fused_batch_normalization_dwc_nchw(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                        const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_dwc_nchw(const ITensor *dwc_weights,
+                                        const ITensor *dwc_bias,
+                                        ITensor       *fused_weights,
+                                        ITensor       *fused_bias,
+                                        const ITensor *bn_mean,
+                                        const ITensor *bn_var,
+                                        const ITensor *bn_beta,
+                                        const ITensor *bn_gamma,
+                                        float          epsilon,
+                                        const Window  &window)
 {
     using ScalarType   = T;
     const int size     = 16 / dwc_weights->info()->element_size();
@@ -50,13 +58,20 @@ void fused_batch_normalization_dwc_nchw(const ITensor *dwc_weights, const ITenso
     Iterator dwc_w_in(dwc_weights, win);
     Iterator dwc_w_out(run_in_place_weights ? dwc_weights : fused_weights, win);
 
-    const auto dwc_bias_in  = (dwc_bias != nullptr ? reinterpret_cast<ScalarType *>(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
-    auto       dwc_bias_out = (run_in_place_bias ? dwc_bias_in : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
+    const auto dwc_bias_in =
+        (dwc_bias != nullptr ? reinterpret_cast<ScalarType *>(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
+    auto dwc_bias_out =
+        (run_in_place_bias ? dwc_bias_in
+                           : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
 
     const auto input_mean  = reinterpret_cast<const ScalarType *>(bn_mean->ptr_to_element(Coordinates(0, 0)));
     const auto input_var   = reinterpret_cast<const ScalarType *>(bn_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (bn_gamma != nullptr) ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (bn_beta != nullptr) ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_gamma = (bn_gamma != nullptr)
+                                 ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0)))
+                                 : nullptr;
+    const auto input_beta  = (bn_beta != nullptr)
+                                 ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0)))
+                                 : nullptr;
 
     auto       mean_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
     auto       var_vec     = wrapper::vdup_n(ScalarType(0), ExactTagType{});
@@ -70,74 +85,92 @@ void fused_batch_normalization_dwc_nchw(const ITensor *dwc_weights, const ITenso
     auto gamma              = ScalarType(1.0);
     auto beta               = ScalarType(0.0);
     auto dwc_bias_in_scalar = ScalarType(0.0);
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        var = input_var[id[2]];
-        if(input_gamma != nullptr)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            gamma = input_gamma[id[2]];
-        }
-
-        if(id[1] == 0)
-        {
-            mean = input_mean[id[2]];
-
-            // Construct vectors
-            mean_vec = wrapper::vdup_n(mean, ExactTagType{});
-            if(input_beta != nullptr)
+            var = input_var[id[2]];
+            if (input_gamma != nullptr)
             {
-                beta     = input_beta[id[2]];
-                beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+                gamma = input_gamma[id[2]];
             }
 
-            if(dwc_bias_in != nullptr)
+            if (id[1] == 0)
             {
-                dwc_bias_in_scalar = dwc_bias_in[id[2]];
+                mean = input_mean[id[2]];
+
+                // Construct vectors
+                mean_vec = wrapper::vdup_n(mean, ExactTagType{});
+                if (input_beta != nullptr)
+                {
+                    beta     = input_beta[id[2]];
+                    beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+                }
+
+                if (dwc_bias_in != nullptr)
+                {
+                    dwc_bias_in_scalar = dwc_bias_in[id[2]];
+                }
+
+                auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
+                dwc_bias_out[id[2]]      = (dwc_bias_tmp_scalar * gamma) + beta;
             }
 
-            auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
-            dwc_bias_out[id[2]]      = (dwc_bias_tmp_scalar * gamma) + beta;
-        }
+            int  x             = window_start_x;
+            auto dwc_w_in_ptr  = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
+            auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
+            var_vec            = wrapper::vdup_n(var, ExactTagType{});
+            gamma_vec          = wrapper::vdup_n(gamma, ExactTagType{});
+            rvar_vec           = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
 
-        int  x             = window_start_x;
-        auto dwc_w_in_ptr  = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
-        auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
-        var_vec            = wrapper::vdup_n(var, ExactTagType{});
-        gamma_vec          = wrapper::vdup_n(gamma, ExactTagType{});
-        rvar_vec           = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
-
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            auto wn = wrapper::vloadq(dwc_w_in_ptr + x);
-            wn      = wrapper::vmul(wn, rvar_vec);
-            wn      = wrapper::vmul(wn, gamma_vec);
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                auto wn = wrapper::vloadq(dwc_w_in_ptr + x);
+                wn      = wrapper::vmul(wn, rvar_vec);
+                wn      = wrapper::vmul(wn, gamma_vec);
 
-            // Store results
-            wrapper::vstore(dwc_w_out_ptr + x, wn);
-        }
+                // Store results
+                wrapper::vstore(dwc_w_out_ptr + x, wn);
+            }
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
-        }
-    },
-    dwc_w_in, dwc_w_out);
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
+            }
+        },
+        dwc_w_in, dwc_w_out);
 }
 
-void fused_batch_normalization_dwc_nchw_f32(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                            const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_dwc_nchw_f32(const ITensor *dwc_weights,
+                                            const ITensor *dwc_bias,
+                                            ITensor       *fused_weights,
+                                            ITensor       *fused_bias,
+                                            const ITensor *bn_mean,
+                                            const ITensor *bn_var,
+                                            const ITensor *bn_beta,
+                                            const ITensor *bn_gamma,
+                                            float          epsilon,
+                                            const Window  &window)
 {
-    return fused_batch_normalization_dwc_nchw<float32_t>(dwc_weights, dwc_bias, fused_weights, fused_bias,
-                                                         bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window);
+    return fused_batch_normalization_dwc_nchw<float32_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean,
+                                                         bn_var, bn_beta, bn_gamma, epsilon, window);
 }
 
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-void fused_batch_normalization_dwc_nchw_f16(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                            const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_dwc_nchw_f16(const ITensor *dwc_weights,
+                                            const ITensor *dwc_bias,
+                                            ITensor       *fused_weights,
+                                            ITensor       *fused_bias,
+                                            const ITensor *bn_mean,
+                                            const ITensor *bn_var,
+                                            const ITensor *bn_beta,
+                                            const ITensor *bn_gamma,
+                                            float          epsilon,
+                                            const Window  &window)
 {
-    return fused_batch_normalization_dwc_nchw<float16_t>(dwc_weights, dwc_bias, fused_weights, fused_bias,
-                                                         bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window);
+    return fused_batch_normalization_dwc_nchw<float16_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean,
+                                                         bn_var, bn_beta, bn_gamma, epsilon, window);
 }
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
 
diff --git a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp
index 275211ff38..1d88d3b494 100644
--- a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp
+++ b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp
@@ -30,11 +30,19 @@ namespace arm_compute
 {
 namespace cpu
 {
-void fused_batch_normalization_dwc_nhwc_f16(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                            const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_dwc_nhwc_f16(const ITensor *dwc_weights,
+                                            const ITensor *dwc_bias,
+                                            ITensor       *fused_weights,
+                                            ITensor       *fused_bias,
+                                            const ITensor *bn_mean,
+                                            const ITensor *bn_var,
+                                            const ITensor *bn_beta,
+                                            const ITensor *bn_gamma,
+                                            float          epsilon,
+                                            const Window  &window)
 {
-    return fused_batch_normalization_dwc_nhwc<float16_t>(dwc_weights, dwc_bias, fused_weights, fused_bias,
-                                                         bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window);
+    return fused_batch_normalization_dwc_nhwc<float16_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean,
+                                                         bn_var, bn_beta, bn_gamma, epsilon, window);
 }
 
 } // namespace cpu
diff --git a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp
index 67169c5325..1f336bb196 100644
--- a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp
+++ b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp
@@ -29,11 +29,19 @@ namespace arm_compute
 {
 namespace cpu
 {
-void fused_batch_normalization_dwc_nhwc_f32(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                            const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_dwc_nhwc_f32(const ITensor *dwc_weights,
+                                            const ITensor *dwc_bias,
+                                            ITensor       *fused_weights,
+                                            ITensor       *fused_bias,
+                                            const ITensor *bn_mean,
+                                            const ITensor *bn_var,
+                                            const ITensor *bn_beta,
+                                            const ITensor *bn_gamma,
+                                            float          epsilon,
+                                            const Window  &window)
 {
-    return fused_batch_normalization_dwc_nhwc<float32_t>(dwc_weights, dwc_bias, fused_weights, fused_bias,
-                                                         bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window);
+    return fused_batch_normalization_dwc_nhwc<float32_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean,
+                                                         bn_var, bn_beta, bn_gamma, epsilon, window);
 }
 
 } // namespace cpu
diff --git a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h
index 6f0386276f..5b74a7aef6 100644
--- a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h
+++ b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h
@@ -25,6 +25,7 @@
 #define SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_IMPL_H
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
@@ -32,8 +33,16 @@ namespace arm_compute
 namespace cpu
 {
 template <typename T>
-void fused_batch_normalization_dwc_nhwc(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                        const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_dwc_nhwc(const ITensor *dwc_weights,
+                                        const ITensor *dwc_bias,
+                                        ITensor       *fused_weights,
+                                        ITensor       *fused_bias,
+                                        const ITensor *bn_mean,
+                                        const ITensor *bn_var,
+                                        const ITensor *bn_beta,
+                                        const ITensor *bn_gamma,
+                                        float          epsilon,
+                                        const Window  &window)
 {
     using ScalarType   = T;
     const int size     = 16 / dwc_weights->info()->element_size();
@@ -53,13 +62,20 @@ void fused_batch_normalization_dwc_nhwc(const ITensor *dwc_weights, const ITenso
     Iterator dwc_w_in(dwc_weights, win);
     Iterator dwc_w_out(run_in_place_weights ? dwc_weights : fused_weights, win);
 
-    const auto dwc_bias_in  = (dwc_bias != nullptr ? reinterpret_cast<ScalarType *>(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
-    auto       dwc_bias_out = (run_in_place_bias ? dwc_bias_in : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
+    const auto dwc_bias_in =
+        (dwc_bias != nullptr ? reinterpret_cast<ScalarType *>(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
+    auto dwc_bias_out =
+        (run_in_place_bias ? dwc_bias_in
+                           : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
 
     const auto input_mean  = reinterpret_cast<const ScalarType *>(bn_mean->ptr_to_element(Coordinates(0, 0)));
     const auto input_var   = reinterpret_cast<const ScalarType *>(bn_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (bn_gamma != nullptr) ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (bn_beta != nullptr) ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_gamma = (bn_gamma != nullptr)
+                                 ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0)))
+                                 : nullptr;
+    const auto input_beta  = (bn_beta != nullptr)
+                                 ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0)))
+                                 : nullptr;
 
     auto       mean_vec     = wrapper::vdup_n(ScalarType(0), ExactTagType{});
     auto       var_vec      = wrapper::vdup_n(ScalarType(0), ExactTagType{});
@@ -73,81 +89,84 @@ void fused_batch_normalization_dwc_nhwc(const ITensor *dwc_weights, const ITenso
     auto beta               = ScalarType(0.0);
     auto dwc_bias_in_scalar = ScalarType(0);
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            var_vec = wrapper::vloadq(input_var + x);
-            if(input_gamma != nullptr)
-            {
-                gamma_vec = wrapper::vloadq(input_gamma + x);
-            }
-
-            if((id[2] == 0) && (id[1] == 0))
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                mean_vec = wrapper::vloadq(input_mean + x);
-
-                // Construct vectors
-                if(input_beta != nullptr)
+                var_vec = wrapper::vloadq(input_var + x);
+                if (input_gamma != nullptr)
                 {
-                    beta_vec = wrapper::vloadq(input_beta + x);
+                    gamma_vec = wrapper::vloadq(input_gamma + x);
                 }
 
-                if(dwc_bias_in != nullptr)
+                if ((id[2] == 0) && (id[1] == 0))
                 {
-                    dwc_bias_vec = wrapper::vloadq(dwc_bias_in + x);
+                    mean_vec = wrapper::vloadq(input_mean + x);
+
+                    // Construct vectors
+                    if (input_beta != nullptr)
+                    {
+                        beta_vec = wrapper::vloadq(input_beta + x);
+                    }
+
+                    if (dwc_bias_in != nullptr)
+                    {
+                        dwc_bias_vec = wrapper::vloadq(dwc_bias_in + x);
+                    }
+
+                    auto dwc_bias_tmp_vec = wrapper::vmul(wrapper::vsub(dwc_bias_vec, mean_vec),
+                                                          wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)));
+                    dwc_bias_tmp_vec      = wrapper::vadd(wrapper::vmul(dwc_bias_tmp_vec, gamma_vec), beta_vec);
+                    wrapper::vstore(dwc_bias_out + x, dwc_bias_tmp_vec);
                 }
 
-                auto dwc_bias_tmp_vec = wrapper::vmul(wrapper::vsub(dwc_bias_vec, mean_vec), wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)));
-                dwc_bias_tmp_vec      = wrapper::vadd(wrapper::vmul(dwc_bias_tmp_vec, gamma_vec), beta_vec);
-                wrapper::vstore(dwc_bias_out + x, dwc_bias_tmp_vec);
-            }
-
-            auto dwc_w_in_ptr  = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
-            auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
+                auto dwc_w_in_ptr  = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
+                auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
 
-            auto wn  = wrapper::vloadq(dwc_w_in_ptr + x);
-            rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
-            wn       = wrapper::vmul(wn, rvar_vec);
-            wn       = wrapper::vmul(wn, gamma_vec);
+                auto wn  = wrapper::vloadq(dwc_w_in_ptr + x);
+                rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+                wn       = wrapper::vmul(wn, rvar_vec);
+                wn       = wrapper::vmul(wn, gamma_vec);
 
-            // Store results
-            wrapper::vstore(dwc_w_out_ptr + x, wn);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            auto var = input_var[x];
-            if(input_gamma != nullptr)
-            {
-                gamma = input_gamma[x];
+                // Store results
+                wrapper::vstore(dwc_w_out_ptr + x, wn);
             }
 
-            if(id[2] == 0 && id[1] == 0)
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                auto mean = input_mean[x];
-                if(input_beta != nullptr)
+                auto var = input_var[x];
+                if (input_gamma != nullptr)
                 {
-                    beta = input_beta[x];
+                    gamma = input_gamma[x];
                 }
-                if(dwc_bias_in != nullptr)
+
+                if (id[2] == 0 && id[1] == 0)
                 {
-                    dwc_bias_in_scalar = dwc_bias_in[x];
+                    auto mean = input_mean[x];
+                    if (input_beta != nullptr)
+                    {
+                        beta = input_beta[x];
+                    }
+                    if (dwc_bias_in != nullptr)
+                    {
+                        dwc_bias_in_scalar = dwc_bias_in[x];
+                    }
+
+                    auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
+                    dwc_bias_out[x]          = (dwc_bias_tmp_scalar * gamma) + beta;
                 }
 
-                auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
-                dwc_bias_out[x]          = (dwc_bias_tmp_scalar * gamma) + beta;
-            }
-
-            const auto dwc_w_in_ptr  = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
-            auto       dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
+                const auto dwc_w_in_ptr  = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
+                auto       dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
 
-            *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
-        }
-    },
-    dwc_w_in, dwc_w_out);
+                *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
+            }
+        },
+        dwc_w_in, dwc_w_out);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp b/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp
index 505a37174e..4d7507a5da 100644
--- a/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp
@@ -48,30 +48,32 @@ void matrix_addition_f16(const ITensor *src, ITensor *dst, const Window &window,
     Iterator in(src, win);
     Iterator out(dst, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const float16_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x < (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            float16x8x2_t       alpha_ab = vld2q_f16(out_ptr + x);
-            const float16x8x2_t c        = vld2q_f16(in_ptr + x);
-            // Multiply matrix C by its weight and accumulate
-            alpha_ab.val[0] = vaddq_f16(alpha_ab.val[0], vmulq_f16(c.val[0], beta_f16));
-            alpha_ab.val[1] = vaddq_f16(alpha_ab.val[1], vmulq_f16(c.val[1], beta_f16));
+            const auto in_ptr  = reinterpret_cast<const float16_t *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr());
 
-            vst2q_f16(out_ptr + x, alpha_ab);
-        }
+            int x = window_start_x;
+            for (; x < (window_end_x - window_step_x); x += window_step_x)
+            {
+                float16x8x2_t       alpha_ab = vld2q_f16(out_ptr + x);
+                const float16x8x2_t c        = vld2q_f16(in_ptr + x);
+                // Multiply matrix C by its weight and accumulate
+                alpha_ab.val[0] = vaddq_f16(alpha_ab.val[0], vmulq_f16(c.val[0], beta_f16));
+                alpha_ab.val[1] = vaddq_f16(alpha_ab.val[1], vmulq_f16(c.val[1], beta_f16));
 
-        // Left-over loop
-        for(; x < window_end_x; ++x)
-        {
-            *(out_ptr + x) += *(in_ptr + x) * static_cast<float16_t>(beta);
-        }
-    },
-    in, out);
+                vst2q_f16(out_ptr + x, alpha_ab);
+            }
+
+            // Left-over loop
+            for (; x < window_end_x; ++x)
+            {
+                *(out_ptr + x) += *(in_ptr + x) * static_cast<float16_t>(beta);
+            }
+        },
+        in, out);
 }
 } // namespace
 void neon_fp16_gemm_matrix_add(const ITensor *src, ITensor *dst, const Window &window, float beta)
diff --git a/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp b/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp
index dd0384ca13..47de0f3928 100644
--- a/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp
+++ b/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "src/cpu/kernels/gemm_matrix_add/generic/neon/impl.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -44,33 +45,35 @@ void matrix_addition_f32(const ITensor *src, ITensor *dst, const Window &window,
     Iterator in(src, win);
     Iterator out(dst, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const float *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<float *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x < (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            float32x4x4_t       alpha_ab = vld4q_f32(out_ptr + x);
-            const float32x4x4_t c        = vld4q_f32(in_ptr + x);
+            const auto in_ptr  = reinterpret_cast<const float *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<float *>(out.ptr());
 
-            // Multiply matrix C by its weight and accumulate
-            alpha_ab.val[0] = vmlaq_f32(alpha_ab.val[0], c.val[0], beta_f32);
-            alpha_ab.val[1] = vmlaq_f32(alpha_ab.val[1], c.val[1], beta_f32);
-            alpha_ab.val[2] = vmlaq_f32(alpha_ab.val[2], c.val[2], beta_f32);
-            alpha_ab.val[3] = vmlaq_f32(alpha_ab.val[3], c.val[3], beta_f32);
+            int x = window_start_x;
+            for (; x < (window_end_x - window_step_x); x += window_step_x)
+            {
+                float32x4x4_t       alpha_ab = vld4q_f32(out_ptr + x);
+                const float32x4x4_t c        = vld4q_f32(in_ptr + x);
 
-            vst4q_f32(out_ptr + x, alpha_ab);
-        }
+                // Multiply matrix C by its weight and accumulate
+                alpha_ab.val[0] = vmlaq_f32(alpha_ab.val[0], c.val[0], beta_f32);
+                alpha_ab.val[1] = vmlaq_f32(alpha_ab.val[1], c.val[1], beta_f32);
+                alpha_ab.val[2] = vmlaq_f32(alpha_ab.val[2], c.val[2], beta_f32);
+                alpha_ab.val[3] = vmlaq_f32(alpha_ab.val[3], c.val[3], beta_f32);
 
-        // Left-over loop
-        for(; x < window_end_x; ++x)
-        {
-            *(out_ptr + x) += *(in_ptr + x) * beta;
-        }
-    },
-    in, out);
+                vst4q_f32(out_ptr + x, alpha_ab);
+            }
+
+            // Left-over loop
+            for (; x < window_end_x; ++x)
+            {
+                *(out_ptr + x) += *(in_ptr + x) * beta;
+            }
+        },
+        in, out);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp
index 8fd79f9287..60fda511e3 100644
--- a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp
@@ -32,7 +32,8 @@ namespace arm_compute
 {
 namespace cpu
 {
-void vector_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
+void vector_matrix_multiply_f16(
+    const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
 {
     const auto width_matrix_b  = static_cast<int>(dst->info()->dimension(0));
     const auto in_b_stride     = static_cast<int>(rhs->info()->strides_in_bytes()[1] / rhs->info()->element_size());
@@ -42,7 +43,8 @@ void vector_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor
     const int window_start_x = 32 * info.thread_id;
     const int window_step_x  = 32 * info.num_threads;
     const int window_end_x   = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
-    ARM_COMPUTE_ERROR_ON_MSG((window_end_x - window_start_x) % window_step_x, " (window_end_x - window_start_x) must be multiple of window_step_x");
+    ARM_COMPUTE_ERROR_ON_MSG((window_end_x - window_start_x) % window_step_x,
+                             " (window_end_x - window_start_x) must be multiple of window_step_x");
 
     Window win_out(window);
     win_out.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -55,7 +57,7 @@ void vector_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor
     Window win_b;
     // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
     // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-    if(rhs->info()->num_dimensions() >= 3)
+    if (rhs->info()->num_dimensions() >= 3)
     {
         win_b = window;
     }
@@ -70,169 +72,172 @@ void vector_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor
 
     const float16x8_t alpha_f16 = vdupq_n_f16(alpha);
 
-    execute_window_loop(win_out, [&](const Coordinates &)
-    {
-        int x = window_start_x;
-        // Here we don't check for x lower equal than (window_end_x - window_step_x) because of
-        // window_end_x is computed above which may cause out-of-bound writes to the dst.
-        for(; x < (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_out,
+        [&](const Coordinates &)
         {
-            if(x > width_matrix_b)
+            int x = window_start_x;
+            // Here we don't check for x lower equal than (window_end_x - window_step_x) because of
+            // window_end_x is computed above which may cause out-of-bound writes to the dst.
+            for (; x < (window_end_x - window_step_x); x += window_step_x)
             {
-                return;
-            }
-
-            auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x;
+                if (x > width_matrix_b)
+                {
+                    return;
+                }
 
-            float16x8_t acc0 = vdupq_n_f16(0.f);
-            float16x8_t acc1 = vdupq_n_f16(0.f);
-            float16x8_t acc2 = vdupq_n_f16(0.f);
-            float16x8_t acc3 = vdupq_n_f16(0.f);
+                auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x;
 
-            auto             vec_a          = reinterpret_cast<const float16_t *>(ina.ptr());
-            const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
-            for(; vec_a <= (vec_a_end_addr - 4);)
-            {
-                const float16x4_t a0l = vld1_f16(vec_a);
-
-                float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
-                float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
-                float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
-                float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
-                float16x8_t b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
-                float16x8_t b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
-                float16x8_t b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
-                float16x8_t b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
-
-                acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 0));
-                acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 0));
-                acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 0));
-                acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 0));
-                acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 1));
-                acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 1));
-                acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 1));
-                acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 1));
-
-                matrix_b += 2 * in_b_stride;
-
-                b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
-                b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
-                b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
-                b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
-                b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
-                b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
-                b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
-                b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
-
-                acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 2));
-                acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 2));
-                acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 2));
-                acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 2));
-                acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 3));
-                acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 3));
-                acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 3));
-                acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 3));
-
-                vec_a += 4;
-                matrix_b += 2 * in_b_stride;
-            }
+                float16x8_t acc0 = vdupq_n_f16(0.f);
+                float16x8_t acc1 = vdupq_n_f16(0.f);
+                float16x8_t acc2 = vdupq_n_f16(0.f);
+                float16x8_t acc3 = vdupq_n_f16(0.f);
 
-            for(; vec_a < vec_a_end_addr; ++vec_a)
-            {
-                const float16_t   a0  = *vec_a;
-                const float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
-                const float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
-                const float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
-                const float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
-
-                acc0 = vaddq_f16(acc0, vmulq_n_f16(b00, a0));
-                acc1 = vaddq_f16(acc1, vmulq_n_f16(b01, a0));
-                acc2 = vaddq_f16(acc2, vmulq_n_f16(b02, a0));
-                acc3 = vaddq_f16(acc3, vmulq_n_f16(b03, a0));
-
-                matrix_b += in_b_stride;
-            }
+                auto             vec_a          = reinterpret_cast<const float16_t *>(ina.ptr());
+                const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
+                for (; vec_a <= (vec_a_end_addr - 4);)
+                {
+                    const float16x4_t a0l = vld1_f16(vec_a);
+
+                    float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
+                    float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+                    float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+                    float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+                    float16x8_t b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
+                    float16x8_t b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
+                    float16x8_t b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
+                    float16x8_t b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
+
+                    acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 0));
+                    acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 0));
+                    acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 0));
+                    acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 0));
+                    acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 1));
+                    acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 1));
+                    acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 1));
+                    acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 1));
+
+                    matrix_b += 2 * in_b_stride;
+
+                    b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
+                    b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+                    b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+                    b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+                    b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
+                    b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
+                    b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
+                    b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
+
+                    acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 2));
+                    acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 2));
+                    acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 2));
+                    acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 2));
+                    acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 3));
+                    acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 3));
+                    acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 3));
+                    acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 3));
+
+                    vec_a += 4;
+                    matrix_b += 2 * in_b_stride;
+                }
 
-            // Multiply by the weight of matrix product (alpha)
-            if(multiply_alpha)
-            {
-                acc0 = vmulq_f16(acc0, alpha_f16);
-                acc1 = vmulq_f16(acc1, alpha_f16);
-                acc2 = vmulq_f16(acc2, alpha_f16);
-                acc3 = vmulq_f16(acc3, alpha_f16);
-            }
+                for (; vec_a < vec_a_end_addr; ++vec_a)
+                {
+                    const float16_t   a0  = *vec_a;
+                    const float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
+                    const float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+                    const float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+                    const float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+
+                    acc0 = vaddq_f16(acc0, vmulq_n_f16(b00, a0));
+                    acc1 = vaddq_f16(acc1, vmulq_n_f16(b01, a0));
+                    acc2 = vaddq_f16(acc2, vmulq_n_f16(b02, a0));
+                    acc3 = vaddq_f16(acc3, vmulq_n_f16(b03, a0));
+
+                    matrix_b += in_b_stride;
+                }
 
-            auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x;
+                // Multiply by the weight of matrix product (alpha)
+                if (multiply_alpha)
+                {
+                    acc0 = vmulq_f16(acc0, alpha_f16);
+                    acc1 = vmulq_f16(acc1, alpha_f16);
+                    acc2 = vmulq_f16(acc2, alpha_f16);
+                    acc3 = vmulq_f16(acc3, alpha_f16);
+                }
 
-            vst1q_f16(vec_out + 0, acc0);
-            vst1q_f16(vec_out + 8, acc1);
-            vst1q_f16(vec_out + 16, acc2);
-            vst1q_f16(vec_out + 24, acc3);
-        }
+                auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x;
 
-        for(; x < window_end_x; ++x)
-        {
-            if(x > width_matrix_b)
-            {
-                return;
+                vst1q_f16(vec_out + 0, acc0);
+                vst1q_f16(vec_out + 8, acc1);
+                vst1q_f16(vec_out + 16, acc2);
+                vst1q_f16(vec_out + 24, acc3);
             }
 
-            auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x;
+            for (; x < window_end_x; ++x)
+            {
+                if (x > width_matrix_b)
+                {
+                    return;
+                }
 
-            float16x4_t vacc = vdup_n_f16(0.f);
+                auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x;
 
-            auto             vec_a          = reinterpret_cast<const float16_t *>(ina.ptr());
-            const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
-            for(; vec_a <= (vec_a_end_addr - 4); vec_a += 4)
-            {
-                const float16x4_t a0l = vld1_f16(vec_a);
+                float16x4_t vacc = vdup_n_f16(0.f);
 
-                const float16x4_t b_col =
+                auto             vec_a          = reinterpret_cast<const float16_t *>(ina.ptr());
+                const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
+                for (; vec_a <= (vec_a_end_addr - 4); vec_a += 4)
                 {
-                    *(matrix_b + 0 * in_b_stride),
-                    *(matrix_b + 1 * in_b_stride),
-                    *(matrix_b + 2 * in_b_stride),
-                    *(matrix_b + 3 * in_b_stride),
-                };
+                    const float16x4_t a0l = vld1_f16(vec_a);
 
-                vacc = vadd_f16(vacc, vmul_f16(a0l, b_col));
+                    const float16x4_t b_col = {
+                        *(matrix_b + 0 * in_b_stride),
+                        *(matrix_b + 1 * in_b_stride),
+                        *(matrix_b + 2 * in_b_stride),
+                        *(matrix_b + 3 * in_b_stride),
+                    };
 
-                matrix_b += 4 * in_b_stride;
-            }
+                    vacc = vadd_f16(vacc, vmul_f16(a0l, b_col));
 
-            float16_t acc = vget_lane_f16(vacc, 0) + vget_lane_f16(vacc, 1) + vget_lane_f16(vacc, 2) + vget_lane_f16(vacc, 3);
+                    matrix_b += 4 * in_b_stride;
+                }
 
-            for(; vec_a < vec_a_end_addr; ++vec_a)
-            {
-                const float16_t a0  = *vec_a;
-                const float16_t b00 = *matrix_b;
+                float16_t acc =
+                    vget_lane_f16(vacc, 0) + vget_lane_f16(vacc, 1) + vget_lane_f16(vacc, 2) + vget_lane_f16(vacc, 3);
 
-                acc += b00 * a0;
+                for (; vec_a < vec_a_end_addr; ++vec_a)
+                {
+                    const float16_t a0  = *vec_a;
+                    const float16_t b00 = *matrix_b;
 
-                matrix_b += in_b_stride;
-            }
+                    acc += b00 * a0;
 
-            // Multiply by the weight of matrix product (alpha)
-            if(multiply_alpha)
-            {
-                acc *= static_cast<float16_t>(alpha);
-            }
+                    matrix_b += in_b_stride;
+                }
 
-            auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x;
+                // Multiply by the weight of matrix product (alpha)
+                if (multiply_alpha)
+                {
+                    acc *= static_cast<float16_t>(alpha);
+                }
 
-            *(vec_out) = acc;
-        }
-    },
-    ina, inb, out);
+                auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x;
+
+                *(vec_out) = acc;
+            }
+        },
+        ina, inb, out);
 }
 
-void matrix_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
+void matrix_matrix_multiply_f16(
+    const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
 {
     ARM_COMPUTE_UNUSED(info);
-    const int    out_width            = static_cast<int>(dst->info()->dimension(0));
-    const int    out_height           = static_cast<int>(dst->info()->dimension(1));
-    const size_t in_b_stride          = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type());
-    const size_t out_stride           = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type());
+    const int    out_width   = static_cast<int>(dst->info()->dimension(0));
+    const int    out_height  = static_cast<int>(dst->info()->dimension(1));
+    const size_t in_b_stride = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type());
+    const size_t out_stride  = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type());
     const int    num_elems_matrix_b_x = rhs->info()->dimension(0);
 
     // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the dst matrix
@@ -243,7 +248,7 @@ void matrix_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor
     Window win_b;
     // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
     // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-    if(rhs->info()->num_dimensions() >= 3)
+    if (rhs->info()->num_dimensions() >= 3)
     {
         win_b = window;
     }
@@ -259,22 +264,16 @@ void matrix_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor
 
     const float16x8_t alpha_f16 = vdupq_n_f16(alpha);
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto   *mtx_a0  = reinterpret_cast<const float16_t *>(ina.ptr());
-        const auto   *mtx_b0  = reinterpret_cast<const float16_t *>(inb.ptr());
-        auto         *mtx_out = reinterpret_cast<float16_t *>(out.ptr());
-        float16x8x4_t c =
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            {
-                vdupq_n_f16(0.f),
-                vdupq_n_f16(0.f),
-                vdupq_n_f16(0.f),
-                vdupq_n_f16(0.f)
-            }
-        };
+            const auto   *mtx_a0  = reinterpret_cast<const float16_t *>(ina.ptr());
+            const auto   *mtx_b0  = reinterpret_cast<const float16_t *>(inb.ptr());
+            auto         *mtx_out = reinterpret_cast<float16_t *>(out.ptr());
+            float16x8x4_t c       = {{vdupq_n_f16(0.f), vdupq_n_f16(0.f), vdupq_n_f16(0.f), vdupq_n_f16(0.f)}};
 
-        /*
+            /*
         This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
              |a00 a01 a02 a03 | a04 a05 a06 a07|
              |a10 a11 a12 a13 | a14 a15 a16 a17|
@@ -302,111 +301,118 @@ void matrix_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor
 
         The size of the dst tensor's XY-plane must be the following shape [ width * 8, height / 8 ]. All other dimensions must have the same size.
         */
-        const float16_t *mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
-
-        for(; mtx_b0 <= (mtx_b0_end_addr - 32);)
-
-        {
-            const float16x8_t p00 = vld1q_f16(mtx_a0);
-            const float16x8_t p02 = vld1q_f16(mtx_a0 + 8);
-
-            const float16x8_t q00 = vld1q_f16(mtx_b0);
-            const float16x8_t q02 = vld1q_f16(mtx_b0 + 8);
-            const float16x8_t q04 = vld1q_f16(mtx_b0 + 16);
-            const float16x8_t q06 = vld1q_f16(mtx_b0 + 24);
-
-            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vgetq_lane_f16(p00, 0)));
-            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vgetq_lane_f16(p00, 1)));
-            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vgetq_lane_f16(p00, 2)));
-            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vgetq_lane_f16(p00, 3)));
+            const float16_t *mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
 
-            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q02, vgetq_lane_f16(p00, 4)));
-            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q02, vgetq_lane_f16(p00, 5)));
-            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q02, vgetq_lane_f16(p00, 6)));
-            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q02, vgetq_lane_f16(p00, 7)));
+            for (; mtx_b0 <= (mtx_b0_end_addr - 32);)
 
-            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q04, vgetq_lane_f16(p02, 0)));
-            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q04, vgetq_lane_f16(p02, 1)));
-            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q04, vgetq_lane_f16(p02, 2)));
-            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q04, vgetq_lane_f16(p02, 3)));
-
-            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q06, vgetq_lane_f16(p02, 4)));
-            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q06, vgetq_lane_f16(p02, 5)));
-            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q06, vgetq_lane_f16(p02, 6)));
-            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q06, vgetq_lane_f16(p02, 7)));
-
-            mtx_a0 += 16;
-            mtx_b0 += 32;
-        }
+            {
+                const float16x8_t p00 = vld1q_f16(mtx_a0);
+                const float16x8_t p02 = vld1q_f16(mtx_a0 + 8);
+
+                const float16x8_t q00 = vld1q_f16(mtx_b0);
+                const float16x8_t q02 = vld1q_f16(mtx_b0 + 8);
+                const float16x8_t q04 = vld1q_f16(mtx_b0 + 16);
+                const float16x8_t q06 = vld1q_f16(mtx_b0 + 24);
+
+                c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vgetq_lane_f16(p00, 0)));
+                c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vgetq_lane_f16(p00, 1)));
+                c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vgetq_lane_f16(p00, 2)));
+                c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vgetq_lane_f16(p00, 3)));
+
+                c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q02, vgetq_lane_f16(p00, 4)));
+                c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q02, vgetq_lane_f16(p00, 5)));
+                c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q02, vgetq_lane_f16(p00, 6)));
+                c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q02, vgetq_lane_f16(p00, 7)));
+
+                c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q04, vgetq_lane_f16(p02, 0)));
+                c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q04, vgetq_lane_f16(p02, 1)));
+                c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q04, vgetq_lane_f16(p02, 2)));
+                c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q04, vgetq_lane_f16(p02, 3)));
+
+                c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q06, vgetq_lane_f16(p02, 4)));
+                c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q06, vgetq_lane_f16(p02, 5)));
+                c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q06, vgetq_lane_f16(p02, 6)));
+                c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q06, vgetq_lane_f16(p02, 7)));
+
+                mtx_a0 += 16;
+                mtx_b0 += 32;
+            }
 
-        for(; mtx_b0 < mtx_b0_end_addr;)
+            for (; mtx_b0 < mtx_b0_end_addr;)
 
-        {
-            const float16x4_t p00 = vld1_f16(mtx_a0);
-            const float16x8_t q00 = vld1q_f16(mtx_b0);
+            {
+                const float16x4_t p00 = vld1_f16(mtx_a0);
+                const float16x8_t q00 = vld1q_f16(mtx_b0);
 
-            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vget_lane_f16(p00, 0)));
-            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vget_lane_f16(p00, 1)));
-            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vget_lane_f16(p00, 2)));
-            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vget_lane_f16(p00, 3)));
+                c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vget_lane_f16(p00, 0)));
+                c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vget_lane_f16(p00, 1)));
+                c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vget_lane_f16(p00, 2)));
+                c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vget_lane_f16(p00, 3)));
 
-            mtx_a0 += 4;
-            mtx_b0 += 8;
-        }
+                mtx_a0 += 4;
+                mtx_b0 += 8;
+            }
 
-        if(multiply_alpha)
-        {
-            c.val[0] = vmulq_f16(c.val[0], alpha_f16);
-            c.val[1] = vmulq_f16(c.val[1], alpha_f16);
-            c.val[2] = vmulq_f16(c.val[2], alpha_f16);
-            c.val[3] = vmulq_f16(c.val[3], alpha_f16);
-        }
+            if (multiply_alpha)
+            {
+                c.val[0] = vmulq_f16(c.val[0], alpha_f16);
+                c.val[1] = vmulq_f16(c.val[1], alpha_f16);
+                c.val[2] = vmulq_f16(c.val[2], alpha_f16);
+                c.val[3] = vmulq_f16(c.val[3], alpha_f16);
+            }
 
-        if(id.x() < (out_width - 8))
-        {
-            vst1q_f16(mtx_out, c.val[0]);
-            if(id.y() + 1 < out_height)
+            if (id.x() < (out_width - 8))
             {
-                vst1q_f16(mtx_out + 1 * out_stride, c.val[1]);
-                if(id.y() + 2 < out_height)
+                vst1q_f16(mtx_out, c.val[0]);
+                if (id.y() + 1 < out_height)
                 {
-                    vst1q_f16(mtx_out + 2 * out_stride, c.val[2]);
-                    if(id.y() + 3 < out_height)
+                    vst1q_f16(mtx_out + 1 * out_stride, c.val[1]);
+                    if (id.y() + 2 < out_height)
                     {
-                        vst1q_f16(mtx_out + 3 * out_stride, c.val[3]);
+                        vst1q_f16(mtx_out + 2 * out_stride, c.val[2]);
+                        if (id.y() + 3 < out_height)
+                        {
+                            vst1q_f16(mtx_out + 3 * out_stride, c.val[3]);
+                        }
                     }
                 }
             }
-        }
-        else
-        {
-            // Left-over columns
-            const int columns_left = out_width - id.x();
-            for(int x = 0; x < columns_left; ++x)
+            else
             {
-                *(mtx_out + x) = c.val[0][x];
-                if(id.y() + 1 < out_height)
+                // Left-over columns
+                const int columns_left = out_width - id.x();
+                for (int x = 0; x < columns_left; ++x)
                 {
-                    *(mtx_out + x + 1 * out_stride) = c.val[1][x];
-                    if(id.y() + 2 < out_height)
+                    *(mtx_out + x) = c.val[0][x];
+                    if (id.y() + 1 < out_height)
                     {
-                        *(mtx_out + x + 2 * out_stride) = c.val[2][x];
-                        if(id.y() + 3 < out_height)
+                        *(mtx_out + x + 1 * out_stride) = c.val[1][x];
+                        if (id.y() + 2 < out_height)
                         {
-                            *(mtx_out + x + 3 * out_stride) = c.val[3][x];
+                            *(mtx_out + x + 2 * out_stride) = c.val[2][x];
+                            if (id.y() + 3 < out_height)
+                            {
+                                *(mtx_out + x + 3 * out_stride) = c.val[3][x];
+                            }
                         }
                     }
                 }
             }
-        }
-    },
-    ina, inb, out);
+        },
+        ina, inb, out);
 }
 
-void neon_fp16_gemm_matrix_mul(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha, const bool is_dst_vector)
+void neon_fp16_gemm_matrix_mul(const ITensor    *lhs,
+                               const ITensor    *rhs,
+                               ITensor          *dst,
+                               const Window     &window,
+                               const ThreadInfo &info,
+                               float             alpha,
+                               const bool        is_dst_vector)
 {
-    return (is_dst_vector) ? vector_matrix_multiply_f16(lhs, rhs, dst, window, info, alpha) : matrix_matrix_multiply_f16(lhs, rhs, dst, window, info, alpha);
+    return (is_dst_vector) ? vector_matrix_multiply_f16(lhs, rhs, dst, window, info, alpha)
+                           : matrix_matrix_multiply_f16(lhs, rhs, dst, window, info, alpha);
 }
-} // namespce cpu
+} // namespace cpu
 } // namespace arm_compute
 #endif //__ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp
index 9c1f6f3c0f..e12a312280 100644
--- a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp
@@ -28,9 +28,16 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp32_gemm_matrix_mul(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha, const bool is_dst_vector)
+void neon_fp32_gemm_matrix_mul(const ITensor    *lhs,
+                               const ITensor    *rhs,
+                               ITensor          *dst,
+                               const Window     &window,
+                               const ThreadInfo &info,
+                               float             alpha,
+                               const bool        is_dst_vector)
 {
-    return (is_dst_vector) ? vector_matrix_multiply_f32(lhs, rhs, dst, window, info, alpha) : matrix_matrix_multiply_f32(lhs, rhs, dst, window, info, alpha);
+    return (is_dst_vector) ? vector_matrix_multiply_f32(lhs, rhs, dst, window, info, alpha)
+                           : matrix_matrix_multiply_f32(lhs, rhs, dst, window, info, alpha);
 }
-} // namespce cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp
index 0051d3d9dc..404d070a37 100644
--- a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp
+++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h"
+
 #include "src/core/utils/helpers/float_ops.h"
 
 #include <arm_neon.h>
@@ -31,10 +32,12 @@ namespace arm_compute
 {
 namespace cpu
 {
-void vector_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
+void vector_matrix_multiply_f32(
+    const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
 {
-    const auto width_matrix_b  = static_cast<int>(dst->info()->dimension(0));
-    const auto in_b_stride     = static_cast<int>(rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type()));
+    const auto width_matrix_b = static_cast<int>(dst->info()->dimension(0));
+    const auto in_b_stride =
+        static_cast<int>(rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type()));
     const auto num_elems_vec_a = static_cast<int>(lhs->info()->dimension(0));
 
     // The implementation computes 16 elements per iteration
@@ -54,7 +57,7 @@ void vector_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor
     Window win_b;
     // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
     // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-    if(rhs->info()->num_dimensions() >= 3)
+    if (rhs->info()->num_dimensions() >= 3)
     {
         win_b = window;
     }
@@ -69,209 +72,220 @@ void vector_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor
 
     const float32x4_t alpha_f32 = vdupq_n_f32(alpha);
 
-    execute_window_loop(win_out, [&](const Coordinates &)
-    {
-        int x = window_start_x;
-        // Here we don't check for x lower equal than (window_end_x - window_step_x) because of
-        // window_end_x is computed above which may cause out-of-bound writes to the dst.
-        for(; x < (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_out,
+        [&](const Coordinates &)
         {
-            if(x > width_matrix_b)
+            int x = window_start_x;
+            // Here we don't check for x lower equal than (window_end_x - window_step_x) because of
+            // window_end_x is computed above which may cause out-of-bound writes to the dst.
+            for (; x < (window_end_x - window_step_x); x += window_step_x)
             {
-                return;
-            }
+                if (x > width_matrix_b)
+                {
+                    return;
+                }
 
-            float32x4_t acc0 = vdupq_n_f32(0.f);
-            float32x4_t acc1 = vdupq_n_f32(0.f);
-            float32x4_t acc2 = vdupq_n_f32(0.f);
-            float32x4_t acc3 = vdupq_n_f32(0.f);
+                float32x4_t acc0 = vdupq_n_f32(0.f);
+                float32x4_t acc1 = vdupq_n_f32(0.f);
+                float32x4_t acc2 = vdupq_n_f32(0.f);
+                float32x4_t acc3 = vdupq_n_f32(0.f);
 
-            auto vec_a    = reinterpret_cast<const float *>(ina.ptr());
-            auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x;
+                auto vec_a    = reinterpret_cast<const float *>(ina.ptr());
+                auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x;
 
 #if __arm__
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
 #endif /* __arm__ */
 
-            auto vec_a_end_addr = vec_a + num_elems_vec_a;
-            for(; vec_a <= (vec_a_end_addr - 4);)
-            {
-                float32x2_t a0l = vld1_f32(vec_a);
+                auto vec_a_end_addr = vec_a + num_elems_vec_a;
+                for (; vec_a <= (vec_a_end_addr - 4);)
+                {
+                    float32x2_t a0l = vld1_f32(vec_a);
 
-                float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
-                float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
-                float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
-                float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+                    float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+                    float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+                    float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+                    float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
 
-                float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
-                float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
-                float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
-                float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
+                    float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
+                    float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
+                    float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
+                    float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
 
 #if __arm__
-                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
+                    asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
 #endif /* __arm__ */
 
-                acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
-                acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
-                acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
-                acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
+                    acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
+                    acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
+                    acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
+                    acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
 
-                acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
-                acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
-                acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
-                acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
+                    acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
+                    acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
+                    acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
+                    acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
 
-                vec_a += 2;
-                matrix_b += 2 * in_b_stride;
+                    vec_a += 2;
+                    matrix_b += 2 * in_b_stride;
 
-                a0l = vld1_f32(vec_a);
+                    a0l = vld1_f32(vec_a);
 
-                b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
-                b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
-                b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
-                b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+                    b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+                    b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+                    b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+                    b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
 
-                b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
-                b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
-                b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
-                b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
+                    b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
+                    b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
+                    b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
+                    b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
 
-                acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
-                acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
-                acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
-                acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
+                    acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
+                    acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
+                    acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
+                    acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
 
-                acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
-                acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
-                acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
-                acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
+                    acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
+                    acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
+                    acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
+                    acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
 
-                vec_a += 2;
-                matrix_b += 2 * in_b_stride;
-            }
+                    vec_a += 2;
+                    matrix_b += 2 * in_b_stride;
+                }
 
-            for(; vec_a < vec_a_end_addr; ++vec_a)
-            {
-                const float a0 = *vec_a;
+                for (; vec_a < vec_a_end_addr; ++vec_a)
+                {
+                    const float a0 = *vec_a;
 
-                const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
-                const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
-                const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
-                const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+                    const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+                    const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+                    const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+                    const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
 
-                acc0 = vmlaq_n_f32(acc0, b00, a0);
-                acc1 = vmlaq_n_f32(acc1, b01, a0);
-                acc2 = vmlaq_n_f32(acc2, b02, a0);
-                acc3 = vmlaq_n_f32(acc3, b03, a0);
+                    acc0 = vmlaq_n_f32(acc0, b00, a0);
+                    acc1 = vmlaq_n_f32(acc1, b01, a0);
+                    acc2 = vmlaq_n_f32(acc2, b02, a0);
+                    acc3 = vmlaq_n_f32(acc3, b03, a0);
 
-                matrix_b += in_b_stride;
-            }
+                    matrix_b += in_b_stride;
+                }
 
-            // Multiply by the weight of matrix product (alpha)
-            if(multiply_alpha)
-            {
-                acc0 = vmulq_f32(acc0, alpha_f32);
-                acc1 = vmulq_f32(acc1, alpha_f32);
-                acc2 = vmulq_f32(acc2, alpha_f32);
-                acc3 = vmulq_f32(acc3, alpha_f32);
-            }
+                // Multiply by the weight of matrix product (alpha)
+                if (multiply_alpha)
+                {
+                    acc0 = vmulq_f32(acc0, alpha_f32);
+                    acc1 = vmulq_f32(acc1, alpha_f32);
+                    acc2 = vmulq_f32(acc2, alpha_f32);
+                    acc3 = vmulq_f32(acc3, alpha_f32);
+                }
 
-            const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x;
+                const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x;
 
-            vst1q_f32(vec_out + 0, acc0);
-            vst1q_f32(vec_out + 4, acc1);
-            vst1q_f32(vec_out + 8, acc2);
-            vst1q_f32(vec_out + 12, acc3);
-        }
+                vst1q_f32(vec_out + 0, acc0);
+                vst1q_f32(vec_out + 4, acc1);
+                vst1q_f32(vec_out + 8, acc2);
+                vst1q_f32(vec_out + 12, acc3);
+            }
 
-        // Left-over loop
-        for(; x < window_end_x; ++x)
-        {
-            if(x > width_matrix_b)
+            // Left-over loop
+            for (; x < window_end_x; ++x)
             {
-                return;
-            }
+                if (x > width_matrix_b)
+                {
+                    return;
+                }
 
-            float32x4_t vacc = vdupq_n_f32(0.f);
+                float32x4_t vacc = vdupq_n_f32(0.f);
 
-            auto vec_a    = reinterpret_cast<const float *>(ina.ptr());
-            auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x;
+                auto vec_a    = reinterpret_cast<const float *>(ina.ptr());
+                auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x;
 
 #if __arm__
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
 #endif /* __arm__ */
 
-            auto vec_a_end_addr = vec_a + num_elems_vec_a;
-            for(; vec_a <= (vec_a_end_addr - 4); vec_a += 4)
-            {
-                const float32x4_t a0l = vld1q_f32(vec_a);
-
-                const float32x4_t b_col =
+                auto vec_a_end_addr = vec_a + num_elems_vec_a;
+                for (; vec_a <= (vec_a_end_addr - 4); vec_a += 4)
                 {
-                    *(matrix_b + 0 * in_b_stride),
-                    *(matrix_b + 1 * in_b_stride),
-                    *(matrix_b + 2 * in_b_stride),
-                    *(matrix_b + 3 * in_b_stride),
-                };
+                    const float32x4_t a0l = vld1q_f32(vec_a);
+
+                    const float32x4_t b_col = {
+                        *(matrix_b + 0 * in_b_stride),
+                        *(matrix_b + 1 * in_b_stride),
+                        *(matrix_b + 2 * in_b_stride),
+                        *(matrix_b + 3 * in_b_stride),
+                    };
 
 #if __arm__
-                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
+                    asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
 #endif /* __arm__ */
 
-                vacc = vmlaq_f32(vacc, b_col, a0l);
+                    vacc = vmlaq_f32(vacc, b_col, a0l);
 
-                matrix_b += 4 * in_b_stride;
-            }
+                    matrix_b += 4 * in_b_stride;
+                }
 
-            float acc = vgetq_lane_f32(vacc, 0) + vgetq_lane_f32(vacc, 1) + vgetq_lane_f32(vacc, 2) + vgetq_lane_f32(vacc, 3);
+                float acc = vgetq_lane_f32(vacc, 0) + vgetq_lane_f32(vacc, 1) + vgetq_lane_f32(vacc, 2) +
+                            vgetq_lane_f32(vacc, 3);
 
-            for(; vec_a < vec_a_end_addr; ++vec_a)
-            {
-                const float a0 = *vec_a;
+                for (; vec_a < vec_a_end_addr; ++vec_a)
+                {
+                    const float a0 = *vec_a;
 
-                const float b00 = *matrix_b;
+                    const float b00 = *matrix_b;
 
-                acc += b00 * a0;
+                    acc += b00 * a0;
 
-                matrix_b += in_b_stride;
-            }
+                    matrix_b += in_b_stride;
+                }
 
-            // Multiply by the weight of matrix product (alpha)
-            if(multiply_alpha)
-            {
-                acc *= alpha;
-            }
+                // Multiply by the weight of matrix product (alpha)
+                if (multiply_alpha)
+                {
+                    acc *= alpha;
+                }
 
-            const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x;
+                const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x;
 
-            *vec_out = acc;
-        }
-    },
-    ina, inb, out);
+                *vec_out = acc;
+            }
+        },
+        ina, inb, out);
 }
 
-void matrix_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
+void matrix_matrix_multiply_f32(
+    const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
 {
     ARM_COMPUTE_UNUSED(info);
-    const int    out_width            = static_cast<int>(dst->info()->dimension(0));
-    const int    out_height           = static_cast<int>(dst->info()->dimension(1));
-    const size_t in_b_stride          = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type());
-    const size_t out_stride1          = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type());
-    const size_t out_stride2          = out_stride1 * 2;
-    const size_t out_stride3          = out_stride1 * 3;
+    const int    out_width   = static_cast<int>(dst->info()->dimension(0));
+    const int    out_height  = static_cast<int>(dst->info()->dimension(1));
+    const size_t in_b_stride = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type());
+    const size_t out_stride1 = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type());
+    const size_t out_stride2 = out_stride1 * 2;
+    const size_t out_stride3 = out_stride1 * 3;
     const int    num_elems_matrix_b_x = rhs->info()->dimension(0);
 
     // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the dst matrix
@@ -282,7 +296,7 @@ void matrix_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor
     Window win_b;
     // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
     // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-    if(rhs->info()->num_dimensions() >= 3)
+    if (rhs->info()->num_dimensions() >= 3)
     {
         win_b = window;
     }
@@ -302,338 +316,340 @@ void matrix_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor
     // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with CpuGemmInterleave4x4 and CpuGemmTranspose1xW
     // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
     // All the values needed for computing a single 4x4 block will be read from consecutive memory positions
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        auto mtx_a0 = reinterpret_cast<const float *>(ina.ptr());
-        auto mtx_b0 = reinterpret_cast<const float *>(inb.ptr());
-        auto mtx_b1 = mtx_b0 + in_b_stride;
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            auto mtx_a0 = reinterpret_cast<const float *>(ina.ptr());
+            auto mtx_b0 = reinterpret_cast<const float *>(inb.ptr());
+            auto mtx_b1 = mtx_b0 + in_b_stride;
 
-        float32x4_t acc00 = vdupq_n_f32(0.f);
-        float32x4_t acc10 = vdupq_n_f32(0.f);
-        float32x4_t acc20 = vdupq_n_f32(0.f);
-        float32x4_t acc30 = vdupq_n_f32(0.f);
+            float32x4_t acc00 = vdupq_n_f32(0.f);
+            float32x4_t acc10 = vdupq_n_f32(0.f);
+            float32x4_t acc20 = vdupq_n_f32(0.f);
+            float32x4_t acc30 = vdupq_n_f32(0.f);
 
-        float32x4_t acc01 = vdupq_n_f32(0.f);
-        float32x4_t acc11 = vdupq_n_f32(0.f);
-        float32x4_t acc21 = vdupq_n_f32(0.f);
-        float32x4_t acc31 = vdupq_n_f32(0.f);
+            float32x4_t acc01 = vdupq_n_f32(0.f);
+            float32x4_t acc11 = vdupq_n_f32(0.f);
+            float32x4_t acc21 = vdupq_n_f32(0.f);
+            float32x4_t acc31 = vdupq_n_f32(0.f);
 
 #if __arm__
-        asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
-        asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
-        asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
 #endif /* __arm__ */
 
-        auto mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
-        for(; mtx_b0 <= (mtx_b0_end_addr - 32);)
-        {
-            float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0);
-            float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1);
-            float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2);
-            float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3);
+            auto mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
+            for (; mtx_b0 <= (mtx_b0_end_addr - 32);)
+            {
+                float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0);
+                float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1);
+                float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2);
+                float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3);
 
-            float32x4_t b00 = vld1q_f32(mtx_b0);
-            float32x4_t b10 = vld1q_f32(mtx_b1);
-            float32x4_t b01 = vld1q_f32(mtx_b0 + 4);
-            float32x4_t b11 = vld1q_f32(mtx_b1 + 4);
+                float32x4_t b00 = vld1q_f32(mtx_b0);
+                float32x4_t b10 = vld1q_f32(mtx_b1);
+                float32x4_t b01 = vld1q_f32(mtx_b0 + 4);
+                float32x4_t b11 = vld1q_f32(mtx_b1 + 4);
 
 #if __arm__
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
 #endif /* __arm__ */
 
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b00, a0);
-            acc10 = vmlaq_f32(acc10, b00, a1);
-            acc20 = vmlaq_f32(acc20, b00, a2);
-            acc30 = vmlaq_f32(acc30, b00, a3);
-
-            float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4);
-            float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5);
-            float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6);
-            float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b10, a0);
-            acc11 = vmlaq_f32(acc11, b10, a1);
-            acc21 = vmlaq_f32(acc21, b10, a2);
-            acc31 = vmlaq_f32(acc31, b10, a3);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b01, a4);
-            acc10 = vmlaq_f32(acc10, b01, a5);
-            acc20 = vmlaq_f32(acc20, b01, a6);
-            acc30 = vmlaq_f32(acc30, b01, a7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b11, a4);
-            acc11 = vmlaq_f32(acc11, b11, a5);
-            acc21 = vmlaq_f32(acc21, b11, a6);
-            acc31 = vmlaq_f32(acc31, b11, a7);
-
-            mtx_a0 += 8;
-            mtx_b0 += 8;
-            mtx_b1 += 8;
-
-            a0 = vld1q_dup_f32(mtx_a0 + 0);
-            a1 = vld1q_dup_f32(mtx_a0 + 1);
-            a2 = vld1q_dup_f32(mtx_a0 + 2);
-            a3 = vld1q_dup_f32(mtx_a0 + 3);
-
-            b00 = vld1q_f32(mtx_b0);
-            b10 = vld1q_f32(mtx_b1);
-            b01 = vld1q_f32(mtx_b0 + 4);
-            b11 = vld1q_f32(mtx_b1 + 4);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b00, a0);
-            acc10 = vmlaq_f32(acc10, b00, a1);
-            acc20 = vmlaq_f32(acc20, b00, a2);
-            acc30 = vmlaq_f32(acc30, b00, a3);
-
-            a4 = vld1q_dup_f32(mtx_a0 + 4);
-            a5 = vld1q_dup_f32(mtx_a0 + 5);
-            a6 = vld1q_dup_f32(mtx_a0 + 6);
-            a7 = vld1q_dup_f32(mtx_a0 + 7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b10, a0);
-            acc11 = vmlaq_f32(acc11, b10, a1);
-            acc21 = vmlaq_f32(acc21, b10, a2);
-            acc31 = vmlaq_f32(acc31, b10, a3);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b01, a4);
-            acc10 = vmlaq_f32(acc10, b01, a5);
-            acc20 = vmlaq_f32(acc20, b01, a6);
-            acc30 = vmlaq_f32(acc30, b01, a7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b11, a4);
-            acc11 = vmlaq_f32(acc11, b11, a5);
-            acc21 = vmlaq_f32(acc21, b11, a6);
-            acc31 = vmlaq_f32(acc31, b11, a7);
-
-            mtx_a0 += 8;
-            mtx_b0 += 8;
-            mtx_b1 += 8;
-
-            a0  = vld1q_dup_f32(mtx_a0 + 0);
-            a1  = vld1q_dup_f32(mtx_a0 + 1);
-            a2  = vld1q_dup_f32(mtx_a0 + 2);
-            a3  = vld1q_dup_f32(mtx_a0 + 3);
-            b00 = vld1q_f32(mtx_b0);
-            b10 = vld1q_f32(mtx_b1);
-            b01 = vld1q_f32(mtx_b0 + 4);
-            b11 = vld1q_f32(mtx_b1 + 4);
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b00, a0);
+                acc10 = vmlaq_f32(acc10, b00, a1);
+                acc20 = vmlaq_f32(acc20, b00, a2);
+                acc30 = vmlaq_f32(acc30, b00, a3);
+
+                float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4);
+                float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5);
+                float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6);
+                float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b10, a0);
+                acc11 = vmlaq_f32(acc11, b10, a1);
+                acc21 = vmlaq_f32(acc21, b10, a2);
+                acc31 = vmlaq_f32(acc31, b10, a3);
+
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b01, a4);
+                acc10 = vmlaq_f32(acc10, b01, a5);
+                acc20 = vmlaq_f32(acc20, b01, a6);
+                acc30 = vmlaq_f32(acc30, b01, a7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b11, a4);
+                acc11 = vmlaq_f32(acc11, b11, a5);
+                acc21 = vmlaq_f32(acc21, b11, a6);
+                acc31 = vmlaq_f32(acc31, b11, a7);
+
+                mtx_a0 += 8;
+                mtx_b0 += 8;
+                mtx_b1 += 8;
+
+                a0 = vld1q_dup_f32(mtx_a0 + 0);
+                a1 = vld1q_dup_f32(mtx_a0 + 1);
+                a2 = vld1q_dup_f32(mtx_a0 + 2);
+                a3 = vld1q_dup_f32(mtx_a0 + 3);
+
+                b00 = vld1q_f32(mtx_b0);
+                b10 = vld1q_f32(mtx_b1);
+                b01 = vld1q_f32(mtx_b0 + 4);
+                b11 = vld1q_f32(mtx_b1 + 4);
+
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b00, a0);
+                acc10 = vmlaq_f32(acc10, b00, a1);
+                acc20 = vmlaq_f32(acc20, b00, a2);
+                acc30 = vmlaq_f32(acc30, b00, a3);
+
+                a4 = vld1q_dup_f32(mtx_a0 + 4);
+                a5 = vld1q_dup_f32(mtx_a0 + 5);
+                a6 = vld1q_dup_f32(mtx_a0 + 6);
+                a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b10, a0);
+                acc11 = vmlaq_f32(acc11, b10, a1);
+                acc21 = vmlaq_f32(acc21, b10, a2);
+                acc31 = vmlaq_f32(acc31, b10, a3);
+
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b01, a4);
+                acc10 = vmlaq_f32(acc10, b01, a5);
+                acc20 = vmlaq_f32(acc20, b01, a6);
+                acc30 = vmlaq_f32(acc30, b01, a7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b11, a4);
+                acc11 = vmlaq_f32(acc11, b11, a5);
+                acc21 = vmlaq_f32(acc21, b11, a6);
+                acc31 = vmlaq_f32(acc31, b11, a7);
+
+                mtx_a0 += 8;
+                mtx_b0 += 8;
+                mtx_b1 += 8;
+
+                a0  = vld1q_dup_f32(mtx_a0 + 0);
+                a1  = vld1q_dup_f32(mtx_a0 + 1);
+                a2  = vld1q_dup_f32(mtx_a0 + 2);
+                a3  = vld1q_dup_f32(mtx_a0 + 3);
+                b00 = vld1q_f32(mtx_b0);
+                b10 = vld1q_f32(mtx_b1);
+                b01 = vld1q_f32(mtx_b0 + 4);
+                b11 = vld1q_f32(mtx_b1 + 4);
 
 #if __arm__
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
 #endif /* __arm__ */
 
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b00, a0);
-            acc10 = vmlaq_f32(acc10, b00, a1);
-            acc20 = vmlaq_f32(acc20, b00, a2);
-            acc30 = vmlaq_f32(acc30, b00, a3);
-
-            a4 = vld1q_dup_f32(mtx_a0 + 4);
-            a5 = vld1q_dup_f32(mtx_a0 + 5);
-            a6 = vld1q_dup_f32(mtx_a0 + 6);
-            a7 = vld1q_dup_f32(mtx_a0 + 7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b10, a0);
-            acc11 = vmlaq_f32(acc11, b10, a1);
-            acc21 = vmlaq_f32(acc21, b10, a2);
-            acc31 = vmlaq_f32(acc31, b10, a3);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b01, a4);
-            acc10 = vmlaq_f32(acc10, b01, a5);
-            acc20 = vmlaq_f32(acc20, b01, a6);
-            acc30 = vmlaq_f32(acc30, b01, a7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b11, a4);
-            acc11 = vmlaq_f32(acc11, b11, a5);
-            acc21 = vmlaq_f32(acc21, b11, a6);
-            acc31 = vmlaq_f32(acc31, b11, a7);
-
-            mtx_a0 += 8;
-            mtx_b0 += 8;
-            mtx_b1 += 8;
-
-            a0  = vld1q_dup_f32(mtx_a0 + 0);
-            a1  = vld1q_dup_f32(mtx_a0 + 1);
-            a2  = vld1q_dup_f32(mtx_a0 + 2);
-            a3  = vld1q_dup_f32(mtx_a0 + 3);
-            b00 = vld1q_f32(mtx_b0);
-            b10 = vld1q_f32(mtx_b1);
-            b01 = vld1q_f32(mtx_b0 + 4);
-            b11 = vld1q_f32(mtx_b1 + 4);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b00, a0);
-            acc10 = vmlaq_f32(acc10, b00, a1);
-            acc20 = vmlaq_f32(acc20, b00, a2);
-            acc30 = vmlaq_f32(acc30, b00, a3);
-
-            a4 = vld1q_dup_f32(mtx_a0 + 4);
-            a5 = vld1q_dup_f32(mtx_a0 + 5);
-            a6 = vld1q_dup_f32(mtx_a0 + 6);
-            a7 = vld1q_dup_f32(mtx_a0 + 7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b10, a0);
-            acc11 = vmlaq_f32(acc11, b10, a1);
-            acc21 = vmlaq_f32(acc21, b10, a2);
-            acc31 = vmlaq_f32(acc31, b10, a3);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b01, a4);
-            acc10 = vmlaq_f32(acc10, b01, a5);
-            acc20 = vmlaq_f32(acc20, b01, a6);
-            acc30 = vmlaq_f32(acc30, b01, a7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b11, a4);
-            acc11 = vmlaq_f32(acc11, b11, a5);
-            acc21 = vmlaq_f32(acc21, b11, a6);
-            acc31 = vmlaq_f32(acc31, b11, a7);
-
-            mtx_a0 += 8;
-            mtx_b0 += 8;
-            mtx_b1 += 8;
-        }
-
-        for(; mtx_b0 < mtx_b0_end_addr;)
-        {
-            float32x4_t a0  = vld1q_dup_f32(mtx_a0 + 0);
-            float32x4_t a1  = vld1q_dup_f32(mtx_a0 + 1);
-            float32x4_t a2  = vld1q_dup_f32(mtx_a0 + 2);
-            float32x4_t a3  = vld1q_dup_f32(mtx_a0 + 3);
-            float32x4_t b00 = vld1q_f32(mtx_b0);
-            float32x4_t b10 = vld1q_f32(mtx_b1);
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b00, a0);
+                acc10 = vmlaq_f32(acc10, b00, a1);
+                acc20 = vmlaq_f32(acc20, b00, a2);
+                acc30 = vmlaq_f32(acc30, b00, a3);
+
+                a4 = vld1q_dup_f32(mtx_a0 + 4);
+                a5 = vld1q_dup_f32(mtx_a0 + 5);
+                a6 = vld1q_dup_f32(mtx_a0 + 6);
+                a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b10, a0);
+                acc11 = vmlaq_f32(acc11, b10, a1);
+                acc21 = vmlaq_f32(acc21, b10, a2);
+                acc31 = vmlaq_f32(acc31, b10, a3);
+
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b01, a4);
+                acc10 = vmlaq_f32(acc10, b01, a5);
+                acc20 = vmlaq_f32(acc20, b01, a6);
+                acc30 = vmlaq_f32(acc30, b01, a7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b11, a4);
+                acc11 = vmlaq_f32(acc11, b11, a5);
+                acc21 = vmlaq_f32(acc21, b11, a6);
+                acc31 = vmlaq_f32(acc31, b11, a7);
+
+                mtx_a0 += 8;
+                mtx_b0 += 8;
+                mtx_b1 += 8;
+
+                a0  = vld1q_dup_f32(mtx_a0 + 0);
+                a1  = vld1q_dup_f32(mtx_a0 + 1);
+                a2  = vld1q_dup_f32(mtx_a0 + 2);
+                a3  = vld1q_dup_f32(mtx_a0 + 3);
+                b00 = vld1q_f32(mtx_b0);
+                b10 = vld1q_f32(mtx_b1);
+                b01 = vld1q_f32(mtx_b0 + 4);
+                b11 = vld1q_f32(mtx_b1 + 4);
+
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b00, a0);
+                acc10 = vmlaq_f32(acc10, b00, a1);
+                acc20 = vmlaq_f32(acc20, b00, a2);
+                acc30 = vmlaq_f32(acc30, b00, a3);
+
+                a4 = vld1q_dup_f32(mtx_a0 + 4);
+                a5 = vld1q_dup_f32(mtx_a0 + 5);
+                a6 = vld1q_dup_f32(mtx_a0 + 6);
+                a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b10, a0);
+                acc11 = vmlaq_f32(acc11, b10, a1);
+                acc21 = vmlaq_f32(acc21, b10, a2);
+                acc31 = vmlaq_f32(acc31, b10, a3);
+
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b01, a4);
+                acc10 = vmlaq_f32(acc10, b01, a5);
+                acc20 = vmlaq_f32(acc20, b01, a6);
+                acc30 = vmlaq_f32(acc30, b01, a7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b11, a4);
+                acc11 = vmlaq_f32(acc11, b11, a5);
+                acc21 = vmlaq_f32(acc21, b11, a6);
+                acc31 = vmlaq_f32(acc31, b11, a7);
+
+                mtx_a0 += 8;
+                mtx_b0 += 8;
+                mtx_b1 += 8;
+            }
+
+            for (; mtx_b0 < mtx_b0_end_addr;)
+            {
+                float32x4_t a0  = vld1q_dup_f32(mtx_a0 + 0);
+                float32x4_t a1  = vld1q_dup_f32(mtx_a0 + 1);
+                float32x4_t a2  = vld1q_dup_f32(mtx_a0 + 2);
+                float32x4_t a3  = vld1q_dup_f32(mtx_a0 + 3);
+                float32x4_t b00 = vld1q_f32(mtx_b0);
+                float32x4_t b10 = vld1q_f32(mtx_b1);
 
 #if __arm__
-            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
-            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
-            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+                asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+                asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+                asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
 #endif /* __arm__ */
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b00, a0);
-            acc10 = vmlaq_f32(acc10, b00, a1);
-            acc20 = vmlaq_f32(acc20, b00, a2);
-            acc30 = vmlaq_f32(acc30, b00, a3);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b10, a0);
-            acc11 = vmlaq_f32(acc11, b10, a1);
-            acc21 = vmlaq_f32(acc21, b10, a2);
-            acc31 = vmlaq_f32(acc31, b10, a3);
-
-            mtx_a0 += 4;
-            mtx_b0 += 4;
-            mtx_b1 += 4;
-        }
-
-        // Multiply by the weight of matrix product (alpha)
-        if(multiply_alpha)
-        {
-            acc00 = vmulq_f32(acc00, alpha_f32);
-            acc10 = vmulq_f32(acc10, alpha_f32);
-            acc20 = vmulq_f32(acc20, alpha_f32);
-            acc30 = vmulq_f32(acc30, alpha_f32);
-            acc01 = vmulq_f32(acc01, alpha_f32);
-            acc11 = vmulq_f32(acc11, alpha_f32);
-            acc21 = vmulq_f32(acc21, alpha_f32);
-            acc31 = vmulq_f32(acc31, alpha_f32);
-        }
-
-        const auto mtx_out0 = reinterpret_cast<float *>(out.ptr());
-        const auto mtx_out1 = mtx_out0 + 4;
-
-        if(id.x() < (out_width - 8))
-        {
-            vst1q_f32(mtx_out0, acc00);
-            vst1q_f32(mtx_out1, acc01);
-            if(id.y() + 1 < out_height)
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b00, a0);
+                acc10 = vmlaq_f32(acc10, b00, a1);
+                acc20 = vmlaq_f32(acc20, b00, a2);
+                acc30 = vmlaq_f32(acc30, b00, a3);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b10, a0);
+                acc11 = vmlaq_f32(acc11, b10, a1);
+                acc21 = vmlaq_f32(acc21, b10, a2);
+                acc31 = vmlaq_f32(acc31, b10, a3);
+
+                mtx_a0 += 4;
+                mtx_b0 += 4;
+                mtx_b1 += 4;
+            }
+
+            // Multiply by the weight of matrix product (alpha)
+            if (multiply_alpha)
+            {
+                acc00 = vmulq_f32(acc00, alpha_f32);
+                acc10 = vmulq_f32(acc10, alpha_f32);
+                acc20 = vmulq_f32(acc20, alpha_f32);
+                acc30 = vmulq_f32(acc30, alpha_f32);
+                acc01 = vmulq_f32(acc01, alpha_f32);
+                acc11 = vmulq_f32(acc11, alpha_f32);
+                acc21 = vmulq_f32(acc21, alpha_f32);
+                acc31 = vmulq_f32(acc31, alpha_f32);
+            }
+
+            const auto mtx_out0 = reinterpret_cast<float *>(out.ptr());
+            const auto mtx_out1 = mtx_out0 + 4;
+
+            if (id.x() < (out_width - 8))
             {
-                vst1q_f32(mtx_out0 + out_stride1, acc10);
-                vst1q_f32(mtx_out1 + out_stride1, acc11);
-                if(id.y() + 2 < out_height)
+                vst1q_f32(mtx_out0, acc00);
+                vst1q_f32(mtx_out1, acc01);
+                if (id.y() + 1 < out_height)
                 {
-                    vst1q_f32(mtx_out0 + out_stride2, acc20);
-                    vst1q_f32(mtx_out1 + out_stride2, acc21);
-                    if(id.y() + 3 < out_height)
+                    vst1q_f32(mtx_out0 + out_stride1, acc10);
+                    vst1q_f32(mtx_out1 + out_stride1, acc11);
+                    if (id.y() + 2 < out_height)
                     {
-                        vst1q_f32(mtx_out0 + out_stride3, acc30);
-                        vst1q_f32(mtx_out1 + out_stride3, acc31);
+                        vst1q_f32(mtx_out0 + out_stride2, acc20);
+                        vst1q_f32(mtx_out1 + out_stride2, acc21);
+                        if (id.y() + 3 < out_height)
+                        {
+                            vst1q_f32(mtx_out0 + out_stride3, acc30);
+                            vst1q_f32(mtx_out1 + out_stride3, acc31);
+                        }
                     }
                 }
             }
-        }
-        else if(id.x() < (out_width - 4))
-        {
-            vst1q_f32(mtx_out0, acc00);
-            if(id.y() + 1 < out_height)
+            else if (id.x() < (out_width - 4))
             {
-                vst1q_f32(mtx_out0 + out_stride1, acc10);
-                if(id.y() + 2 < out_height)
+                vst1q_f32(mtx_out0, acc00);
+                if (id.y() + 1 < out_height)
                 {
-                    vst1q_f32(mtx_out0 + out_stride2, acc20);
-                    if(id.y() + 3 < out_height)
+                    vst1q_f32(mtx_out0 + out_stride1, acc10);
+                    if (id.y() + 2 < out_height)
                     {
-                        vst1q_f32(mtx_out0 + out_stride3, acc30);
+                        vst1q_f32(mtx_out0 + out_stride2, acc20);
+                        if (id.y() + 3 < out_height)
+                        {
+                            vst1q_f32(mtx_out0 + out_stride3, acc30);
+                        }
                     }
                 }
-            }
-            // Left-over columns
-            const int columns_left = out_width - id.x() - 4;
-            for(auto x = 0; x < columns_left; ++x)
-            {
-                *(mtx_out1 + x) = acc01[x];
-                if(id.y() + 1 < out_height)
+                // Left-over columns
+                const int columns_left = out_width - id.x() - 4;
+                for (auto x = 0; x < columns_left; ++x)
                 {
-                    *(mtx_out1 + x + out_stride1) = acc11[x];
-                    if(id.y() + 2 < out_height)
+                    *(mtx_out1 + x) = acc01[x];
+                    if (id.y() + 1 < out_height)
                     {
-                        *(mtx_out1 + x + out_stride2) = acc21[x];
-                        if(id.y() + 3 < out_height)
+                        *(mtx_out1 + x + out_stride1) = acc11[x];
+                        if (id.y() + 2 < out_height)
                         {
-                            *(mtx_out1 + x + out_stride3) = acc31[x];
+                            *(mtx_out1 + x + out_stride2) = acc21[x];
+                            if (id.y() + 3 < out_height)
+                            {
+                                *(mtx_out1 + x + out_stride3) = acc31[x];
+                            }
                         }
                     }
                 }
             }
-        }
-        else
-        {
-            // Left-over columns
-            const int columns_left = out_width - id.x();
-            for(int x = 0; x < columns_left; ++x)
+            else
             {
-                *(mtx_out0 + x) = acc00[x];
-                if(id.y() + 1 < out_height)
+                // Left-over columns
+                const int columns_left = out_width - id.x();
+                for (int x = 0; x < columns_left; ++x)
                 {
-                    *(mtx_out0 + x + out_stride1) = acc10[x];
-                    if(id.y() + 2 < out_height)
+                    *(mtx_out0 + x) = acc00[x];
+                    if (id.y() + 1 < out_height)
                     {
-                        *(mtx_out0 + x + out_stride2) = acc20[x];
-                        if(id.y() + 3 < out_height)
+                        *(mtx_out0 + x + out_stride1) = acc10[x];
+                        if (id.y() + 2 < out_height)
                         {
-                            *(mtx_out0 + x + out_stride3) = acc30[x];
+                            *(mtx_out0 + x + out_stride2) = acc20[x];
+                            if (id.y() + 3 < out_height)
+                            {
+                                *(mtx_out0 + x + out_stride3) = acc30[x];
+                            }
                         }
                     }
                 }
             }
-        }
-    },
-    ina, inb, out);
+        },
+        ina, inb, out);
 }
 } // namespace cpu
 
diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h
index f9f1f247ac..74ea4c2b17 100644
--- a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h
+++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h
@@ -24,15 +24,18 @@
 #ifndef SRC_CORE_KERNELS_GEMMMATRIXMUL_IMPL_H
 #define SRC_CORE_KERNELS_GEMMMATRIXMUL_IMPL_H
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/CPP/Validate.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void vector_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha);
+void vector_matrix_multiply_f32(
+    const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha);
 
-void matrix_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha);
+void matrix_matrix_multiply_f32(
+    const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha);
 
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/gemm_matrix_mul/list.h b/src/cpu/kernels/gemm_matrix_mul/list.h
index 9cdb58ae06..15b23b1d81 100644
--- a/src/cpu/kernels/gemm_matrix_mul/list.h
+++ b/src/cpu/kernels/gemm_matrix_mul/list.h
@@ -27,8 +27,9 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_GEMMMATRIXMUL_KERNEL(func_name) \
-    void func_name(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha, const bool is_dst_vector)
+#define DECLARE_GEMMMATRIXMUL_KERNEL(func_name)                                                                        \
+    void func_name(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, \
+                   float alpha, const bool is_dst_vector)
 DECLARE_GEMMMATRIXMUL_KERNEL(neon_fp32_gemm_matrix_mul);
 DECLARE_GEMMMATRIXMUL_KERNEL(neon_fp16_gemm_matrix_mul);
 #undef DECLARE_GEMMMATRIXMUL_KERNEL
diff --git a/src/cpu/kernels/genproposals/generic/neon/fp16.cpp b/src/cpu/kernels/genproposals/generic/neon/fp16.cpp
index d4e469b691..4ed7e54f1c 100644
--- a/src/cpu/kernels/genproposals/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/genproposals/generic/neon/fp16.cpp
@@ -27,10 +27,13 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp16_computeallanchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
+void neon_fp16_computeallanchors(const ITensor     *anchors,
+                                 ITensor           *all_anchors,
+                                 ComputeAnchorsInfo anchors_info,
+                                 const Window      &window)
 {
     return compute_all_anchors<float16_t>(anchors, all_anchors, anchors_info, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
 #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
diff --git a/src/cpu/kernels/genproposals/generic/neon/fp32.cpp b/src/cpu/kernels/genproposals/generic/neon/fp32.cpp
index 09aa6ecec4..f15cd63bb2 100644
--- a/src/cpu/kernels/genproposals/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/genproposals/generic/neon/fp32.cpp
@@ -26,9 +26,12 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp32_computeallanchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
+void neon_fp32_computeallanchors(const ITensor     *anchors,
+                                 ITensor           *all_anchors,
+                                 ComputeAnchorsInfo anchors_info,
+                                 const Window      &window)
 {
     return compute_all_anchors<float>(anchors, all_anchors, anchors_info, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/genproposals/generic/neon/impl.cpp b/src/cpu/kernels/genproposals/generic/neon/impl.cpp
index 9224e32a94..8cb76f3afb 100644
--- a/src/cpu/kernels/genproposals/generic/neon/impl.cpp
+++ b/src/cpu/kernels/genproposals/generic/neon/impl.cpp
@@ -28,7 +28,10 @@ class ITensor;
 class Window;
 namespace cpu
 {
-void compute_all_anchors_qasymm16(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
+void compute_all_anchors_qasymm16(const ITensor     *anchors,
+                                  ITensor           *all_anchors,
+                                  ComputeAnchorsInfo anchors_info,
+                                  const Window      &window)
 {
     Iterator all_anchors_it(all_anchors, window);
     Iterator anchors_it(all_anchors, window);
@@ -39,28 +42,30 @@ void compute_all_anchors_qasymm16(const ITensor *anchors, ITensor *all_anchors,
 
     const UniformQuantizationInfo qinfo = anchors->info()->quantization_info().uniform();
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const size_t anchor_offset = id.y() % num_anchors;
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const size_t anchor_offset = id.y() % num_anchors;
 
-        const auto out_anchor_ptr = reinterpret_cast<int16_t *>(all_anchors_it.ptr());
-        const auto anchor_ptr     = reinterpret_cast<int16_t *>(anchors->ptr_to_element(Coordinates(0, anchor_offset)));
+            const auto out_anchor_ptr = reinterpret_cast<int16_t *>(all_anchors_it.ptr());
+            const auto anchor_ptr = reinterpret_cast<int16_t *>(anchors->ptr_to_element(Coordinates(0, anchor_offset)));
 
-        const size_t shift_idy = id.y() / num_anchors;
-        const float  shiftx    = (shift_idy % feat_width) * stride;
-        const float  shifty    = (shift_idy / feat_width) * stride;
+            const size_t shift_idy = id.y() / num_anchors;
+            const float  shiftx    = (shift_idy % feat_width) * stride;
+            const float  shifty    = (shift_idy / feat_width) * stride;
 
-        const float new_anchor_x1 = dequantize_qsymm16(*anchor_ptr, qinfo.scale) + shiftx;
-        const float new_anchor_y1 = dequantize_qsymm16(*(1 + anchor_ptr), qinfo.scale) + shifty;
-        const float new_anchor_x2 = dequantize_qsymm16(*(2 + anchor_ptr), qinfo.scale) + shiftx;
-        const float new_anchor_y2 = dequantize_qsymm16(*(3 + anchor_ptr), qinfo.scale) + shifty;
+            const float new_anchor_x1 = dequantize_qsymm16(*anchor_ptr, qinfo.scale) + shiftx;
+            const float new_anchor_y1 = dequantize_qsymm16(*(1 + anchor_ptr), qinfo.scale) + shifty;
+            const float new_anchor_x2 = dequantize_qsymm16(*(2 + anchor_ptr), qinfo.scale) + shiftx;
+            const float new_anchor_y2 = dequantize_qsymm16(*(3 + anchor_ptr), qinfo.scale) + shifty;
 
-        *out_anchor_ptr       = quantize_qsymm16(new_anchor_x1, qinfo.scale);
-        *(out_anchor_ptr + 1) = quantize_qsymm16(new_anchor_y1, qinfo.scale);
-        *(out_anchor_ptr + 2) = quantize_qsymm16(new_anchor_x2, qinfo.scale);
-        *(out_anchor_ptr + 3) = quantize_qsymm16(new_anchor_y2, qinfo.scale);
-    },
-    all_anchors_it);
+            *out_anchor_ptr       = quantize_qsymm16(new_anchor_x1, qinfo.scale);
+            *(out_anchor_ptr + 1) = quantize_qsymm16(new_anchor_y1, qinfo.scale);
+            *(out_anchor_ptr + 2) = quantize_qsymm16(new_anchor_x2, qinfo.scale);
+            *(out_anchor_ptr + 3) = quantize_qsymm16(new_anchor_y2, qinfo.scale);
+        },
+        all_anchors_it);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/genproposals/generic/neon/impl.h b/src/cpu/kernels/genproposals/generic/neon/impl.h
index da052c9192..3317bcfbe6 100644
--- a/src/cpu/kernels/genproposals/generic/neon/impl.h
+++ b/src/cpu/kernels/genproposals/generic/neon/impl.h
@@ -26,13 +26,17 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 namespace arm_compute
 {
 namespace cpu
 {
 template <typename T>
-void compute_all_anchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
+void compute_all_anchors(const ITensor     *anchors,
+                         ITensor           *all_anchors,
+                         ComputeAnchorsInfo anchors_info,
+                         const Window      &window)
 {
     Iterator all_anchors_it(all_anchors, window);
     Iterator anchors_it(all_anchors, window);
@@ -41,26 +45,31 @@ void compute_all_anchors(const ITensor *anchors, ITensor *all_anchors, ComputeAn
     const T      stride      = 1.f / anchors_info.spatial_scale();
     const size_t feat_width  = anchors_info.feat_width();
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const size_t anchor_offset = id.y() % num_anchors;
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const size_t anchor_offset = id.y() % num_anchors;
 
-        const auto out_anchor_ptr = reinterpret_cast<T *>(all_anchors_it.ptr());
-        const auto anchor_ptr     = reinterpret_cast<T *>(anchors->ptr_to_element(Coordinates(0, anchor_offset)));
+            const auto out_anchor_ptr = reinterpret_cast<T *>(all_anchors_it.ptr());
+            const auto anchor_ptr     = reinterpret_cast<T *>(anchors->ptr_to_element(Coordinates(0, anchor_offset)));
 
-        const size_t shift_idy = id.y() / num_anchors;
-        const T      shiftx    = (shift_idy % feat_width) * stride;
-        const T      shifty    = (shift_idy / feat_width) * stride;
+            const size_t shift_idy = id.y() / num_anchors;
+            const T      shiftx    = (shift_idy % feat_width) * stride;
+            const T      shifty    = (shift_idy / feat_width) * stride;
 
-        *out_anchor_ptr       = *anchor_ptr + shiftx;
-        *(out_anchor_ptr + 1) = *(1 + anchor_ptr) + shifty;
-        *(out_anchor_ptr + 2) = *(2 + anchor_ptr) + shiftx;
-        *(out_anchor_ptr + 3) = *(3 + anchor_ptr) + shifty;
-    },
-    all_anchors_it);
+            *out_anchor_ptr       = *anchor_ptr + shiftx;
+            *(out_anchor_ptr + 1) = *(1 + anchor_ptr) + shifty;
+            *(out_anchor_ptr + 2) = *(2 + anchor_ptr) + shiftx;
+            *(out_anchor_ptr + 3) = *(3 + anchor_ptr) + shifty;
+        },
+        all_anchors_it);
 }
 
-void compute_all_anchors_qasymm16(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window);
+void compute_all_anchors_qasymm16(const ITensor     *anchors,
+                                  ITensor           *all_anchors,
+                                  ComputeAnchorsInfo anchors_info,
+                                  const Window      &window);
 } // namespace cpu
 } // namespace arm_compute
 #endif //define SRC_CORE_SVE_KERNELS_NEGENERATEPROPOSALSLAYERKERNEL_IMPL_H
diff --git a/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp b/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp
index cfb5a41d6e..7182d0b27d 100644
--- a/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp
+++ b/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp
@@ -26,9 +26,12 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_qu16_computeallanchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
+void neon_qu16_computeallanchors(const ITensor     *anchors,
+                                 ITensor           *all_anchors,
+                                 ComputeAnchorsInfo anchors_info,
+                                 const Window      &window)
 {
     return compute_all_anchors_qasymm16(anchors, all_anchors, anchors_info, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp b/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp
index 2b7d91b144..44418c0bb9 100644
--- a/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp
@@ -23,6 +23,7 @@
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/instancenorm/generic/neon/impl.h"
 
@@ -40,7 +41,10 @@ void vector_float_sum_fp16(AccType &result, AccType &result_square, const InputT
 }
 
 template <typename InputType, typename AccType>
-InputType vector_float_norm_fp16(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta)
+InputType vector_float_norm_fp16(const InputType &inputs,
+                                 const AccType   &vec_mean,
+                                 const AccType   &vec_multip,
+                                 const AccType   &vec_beta)
 {
     return wrapper::vadd(wrapper::vmul(wrapper::vsub(inputs, vec_mean), vec_multip), vec_beta);
 }
@@ -52,19 +56,24 @@ inline void vector_float_sum_fp16(float32x4_t &result, float32x4_t &result_squar
     vector_float_sum_fp16(result, result_square, wrapper::vcvt<float>(wrapper::vgethigh(inputs)));
 }
 template <>
-inline float16x8_t vector_float_norm_fp16(const float16x8_t &inputs, const float32x4_t &vec_mean, const float32x4_t &vec_multip, const float32x4_t &vec_beta)
+inline float16x8_t vector_float_norm_fp16(const float16x8_t &inputs,
+                                          const float32x4_t &vec_mean,
+                                          const float32x4_t &vec_multip,
+                                          const float32x4_t &vec_beta)
 {
-    const auto  input_low   = wrapper::vcvt<float>(wrapper::vgetlow(inputs));
-    const auto  input_high  = wrapper::vcvt<float>(wrapper::vgethigh(inputs));
-    const auto  result_low  = wrapper::vcvt<float16_t>(vector_float_norm_fp16(input_low, vec_mean, vec_multip, vec_beta));
-    const auto  result_high = wrapper::vcvt<float16_t>(vector_float_norm_fp16(input_high, vec_mean, vec_multip, vec_beta));
-    float16x8_t result      = wrapper::vcombine(result_low, result_high);
+    const auto input_low  = wrapper::vcvt<float>(wrapper::vgetlow(inputs));
+    const auto input_high = wrapper::vcvt<float>(wrapper::vgethigh(inputs));
+    const auto result_low = wrapper::vcvt<float16_t>(vector_float_norm_fp16(input_low, vec_mean, vec_multip, vec_beta));
+    const auto result_high =
+        wrapper::vcvt<float16_t>(vector_float_norm_fp16(input_high, vec_mean, vec_multip, vec_beta));
+    float16x8_t result = wrapper::vcombine(result_low, result_high);
 
     return result;
 }
 
 template <typename AccType>
-void instance_normalization_nchw_fp16(const ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window)
+void instance_normalization_nchw_fp16(
+    const ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window)
 {
     /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float16_t, wrapper::traits::BitWidth::W128>;
@@ -78,91 +87,105 @@ void instance_normalization_nchw_fp16(const ITensor *input, ITensor *output, flo
     const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1);
 
     Iterator input_it(input, win);
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        Window win_plane = window;
-        win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
-        win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
-        win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
-
-        Iterator input_plane_it(input, win_plane);
-        Iterator output_plane_it(output, win_plane);
-
-        auto sum_h_w         = static_cast<AccType>(0.f);
-        auto sum_squares_h_w = static_cast<AccType>(0.f);
-
-        execute_window_loop(win_plane, [&](const Coordinates &)
-        {
-            const auto input_ptr = reinterpret_cast<const float16_t *>(input_plane_it.ptr());
-
-            auto vec_sum_h_w         = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
-            auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
-
-            // Compute S elements per iteration
-            int x = window.x().start();
-            for(; x <= (window.x().end() - window_step_x); x += window_step_x)
-            {
-                auto vec_input_val = wrapper::vloadq(input_ptr + x);
-                vector_float_sum_fp16(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val);
-            }
-
-            auto vec2_sum_h_w         = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
-            auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w));
-
-            vec2_sum_h_w         = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
-            vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
-
-            sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
-            sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
-
-            // Compute left-over elements
-            for(; x < window.x().end(); ++x)
-            {
-                const auto value = static_cast<AccType>(*(input_ptr + x));
-                sum_h_w += value;
-                sum_squares_h_w += value * value;
-            }
-        },
-        input_plane_it, output_plane_it);
-
-        const auto mean_h_w = sum_h_w / elements_plane;
-        const auto var_h_w  = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
-
-        const auto multip_h_w     = gamma / std::sqrt(var_h_w + epsilon);
-        const auto vec_mean_h_w   = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{});
-        const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{});
-        const auto vec_beta       = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{});
-
-        execute_window_loop(win_plane, [&](const Coordinates &)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            auto input_ptr  = reinterpret_cast<const float16_t *>(input_plane_it.ptr());
-            auto output_ptr = reinterpret_cast<float16_t *>(output_plane_it.ptr());
-
-            // Compute S elements per iteration
-            int x = window.x().start();
-            for(; x <= (window.x().end() - window_step_x); x += window_step_x)
-            {
-                const auto vec_val        = wrapper::vloadq(input_ptr + x);
-                const auto normalized_vec = vector_float_norm_fp16(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta);
-                wrapper::vstore(output_ptr + x, normalized_vec);
-            }
-
-            // Compute left-over elements
-            for(; x < window.x().end(); ++x)
-            {
-                const auto val    = static_cast<AccType>(*(input_ptr + x));
-                *(output_ptr + x) = static_cast<float16_t>((val - mean_h_w) * multip_h_w + beta);
-            }
+            Window win_plane = window;
+            win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
+            win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
+            win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
+
+            Iterator input_plane_it(input, win_plane);
+            Iterator output_plane_it(output, win_plane);
+
+            auto sum_h_w         = static_cast<AccType>(0.f);
+            auto sum_squares_h_w = static_cast<AccType>(0.f);
+
+            execute_window_loop(
+                win_plane,
+                [&](const Coordinates &)
+                {
+                    const auto input_ptr = reinterpret_cast<const float16_t *>(input_plane_it.ptr());
+
+                    auto vec_sum_h_w         = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
+                    auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
+
+                    // Compute S elements per iteration
+                    int x = window.x().start();
+                    for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+                    {
+                        auto vec_input_val = wrapper::vloadq(input_ptr + x);
+                        vector_float_sum_fp16(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val);
+                    }
+
+                    auto vec2_sum_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
+                    auto vec2_sum_squares_h_w =
+                        wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w));
+
+                    vec2_sum_h_w         = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
+                    vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
+
+                    sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
+                    sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
+
+                    // Compute left-over elements
+                    for (; x < window.x().end(); ++x)
+                    {
+                        const auto value = static_cast<AccType>(*(input_ptr + x));
+                        sum_h_w += value;
+                        sum_squares_h_w += value * value;
+                    }
+                },
+                input_plane_it, output_plane_it);
+
+            const auto mean_h_w = sum_h_w / elements_plane;
+            const auto var_h_w  = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
+
+            const auto multip_h_w     = gamma / std::sqrt(var_h_w + epsilon);
+            const auto vec_mean_h_w   = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{});
+            const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{});
+            const auto vec_beta       = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{});
+
+            execute_window_loop(
+                win_plane,
+                [&](const Coordinates &)
+                {
+                    auto input_ptr  = reinterpret_cast<const float16_t *>(input_plane_it.ptr());
+                    auto output_ptr = reinterpret_cast<float16_t *>(output_plane_it.ptr());
+
+                    // Compute S elements per iteration
+                    int x = window.x().start();
+                    for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+                    {
+                        const auto vec_val = wrapper::vloadq(input_ptr + x);
+                        const auto normalized_vec =
+                            vector_float_norm_fp16(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta);
+                        wrapper::vstore(output_ptr + x, normalized_vec);
+                    }
+
+                    // Compute left-over elements
+                    for (; x < window.x().end(); ++x)
+                    {
+                        const auto val    = static_cast<AccType>(*(input_ptr + x));
+                        *(output_ptr + x) = static_cast<float16_t>((val - mean_h_w) * multip_h_w + beta);
+                    }
+                },
+                input_plane_it, output_plane_it);
         },
-        input_plane_it, output_plane_it);
-    },
-    input_it);
-}
+        input_it);
 }
-
-void neon_fp16_instancenorm(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window)
+} // namespace
+
+void neon_fp16_instancenorm(ITensor      *input,
+                            ITensor      *output,
+                            float         gamma,
+                            float         beta,
+                            float         epsilon,
+                            bool          use_mixed_precision,
+                            const Window &window)
 {
-    if(use_mixed_precision)
+    if (use_mixed_precision)
     {
         return instance_normalization_nchw_fp16<float>(input, output, gamma, beta, epsilon, window);
     }
diff --git a/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp b/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp
index 061dd9585c..e1ca05518d 100644
--- a/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp
@@ -26,7 +26,13 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp32_instancenorm(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window)
+void neon_fp32_instancenorm(ITensor      *input,
+                            ITensor      *output,
+                            float         gamma,
+                            float         beta,
+                            float         epsilon,
+                            bool          use_mixed_precision,
+                            const Window &window)
 {
     ARM_COMPUTE_UNUSED(use_mixed_precision);
     return instance_normalization_nchw<float>(input, output, gamma, beta, epsilon, window);
diff --git a/src/cpu/kernels/instancenorm/generic/neon/impl.cpp b/src/cpu/kernels/instancenorm/generic/neon/impl.cpp
index 483b6f568b..515079e1b5 100644
--- a/src/cpu/kernels/instancenorm/generic/neon/impl.cpp
+++ b/src/cpu/kernels/instancenorm/generic/neon/impl.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "src/cpu/kernels/instancenorm/generic/neon/impl.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
@@ -38,13 +39,15 @@ void vector_float_sum(AccType &result, AccType &result_square, const InputType &
 }
 
 template <typename InputType, typename AccType>
-InputType vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta)
+InputType
+vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta)
 {
     return wrapper::vadd(wrapper::vmul(wrapper::vsub(inputs, vec_mean), vec_multip), vec_beta);
 }
 
 template <typename T, typename AccType>
-void instance_normalization_nchw(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window)
+void instance_normalization_nchw(
+    ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window)
 {
     /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
@@ -58,88 +61,96 @@ void instance_normalization_nchw(ITensor *input, ITensor *output, float gamma, f
     const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1);
 
     Iterator input_it(input, win);
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        Window win_plane = window;
-        win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
-        win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
-        win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
-
-        Iterator input_plane_it(input, win_plane);
-        Iterator output_plane_it(output, win_plane);
-
-        auto sum_h_w         = static_cast<AccType>(0.f);
-        auto sum_squares_h_w = static_cast<AccType>(0.f);
-
-        execute_window_loop(win_plane, [&](const Coordinates &)
-        {
-            const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr());
-
-            auto vec_sum_h_w         = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
-            auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
-
-            // Compute S elements per iteration
-            int x = window.x().start();
-            for(; x <= (window.x().end() - window_step_x); x += window_step_x)
-            {
-                auto vec_input_val = wrapper::vloadq(input_ptr + x);
-                vector_float_sum(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val);
-            }
-
-            auto vec2_sum_h_w         = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
-            auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w));
-
-            vec2_sum_h_w         = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
-            vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
-
-            sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
-            sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
-
-            // Compute left-over elements
-            for(; x < window.x().end(); ++x)
-            {
-                const auto value = static_cast<AccType>(*(input_ptr + x));
-                sum_h_w += value;
-                sum_squares_h_w += value * value;
-            }
-        },
-        input_plane_it, output_plane_it);
-
-        const auto mean_h_w = sum_h_w / elements_plane;
-        const auto var_h_w  = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
-
-        const auto multip_h_w     = gamma / std::sqrt(var_h_w + epsilon);
-        const auto vec_mean_h_w   = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{});
-        const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{});
-        const auto vec_beta       = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{});
-
-        execute_window_loop(win_plane, [&](const Coordinates &)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            auto input_ptr  = reinterpret_cast<T *>(input_plane_it.ptr());
-            auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr());
-
-            // Compute S elements per iteration
-            int x = window.x().start();
-            //auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{});
-            for(; x <= (window.x().end() - window_step_x); x += window_step_x)
-            {
-                const auto vec_val        = wrapper::vloadq(input_ptr + x);
-                const auto normalized_vec = vector_float_norm(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta);
-                wrapper::vstore(output_ptr + x, normalized_vec);
-            }
-
-            // Compute left-over elements
-            for(; x < window.x().end(); ++x)
-            {
-                const auto val    = static_cast<AccType>(*(input_ptr + x));
-                *(output_ptr + x) = static_cast<T>((val - mean_h_w) * multip_h_w + beta);
-            }
+            Window win_plane = window;
+            win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
+            win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
+            win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
+
+            Iterator input_plane_it(input, win_plane);
+            Iterator output_plane_it(output, win_plane);
+
+            auto sum_h_w         = static_cast<AccType>(0.f);
+            auto sum_squares_h_w = static_cast<AccType>(0.f);
+
+            execute_window_loop(
+                win_plane,
+                [&](const Coordinates &)
+                {
+                    const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr());
+
+                    auto vec_sum_h_w         = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
+                    auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
+
+                    // Compute S elements per iteration
+                    int x = window.x().start();
+                    for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+                    {
+                        auto vec_input_val = wrapper::vloadq(input_ptr + x);
+                        vector_float_sum(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val);
+                    }
+
+                    auto vec2_sum_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
+                    auto vec2_sum_squares_h_w =
+                        wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w));
+
+                    vec2_sum_h_w         = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
+                    vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
+
+                    sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
+                    sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
+
+                    // Compute left-over elements
+                    for (; x < window.x().end(); ++x)
+                    {
+                        const auto value = static_cast<AccType>(*(input_ptr + x));
+                        sum_h_w += value;
+                        sum_squares_h_w += value * value;
+                    }
+                },
+                input_plane_it, output_plane_it);
+
+            const auto mean_h_w = sum_h_w / elements_plane;
+            const auto var_h_w  = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
+
+            const auto multip_h_w     = gamma / std::sqrt(var_h_w + epsilon);
+            const auto vec_mean_h_w   = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{});
+            const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{});
+            const auto vec_beta       = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{});
+
+            execute_window_loop(
+                win_plane,
+                [&](const Coordinates &)
+                {
+                    auto input_ptr  = reinterpret_cast<T *>(input_plane_it.ptr());
+                    auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr());
+
+                    // Compute S elements per iteration
+                    int x = window.x().start();
+                    //auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{});
+                    for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+                    {
+                        const auto vec_val        = wrapper::vloadq(input_ptr + x);
+                        const auto normalized_vec = vector_float_norm(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta);
+                        wrapper::vstore(output_ptr + x, normalized_vec);
+                    }
+
+                    // Compute left-over elements
+                    for (; x < window.x().end(); ++x)
+                    {
+                        const auto val    = static_cast<AccType>(*(input_ptr + x));
+                        *(output_ptr + x) = static_cast<T>((val - mean_h_w) * multip_h_w + beta);
+                    }
+                },
+                input_plane_it, output_plane_it);
         },
-        input_plane_it, output_plane_it);
-    },
-    input_it);
+        input_it);
 }
 
-template void instance_normalization_nchw<float>(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
+template void instance_normalization_nchw<float>(
+    ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/instancenorm/generic/neon/impl.h b/src/cpu/kernels/instancenorm/generic/neon/impl.h
index 0ddfcdd5ba..e1cc7487f7 100644
--- a/src/cpu/kernels/instancenorm/generic/neon/impl.h
+++ b/src/cpu/kernels/instancenorm/generic/neon/impl.h
@@ -32,13 +32,15 @@ namespace arm_compute
 namespace cpu
 {
 template <typename T, typename AccType = T>
-void instance_normalization_nchw(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
+void instance_normalization_nchw(
+    ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
 
 template <typename InputType, typename AccType = InputType>
 void vector_float_sum(AccType &result, AccType &result_square, const InputType &inputs);
 
 template <typename InputType, typename AccType = InputType>
-InputType vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta);
+InputType
+vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta);
 } // namespace cpu
 } // namespace arm_compute
 #endif //define SRC_CORE_SVE_KERNELS_INSTANCENORM_IMPL_H
diff --git a/src/cpu/kernels/instancenorm/list.h b/src/cpu/kernels/instancenorm/list.h
index 54f1d3213f..51b496c41d 100644
--- a/src/cpu/kernels/instancenorm/list.h
+++ b/src/cpu/kernels/instancenorm/list.h
@@ -27,8 +27,9 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_INSTANCENORM_KERNEL(func_name) \
-    void func_name(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window)
+#define DECLARE_INSTANCENORM_KERNEL(func_name)                                                                        \
+    void func_name(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, \
+                   const Window &window)
 DECLARE_INSTANCENORM_KERNEL(neon_fp32_instancenorm);
 DECLARE_INSTANCENORM_KERNEL(neon_fp16_instancenorm);
 #undef DECLARE_INSTANCENORM_KERNEL
diff --git a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
index b503a8b734..32d9ca4eac 100644
--- a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
+++ b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
@@ -24,18 +24,17 @@
 #include "src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h"
 
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/AssemblyUtils.h"
-
 #include "src/core/NEON/kernels/assembly/depthwise.hpp"
+#include "src/core/utils/AssemblyUtils.h"
 
 #include "depthwise_common.hpp"
-
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -54,9 +53,13 @@ constexpr unsigned int idx_channels = 0;
 constexpr unsigned int idx_batches  = 3;
 
 template <typename TSrc, typename TWeights, typename TDst>
-void create_arm_dwc(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst,
-                    const ConvolutionInfo &info, const CPUInfo &cpu_info,
-                    std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel, std::string &_name)
+void create_arm_dwc(const ITensorInfo                                      *src,
+                    const ITensorInfo                                      *weights,
+                    ITensorInfo                                            *dst,
+                    const ConvolutionInfo                                  &info,
+                    const CPUInfo                                          &cpu_info,
+                    std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel,
+                    std::string                                            &_name)
 {
     unsigned int stride_cols{};
     unsigned int stride_rows{};
@@ -79,13 +82,13 @@ void create_arm_dwc(const ITensorInfo *src, const ITensorInfo *weights, ITensorI
 
     const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info);
 
-    arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols, dilation_rows, dilation_cols,
-                                            n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier,
-                                            padding, activation, nullptr);
+    arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols,
+                                            dilation_rows, dilation_cols, n_batches, src_rows, src_cols, n_channels,
+                                            dst_rows, dst_cols, info.depth_multiplier, padding, activation, nullptr);
 
     // Configure assembly pooling kernel
     auto dwc_kernel_asm = arm_conv::depthwise::depthwise<TSrc, TWeights, TDst>(args);
-    if(dwc_kernel_asm == nullptr)
+    if (dwc_kernel_asm == nullptr)
     {
         // Configuration not supported: Leave function unconfigured:
         return;
@@ -96,11 +99,16 @@ void create_arm_dwc(const ITensorInfo *src, const ITensorInfo *weights, ITensorI
 }
 
 template <typename TSrc, typename TWeights, typename TDst>
-void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst,
-                          const ConvolutionInfo &info, const CPUInfo &cpu_info,
+void create_arm_dwc_quant(const ITensorInfo                                      *src,
+                          const ITensorInfo                                      *weights,
+                          ITensorInfo                                            *dst,
+                          const ConvolutionInfo                                  &info,
+                          const CPUInfo                                          &cpu_info,
                           std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel,
-                          std::vector<int32_t> &multipliers, std::vector<int32_t> &right_shifts, std::vector<int32_t> &left_shifts,
-                          std::string &_name)
+                          std::vector<int32_t>                                   &multipliers,
+                          std::vector<int32_t>                                   &right_shifts,
+                          std::vector<int32_t>                                   &left_shifts,
+                          std::string                                            &_name)
 {
     unsigned int stride_cols{};
     unsigned int stride_rows{};
@@ -123,9 +131,9 @@ void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, IT
 
     const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info);
 
-    arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols, dilation_rows, dilation_cols,
-                                            n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier,
-                                            padding, activation, nullptr);
+    arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols,
+                                            dilation_rows, dilation_cols, n_batches, src_rows, src_cols, n_channels,
+                                            dst_rows, dst_cols, info.depth_multiplier, padding, activation, nullptr);
 
     const auto src_qinfo     = src->quantization_info().uniform();
     const auto weights_qinfo = weights->quantization_info();
@@ -135,64 +143,50 @@ void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, IT
 
     multipliers.resize(num_filters);
     std::vector<int32_t> dst_shifts(num_filters);
-    quantization::compute_quantized_multipliers_and_shifts(src,
-                                                           weights,
-                                                           dst,
-                                                           multipliers.data(),
-                                                           dst_shifts.data());
+    quantization::compute_quantized_multipliers_and_shifts(src, weights, dst, multipliers.data(), dst_shifts.data());
 
     // Quantize activation bounds
     int32_t min_activation = std::numeric_limits<TSrc>::lowest();
     int32_t max_activation = std::numeric_limits<TSrc>::max();
-    if(info.act_info.enabled())
+    if (info.act_info.enabled())
     {
-        std::tie(min_activation, max_activation) = get_quantized_activation_min_max(info.act_info, src->data_type(), dst_qinfo);
+        std::tie(min_activation, max_activation) =
+            get_quantized_activation_min_max(info.act_info, src->data_type(), dst_qinfo);
     }
 
     // Set quantization parameters for assembly kernels
     arm_gemm::Requantize32 requant_args{};
-    if(is_data_type_quantized_per_channel(weights->data_type()))
+    if (is_data_type_quantized_per_channel(weights->data_type()))
     {
         left_shifts.resize(num_filters);
         right_shifts.resize(num_filters);
         bool need_left_shift = false; // Select more optimized path if left shift is not needed
-        for(unsigned int i = 0; i < num_filters; ++i)
+        for (unsigned int i = 0; i < num_filters; ++i)
         {
             left_shifts[i]  = std::max(-dst_shifts[i], static_cast<int32_t>(0));
             right_shifts[i] = std::min(-dst_shifts[i], static_cast<int32_t>(0));
-            if(dst_shifts[i] < 0 && !need_left_shift)
+            if (dst_shifts[i] < 0 && !need_left_shift)
             {
                 need_left_shift = true;
             }
         }
 
-        requant_args = arm_gemm::Requantize32(nullptr,
-                                              0,
-                                              src_qinfo.offset,
-                                              weights_qinfo.uniform().offset,
-                                              dst_qinfo.offset,
-                                              (need_left_shift) ? left_shifts.data() : nullptr,
-                                              right_shifts.data(),
-                                              multipliers.data(),
-                                              static_cast<TSrc>(min_activation),
-                                              static_cast<TSrc>(max_activation));
+        requant_args = arm_gemm::Requantize32(nullptr, 0, src_qinfo.offset, weights_qinfo.uniform().offset,
+                                              dst_qinfo.offset, (need_left_shift) ? left_shifts.data() : nullptr,
+                                              right_shifts.data(), multipliers.data(),
+                                              static_cast<TSrc>(min_activation), static_cast<TSrc>(max_activation));
     }
     else
     {
-        requant_args = arm_gemm::Requantize32(nullptr,
-                                              0,
-                                              src_qinfo.offset,
-                                              weights_qinfo.uniform().offset,
-                                              dst_qinfo.offset,
-                                              -dst_shifts[0],
-                                              multipliers[0],
-                                              static_cast<TSrc>(min_activation),
-                                              static_cast<TSrc>(max_activation));
+        requant_args = arm_gemm::Requantize32(nullptr, 0, src_qinfo.offset, weights_qinfo.uniform().offset,
+                                              dst_qinfo.offset, -dst_shifts[0], multipliers[0],
+                                              static_cast<TSrc>(min_activation), static_cast<TSrc>(max_activation));
     }
 
     // Configure assembly pooling kernel with requantization
-    auto dwc_kernel_asm = arm_conv::depthwise::depthwise<TSrc, TWeights, TDst, arm_gemm::Requantize32>(args, requant_args);
-    if(dwc_kernel_asm == nullptr)
+    auto dwc_kernel_asm =
+        arm_conv::depthwise::depthwise<TSrc, TWeights, TDst, arm_gemm::Requantize32>(args, requant_args);
+    if (dwc_kernel_asm == nullptr)
     {
         // Configuration not supported: Leave function unconfigured:
         return;
@@ -203,18 +197,18 @@ void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, IT
 } // namespace
 
 CpuDepthwiseConv2dAssemblyWrapperKernel::CpuDepthwiseConv2dAssemblyWrapperKernel()
-    : _kernel_asm(nullptr),
-      _multipliers(),
-      _left_shifts(),
-      _right_shifts(),
-      _name()
+    : _kernel_asm(nullptr), _multipliers(), _left_shifts(), _right_shifts(), _name()
 {
 }
 
 CpuDepthwiseConv2dAssemblyWrapperKernel::~CpuDepthwiseConv2dAssemblyWrapperKernel() = default;
 
-void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *, ITensorInfo *dst,
-                                                        const ConvolutionInfo &info, const CPUInfo &cpu_info)
+void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src,
+                                                        const ITensorInfo *weights,
+                                                        const ITensorInfo *,
+                                                        ITensorInfo           *dst,
+                                                        const ConvolutionInfo &info,
+                                                        const CPUInfo         &cpu_info)
 {
     ARM_COMPUTE_UNUSED(cpu_info);
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
@@ -225,24 +219,30 @@ void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src,
     _name = "CpuDepthwiseConv2dAssemblyWrapperKernel";
     std::string asm_kernel_name("");
 #if defined(__aarch64__)
-    switch(src->data_type())
+    switch (src->data_type())
     {
         case DataType::QASYMM8:
-            if(is_data_type_quantized_per_channel(weights->data_type()))
+            if (is_data_type_quantized_per_channel(weights->data_type()))
             {
-                create_arm_dwc_quant<uint8_t, int8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts, asm_kernel_name);
+                create_arm_dwc_quant<uint8_t, int8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm,
+                                                               _multipliers, _right_shifts, _left_shifts,
+                                                               asm_kernel_name);
             }
             else
             {
-                create_arm_dwc_quant<uint8_t, uint8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts, asm_kernel_name);
+                create_arm_dwc_quant<uint8_t, uint8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm,
+                                                                _multipliers, _right_shifts, _left_shifts,
+                                                                asm_kernel_name);
             }
             break;
         case DataType::QASYMM8_SIGNED:
-            create_arm_dwc_quant<int8_t, int8_t, int8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts, asm_kernel_name);
+            create_arm_dwc_quant<int8_t, int8_t, int8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers,
+                                                         _right_shifts, _left_shifts, asm_kernel_name);
             break;
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
         case DataType::F16:
-            create_arm_dwc<float16_t, float16_t, float16_t>(src, weights, dst, info, cpu_info, _kernel_asm, asm_kernel_name);
+            create_arm_dwc<float16_t, float16_t, float16_t>(src, weights, dst, info, cpu_info, _kernel_asm,
+                                                            asm_kernel_name);
             break;
 #endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
         case DataType::F32:
@@ -255,13 +255,17 @@ void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src,
 
     Window win = calculate_max_window(*dst, Steps());
     ICpuKernel::configure(win);
-    if(_kernel_asm != nullptr)
+    if (_kernel_asm != nullptr)
     {
         _name += "/" + asm_kernel_name;
     }
 }
 
-Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info)
+Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo     *src,
+                                                         const ITensorInfo     *weights,
+                                                         const ITensorInfo     *bias,
+                                                         const ITensorInfo     *dst,
+                                                         const ConvolutionInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
 
@@ -269,10 +273,12 @@ Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src,
     ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels");
 #endif // !defined(__aarch64__)
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC, "Only NHWC is supported by assembly kernels");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC,
+                                    "Only NHWC is supported by assembly kernels");
 
-    if(is_data_type_quantized_per_channel(weights->data_type()))
+    if (is_data_type_quantized_per_channel(weights->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
@@ -282,12 +288,12 @@ Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src,
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(0));
 
-        if(is_data_type_quantized(src->data_type()))
+        if (is_data_type_quantized(src->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         }
@@ -297,7 +303,7 @@ Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src,
         }
     }
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
@@ -305,17 +311,15 @@ Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src,
     }
 
     // Assembly kernels cannot work with padding greater than the kernel.
-    const auto &padding = info.pad_stride_info;
-    const auto &dilation = info.dilation;
+    const auto &padding   = info.pad_stride_info;
+    const auto &dilation  = info.dilation;
     const auto &wei_shape = weights->tensor_shape();
 
     const auto dilated_wei_w = wei_shape[1] + (wei_shape[1] - 1) * (dilation.x() - 1);
     const auto dilated_wei_h = wei_shape[2] + (wei_shape[2] - 1) * (dilation.y() - 1);
 
-    ARM_COMPUTE_RETURN_ERROR_ON(
-        padding.pad_left() >= dilated_wei_w || padding.pad_right() >= dilated_wei_w ||
-        padding.pad_top() >= dilated_wei_h || padding.pad_bottom() >= dilated_wei_h
-    );
+    ARM_COMPUTE_RETURN_ERROR_ON(padding.pad_left() >= dilated_wei_w || padding.pad_right() >= dilated_wei_w ||
+                                padding.pad_top() >= dilated_wei_h || padding.pad_bottom() >= dilated_wei_h);
 
     return Status{};
 }
@@ -351,13 +355,12 @@ void CpuDepthwiseConv2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const
     const size_t ld_dst_row   = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom);
     const size_t ld_dst_batch = ld_dst_row * dst_shape[2];
 
-    _kernel_asm->execute(src_ptr, ld_src_col, ld_src_row, ld_src_batch,
-                         parameters_ptr,
-                         dst_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
-                         working_space, info.thread_id, info.num_threads);
+    _kernel_asm->execute(src_ptr, ld_src_col, ld_src_row, ld_src_batch, parameters_ptr, dst_ptr, ld_dst_col, ld_dst_row,
+                         ld_dst_batch, working_space, info.thread_id, info.num_threads);
 }
 
-void CpuDepthwiseConv2dAssemblyWrapperKernel::pack_parameters(void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weight_row)
+void CpuDepthwiseConv2dAssemblyWrapperKernel::pack_parameters(
+    void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weight_row)
 {
     _kernel_asm->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weight_row);
 }
diff --git a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h
index f61cb1b09c..fadaefb999 100644
--- a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h
+++ b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_WRAPPER_KERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 #include "src/cpu/kernels/CpuKernelSelectionTypes.h"
@@ -35,8 +36,8 @@ namespace depthwise
 {
 // Forward declarations
 class IDepthwiseCommon;
-} // depthwise
-} // arm_conv
+} // namespace depthwise
+} // namespace arm_conv
 
 namespace arm_compute
 {
@@ -66,7 +67,12 @@ public:
      * @param[in]  info     Depthwise convolution layer meta-data.
      * @param[in]  cpu_info CPU information needed to select the most appropriate kernel.
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info, const CPUInfo &cpu_info);
+    void configure(const ITensorInfo     *src,
+                   const ITensorInfo     *weights,
+                   const ITensorInfo     *bias,
+                   ITensorInfo           *dst,
+                   const ConvolutionInfo &info,
+                   const CPUInfo         &cpu_info);
 
     /** Indicates whether or not this function can be used to process the given parameters.
      *
@@ -74,10 +80,14 @@ public:
      *
      * @return a status.
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info);
+    static Status validate(const ITensorInfo     *src,
+                           const ITensorInfo     *weights,
+                           const ITensorInfo     *bias,
+                           const ITensorInfo     *dst,
+                           const ConvolutionInfo &info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     /** Pack bias and weights in a storage space for the assembly kernel
@@ -88,7 +98,8 @@ public:
      * @param[in] ld_weights_col Columns displacement for the weights tensor.
      * @param[in] ld_weights_row Rows displacement for the weights tensor.
      */
-    void pack_parameters(void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weights_row);
+    void pack_parameters(
+        void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weights_row);
 
     /** Get the amount of storage space required for the rearranged weights and bias.
      *
diff --git a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
index 10ff4183c0..a161c800fd 100644
--- a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
+++ b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
@@ -22,14 +22,16 @@
  * SOFTWARE.
  */
 #include "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h"
+
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/INEKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/INEKernel.h"
 
 #include <arm_neon.h>
 
@@ -41,7 +43,10 @@ namespace kernels
 {
 using namespace arm_compute::misc::shape_calculator;
 
-void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo      *src,
+                                               ITensorInfo            *dst,
+                                               const PoolingLayerInfo &info,
+                                               const CPUInfo          &cpu_info)
 {
     ARM_COMPUTE_UNUSED(cpu_info);
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
@@ -52,10 +57,10 @@ void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorIn
 #if defined(__aarch64__)
     const bool requantize = src->quantization_info() != dst->quantization_info();
 
-    switch(src->data_type())
+    switch (src->data_type())
     {
         case DataType::QASYMM8:
-            if(requantize)
+            if (requantize)
             {
                 create_arm_pooling_requant<uint8_t, uint8_t>(src, dst, info, cpu_info);
             }
@@ -65,7 +70,7 @@ void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorIn
             }
             break;
         case DataType::QASYMM8_SIGNED:
-            if(requantize)
+            if (requantize)
             {
                 create_arm_pooling_requant<int8_t, int8_t>(src, dst, info, cpu_info);
             }
@@ -91,7 +96,8 @@ void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorIn
     INEKernel::configure(win);
 }
 
-Status CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info)
+Status
+CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
 
@@ -99,43 +105,52 @@ Status CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const IT
     ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels");
 #endif /* __aarch64__ */
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((src->data_layout() != DataLayout::NHWC) || (info.data_layout != DataLayout::NHWC), "Only NHWC is supported by assembly kernels");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((src->data_layout() != DataLayout::NHWC) || (info.data_layout != DataLayout::NHWC),
+                                    "Only NHWC is supported by assembly kernels");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.pool_type != PoolingType::AVG) && (info.pool_type != PoolingType::MAX),
                                     "Only AVG and MAX pooling are supported by assembly kernels");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(info), "Pooling region that is entirely outside input tensor is unsupported by assembly kernels");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        is_pool_region_entirely_outside_input(info),
+        "Pooling region that is entirely outside input tensor is unsupported by assembly kernels");
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
 
         const auto src_qinfo = src->quantization_info().uniform();
         const auto dst_qinfo = dst->quantization_info().uniform();
 
-        if(src_qinfo != dst_qinfo)
+        if (src_qinfo != dst_qinfo)
         {
             const float multiplier = src_qinfo.scale / dst_qinfo.scale;
             int32_t     dst_multiplier{};
             int32_t     dst_shift{};
-            ARM_COMPUTE_RETURN_ERROR_ON(quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift));
+            ARM_COMPUTE_RETURN_ERROR_ON(
+                quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift));
         }
         else
         {
-            if(src->data_type() == DataType::QASYMM8)
+            if (src->data_type() == DataType::QASYMM8)
             {
                 const bool has_padding = info.pad_stride_info.has_padding();
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    !info.exclude_padding && has_padding,
+                    "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
             }
         }
     }
     else
     {
-        if(src->data_type() == DataType::QASYMM8)
+        if (src->data_type() == DataType::QASYMM8)
         {
             // If dst is not configured, the quantization info are the same
             const bool has_padding = info.pad_stride_info.has_padding();
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                !info.exclude_padding && has_padding,
+                "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
         }
     }
     return Status{};
@@ -154,9 +169,10 @@ void CpuPool2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &
     ITensor       *dst       = tensors.get_tensor(TensorType::ACL_DST);
     ITensor       *workspace = tensors.get_tensor(TensorType::ACL_INT_0);
 
-    const auto in_ptr        = src->buffer() + src->info()->offset_first_element_in_bytes();
-    auto       out_ptr       = dst->buffer() + dst->info()->offset_first_element_in_bytes();
-    auto       working_space = (workspace == nullptr) ? nullptr : workspace->buffer() + workspace->info()->offset_first_element_in_bytes();
+    const auto in_ptr  = src->buffer() + src->info()->offset_first_element_in_bytes();
+    auto       out_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes();
+    auto       working_space =
+        (workspace == nullptr) ? nullptr : workspace->buffer() + workspace->info()->offset_first_element_in_bytes();
 
     const auto src_shape   = src->info()->tensor_shape();
     const auto dst_shape   = dst->info()->tensor_shape();
@@ -170,8 +186,7 @@ void CpuPool2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &
     const size_t ld_dst_row   = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom);
     const size_t ld_dst_batch = ld_dst_row * dst_shape[2];
 
-    _kernel_asm->execute(in_ptr, ld_src_col, ld_src_row, ld_src_batch,
-                         out_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
+    _kernel_asm->execute(in_ptr, ld_src_col, ld_src_row, ld_src_batch, out_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
                          working_space, info.thread_id, info.num_threads);
 }
 
@@ -186,9 +201,14 @@ bool CpuPool2dAssemblyWrapperKernel::is_configured() const
 }
 
 template <typename Typesrc, typename Typedst>
-void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo      *src,
+                                                        ITensorInfo            *dst,
+                                                        const PoolingLayerInfo &info,
+                                                        const CPUInfo          &cpu_info)
 {
-    const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
+    const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG)
+                                                         ? arm_conv::pooling::PoolingType::AVERAGE
+                                                         : arm_conv::pooling::PoolingType::MAX;
 
     arm_conv::pooling::PoolingWindow window{};
     window.cols = static_cast<unsigned int>(info.pool_size.x());
@@ -197,7 +217,8 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src,
     arm_conv::pooling::PoolingStride stride{};
     std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
 
-    const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
+    const arm_conv::pooling::PaddingValues padding{info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(),
+                                                   info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom()};
 
     constexpr unsigned int idx_width    = 1;
     constexpr unsigned int idx_height   = 2;
@@ -211,11 +232,12 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src,
     const unsigned int dst_rows   = dst->dimension(idx_height);
     const unsigned int dst_cols   = dst->dimension(idx_width);
 
-    arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
+    arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows,
+                                        src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
 
     // Configure assembly pooling kernel
     auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst>(args);
-    if(pooling_kernel_asm == nullptr)
+    if (pooling_kernel_asm == nullptr)
     {
         // Configuration not supported: Leave function unconfigured:
         return;
@@ -225,9 +247,14 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src,
 }
 
 template <typename Typesrc, typename Typedst>
-void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo      *src,
+                                                                ITensorInfo            *dst,
+                                                                const PoolingLayerInfo &info,
+                                                                const CPUInfo          &cpu_info)
 {
-    const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
+    const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG)
+                                                         ? arm_conv::pooling::PoolingType::AVERAGE
+                                                         : arm_conv::pooling::PoolingType::MAX;
 
     arm_conv::pooling::PoolingWindow window{};
     window.cols = static_cast<unsigned int>(info.pool_size.x());
@@ -236,7 +263,8 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInf
     arm_conv::pooling::PoolingStride stride{};
     std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
 
-    const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
+    const arm_conv::pooling::PaddingValues padding{info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(),
+                                                   info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom()};
 
     constexpr unsigned int idx_width    = 1;
     constexpr unsigned int idx_height   = 2;
@@ -250,7 +278,8 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInf
     const unsigned int dst_rows   = dst->dimension(idx_height);
     const unsigned int dst_cols   = dst->dimension(idx_width);
 
-    arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
+    arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows,
+                                        src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
 
     const auto src_qinfo = src->quantization_info().uniform();
     const auto dst_qinfo = dst->quantization_info().uniform();
@@ -260,15 +289,15 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInf
     int32_t     dst_shift{};
     quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift);
 
-    const arm_conv::pooling::Requantize32 requant_args(src_qinfo.offset,
-                                                       dst_qinfo.offset,
+    const arm_conv::pooling::Requantize32 requant_args(src_qinfo.offset, dst_qinfo.offset,
                                                        dst_shift, // left shift
                                                        0,         // right shift
                                                        dst_multiplier);
 
     // Configure assembly pooling kernel with requantization
-    auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst, arm_conv::pooling::Requantize32>(args, requant_args);
-    if(pooling_kernel_asm == nullptr)
+    auto pooling_kernel_asm =
+        arm_conv::pooling::pooling<Typesrc, Typedst, arm_conv::pooling::Requantize32>(args, requant_args);
+    if (pooling_kernel_asm == nullptr)
     {
         // Configuration not supported: Leave function unconfigured:
         return;
diff --git a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
index 8713d5c54d..b4ff1e6f2d 100644
--- a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
+++ b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
@@ -25,8 +25,9 @@
 #define ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H
 
 #include "arm_compute/core/Types.h"
-#include "src/core/NEON/kernels/assembly/pooling.hpp"
+
 #include "src/core/common/Macros.h"
+#include "src/core/NEON/kernels/assembly/pooling.hpp"
 #include "src/cpu/ICpuKernel.h"
 #include "src/cpu/kernels/CpuKernelSelectionTypes.h"
 
@@ -101,7 +102,8 @@ private:
      * @param[in] info Pooling layer meta-data.
      */
     template <typename Typesrc, typename Typedst>
-    void create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
+    void
+    create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
 
     /** Helper function to create the assembly kernel with requantization support
      *
@@ -110,9 +112,12 @@ private:
      * @param[in] info Pooling layer meta-data.
      */
     template <typename Typesrc, typename Typedst>
-    void create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
+    void create_arm_pooling_requant(const ITensorInfo      *src,
+                                    ITensorInfo            *dst,
+                                    const PoolingLayerInfo &info,
+                                    const CPUInfo          &cpu_info);
 
-    std::unique_ptr<arm_conv::pooling::IPoolingCommon> _kernel_asm{ nullptr };
+    std::unique_ptr<arm_conv::pooling::IPoolingCommon> _kernel_asm{nullptr};
 
     /** Return minimum workload size of the relevant kernel
      *
diff --git a/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp b/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp
index 661c3d7f46..6c6527de06 100644
--- a/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp
@@ -32,13 +32,15 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp16_l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis)
+void neon_fp16_l2_normalize_x(
+    const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis)
 {
     ARM_COMPUTE_UNUSED(unused_axis);
     return l2_normalize_x<float16_t, 8>(in, sum, out, epsilon, window);
 }
 
-void neon_fp16_l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
+void neon_fp16_l2_normalize_yz(
+    const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
 {
     return l2_normalize_yz<float16_t, 8>(in, sum, out, epsilon, window, axis);
 }
diff --git a/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp b/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp
index be32bdc4fa..520877068c 100644
--- a/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp
@@ -22,21 +22,23 @@
  * SOFTWARE.
  */
 
-#include "src/cpu/kernels/l2normlayer/generic/neon/impl.h"
-
 #include "arm_compute/core/Helpers.h"
 
+#include "src/cpu/kernels/l2normlayer/generic/neon/impl.h"
+
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp32_l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis)
+void neon_fp32_l2_normalize_x(
+    const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis)
 {
     ARM_COMPUTE_UNUSED(unused_axis);
     return l2_normalize_x<float, 4>(in, sum, out, epsilon, window);
 }
 
-void neon_fp32_l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
+void neon_fp32_l2_normalize_yz(
+    const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
 {
     return l2_normalize_yz<float, 4>(in, sum, out, epsilon, window, axis);
 }
diff --git a/src/cpu/kernels/l2normlayer/generic/neon/impl.h b/src/cpu/kernels/l2normlayer/generic/neon/impl.h
index a06cdd33d3..6bd19299b7 100644
--- a/src/cpu/kernels/l2normlayer/generic/neon/impl.h
+++ b/src/cpu/kernels/l2normlayer/generic/neon/impl.h
@@ -26,8 +26,9 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <cstddef>
 
@@ -51,33 +52,36 @@ void l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float e
     Iterator sum_it(sum, win_collapsed);
     Iterator output_it(out, win_collapsed);
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
-
-        const T    sum_value      = *reinterpret_cast<const T *>(sum_it.ptr());
-        const T    norm_value     = static_cast<T>(1.f) / std::sqrt(std::max(sum_value, static_cast<T>(epsilon)));
-        const auto vec_norm_value = wrapper::vdup_n(norm_value, ExactTagType{});
-
-        // Compute elements over vector steps
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            out_ptr[x] = in_ptr[x] * norm_value;
-        }
-    },
-    input_it, sum_it, output_it);
+            const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
+
+            const T    sum_value      = *reinterpret_cast<const T *>(sum_it.ptr());
+            const T    norm_value     = static_cast<T>(1.f) / std::sqrt(std::max(sum_value, static_cast<T>(epsilon)));
+            const auto vec_norm_value = wrapper::vdup_n(norm_value, ExactTagType{});
+
+            // Compute elements over vector steps
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                out_ptr[x] = in_ptr[x] * norm_value;
+            }
+        },
+        input_it, sum_it, output_it);
 }
 
 template <typename T, int S>
-void l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
+void l2_normalize_yz(
+    const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
 {
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
 
@@ -97,28 +101,30 @@ void l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float
 
     const auto vec_eps = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{});
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
-        const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
-
-        // Compute elements over vector steps
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vec_norm_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr + x), vec_eps));
-            wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const T norm_value = static_cast<T>(1.f) / std::sqrt(std::max(sum_ptr[x], static_cast<T>(epsilon)));
-            out_ptr[x]         = in_ptr[x] * norm_value;
-        }
-    },
-    input_it, sum_it, output_it);
+            const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
+            const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
+
+            // Compute elements over vector steps
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vec_norm_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr + x), vec_eps));
+                wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                const T norm_value = static_cast<T>(1.f) / std::sqrt(std::max(sum_ptr[x], static_cast<T>(epsilon)));
+                out_ptr[x]         = in_ptr[x] * norm_value;
+            }
+        },
+        input_it, sum_it, output_it);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/l2normlayer/list.h b/src/cpu/kernels/l2normlayer/list.h
index 2bad7f54f5..e2a879d06e 100644
--- a/src/cpu/kernels/l2normlayer/list.h
+++ b/src/cpu/kernels/l2normlayer/list.h
@@ -27,8 +27,9 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_L2NORMLAYER_KERNEL(func_name) \
-    void func_name(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
+#define DECLARE_L2NORMLAYER_KERNEL(func_name)                                                                \
+    void func_name(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, \
+                   size_t axis)
 
 DECLARE_L2NORMLAYER_KERNEL(neon_fp16_l2_normalize_x);
 DECLARE_L2NORMLAYER_KERNEL(neon_fp16_l2_normalize_yz);
diff --git a/src/cpu/kernels/lut/generic/neon/u8.cpp b/src/cpu/kernels/lut/generic/neon/u8.cpp
index 8ab647bfee..5516f5b33d 100644
--- a/src/cpu/kernels/lut/generic/neon/u8.cpp
+++ b/src/cpu/kernels/lut/generic/neon/u8.cpp
@@ -32,376 +32,374 @@ namespace cpu
 #ifdef __aarch64__
 
 void lut_u8_neon(
-    const uint8_t        *table,
-    size_t                num_strings,
-    size_t                string_length,
-    const uint8_t *const *input,
-    uint8_t *const       *output)
+    const uint8_t *table, size_t num_strings, size_t string_length, const uint8_t *const *input, uint8_t *const *output)
 {
-    __asm__ __volatile__(
-        "ldr q16, [%x[table], #0x0]\n"
-        "ldr q17, [%x[table], #0x10]\n"
-        "mov x23, #0x0\n"
-        "ldr q18, [%x[table], #0x20]\n"
-        "ldr q19, [%x[table], #0x30]\n"
-        "ldr q20, [%x[table], #0x40]\n"
-        "ldr q21, [%x[table], #0x50]\n"
-        "ldr q22, [%x[table], #0x60]\n"
-        "ldr q23, [%x[table], #0x70]\n"
-        "ldr q24, [%x[table], #0x80]\n"
-        "ldr q25, [%x[table], #0x90]\n"
-        "ldr q26, [%x[table], #0xa0]\n"
-        "ldr q27, [%x[table], #0xb0]\n"
-        "ldr q28, [%x[table], #0xc0]\n"
-        "ldr q29, [%x[table], #0xd0]\n"
-        "ldr q30, [%x[table], #0xe0]\n"
-        "ldr q31, [%x[table], #0xf0]\n"
-        "1:" // string loop
-        "ldr x22, [%x[input], x23, LSL #0x3]\n"
-        "ldr x21, [%x[output], x23, LSL #0x3]\n"
-        "movi v11.16b, #0x40\n"
-        "movi v10.16b, #0x80\n"
-        "movi v9.16b, #0xc0\n"
-        "mov x20, %x[string_length]\n"
-        "2:" // 4 rounds: width loop
-        "cmp x20, #0x30\n"
-        "bge 27f\n"
-        "tbz x20, #5, 10f\n"
-        "ld1 { v8.16b }, [x22], #0x10\n"
-        "ld1 { v13.16b }, [x22], #0x10\n"
-        "tbz x20, #3, 6f\n"
-        "ldr d12, [x22], #0x8\n"
-        "tbz x20, #2, 4f\n"
-        "ld1 { v12.s }[2], [x22], #0x4\n"
-        "tbz x20, #1, 3f\n"
-        "ld1 { v12.h }[6], [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v12.b }[14], [x22]\n"
-        "b 26f\n"
-        "3:" // 4 rounds: Partial load: partial_1_44
-        "tbz x20, #0, 26f\n"
-        "ld1 { v12.b }[12], [x22]\n"
-        "b 26f\n"
-        "4:" // 4 rounds: Partial load: partial_2_40
-        "tbz x20, #1, 5f\n"
-        "ld1 { v12.h }[4], [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v12.b }[10], [x22]\n"
-        "b 26f\n"
-        "5:" // 4 rounds: Partial load: partial_1_40
-        "tbz x20, #0, 26f\n"
-        "ld1 { v12.b }[8], [x22]\n"
-        "b 26f\n"
-        "6:" // 4 rounds: Partial load: partial_4_32
-        "tbz x20, #2, 8f\n"
-        "ldr s12, [x22], #0x4\n"
-        "tbz x20, #1, 7f\n"
-        "ld1 { v12.h }[2], [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v12.b }[6], [x22]\n"
-        "b 26f\n"
-        "7:" // 4 rounds: Partial load: partial_1_36
-        "tbz x20, #0, 26f\n"
-        "ld1 { v12.b }[4], [x22]\n"
-        "b 26f\n"
-        "8:" // 4 rounds: Partial load: partial_2_32
-        "tbz x20, #1, 9f\n"
-        "ldr h12, [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v12.b }[2], [x22]\n"
-        "b 26f\n"
-        "9:" // 4 rounds: Partial load: partial_1_32
-        "tbz x20, #0, 26f\n"
-        "ldr b12, [x22, #0x0]\n"
-        "b 26f\n"
-        "10:" // 4 rounds: Partial load: partial_16_0
-        "tbz x20, #4, 18f\n"
-        "ld1 { v8.16b }, [x22], #0x10\n"
-        "tbz x20, #3, 14f\n"
-        "ldr d13, [x22], #0x8\n"
-        "tbz x20, #2, 12f\n"
-        "ld1 { v13.s }[2], [x22], #0x4\n"
-        "tbz x20, #1, 11f\n"
-        "ld1 { v13.h }[6], [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v13.b }[14], [x22]\n"
-        "b 26f\n"
-        "11:" // 4 rounds: Partial load: partial_1_28
-        "tbz x20, #0, 26f\n"
-        "ld1 { v13.b }[12], [x22]\n"
-        "b 26f\n"
-        "12:" // 4 rounds: Partial load: partial_2_24
-        "tbz x20, #1, 13f\n"
-        "ld1 { v13.h }[4], [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v13.b }[10], [x22]\n"
-        "b 26f\n"
-        "13:" // 4 rounds: Partial load: partial_1_24
-        "tbz x20, #0, 26f\n"
-        "ld1 { v13.b }[8], [x22]\n"
-        "b 26f\n"
-        "14:" // 4 rounds: Partial load: partial_4_16
-        "tbz x20, #2, 16f\n"
-        "ldr s13, [x22], #0x4\n"
-        "tbz x20, #1, 15f\n"
-        "ld1 { v13.h }[2], [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v13.b }[6], [x22]\n"
-        "b 26f\n"
-        "15:" // 4 rounds: Partial load: partial_1_20
-        "tbz x20, #0, 26f\n"
-        "ld1 { v13.b }[4], [x22]\n"
-        "b 26f\n"
-        "16:" // 4 rounds: Partial load: partial_2_16
-        "tbz x20, #1, 17f\n"
-        "ldr h13, [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v13.b }[2], [x22]\n"
-        "b 26f\n"
-        "17:" // 4 rounds: Partial load: partial_1_16
-        "tbz x20, #0, 26f\n"
-        "ldr b13, [x22, #0x0]\n"
-        "b 26f\n"
-        "18:" // 4 rounds: Partial load: partial_8_0
-        "tbz x20, #3, 22f\n"
-        "ldr d8, [x22], #0x8\n"
-        "tbz x20, #2, 20f\n"
-        "ld1 { v8.s }[2], [x22], #0x4\n"
-        "tbz x20, #1, 19f\n"
-        "ld1 { v8.h }[6], [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v8.b }[14], [x22]\n"
-        "b 26f\n"
-        "19:" // 4 rounds: Partial load: partial_1_12
-        "tbz x20, #0, 26f\n"
-        "ld1 { v8.b }[12], [x22]\n"
-        "b 26f\n"
-        "20:" // 4 rounds: Partial load: partial_2_8
-        "tbz x20, #1, 21f\n"
-        "ld1 { v8.h }[4], [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v8.b }[10], [x22]\n"
-        "b 26f\n"
-        "21:" // 4 rounds: Partial load: partial_1_8
-        "tbz x20, #0, 26f\n"
-        "ld1 { v8.b }[8], [x22]\n"
-        "b 26f\n"
-        "22:" // 4 rounds: Partial load: partial_4_0
-        "tbz x20, #2, 24f\n"
-        "ldr s8, [x22], #0x4\n"
-        "tbz x20, #1, 23f\n"
-        "ld1 { v8.h }[2], [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v8.b }[6], [x22]\n"
-        "b 26f\n"
-        "23:" // 4 rounds: Partial load: partial_1_4
-        "tbz x20, #0, 26f\n"
-        "ld1 { v8.b }[4], [x22]\n"
-        "b 26f\n"
-        "24:" // 4 rounds: Partial load: partial_2_0
-        "tbz x20, #1, 25f\n"
-        "ldr h8, [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v8.b }[2], [x22]\n"
-        "b 26f\n"
-        "25:" // 4 rounds: Partial load: partial_1_0
-        "ldr b8, [x22, #0x0]\n"
-        "26:" // 4 rounds: Partial load: Done
-        "b 28f\n"
-        "27:" // 4 rounds: Full load
-        "ldr q8, [x22, #0x0]\n"
-        "ldr q13, [x22, #0x10]\n"
-        "ldr q12, [x22, #0x20]\n"
-        "add x22, x22, #0x30\n"
-        "28:" // 4 rounds: Load done
-        "sub v0.16b, v8.16b, v11.16b\n"
-        "sub v7.16b, v8.16b, v10.16b\n"
-        "tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b\n"
-        "sub v6.16b, v8.16b, v9.16b\n"
-        "sub v5.16b, v13.16b, v11.16b\n"
-        "tbl v8.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v8.16b\n"
-        "sub v4.16b, v13.16b, v10.16b\n"
-        "sub v3.16b, v13.16b, v9.16b\n"
-        "tbl v7.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v7.16b\n"
-        "sub v2.16b, v12.16b, v11.16b\n"
-        "sub v1.16b, v12.16b, v10.16b\n"
-        "tbl v6.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v6.16b\n"
-        "tbl v13.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v13.16b\n"
-        "tbl v5.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v5.16b\n"
-        "orr v8.16b, v8.16b, v0.16b\n"
-        "sub v0.16b, v12.16b, v9.16b\n"
-        "tbl v4.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v4.16b\n"
-        "tbl v3.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v3.16b\n"
-        "tbl v12.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v12.16b\n"
-        "tbl v2.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v2.16b\n"
-        "orr v7.16b, v7.16b, v6.16b\n"
-        "tbl v1.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v1.16b\n"
-        "tbl v0.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v0.16b\n"
-        "orr v13.16b, v13.16b, v5.16b\n"
-        "orr v4.16b, v4.16b, v3.16b\n"
-        "orr v12.16b, v12.16b, v2.16b\n"
-        "cmp x20, #0x30\n"
-        "orr v1.16b, v1.16b, v0.16b\n"
-        "orr v8.16b, v8.16b, v7.16b\n"
-        "orr v13.16b, v13.16b, v4.16b\n"
-        "orr v12.16b, v12.16b, v1.16b\n"
-        "bge 53f\n"
-        "tbz x20, #5, 36f\n"
-        "st1 { v8.16b }, [x21], #0x10\n"
-        "st1 { v13.16b }, [x21], #0x10\n"
-        "tbz x20, #3, 32f\n"
-        "str d12, [x21], #0x8\n"
-        "tbz x20, #2, 30f\n"
-        "st1 { v12.s }[2], [x21], #0x4\n"
-        "tbz x20, #1, 29f\n"
-        "st1 { v12.h }[6], [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v12.b }[14], [x21]\n"
-        "b 52f\n"
-        "29:" // 4 rounds: Partial writeback: partial_1_44
-        "tbz x20, #0, 52f\n"
-        "st1 { v12.b }[12], [x21]\n"
-        "b 52f\n"
-        "30:" // 4 rounds: Partial writeback: partial_2_40
-        "tbz x20, #1, 31f\n"
-        "st1 { v12.h }[4], [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v12.b }[10], [x21]\n"
-        "b 52f\n"
-        "31:" // 4 rounds: Partial writeback: partial_1_40
-        "tbz x20, #0, 52f\n"
-        "st1 { v12.b }[8], [x21]\n"
-        "b 52f\n"
-        "32:" // 4 rounds: Partial writeback: partial_4_32
-        "tbz x20, #2, 34f\n"
-        "str s12, [x21], #0x4\n"
-        "tbz x20, #1, 33f\n"
-        "st1 { v12.h }[2], [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v12.b }[6], [x21]\n"
-        "b 52f\n"
-        "33:" // 4 rounds: Partial writeback: partial_1_36
-        "tbz x20, #0, 52f\n"
-        "st1 { v12.b }[4], [x21]\n"
-        "b 52f\n"
-        "34:" // 4 rounds: Partial writeback: partial_2_32
-        "tbz x20, #1, 35f\n"
-        "str h12, [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v12.b }[2], [x21]\n"
-        "b 52f\n"
-        "35:" // 4 rounds: Partial writeback: partial_1_32
-        "tbz x20, #0, 52f\n"
-        "str b12, [x21, #0x0]\n"
-        "b 52f\n"
-        "36:" // 4 rounds: Partial writeback: partial_16_0
-        "tbz x20, #4, 44f\n"
-        "st1 { v8.16b }, [x21], #0x10\n"
-        "tbz x20, #3, 40f\n"
-        "str d13, [x21], #0x8\n"
-        "tbz x20, #2, 38f\n"
-        "st1 { v13.s }[2], [x21], #0x4\n"
-        "tbz x20, #1, 37f\n"
-        "st1 { v13.h }[6], [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v13.b }[14], [x21]\n"
-        "b 52f\n"
-        "37:" // 4 rounds: Partial writeback: partial_1_28
-        "tbz x20, #0, 52f\n"
-        "st1 { v13.b }[12], [x21]\n"
-        "b 52f\n"
-        "38:" // 4 rounds: Partial writeback: partial_2_24
-        "tbz x20, #1, 39f\n"
-        "st1 { v13.h }[4], [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v13.b }[10], [x21]\n"
-        "b 52f\n"
-        "39:" // 4 rounds: Partial writeback: partial_1_24
-        "tbz x20, #0, 52f\n"
-        "st1 { v13.b }[8], [x21]\n"
-        "b 52f\n"
-        "40:" // 4 rounds: Partial writeback: partial_4_16
-        "tbz x20, #2, 42f\n"
-        "str s13, [x21], #0x4\n"
-        "tbz x20, #1, 41f\n"
-        "st1 { v13.h }[2], [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v13.b }[6], [x21]\n"
-        "b 52f\n"
-        "41:" // 4 rounds: Partial writeback: partial_1_20
-        "tbz x20, #0, 52f\n"
-        "st1 { v13.b }[4], [x21]\n"
-        "b 52f\n"
-        "42:" // 4 rounds: Partial writeback: partial_2_16
-        "tbz x20, #1, 43f\n"
-        "str h13, [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v13.b }[2], [x21]\n"
-        "b 52f\n"
-        "43:" // 4 rounds: Partial writeback: partial_1_16
-        "tbz x20, #0, 52f\n"
-        "str b13, [x21, #0x0]\n"
-        "b 52f\n"
-        "44:" // 4 rounds: Partial writeback: partial_8_0
-        "tbz x20, #3, 48f\n"
-        "str d8, [x21], #0x8\n"
-        "tbz x20, #2, 46f\n"
-        "st1 { v8.s }[2], [x21], #0x4\n"
-        "tbz x20, #1, 45f\n"
-        "st1 { v8.h }[6], [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v8.b }[14], [x21]\n"
-        "b 52f\n"
-        "45:" // 4 rounds: Partial writeback: partial_1_12
-        "tbz x20, #0, 52f\n"
-        "st1 { v8.b }[12], [x21]\n"
-        "b 52f\n"
-        "46:" // 4 rounds: Partial writeback: partial_2_8
-        "tbz x20, #1, 47f\n"
-        "st1 { v8.h }[4], [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v8.b }[10], [x21]\n"
-        "b 52f\n"
-        "47:" // 4 rounds: Partial writeback: partial_1_8
-        "tbz x20, #0, 52f\n"
-        "st1 { v8.b }[8], [x21]\n"
-        "b 52f\n"
-        "48:" // 4 rounds: Partial writeback: partial_4_0
-        "tbz x20, #2, 50f\n"
-        "str s8, [x21], #0x4\n"
-        "tbz x20, #1, 49f\n"
-        "st1 { v8.h }[2], [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v8.b }[6], [x21]\n"
-        "b 52f\n"
-        "49:" // 4 rounds: Partial writeback: partial_1_4
-        "tbz x20, #0, 52f\n"
-        "st1 { v8.b }[4], [x21]\n"
-        "b 52f\n"
-        "50:" // 4 rounds: Partial writeback: partial_2_0
-        "tbz x20, #1, 51f\n"
-        "str h8, [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v8.b }[2], [x21]\n"
-        "b 52f\n"
-        "51:" // 4 rounds: Partial writeback: partial_1_0
-        "str b8, [x21, #0x0]\n"
-        "52:" // 4 rounds: Partial writeback: Done
-        "b 54f\n"
-        "53:" // 4 rounds: Full writeback
-        "str q8, [x21, #0x0]\n"
-        "str q13, [x21, #0x10]\n"
-        "str q12, [x21, #0x20]\n"
-        "add x21, x21, #0x30\n"
-        "54:" // 4 rounds: Writeback done
-        "subs x20, x20, #0x30\n"
-        "bgt 2b\n"
-        "add x23, x23, #0x1\n"
-        "cmp x23, %x[num_strings]\n"
-        "bne 1b\n"
-        :
-        : [input] "r"(input), [num_strings] "r"(num_strings), [output] "r"(output), [string_length] "r"(string_length), [table] "r"(table)
-        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23");
+    __asm__ __volatile__("ldr q16, [%x[table], #0x0]\n"
+                         "ldr q17, [%x[table], #0x10]\n"
+                         "mov x23, #0x0\n"
+                         "ldr q18, [%x[table], #0x20]\n"
+                         "ldr q19, [%x[table], #0x30]\n"
+                         "ldr q20, [%x[table], #0x40]\n"
+                         "ldr q21, [%x[table], #0x50]\n"
+                         "ldr q22, [%x[table], #0x60]\n"
+                         "ldr q23, [%x[table], #0x70]\n"
+                         "ldr q24, [%x[table], #0x80]\n"
+                         "ldr q25, [%x[table], #0x90]\n"
+                         "ldr q26, [%x[table], #0xa0]\n"
+                         "ldr q27, [%x[table], #0xb0]\n"
+                         "ldr q28, [%x[table], #0xc0]\n"
+                         "ldr q29, [%x[table], #0xd0]\n"
+                         "ldr q30, [%x[table], #0xe0]\n"
+                         "ldr q31, [%x[table], #0xf0]\n"
+                         "1:" // string loop
+                         "ldr x22, [%x[input], x23, LSL #0x3]\n"
+                         "ldr x21, [%x[output], x23, LSL #0x3]\n"
+                         "movi v11.16b, #0x40\n"
+                         "movi v10.16b, #0x80\n"
+                         "movi v9.16b, #0xc0\n"
+                         "mov x20, %x[string_length]\n"
+                         "2:" // 4 rounds: width loop
+                         "cmp x20, #0x30\n"
+                         "bge 27f\n"
+                         "tbz x20, #5, 10f\n"
+                         "ld1 { v8.16b }, [x22], #0x10\n"
+                         "ld1 { v13.16b }, [x22], #0x10\n"
+                         "tbz x20, #3, 6f\n"
+                         "ldr d12, [x22], #0x8\n"
+                         "tbz x20, #2, 4f\n"
+                         "ld1 { v12.s }[2], [x22], #0x4\n"
+                         "tbz x20, #1, 3f\n"
+                         "ld1 { v12.h }[6], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v12.b }[14], [x22]\n"
+                         "b 26f\n"
+                         "3:" // 4 rounds: Partial load: partial_1_44
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v12.b }[12], [x22]\n"
+                         "b 26f\n"
+                         "4:" // 4 rounds: Partial load: partial_2_40
+                         "tbz x20, #1, 5f\n"
+                         "ld1 { v12.h }[4], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v12.b }[10], [x22]\n"
+                         "b 26f\n"
+                         "5:" // 4 rounds: Partial load: partial_1_40
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v12.b }[8], [x22]\n"
+                         "b 26f\n"
+                         "6:" // 4 rounds: Partial load: partial_4_32
+                         "tbz x20, #2, 8f\n"
+                         "ldr s12, [x22], #0x4\n"
+                         "tbz x20, #1, 7f\n"
+                         "ld1 { v12.h }[2], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v12.b }[6], [x22]\n"
+                         "b 26f\n"
+                         "7:" // 4 rounds: Partial load: partial_1_36
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v12.b }[4], [x22]\n"
+                         "b 26f\n"
+                         "8:" // 4 rounds: Partial load: partial_2_32
+                         "tbz x20, #1, 9f\n"
+                         "ldr h12, [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v12.b }[2], [x22]\n"
+                         "b 26f\n"
+                         "9:" // 4 rounds: Partial load: partial_1_32
+                         "tbz x20, #0, 26f\n"
+                         "ldr b12, [x22, #0x0]\n"
+                         "b 26f\n"
+                         "10:" // 4 rounds: Partial load: partial_16_0
+                         "tbz x20, #4, 18f\n"
+                         "ld1 { v8.16b }, [x22], #0x10\n"
+                         "tbz x20, #3, 14f\n"
+                         "ldr d13, [x22], #0x8\n"
+                         "tbz x20, #2, 12f\n"
+                         "ld1 { v13.s }[2], [x22], #0x4\n"
+                         "tbz x20, #1, 11f\n"
+                         "ld1 { v13.h }[6], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v13.b }[14], [x22]\n"
+                         "b 26f\n"
+                         "11:" // 4 rounds: Partial load: partial_1_28
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v13.b }[12], [x22]\n"
+                         "b 26f\n"
+                         "12:" // 4 rounds: Partial load: partial_2_24
+                         "tbz x20, #1, 13f\n"
+                         "ld1 { v13.h }[4], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v13.b }[10], [x22]\n"
+                         "b 26f\n"
+                         "13:" // 4 rounds: Partial load: partial_1_24
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v13.b }[8], [x22]\n"
+                         "b 26f\n"
+                         "14:" // 4 rounds: Partial load: partial_4_16
+                         "tbz x20, #2, 16f\n"
+                         "ldr s13, [x22], #0x4\n"
+                         "tbz x20, #1, 15f\n"
+                         "ld1 { v13.h }[2], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v13.b }[6], [x22]\n"
+                         "b 26f\n"
+                         "15:" // 4 rounds: Partial load: partial_1_20
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v13.b }[4], [x22]\n"
+                         "b 26f\n"
+                         "16:" // 4 rounds: Partial load: partial_2_16
+                         "tbz x20, #1, 17f\n"
+                         "ldr h13, [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v13.b }[2], [x22]\n"
+                         "b 26f\n"
+                         "17:" // 4 rounds: Partial load: partial_1_16
+                         "tbz x20, #0, 26f\n"
+                         "ldr b13, [x22, #0x0]\n"
+                         "b 26f\n"
+                         "18:" // 4 rounds: Partial load: partial_8_0
+                         "tbz x20, #3, 22f\n"
+                         "ldr d8, [x22], #0x8\n"
+                         "tbz x20, #2, 20f\n"
+                         "ld1 { v8.s }[2], [x22], #0x4\n"
+                         "tbz x20, #1, 19f\n"
+                         "ld1 { v8.h }[6], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v8.b }[14], [x22]\n"
+                         "b 26f\n"
+                         "19:" // 4 rounds: Partial load: partial_1_12
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v8.b }[12], [x22]\n"
+                         "b 26f\n"
+                         "20:" // 4 rounds: Partial load: partial_2_8
+                         "tbz x20, #1, 21f\n"
+                         "ld1 { v8.h }[4], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v8.b }[10], [x22]\n"
+                         "b 26f\n"
+                         "21:" // 4 rounds: Partial load: partial_1_8
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v8.b }[8], [x22]\n"
+                         "b 26f\n"
+                         "22:" // 4 rounds: Partial load: partial_4_0
+                         "tbz x20, #2, 24f\n"
+                         "ldr s8, [x22], #0x4\n"
+                         "tbz x20, #1, 23f\n"
+                         "ld1 { v8.h }[2], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v8.b }[6], [x22]\n"
+                         "b 26f\n"
+                         "23:" // 4 rounds: Partial load: partial_1_4
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v8.b }[4], [x22]\n"
+                         "b 26f\n"
+                         "24:" // 4 rounds: Partial load: partial_2_0
+                         "tbz x20, #1, 25f\n"
+                         "ldr h8, [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v8.b }[2], [x22]\n"
+                         "b 26f\n"
+                         "25:" // 4 rounds: Partial load: partial_1_0
+                         "ldr b8, [x22, #0x0]\n"
+                         "26:" // 4 rounds: Partial load: Done
+                         "b 28f\n"
+                         "27:" // 4 rounds: Full load
+                         "ldr q8, [x22, #0x0]\n"
+                         "ldr q13, [x22, #0x10]\n"
+                         "ldr q12, [x22, #0x20]\n"
+                         "add x22, x22, #0x30\n"
+                         "28:" // 4 rounds: Load done
+                         "sub v0.16b, v8.16b, v11.16b\n"
+                         "sub v7.16b, v8.16b, v10.16b\n"
+                         "tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b\n"
+                         "sub v6.16b, v8.16b, v9.16b\n"
+                         "sub v5.16b, v13.16b, v11.16b\n"
+                         "tbl v8.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v8.16b\n"
+                         "sub v4.16b, v13.16b, v10.16b\n"
+                         "sub v3.16b, v13.16b, v9.16b\n"
+                         "tbl v7.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v7.16b\n"
+                         "sub v2.16b, v12.16b, v11.16b\n"
+                         "sub v1.16b, v12.16b, v10.16b\n"
+                         "tbl v6.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v6.16b\n"
+                         "tbl v13.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v13.16b\n"
+                         "tbl v5.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v5.16b\n"
+                         "orr v8.16b, v8.16b, v0.16b\n"
+                         "sub v0.16b, v12.16b, v9.16b\n"
+                         "tbl v4.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v4.16b\n"
+                         "tbl v3.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v3.16b\n"
+                         "tbl v12.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v12.16b\n"
+                         "tbl v2.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v2.16b\n"
+                         "orr v7.16b, v7.16b, v6.16b\n"
+                         "tbl v1.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v1.16b\n"
+                         "tbl v0.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v0.16b\n"
+                         "orr v13.16b, v13.16b, v5.16b\n"
+                         "orr v4.16b, v4.16b, v3.16b\n"
+                         "orr v12.16b, v12.16b, v2.16b\n"
+                         "cmp x20, #0x30\n"
+                         "orr v1.16b, v1.16b, v0.16b\n"
+                         "orr v8.16b, v8.16b, v7.16b\n"
+                         "orr v13.16b, v13.16b, v4.16b\n"
+                         "orr v12.16b, v12.16b, v1.16b\n"
+                         "bge 53f\n"
+                         "tbz x20, #5, 36f\n"
+                         "st1 { v8.16b }, [x21], #0x10\n"
+                         "st1 { v13.16b }, [x21], #0x10\n"
+                         "tbz x20, #3, 32f\n"
+                         "str d12, [x21], #0x8\n"
+                         "tbz x20, #2, 30f\n"
+                         "st1 { v12.s }[2], [x21], #0x4\n"
+                         "tbz x20, #1, 29f\n"
+                         "st1 { v12.h }[6], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v12.b }[14], [x21]\n"
+                         "b 52f\n"
+                         "29:" // 4 rounds: Partial writeback: partial_1_44
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v12.b }[12], [x21]\n"
+                         "b 52f\n"
+                         "30:" // 4 rounds: Partial writeback: partial_2_40
+                         "tbz x20, #1, 31f\n"
+                         "st1 { v12.h }[4], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v12.b }[10], [x21]\n"
+                         "b 52f\n"
+                         "31:" // 4 rounds: Partial writeback: partial_1_40
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v12.b }[8], [x21]\n"
+                         "b 52f\n"
+                         "32:" // 4 rounds: Partial writeback: partial_4_32
+                         "tbz x20, #2, 34f\n"
+                         "str s12, [x21], #0x4\n"
+                         "tbz x20, #1, 33f\n"
+                         "st1 { v12.h }[2], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v12.b }[6], [x21]\n"
+                         "b 52f\n"
+                         "33:" // 4 rounds: Partial writeback: partial_1_36
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v12.b }[4], [x21]\n"
+                         "b 52f\n"
+                         "34:" // 4 rounds: Partial writeback: partial_2_32
+                         "tbz x20, #1, 35f\n"
+                         "str h12, [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v12.b }[2], [x21]\n"
+                         "b 52f\n"
+                         "35:" // 4 rounds: Partial writeback: partial_1_32
+                         "tbz x20, #0, 52f\n"
+                         "str b12, [x21, #0x0]\n"
+                         "b 52f\n"
+                         "36:" // 4 rounds: Partial writeback: partial_16_0
+                         "tbz x20, #4, 44f\n"
+                         "st1 { v8.16b }, [x21], #0x10\n"
+                         "tbz x20, #3, 40f\n"
+                         "str d13, [x21], #0x8\n"
+                         "tbz x20, #2, 38f\n"
+                         "st1 { v13.s }[2], [x21], #0x4\n"
+                         "tbz x20, #1, 37f\n"
+                         "st1 { v13.h }[6], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v13.b }[14], [x21]\n"
+                         "b 52f\n"
+                         "37:" // 4 rounds: Partial writeback: partial_1_28
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v13.b }[12], [x21]\n"
+                         "b 52f\n"
+                         "38:" // 4 rounds: Partial writeback: partial_2_24
+                         "tbz x20, #1, 39f\n"
+                         "st1 { v13.h }[4], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v13.b }[10], [x21]\n"
+                         "b 52f\n"
+                         "39:" // 4 rounds: Partial writeback: partial_1_24
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v13.b }[8], [x21]\n"
+                         "b 52f\n"
+                         "40:" // 4 rounds: Partial writeback: partial_4_16
+                         "tbz x20, #2, 42f\n"
+                         "str s13, [x21], #0x4\n"
+                         "tbz x20, #1, 41f\n"
+                         "st1 { v13.h }[2], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v13.b }[6], [x21]\n"
+                         "b 52f\n"
+                         "41:" // 4 rounds: Partial writeback: partial_1_20
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v13.b }[4], [x21]\n"
+                         "b 52f\n"
+                         "42:" // 4 rounds: Partial writeback: partial_2_16
+                         "tbz x20, #1, 43f\n"
+                         "str h13, [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v13.b }[2], [x21]\n"
+                         "b 52f\n"
+                         "43:" // 4 rounds: Partial writeback: partial_1_16
+                         "tbz x20, #0, 52f\n"
+                         "str b13, [x21, #0x0]\n"
+                         "b 52f\n"
+                         "44:" // 4 rounds: Partial writeback: partial_8_0
+                         "tbz x20, #3, 48f\n"
+                         "str d8, [x21], #0x8\n"
+                         "tbz x20, #2, 46f\n"
+                         "st1 { v8.s }[2], [x21], #0x4\n"
+                         "tbz x20, #1, 45f\n"
+                         "st1 { v8.h }[6], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v8.b }[14], [x21]\n"
+                         "b 52f\n"
+                         "45:" // 4 rounds: Partial writeback: partial_1_12
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v8.b }[12], [x21]\n"
+                         "b 52f\n"
+                         "46:" // 4 rounds: Partial writeback: partial_2_8
+                         "tbz x20, #1, 47f\n"
+                         "st1 { v8.h }[4], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v8.b }[10], [x21]\n"
+                         "b 52f\n"
+                         "47:" // 4 rounds: Partial writeback: partial_1_8
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v8.b }[8], [x21]\n"
+                         "b 52f\n"
+                         "48:" // 4 rounds: Partial writeback: partial_4_0
+                         "tbz x20, #2, 50f\n"
+                         "str s8, [x21], #0x4\n"
+                         "tbz x20, #1, 49f\n"
+                         "st1 { v8.h }[2], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v8.b }[6], [x21]\n"
+                         "b 52f\n"
+                         "49:" // 4 rounds: Partial writeback: partial_1_4
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v8.b }[4], [x21]\n"
+                         "b 52f\n"
+                         "50:" // 4 rounds: Partial writeback: partial_2_0
+                         "tbz x20, #1, 51f\n"
+                         "str h8, [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v8.b }[2], [x21]\n"
+                         "b 52f\n"
+                         "51:" // 4 rounds: Partial writeback: partial_1_0
+                         "str b8, [x21, #0x0]\n"
+                         "52:" // 4 rounds: Partial writeback: Done
+                         "b 54f\n"
+                         "53:" // 4 rounds: Full writeback
+                         "str q8, [x21, #0x0]\n"
+                         "str q13, [x21, #0x10]\n"
+                         "str q12, [x21, #0x20]\n"
+                         "add x21, x21, #0x30\n"
+                         "54:" // 4 rounds: Writeback done
+                         "subs x20, x20, #0x30\n"
+                         "bgt 2b\n"
+                         "add x23, x23, #0x1\n"
+                         "cmp x23, %x[num_strings]\n"
+                         "bne 1b\n"
+                         :
+                         : [input] "r"(input), [num_strings] "r"(num_strings), [output] "r"(output),
+                           [string_length] "r"(string_length), [table] "r"(table)
+                         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
+                           "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+                           "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23");
 }
 
 #endif // __aarch64__
diff --git a/src/cpu/kernels/lut/generic/sve2/u8.cpp b/src/cpu/kernels/lut/generic/sve2/u8.cpp
index b80d75326e..ee8572703e 100644
--- a/src/cpu/kernels/lut/generic/sve2/u8.cpp
+++ b/src/cpu/kernels/lut/generic/sve2/u8.cpp
@@ -32,11 +32,7 @@ namespace arm_compute
 namespace cpu
 {
 void lut_u8_sve2(
-    const uint8_t        *table,
-    size_t                num_strings,
-    size_t                string_length,
-    const uint8_t *const *input,
-    uint8_t *const       *output)
+    const uint8_t *table, size_t num_strings, size_t string_length, const uint8_t *const *input, uint8_t *const *output)
 {
     __asm__ __volatile__(
         "ptrue p0.b\n"
@@ -636,7 +632,9 @@ void lut_u8_sve2(
         "bne 2b\n"
         : [table] "+&r"(table)
         : [input] "r"(input), [num_strings] "r"(num_strings), [output] "r"(output), [string_length] "r"(string_length)
-        : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31");
+        : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1",
+          "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21",
+          "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31");
 }
 
 } // namespace cpu
diff --git a/src/cpu/kernels/lut/list.h b/src/cpu/kernels/lut/list.h
index 7a2afc6927..da90346267 100644
--- a/src/cpu/kernels/lut/list.h
+++ b/src/cpu/kernels/lut/list.h
@@ -34,13 +34,9 @@ namespace cpu
 {
 
 #ifdef __aarch64__
-#define DECLARE_LUT_KERNEL(func_name) \
-    void func_name( \
-        const uint8_t        *table, \
-        size_t                num_strings, \
-        size_t                string_length, \
-        const uint8_t *const *input, \
-        uint8_t *const       *output)
+#define DECLARE_LUT_KERNEL(func_name)                                                                           \
+    void func_name(const uint8_t *table, size_t num_strings, size_t string_length, const uint8_t *const *input, \
+                   uint8_t *const *output)
 
 DECLARE_LUT_KERNEL(lut_u8_neon);
 DECLARE_LUT_KERNEL(lut_u8_sve2);
diff --git a/src/cpu/kernels/maxunpool/generic/neon/impl.h b/src/cpu/kernels/maxunpool/generic/neon/impl.h
index 5fe19c4707..73a5b86a2f 100644
--- a/src/cpu/kernels/maxunpool/generic/neon/impl.h
+++ b/src/cpu/kernels/maxunpool/generic/neon/impl.h
@@ -25,6 +25,7 @@
 #define ACL_SRC_CPU_KERNELS_MAXUNPOOL_GENERIC_NEON_IMPL_H
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 namespace arm_compute
 {
@@ -37,13 +38,15 @@ void max_unpooling(const ITensor *input, const ITensor *indices, ITensor *output
     Iterator  indices_itr(indices, window);
     auto      out_ptr      = reinterpret_cast<T *>(output->buffer());
     const int out_stride_w = static_cast<int>(output->info()->strides_in_bytes()[3]);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        auto vindices                                         = reinterpret_cast<uint32_t *>(indices_itr.ptr());
-        auto vinput                                           = reinterpret_cast<T *>(input_itr.ptr());
-        out_ptr[id[3] * out_stride_w / sizeof(T) + *vindices] = *vinput;
-    },
-    input_itr, indices_itr);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            auto vindices                                         = reinterpret_cast<uint32_t *>(indices_itr.ptr());
+            auto vinput                                           = reinterpret_cast<T *>(input_itr.ptr());
+            out_ptr[id[3] * out_stride_w / sizeof(T) + *vindices] = *vinput;
+        },
+        input_itr, indices_itr);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp
index 96e4030268..6470f391e2 100644
--- a/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp
@@ -23,9 +23,9 @@
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 
-#include "src/cpu/kernels/meanstddevnorm/generic/neon/impl.h"
 #include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/CpuTypes.h"
+#include "src/cpu/kernels/meanstddevnorm/generic/neon/impl.h"
 
 namespace arm_compute
 {
@@ -45,64 +45,66 @@ void mean_stddev_normalization<float16_t, 8>(ITensor *input, ITensor *output, fl
     Iterator input_itr(input, win);
     Iterator output_itr(output, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int  x       = window_start_x;
-        auto in_ptr  = reinterpret_cast<const float16_t *>(input_itr.ptr());
-        auto out_ptr = reinterpret_cast<float16_t *>(output_itr.ptr());
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            int  x       = window_start_x;
+            auto in_ptr  = reinterpret_cast<const float16_t *>(input_itr.ptr());
+            auto out_ptr = reinterpret_cast<float16_t *>(output_itr.ptr());
 
-        float16x8_t sum_vec    = vdupq_n_f16(static_cast<float16_t>(0.0f));
-        float32x4_t sum_sq_vec = vdupq_n_f32(0.0f);
+            float16x8_t sum_vec    = vdupq_n_f16(static_cast<float16_t>(0.0f));
+            float32x4_t sum_sq_vec = vdupq_n_f32(0.0f);
 
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            float16x8_t data = vld1q_f16(in_ptr + x);
-            sum_vec          = vaddq_f16(sum_vec, data);
-            float32x4_t dl   = vcvt_f32_f16(vget_low_f16(data));
-            float32x4_t dh   = vcvt_f32_f16(vget_high_f16(data));
-            sum_sq_vec       = vaddq_f32(sum_sq_vec, vmulq_f32(dl, dl));
-            sum_sq_vec       = vaddq_f32(sum_sq_vec, vmulq_f32(dh, dh));
-        }
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                float16x8_t data = vld1q_f16(in_ptr + x);
+                sum_vec          = vaddq_f16(sum_vec, data);
+                float32x4_t dl   = vcvt_f32_f16(vget_low_f16(data));
+                float32x4_t dh   = vcvt_f32_f16(vget_high_f16(data));
+                sum_sq_vec       = vaddq_f32(sum_sq_vec, vmulq_f32(dl, dl));
+                sum_sq_vec       = vaddq_f32(sum_sq_vec, vmulq_f32(dh, dh));
+            }
 
-        float16x4_t sum_carry_res = vpadd_f16(vget_high_f16(sum_vec), vget_low_f16(sum_vec));
-        sum_carry_res             = vpadd_f16(sum_carry_res, sum_carry_res);
-        sum_carry_res             = vpadd_f16(sum_carry_res, sum_carry_res);
+            float16x4_t sum_carry_res = vpadd_f16(vget_high_f16(sum_vec), vget_low_f16(sum_vec));
+            sum_carry_res             = vpadd_f16(sum_carry_res, sum_carry_res);
+            sum_carry_res             = vpadd_f16(sum_carry_res, sum_carry_res);
 
-        float32x4_t sum_sq_carry_res = vpaddq_f32(sum_sq_vec, sum_sq_vec);
-        sum_sq_carry_res             = vpaddq_f32(sum_sq_carry_res, sum_sq_carry_res);
+            float32x4_t sum_sq_carry_res = vpaddq_f32(sum_sq_vec, sum_sq_vec);
+            sum_sq_carry_res             = vpaddq_f32(sum_sq_carry_res, sum_sq_carry_res);
 
-        float16_t sum    = vget_lane_f16(sum_carry_res, 0);
-        float     sum_sq = vgetq_lane_f32(sum_sq_carry_res, 0);
+            float16_t sum    = vget_lane_f16(sum_carry_res, 0);
+            float     sum_sq = vgetq_lane_f32(sum_sq_carry_res, 0);
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            float16_t data = *(in_ptr + x);
-            sum += data;
-            float fdata = static_cast<float>(data);
-            sum_sq += fdata * fdata;
-        }
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                float16_t data = *(in_ptr + x);
+                sum += data;
+                float fdata = static_cast<float>(data);
+                sum_sq += fdata * fdata;
+            }
 
-        float16_t mean       = sum / input->info()->dimension(0);
-        float     var        = (sum_sq / input->info()->dimension(0)) - (mean * mean);
-        float16_t stddev_inv = static_cast<float16_t>(1.f / sqrt(var + epsilon));
+            float16_t mean       = sum / input->info()->dimension(0);
+            float     var        = (sum_sq / input->info()->dimension(0)) - (mean * mean);
+            float16_t stddev_inv = static_cast<float16_t>(1.f / sqrt(var + epsilon));
 
-        float16x8_t mean_vec       = vdupq_n_f16(mean);
-        float16x8_t stddev_inv_vec = vdupq_n_f16(stddev_inv);
+            float16x8_t mean_vec       = vdupq_n_f16(mean);
+            float16x8_t stddev_inv_vec = vdupq_n_f16(stddev_inv);
 
-        for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            float16x8_t data = vld1q_f16(in_ptr + x);
-            float16x8_t res  = vmulq_f16(vsubq_f16(data, mean_vec), stddev_inv_vec);
-            // Store results
-            vst1q_f16(out_ptr + x, res);
-        }
-        for(; x < window_end_x; ++x)
-        {
-            *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv;
-        }
-    },
-    input_itr, output_itr);
+            for (x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                float16x8_t data = vld1q_f16(in_ptr + x);
+                float16x8_t res  = vmulq_f16(vsubq_f16(data, mean_vec), stddev_inv_vec);
+                // Store results
+                vst1q_f16(out_ptr + x, res);
+            }
+            for (; x < window_end_x; ++x)
+            {
+                *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv;
+            }
+        },
+        input_itr, output_itr);
 }
 
 void neon_fp16_meanstddevnorm(ITensor *input, ITensor *output, float epsilon, const Window &window)
diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp
index 0522d6e277..11f6294a35 100644
--- a/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp
+++ b/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "src/cpu/kernels/meanstddevnorm/generic/neon/impl.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
@@ -45,60 +46,62 @@ void mean_stddev_normalization(ITensor *input, ITensor *output, float epsilon, c
     Iterator input_itr(input, win);
     Iterator output_itr(output, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int  x       = window_start_x;
-        auto in_ptr  = reinterpret_cast<const ScalarType *>(input_itr.ptr());
-        auto out_ptr = reinterpret_cast<ScalarType *>(output_itr.ptr());
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            int  x       = window_start_x;
+            auto in_ptr  = reinterpret_cast<const ScalarType *>(input_itr.ptr());
+            auto out_ptr = reinterpret_cast<ScalarType *>(output_itr.ptr());
 
-        auto sum_vec    = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
-        auto sum_sq_vec = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
+            auto sum_vec    = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
+            auto sum_sq_vec = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
 
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            auto data  = wrapper::vloadq(in_ptr + x);
-            sum_vec    = wrapper::vadd(sum_vec, data);
-            sum_sq_vec = wrapper::vadd(sum_sq_vec, wrapper::vmul(data, data));
-        }
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                auto data  = wrapper::vloadq(in_ptr + x);
+                sum_vec    = wrapper::vadd(sum_vec, data);
+                sum_sq_vec = wrapper::vadd(sum_sq_vec, wrapper::vmul(data, data));
+            }
 
-        auto sum_carry_res    = wrapper::vpadd(wrapper::vgethigh(sum_vec), wrapper::vgetlow(sum_vec));
-        auto sum_sq_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_sq_vec), wrapper::vgetlow(sum_sq_vec));
-        for(int i = 0; i < size / 4; ++i)
-        {
-            sum_carry_res    = wrapper::vpadd(sum_carry_res, sum_carry_res);
-            sum_sq_carry_res = wrapper::vpadd(sum_sq_carry_res, sum_sq_carry_res);
-        }
+            auto sum_carry_res    = wrapper::vpadd(wrapper::vgethigh(sum_vec), wrapper::vgetlow(sum_vec));
+            auto sum_sq_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_sq_vec), wrapper::vgetlow(sum_sq_vec));
+            for (int i = 0; i < size / 4; ++i)
+            {
+                sum_carry_res    = wrapper::vpadd(sum_carry_res, sum_carry_res);
+                sum_sq_carry_res = wrapper::vpadd(sum_sq_carry_res, sum_sq_carry_res);
+            }
 
-        auto sum    = wrapper::vgetlane(sum_carry_res, 0);
-        auto sum_sq = wrapper::vgetlane(sum_sq_carry_res, 0);
+            auto sum    = wrapper::vgetlane(sum_carry_res, 0);
+            auto sum_sq = wrapper::vgetlane(sum_sq_carry_res, 0);
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            ScalarType data = *(in_ptr + x);
-            sum += data;
-            sum_sq += data * data;
-        }
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                ScalarType data = *(in_ptr + x);
+                sum += data;
+                sum_sq += data * data;
+            }
 
-        ScalarType mean       = sum / input->info()->dimension(0);
-        ScalarType var        = (sum_sq / input->info()->dimension(0)) - (mean * mean);
-        ScalarType stddev_inv = 1.f / sqrt(var + epsilon);
+            ScalarType mean       = sum / input->info()->dimension(0);
+            ScalarType var        = (sum_sq / input->info()->dimension(0)) - (mean * mean);
+            ScalarType stddev_inv = 1.f / sqrt(var + epsilon);
 
-        auto mean_vec       = wrapper::vdup_n(mean, ExactTagType{});
-        auto stddev_inv_vec = wrapper::vdup_n(stddev_inv, ExactTagType{});
-        for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            auto data = wrapper::vloadq(in_ptr + x);
-            auto res  = wrapper::vmul(wrapper::vsub(data, mean_vec), stddev_inv_vec);
-            // Store results
-            wrapper::vstore(out_ptr + x, res);
-        }
-        for(; x < window_end_x; ++x)
-        {
-            *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv;
-        }
-    },
-    input_itr, output_itr);
+            auto mean_vec       = wrapper::vdup_n(mean, ExactTagType{});
+            auto stddev_inv_vec = wrapper::vdup_n(stddev_inv, ExactTagType{});
+            for (x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                auto data = wrapper::vloadq(in_ptr + x);
+                auto res  = wrapper::vmul(wrapper::vsub(data, mean_vec), stddev_inv_vec);
+                // Store results
+                wrapper::vstore(out_ptr + x, res);
+            }
+            for (; x < window_end_x; ++x)
+            {
+                *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv;
+            }
+        },
+        input_itr, output_itr);
 }
 template void mean_stddev_normalization<float, 4>(ITensor *input, ITensor *output, float epsilon, const Window &window);
 } // namespace cpu
diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp
index 53af1e4b16..32654df5dc 100644
--- a/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
@@ -69,77 +70,76 @@ void neon_qasymm8_meanstddevnorm(ITensor *input, ITensor *output, float epsilon,
     const float32x4_t quant_min_vec    = vdupq_n_f32(0.0f);
 
     execute_window_loop(
-        win, [&](const Coordinates &)
-    {
-        int  x       = window_start_x;
-        auto in_ptr  = reinterpret_cast<const uint8_t *>(input_itr.ptr());
-        auto out_ptr = reinterpret_cast<uint8_t *>(output_itr.ptr());
+        win,
+        [&](const Coordinates &)
+        {
+            int  x       = window_start_x;
+            auto in_ptr  = reinterpret_cast<const uint8_t *>(input_itr.ptr());
+            auto out_ptr = reinterpret_cast<uint8_t *>(output_itr.ptr());
 
-        uint32x4_t sum_vec    = vdupq_n_u32(0);
-        uint32x4_t sum_sq_vec = vdupq_n_u32(0);
+            uint32x4_t sum_vec    = vdupq_n_u32(0);
+            uint32x4_t sum_sq_vec = vdupq_n_u32(0);
 
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const uint8x16_t data         = vld1q_u8(in_ptr + x);
-            sum_vec                       = vaddq_u32(sum_vec, vpaddlq_u16(vpaddlq_u8(data)));
-            const uint16x8_t squares_low  = vmull_u8(vget_low_u8(data), vget_low_u8(data));
-            const uint16x8_t squares_high = vmull_u8(vget_high_u8(data), vget_high_u8(data));
-            sum_sq_vec                    = vaddq_u32(sum_sq_vec, vaddq_u32(vpaddlq_u16(squares_low), vpaddlq_u16(squares_high)));
-        }
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const uint8x16_t data         = vld1q_u8(in_ptr + x);
+                sum_vec                       = vaddq_u32(sum_vec, vpaddlq_u16(vpaddlq_u8(data)));
+                const uint16x8_t squares_low  = vmull_u8(vget_low_u8(data), vget_low_u8(data));
+                const uint16x8_t squares_high = vmull_u8(vget_high_u8(data), vget_high_u8(data));
+                sum_sq_vec = vaddq_u32(sum_sq_vec, vaddq_u32(vpaddlq_u16(squares_low), vpaddlq_u16(squares_high)));
+            }
 
 #ifdef __aarch64__
-        sum_vec         = vpaddq_u32(sum_vec, sum_vec);
-        sum_vec         = vpaddq_u32(sum_vec, sum_vec);
-        uint32_t sum    = vgetq_lane_u32(sum_vec, 0);
-        sum_sq_vec      = vpaddq_u32(sum_sq_vec, sum_sq_vec);
-        sum_sq_vec      = vpaddq_u32(sum_sq_vec, sum_sq_vec);
-        uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0);
+            sum_vec         = vpaddq_u32(sum_vec, sum_vec);
+            sum_vec         = vpaddq_u32(sum_vec, sum_vec);
+            uint32_t sum    = vgetq_lane_u32(sum_vec, 0);
+            sum_sq_vec      = vpaddq_u32(sum_sq_vec, sum_sq_vec);
+            sum_sq_vec      = vpaddq_u32(sum_sq_vec, sum_sq_vec);
+            uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0);
 #elif __arm__ // #ifdef __aarch64__
-        uint32_t sum =  vgetq_lane_u32(sum_vec, 0) +
-                        vgetq_lane_u32(sum_vec, 1) +
-                        vgetq_lane_u32(sum_vec, 2) +
-                        vgetq_lane_u32(sum_vec, 3);
+            uint32_t sum = vgetq_lane_u32(sum_vec, 0) + vgetq_lane_u32(sum_vec, 1) + vgetq_lane_u32(sum_vec, 2) +
+                           vgetq_lane_u32(sum_vec, 3);
 
-        uint32_t sum_sq =   vgetq_lane_u32(sum_sq_vec, 0) +
-                            vgetq_lane_u32(sum_sq_vec, 1) +
-                            vgetq_lane_u32(sum_sq_vec, 2) +
-                            vgetq_lane_u32(sum_sq_vec, 3);
+            uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0) + vgetq_lane_u32(sum_sq_vec, 1) +
+                              vgetq_lane_u32(sum_sq_vec, 2) + vgetq_lane_u32(sum_sq_vec, 3);
 #endif        // #ifdef __aarch64__
-        for(; x < window_end_x; ++x)
-        {
-            auto data = static_cast<uint32_t>(*(in_ptr + x));
-            sum += data;
-            sum_sq += (data * data);
-        }
+            for (; x < window_end_x; ++x)
+            {
+                auto data = static_cast<uint32_t>(*(in_ptr + x));
+                sum += data;
+                sum_sq += (data * data);
+            }
 
-        const float       mean      = (static_cast<float>(sum) / static_cast<float>(input->info()->dimension(0)));
-        const float       var       = (static_cast<float>(sum_sq) / static_cast<float>(input->info()->dimension(0))) - (mean * mean);
-        const float       stdev_inv = 1.0f / sqrtf(var + epsilon);
-        const float32x4_t v_scale   = vdupq_n_f32(stdev_inv * output_inv_scale);
-        const float32x4_t v_offset  = vdupq_n_f32(-mean * stdev_inv * output_inv_scale + output_offset);
-        for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const uint8x16_t data = vld1q_u8(in_ptr + x);
-            float32x4_t      db1  = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(data)))));
-            float32x4_t      db2  = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(data)))));
-            float32x4_t      db3  = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(data)))));
-            float32x4_t      db4  = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(data)))));
-            db1                   = clamp_v4f32(vaddq_f32(vmulq_f32(db1, v_scale), v_offset), quant_min_vec, quant_max_vec);
-            db2                   = clamp_v4f32(vaddq_f32(vmulq_f32(db2, v_scale), v_offset), quant_min_vec, quant_max_vec);
-            db3                   = clamp_v4f32(vaddq_f32(vmulq_f32(db3, v_scale), v_offset), quant_min_vec, quant_max_vec);
-            db4                   = clamp_v4f32(vaddq_f32(vmulq_f32(db4, v_scale), v_offset), quant_min_vec, quant_max_vec);
-            const uint8x16_t out  = fuse_shorts_u16(fuse_words_f32(db1, db2), fuse_words_f32(db3, db4));
-            vst1q_u8(out_ptr + x, out);
-        }
+            const float mean = (static_cast<float>(sum) / static_cast<float>(input->info()->dimension(0)));
+            const float var =
+                (static_cast<float>(sum_sq) / static_cast<float>(input->info()->dimension(0))) - (mean * mean);
+            const float       stdev_inv = 1.0f / sqrtf(var + epsilon);
+            const float32x4_t v_scale   = vdupq_n_f32(stdev_inv * output_inv_scale);
+            const float32x4_t v_offset  = vdupq_n_f32(-mean * stdev_inv * output_inv_scale + output_offset);
+            for (x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const uint8x16_t data = vld1q_u8(in_ptr + x);
+                float32x4_t      db1  = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(data)))));
+                float32x4_t      db2  = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(data)))));
+                float32x4_t      db3  = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(data)))));
+                float32x4_t      db4  = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(data)))));
+                db1 = clamp_v4f32(vaddq_f32(vmulq_f32(db1, v_scale), v_offset), quant_min_vec, quant_max_vec);
+                db2 = clamp_v4f32(vaddq_f32(vmulq_f32(db2, v_scale), v_offset), quant_min_vec, quant_max_vec);
+                db3 = clamp_v4f32(vaddq_f32(vmulq_f32(db3, v_scale), v_offset), quant_min_vec, quant_max_vec);
+                db4 = clamp_v4f32(vaddq_f32(vmulq_f32(db4, v_scale), v_offset), quant_min_vec, quant_max_vec);
+                const uint8x16_t out = fuse_shorts_u16(fuse_words_f32(db1, db2), fuse_words_f32(db3, db4));
+                vst1q_u8(out_ptr + x, out);
+            }
 
-        for(; x < window_end_x; ++x)
-        {
-            auto          data = static_cast<float32_t>(*(in_ptr + x));
-            const uint8_t res  = data * (stdev_inv * output_inv_scale) + (-mean * stdev_inv * output_inv_scale + output_offset);
-            *(out_ptr + x)     = res;
-        }
-    },
-    input_itr, output_itr);
+            for (; x < window_end_x; ++x)
+            {
+                auto          data = static_cast<float32_t>(*(in_ptr + x));
+                const uint8_t res =
+                    data * (stdev_inv * output_inv_scale) + (-mean * stdev_inv * output_inv_scale + output_offset);
+                *(out_ptr + x) = res;
+            }
+        },
+        input_itr, output_itr);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/pool2d/neon/fp16.cpp b/src/cpu/kernels/pool2d/neon/fp16.cpp
index 4e15d3ad3f..4af59c2ad4 100644
--- a/src/cpu/kernels/pool2d/neon/fp16.cpp
+++ b/src/cpu/kernels/pool2d/neon/fp16.cpp
@@ -25,8 +25,9 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 #include "src/cpu/kernels/pool2d/neon/list.h"
 
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
@@ -37,7 +38,12 @@ namespace cpu
 {
 namespace
 {
-void pooling2_f16_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling2_f16_maxpool_indices(const ITensor    *src,
+                                  ITensor          *dst0,
+                                  ITensor          *dst1,
+                                  PoolingLayerInfo &pool_info,
+                                  const Window     &window_src,
+                                  const Window     &window)
 {
     const int window_start_x = window.x().start();
     const int window_end_x   = window.x().end();
@@ -53,8 +59,8 @@ void pooling2_f16_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *ds
     const int pool_pad_top  = pool_info.pad_stride_info.pad_top();
     const int pool_pad_left = pool_info.pad_stride_info.pad_left();
 
-    int pool_stride_x = 0;
-    int pool_stride_y = 0;
+    int pool_stride_x                      = 0;
+    int pool_stride_y                      = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
 
     const int pad_right      = src->info()->padding().right;
@@ -63,97 +69,114 @@ void pooling2_f16_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *ds
     const int in_stride_y    = static_cast<int>(src->info()->strides_in_bytes().y());
     const int in_stride_z    = static_cast<int>(src->info()->strides_in_bytes().z());
 
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        const int idx_width    = id.y() * pool_stride_x;
-        const int idx_height   = id.z() * pool_stride_y;
-        const int pool_limit_y = pool_pad_top - idx_height;
-        const int pool_limit_x = pool_pad_left - idx_width;
-
-        const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
-        const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
-        const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
-        const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-        const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-        const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-
-        int x_off = window_start_x;
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
-        {
-            const auto  in_x0_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off;
-            const auto  in_x1_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off;
-            const auto  in_x2_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off;
-            const auto  in_x3_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off;
-            const auto  v_x0      = vld1q_f16(in_x0_ptr);
-            const auto  v_x1      = vld1q_f16(in_x1_ptr);
-            const auto  v_x2      = vld1q_f16(in_x2_ptr);
-            const auto  v_x3      = vld1q_f16(in_x3_ptr);
-            float16x8_t vres      = vmaxq_f16(vmaxq_f16(v_x2, v_x3), vmaxq_f16(v_x0, v_x1));
-            // Store result
-            vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres);
-
-            const uint32_t   offset_base    = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
-            const uint32_t   offset_x0      = (uint32_t)offset_base / sizeof(float16_t) + x_off;
-            const uint32_t   offset_x1      = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal;
-            const uint32_t   offset_x2      = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_horizontal * src->info()->tensor_shape()[1];
-            const uint32_t   offset_x3      = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal;
-            const uint32x4_t voffset_x0_0   = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
-            const uint32x4_t voffset_x0_1   = { offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 };
-            const uint16x8_t voffset_x0     = vcombine_u16(vmovn_u32(voffset_x0_0), vmovn_u32(voffset_x0_1));
-            const uint32x4_t voffset_x1_0   = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
-            const uint32x4_t voffset_x1_1   = { offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7 };
-            const uint16x8_t voffset_x1     = vcombine_u16(vmovn_u32(voffset_x1_0), vmovn_u32(voffset_x1_1));
-            const uint32x4_t voffset_x2_0   = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
-            const uint32x4_t voffset_x2_1   = { offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7 };
-            const uint16x8_t voffset_x2     = vcombine_u16(vmovn_u32(voffset_x2_0), vmovn_u32(voffset_x2_1));
-            const uint32x4_t voffset_x3_0   = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
-            const uint32x4_t voffset_x3_1   = { offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7 };
-            const uint16x8_t voffset_x3     = vcombine_u16(vmovn_u32(voffset_x3_0), vmovn_u32(voffset_x3_1));
-            const uint16x8_t tmp_indices0   = vbslq_u16(vcgeq_f16(v_x0, v_x1), voffset_x0, voffset_x1);
-            const uint16x8_t tmp_indices1   = vbslq_u16(vcgeq_f16(v_x2, v_x3), voffset_x2, voffset_x3);
-            const uint16x8_t tmp_indices2   = vbslq_u16(vcgeq_f16(vmaxq_f16(v_x0, v_x1), vmaxq_f16(v_x2, v_x3)), tmp_indices0, tmp_indices1);
-            const uint32x4_t tmp_indeces3_0 = vmovl_u16(vget_low_u16(tmp_indices2));
-            const uint32x4_t tmp_indeces3_1 = vmovl_u16(vget_high_u16(tmp_indices2));
-            // Store indicies
-            vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indeces3_0);
-            vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr() + 16) + x_off, tmp_indeces3_1);
-        }
-
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
         {
-            const auto x0  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off);
-            const auto x1  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off);
-            const auto x2  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off);
-            const auto x3  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off);
-            float16_t  res = std::max(std::max(x2, x3), std::max(x0, x1));
-
-            // Store result
-            *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res;
-
-            const uint32_t offset_base = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
-            const uint32_t offset_x0   = (uint32_t)offset_base / sizeof(float16_t) + x_off;
-            const uint32_t offset_x1   = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal;
-            const uint32_t offset_x2   = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_horizontal * src->info()->tensor_shape()[1];
-            const uint32_t offset_x3   = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal;
-            const uint32_t tmp_idx0    = (x0 >= x1) ? offset_x0 : offset_x1;
-            const uint32_t tmp_idx1    = (x2 >= x3) ? offset_x2 : offset_x3;
-            const uint32_t tmp_idx2    = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
-
-            // Store indices
-            *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
-        }
-    },
-    in, out, indices);
-}
-}
+            const int idx_width    = id.y() * pool_stride_x;
+            const int idx_height   = id.z() * pool_stride_y;
+            const int pool_limit_y = pool_pad_top - idx_height;
+            const int pool_limit_x = pool_pad_left - idx_width;
+
+            const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+            const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+            const int in_x0_offset =
+                (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+            const int in_x1_offset =
+                (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+            const int in_x2_offset =
+                (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y + 1 - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+            const int in_x3_offset =
+                (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y + 1 - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+
+            int x_off = window_start_x;
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+            {
+                const auto  in_x0_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off;
+                const auto  in_x1_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off;
+                const auto  in_x2_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off;
+                const auto  in_x3_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off;
+                const auto  v_x0      = vld1q_f16(in_x0_ptr);
+                const auto  v_x1      = vld1q_f16(in_x1_ptr);
+                const auto  v_x2      = vld1q_f16(in_x2_ptr);
+                const auto  v_x3      = vld1q_f16(in_x3_ptr);
+                float16x8_t vres      = vmaxq_f16(vmaxq_f16(v_x2, v_x3), vmaxq_f16(v_x0, v_x1));
+                // Store result
+                vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres);
+
+                const uint32_t offset_base = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x,
+                                                                          pool_stride_y, DataLayout::NHWC);
+                const uint32_t offset_x0   = (uint32_t)offset_base / sizeof(float16_t) + x_off;
+                const uint32_t offset_x1   = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal;
+                const uint32_t offset_x2   = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) -
+                                           pad_horizontal * src->info()->tensor_shape()[1];
+                const uint32_t   offset_x3    = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal;
+                const uint32x4_t voffset_x0_0 = {offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3};
+                const uint32x4_t voffset_x0_1 = {offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7};
+                const uint16x8_t voffset_x0   = vcombine_u16(vmovn_u32(voffset_x0_0), vmovn_u32(voffset_x0_1));
+                const uint32x4_t voffset_x1_0 = {offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3};
+                const uint32x4_t voffset_x1_1 = {offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7};
+                const uint16x8_t voffset_x1   = vcombine_u16(vmovn_u32(voffset_x1_0), vmovn_u32(voffset_x1_1));
+                const uint32x4_t voffset_x2_0 = {offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3};
+                const uint32x4_t voffset_x2_1 = {offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7};
+                const uint16x8_t voffset_x2   = vcombine_u16(vmovn_u32(voffset_x2_0), vmovn_u32(voffset_x2_1));
+                const uint32x4_t voffset_x3_0 = {offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3};
+                const uint32x4_t voffset_x3_1 = {offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7};
+                const uint16x8_t voffset_x3   = vcombine_u16(vmovn_u32(voffset_x3_0), vmovn_u32(voffset_x3_1));
+                const uint16x8_t tmp_indices0 = vbslq_u16(vcgeq_f16(v_x0, v_x1), voffset_x0, voffset_x1);
+                const uint16x8_t tmp_indices1 = vbslq_u16(vcgeq_f16(v_x2, v_x3), voffset_x2, voffset_x3);
+                const uint16x8_t tmp_indices2 =
+                    vbslq_u16(vcgeq_f16(vmaxq_f16(v_x0, v_x1), vmaxq_f16(v_x2, v_x3)), tmp_indices0, tmp_indices1);
+                const uint32x4_t tmp_indeces3_0 = vmovl_u16(vget_low_u16(tmp_indices2));
+                const uint32x4_t tmp_indeces3_1 = vmovl_u16(vget_high_u16(tmp_indices2));
+                // Store indicies
+                vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indeces3_0);
+                vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr() + 16) + x_off, tmp_indeces3_1);
+            }
 
-void poolingMxN_fp16_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
+            {
+                const auto x0  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off);
+                const auto x1  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off);
+                const auto x2  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off);
+                const auto x3  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off);
+                float16_t  res = std::max(std::max(x2, x3), std::max(x0, x1));
+
+                // Store result
+                *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res;
+
+                const uint32_t offset_base = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x,
+                                                                          pool_stride_y, DataLayout::NHWC);
+                const uint32_t offset_x0   = (uint32_t)offset_base / sizeof(float16_t) + x_off;
+                const uint32_t offset_x1   = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal;
+                const uint32_t offset_x2   = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) -
+                                           pad_horizontal * src->info()->tensor_shape()[1];
+                const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal;
+                const uint32_t tmp_idx0  = (x0 >= x1) ? offset_x0 : offset_x1;
+                const uint32_t tmp_idx1  = (x2 >= x3) ? offset_x2 : offset_x3;
+                const uint32_t tmp_idx2  = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
+
+                // Store indices
+                *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
+            }
+        },
+        in, out, indices);
+}
+} // namespace
+
+void poolingMxN_fp16_neon_nhwc(const ITensor    *src,
+                               ITensor          *dst0,
+                               ITensor          *dst1,
+                               PoolingLayerInfo &pool_info,
+                               const Window     &window_src,
+                               const Window     &window)
 {
-    if(pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && dst1)
+    if (pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && dst1)
     {
         pooling2_f16_maxpool_indices(src, dst0, dst1, pool_info, window_src, window);
     }
@@ -167,151 +190,172 @@ void poolingMxN_fp16_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1,
     Iterator in(src, window_src);
     Iterator out(dst0, window_out);
 
-    const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
-    const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
-    const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int       pool_stride_x   = 0;
-    int       pool_stride_y   = 0;
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+    const int pool_pad_right               = pool_info.pad_stride_info.pad_right();
+    const int pool_pad_top                 = pool_info.pad_stride_info.pad_top();
+    const int pool_pad_left                = pool_info.pad_stride_info.pad_left();
+    const int pool_pad_bottom              = pool_info.pad_stride_info.pad_bottom();
+    int       pool_stride_x                = 0;
+    int       pool_stride_y                = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
     const int       upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
     const int       upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
     const float16_t min_value     = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
     float16x8_t     vres;
 
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        const int idx_width    = id.y() * pool_stride_x;
-        const int idx_height   = id.z() * pool_stride_y;
-        const int pool_limit_y = pool_pad_top - idx_height;
-        const int pool_limit_x = pool_pad_left - idx_width;
-
-        const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
-        const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
-        const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
-        const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
-
-        int x_off = window_start_x;
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
         {
-            if(pool_info.pool_type != PoolingType::MAX)
+            const int idx_width    = id.y() * pool_stride_x;
+            const int idx_height   = id.z() * pool_stride_y;
+            const int pool_limit_y = pool_pad_top - idx_height;
+            const int pool_limit_x = pool_pad_left - idx_width;
+
+            const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+            const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
+            const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+            const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
+
+            int x_off = window_start_x;
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
             {
-                // Calculate scale
-                const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                               pool_stride_y);
-                const float16x8_t scale_v = vdupq_n_f16(scale);
-
-                // Perform pooling
-                vres = vdupq_n_f16(0.0f);
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                if (pool_info.pool_type != PoolingType::MAX)
                 {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    // Calculate scale
+                    const float scale = calculate_avg_scale_pool2d(
+                        pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+                        upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+                    const float16x8_t scale_v = vdupq_n_f16(scale);
+
+                    // Perform pooling
+                    vres = vdupq_n_f16(0.0f);
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                               (src->info()->strides_in_bytes().z())) + x_off);
-
-                        // Get power of 2 in case of l2 pooling and accumulate
-                        if(pool_info.pool_type == PoolingType::L2)
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
                         {
-                            vres = vaddq_f16(vres, vmulq_f16(data, data));
+                            const float16x8_t data = vld1q_f16(
+                                reinterpret_cast<const float16_t *>(
+                                    in.ptr() +
+                                    (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                    (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                x_off);
+
+                            // Get power of 2 in case of l2 pooling and accumulate
+                            if (pool_info.pool_type == PoolingType::L2)
+                            {
+                                vres = vaddq_f16(vres, vmulq_f16(data, data));
+                            }
+                            else
+                            {
+                                vres = vaddq_f16(vres, data);
+                            }
                         }
-                        else
+                    }
+                    // Divide by scale
+                    vres = vmulq_f16(vres, scale_v);
+                }
+                else
+                {
+                    vres = vdupq_n_f16(min_value);
+
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
                         {
-                            vres = vaddq_f16(vres, data);
+                            const float16x8_t data = vld1q_f16(
+                                reinterpret_cast<const float16_t *>(
+                                    in.ptr() +
+                                    (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                    (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                x_off);
+                            vres = vmaxq_f16(vres, data);
                         }
                     }
                 }
-                // Divide by scale
-                vres = vmulq_f16(vres, scale_v);
-            }
-            else
-            {
-                vres = vdupq_n_f16(min_value);
 
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                // Calculate square-root in case of l2 pooling
+                if (pool_info.pool_type == PoolingType::L2)
                 {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                               (src->info()->strides_in_bytes().z())) + x_off);
-                        vres                   = vmaxq_f16(vres, data);
-                    }
+                    float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres);
+                    vres = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal),
+                                                     sqrt_reciprocal));
                 }
-            }
 
-            // Calculate square-root in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
-            {
-                float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres);
-                vres                        = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal));
+                // Store result
+                vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres);
             }
 
-            // Store result
-            vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres);
-        }
-
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            float16_t res = 0.0f;
-
-            if(pool_info.pool_type != PoolingType::MAX)
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
             {
-                // Calculate scale
-                const float16_t scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                                   pool_stride_y);
+                float16_t res = 0.0f;
 
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                if (pool_info.pool_type != PoolingType::MAX)
                 {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const float data = *(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                 (src->info()->strides_in_bytes().z())) + x_off);
+                    // Calculate scale
+                    const float16_t scale = calculate_avg_scale_pool2d(
+                        pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+                        upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
 
-                        // Get power of 2 in case of l2 pooling and accumulate
-                        if(pool_info.pool_type == PoolingType::L2)
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
                         {
-                            res += data * data;
+                            const float data =
+                                *(reinterpret_cast<const float16_t *>(
+                                      in.ptr() +
+                                      (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                      (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                  x_off);
+
+                            // Get power of 2 in case of l2 pooling and accumulate
+                            if (pool_info.pool_type == PoolingType::L2)
+                            {
+                                res += data * data;
+                            }
+                            else
+                            {
+                                res += data;
+                            }
                         }
-                        else
+                    }
+
+                    // Divide by scale
+                    res *= scale;
+                }
+                else
+                {
+                    res = min_value;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
                         {
-                            res += data;
+                            const float16_t data =
+                                *(reinterpret_cast<const float16_t *>(
+                                      in.ptr() +
+                                      (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                      (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                  x_off);
+                            res = std::max(res, data);
                         }
                     }
                 }
 
-                // Divide by scale
-                res *= scale;
-            }
-            else
-            {
-                res = min_value;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                // Calculate square-root in case of l2 pooling
+                if (pool_info.pool_type == PoolingType::L2)
                 {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const float16_t data = *(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                     (src->info()->strides_in_bytes().z())) + x_off);
-                        res                  = std::max(res, data);
-                    }
+                    res = std::sqrt(res);
                 }
-            }
 
-            // Calculate square-root in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
-            {
-                res = std::sqrt(res);
+                // Store result
+                *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res;
             }
-
-            // Store result
-            *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res;
-        }
-    },
-    in, out);
+        },
+        in, out);
 }
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
-\ No newline at end of file
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/pool2d/neon/fp32.cpp b/src/cpu/kernels/pool2d/neon/fp32.cpp
index a400f3a95d..aaa37863cb 100644
--- a/src/cpu/kernels/pool2d/neon/fp32.cpp
+++ b/src/cpu/kernels/pool2d/neon/fp32.cpp
@@ -24,8 +24,9 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 #include "src/cpu/kernels/pool2d/neon/list.h"
 
 namespace arm_compute
@@ -34,7 +35,12 @@ namespace cpu
 {
 namespace
 {
-void pooling2_f32_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling2_f32_maxpool_indices(const ITensor    *src,
+                                  ITensor          *dst0,
+                                  ITensor          *dst1,
+                                  PoolingLayerInfo &pool_info,
+                                  const Window     &window_src,
+                                  const Window     &window)
 {
     const int window_start_x = window.x().start();
     const int window_end_x   = window.x().end();
@@ -50,8 +56,8 @@ void pooling2_f32_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *ds
     const int pool_pad_top  = pool_info.pad_stride_info.pad_top();
     const int pool_pad_left = pool_info.pad_stride_info.pad_left();
 
-    int pool_stride_x = 0;
-    int pool_stride_y = 0;
+    int pool_stride_x                      = 0;
+    int pool_stride_y                      = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
 
     float32x4_t vres;
@@ -63,89 +69,102 @@ void pooling2_f32_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *ds
     const int in_stride_y    = static_cast<int>(src->info()->strides_in_bytes().y());
     const int in_stride_z    = static_cast<int>(src->info()->strides_in_bytes().z());
 
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        const int idx_width    = id.y() * pool_stride_x;
-        const int idx_height   = id.z() * pool_stride_y;
-        const int pool_limit_y = pool_pad_top - idx_height;
-        const int pool_limit_x = pool_pad_left - idx_width;
-
-        const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
-        const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
-
-        const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
-        const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-        const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-        const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-
-        int x_off = window_start_x;
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
-        {
-            const auto in_x0_ptr = reinterpret_cast<const float *>(in.ptr() + in_x0_offset);
-            const auto in_x1_ptr = reinterpret_cast<const float *>(in.ptr() + in_x1_offset);
-            const auto in_x2_ptr = reinterpret_cast<const float *>(in.ptr() + in_x2_offset);
-            const auto in_x3_ptr = reinterpret_cast<const float *>(in.ptr() + in_x3_offset);
-            const auto v_x0      = vld1q_f32(in_x0_ptr + x_off);
-            const auto v_x1      = vld1q_f32(in_x1_ptr + x_off);
-            const auto v_x2      = vld1q_f32(in_x2_ptr + x_off);
-            const auto v_x3      = vld1q_f32(in_x3_ptr + x_off);
-            vres                 = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1));
-            // Store result
-            vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
-
-            const uint32_t   offset_base  = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
-            const uint32_t   offset_x0    = offset_base / sizeof(float) + x_off;
-            const uint32_t   offset_x1    = offset_x0 + in_stride_y / sizeof(float) - pad_horizontal;
-            const uint32_t   offset_x2    = offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1];
-            const uint32_t   offset_x3    = offset_x2 + in_stride_y / sizeof(float) - pad_horizontal;
-            const uint32x4_t voffset_x0   = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
-            const uint32x4_t voffset_x1   = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
-            const uint32x4_t voffset_x2   = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
-            const uint32x4_t voffset_x3   = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
-            const uint32x4_t tmp_indices0 = vbslq_u32(vcgeq_f32(v_x0, v_x1), voffset_x0, voffset_x1);
-            const uint32x4_t tmp_indices1 = vbslq_u32(vcgeq_f32(v_x2, v_x3), voffset_x2, voffset_x3);
-            const uint32x4_t tmp_indices2 = vbslq_u32(vcgeq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1);
-
-            // Store indices
-            vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indices2);
-        }
-
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
         {
-            const auto x0 = *(reinterpret_cast<const float *>(in.ptr() + in_x0_offset) + x_off);
-            const auto x1 = *(reinterpret_cast<const float *>(in.ptr() + in_x1_offset) + x_off);
-            const auto x2 = *(reinterpret_cast<const float *>(in.ptr() + in_x2_offset) + x_off);
-            const auto x3 = *(reinterpret_cast<const float *>(in.ptr() + in_x3_offset) + x_off);
-            res           = std::max(std::max(x2, x3), std::max(x0, x1));
-
-            // Store result
-            *(reinterpret_cast<float *>(out.ptr()) + x_off) = res;
-
-            const uint32_t offset_base = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
-            const uint32_t offset_x0   = offset_base / sizeof(float) + x_off;
-            const uint32_t offset_x1   = offset_x0 + in_stride_y / sizeof(float) - pad_horizontal;
-            const uint32_t offset_x2   = offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1];
-            const uint32_t offset_x3   = offset_x2 + in_stride_y / sizeof(float) - pad_horizontal;
-            const uint32_t tmp_idx0    = (x0 >= x1) ? offset_x0 : offset_x1;
-            const uint32_t tmp_idx1    = (x2 >= x3) ? offset_x2 : offset_x3;
-            const uint32_t tmp_idx2    = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
-
-            // Store indices
-            *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
-        }
-    },
-    in, out, indices);
+            const int idx_width    = id.y() * pool_stride_x;
+            const int idx_height   = id.z() * pool_stride_y;
+            const int pool_limit_y = pool_pad_top - idx_height;
+            const int pool_limit_x = pool_pad_left - idx_width;
+
+            const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+            const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+
+            const int in_x0_offset =
+                (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+            const int in_x1_offset =
+                (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+            const int in_x2_offset =
+                (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y + 1 - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+            const int in_x3_offset =
+                (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y + 1 - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+
+            int x_off = window_start_x;
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+            {
+                const auto in_x0_ptr = reinterpret_cast<const float *>(in.ptr() + in_x0_offset);
+                const auto in_x1_ptr = reinterpret_cast<const float *>(in.ptr() + in_x1_offset);
+                const auto in_x2_ptr = reinterpret_cast<const float *>(in.ptr() + in_x2_offset);
+                const auto in_x3_ptr = reinterpret_cast<const float *>(in.ptr() + in_x3_offset);
+                const auto v_x0      = vld1q_f32(in_x0_ptr + x_off);
+                const auto v_x1      = vld1q_f32(in_x1_ptr + x_off);
+                const auto v_x2      = vld1q_f32(in_x2_ptr + x_off);
+                const auto v_x3      = vld1q_f32(in_x3_ptr + x_off);
+                vres                 = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1));
+                // Store result
+                vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
+
+                const uint32_t offset_base = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x,
+                                                                      pool_stride_y, DataLayout::NHWC);
+                const uint32_t offset_x0   = offset_base / sizeof(float) + x_off;
+                const uint32_t offset_x1   = offset_x0 + in_stride_y / sizeof(float) - pad_horizontal;
+                const uint32_t offset_x2 =
+                    offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1];
+                const uint32_t   offset_x3    = offset_x2 + in_stride_y / sizeof(float) - pad_horizontal;
+                const uint32x4_t voffset_x0   = {offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3};
+                const uint32x4_t voffset_x1   = {offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3};
+                const uint32x4_t voffset_x2   = {offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3};
+                const uint32x4_t voffset_x3   = {offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3};
+                const uint32x4_t tmp_indices0 = vbslq_u32(vcgeq_f32(v_x0, v_x1), voffset_x0, voffset_x1);
+                const uint32x4_t tmp_indices1 = vbslq_u32(vcgeq_f32(v_x2, v_x3), voffset_x2, voffset_x3);
+                const uint32x4_t tmp_indices2 =
+                    vbslq_u32(vcgeq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1);
+
+                // Store indices
+                vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indices2);
+            }
+
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
+            {
+                const auto x0 = *(reinterpret_cast<const float *>(in.ptr() + in_x0_offset) + x_off);
+                const auto x1 = *(reinterpret_cast<const float *>(in.ptr() + in_x1_offset) + x_off);
+                const auto x2 = *(reinterpret_cast<const float *>(in.ptr() + in_x2_offset) + x_off);
+                const auto x3 = *(reinterpret_cast<const float *>(in.ptr() + in_x3_offset) + x_off);
+                res           = std::max(std::max(x2, x3), std::max(x0, x1));
+
+                // Store result
+                *(reinterpret_cast<float *>(out.ptr()) + x_off) = res;
+
+                const uint32_t offset_base = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x,
+                                                                      pool_stride_y, DataLayout::NHWC);
+                const uint32_t offset_x0   = offset_base / sizeof(float) + x_off;
+                const uint32_t offset_x1   = offset_x0 + in_stride_y / sizeof(float) - pad_horizontal;
+                const uint32_t offset_x2 =
+                    offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1];
+                const uint32_t offset_x3 = offset_x2 + in_stride_y / sizeof(float) - pad_horizontal;
+                const uint32_t tmp_idx0  = (x0 >= x1) ? offset_x0 : offset_x1;
+                const uint32_t tmp_idx1  = (x2 >= x3) ? offset_x2 : offset_x3;
+                const uint32_t tmp_idx2  = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
+
+                // Store indices
+                *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
+            }
+        },
+        in, out, indices);
 }
 } // namespace
 
-void poolingMxN_fp32_neon_nhwc_kernel_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, const PoolingLayerInfo &pool_info, const Window &window)
+void poolingMxN_fp32_neon_nhwc_kernel_indices(
+    const ITensor *src, ITensor *dst0, ITensor *dst1, const PoolingLayerInfo &pool_info, const Window &window)
 {
-    const int window_start_x     = window.x().start();
-    const int window_end_x       = window.x().end();
+    const int     window_start_x = window.x().start();
+    const int     window_end_x   = window.x().end();
     constexpr int window_step_x  = 4;
 
     Window window_out = window;
@@ -160,8 +179,8 @@ void poolingMxN_fp32_neon_nhwc_kernel_indices(const ITensor *src, ITensor *dst0,
     const int pool_pad_top  = pool_info.pad_stride_info.pad_top();
     const int pool_pad_left = pool_info.pad_stride_info.pad_left();
 
-    int pool_stride_x = 0;
-    int pool_stride_y = 0;
+    int pool_stride_x                      = 0;
+    int pool_stride_y                      = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
 
     const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit);
@@ -169,9 +188,9 @@ void poolingMxN_fp32_neon_nhwc_kernel_indices(const ITensor *src, ITensor *dst0,
     float32x4_t vres;
     uint32x4_t  vidx;
 
-    constexpr int idx_width   = 1;
-    constexpr int idx_height  = 2;
-    constexpr int idx_batch   = 3;
+    constexpr int idx_width  = 1;
+    constexpr int idx_height = 2;
+    constexpr int idx_batch  = 3;
 
     const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
     const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
@@ -182,89 +201,97 @@ void poolingMxN_fp32_neon_nhwc_kernel_indices(const ITensor *src, ITensor *dst0,
 
     const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
 
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        const int idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
-        const int idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            const int idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
 
-        const int pool_start_x = std::max(0, -idx_width);
-        const int pool_start_y = std::max(0, -idx_height);
+            const int pool_start_x = std::max(0, -idx_width);
+            const int pool_start_y = std::max(0, -idx_height);
 
-        const int pool_end_x = std::min(pool_size_x, input_dim_w - idx_width);
-        const int pool_end_y = std::min(pool_size_y, input_dim_h - idx_height);
+            const int pool_end_x = std::min(pool_size_x, input_dim_w - idx_width);
+            const int pool_end_y = std::min(pool_size_y, input_dim_h - idx_height);
 
-        const uint8_t *in_ptr_n = in_ptr_start + id[idx_batch] * n_stride;
+            const uint8_t *in_ptr_n = in_ptr_start + id[idx_batch] * n_stride;
 
-        const int in_ptr_y_offset = (z_stride * idx_height) + (pool_start_y * z_stride);
-        const int in_ptr_x_offset = (y_stride * idx_width) + (pool_start_x * y_stride);
+            const int in_ptr_y_offset = (z_stride * idx_height) + (pool_start_y * z_stride);
+            const int in_ptr_x_offset = (y_stride * idx_width) + (pool_start_x * y_stride);
 
-        int x_off = window_start_x;
+            int x_off = window_start_x;
 
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
-        {
-            vres              = vdupq_n_f32(min_value);
-            vidx              = vdupq_n_u32(0U);
-            const uint8_t *in_ptr_y = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset;
-            uint32_t    curr_kernel_index = pool_size_x * pool_start_y;
-            for(int y = pool_start_y; y < pool_end_y; ++y)
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
             {
-                const uint8_t *in_ptr_x = in_ptr_y + (x_off * sizeof(float));
-                curr_kernel_index += pool_start_x;
-                for(int x = pool_start_x; x < pool_end_x; ++x)
+                vres                             = vdupq_n_f32(min_value);
+                vidx                             = vdupq_n_u32(0U);
+                const uint8_t *in_ptr_y          = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset;
+                uint32_t       curr_kernel_index = pool_size_x * pool_start_y;
+                for (int y = pool_start_y; y < pool_end_y; ++y)
                 {
-                    const float32x4_t data              = vld1q_f32(reinterpret_cast<const float *>(in_ptr_x));
-                    const uint32x4_t  vidx_curr         = vdupq_n_u32(curr_kernel_index);
-                    const uint32x4_t idxMask = vcgtq_f32(data, vres);
-                    vidx                     = vbslq_u32(idxMask, vidx_curr, vidx);
-                    vres                     = vmaxq_f32(vres, data);
-                    in_ptr_x += y_stride;
-                    curr_kernel_index++;
+                    const uint8_t *in_ptr_x = in_ptr_y + (x_off * sizeof(float));
+                    curr_kernel_index += pool_start_x;
+                    for (int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const float32x4_t data      = vld1q_f32(reinterpret_cast<const float *>(in_ptr_x));
+                        const uint32x4_t  vidx_curr = vdupq_n_u32(curr_kernel_index);
+                        const uint32x4_t  idxMask   = vcgtq_f32(data, vres);
+                        vidx                        = vbslq_u32(idxMask, vidx_curr, vidx);
+                        vres                        = vmaxq_f32(vres, data);
+                        in_ptr_x += y_stride;
+                        curr_kernel_index++;
+                    }
+                    curr_kernel_index += (pool_size_x - pool_end_x);
+                    in_ptr_y += z_stride;
                 }
-                curr_kernel_index += (pool_size_x - pool_end_x);
-                in_ptr_y += z_stride;
+                // Store result
+                vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
+                vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, vidx);
             }
-            // Store result
-            vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
-            vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, vidx);
-        }
 
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            float    res      = min_value;
-            uint32_t idx      = 0U;
-            const uint8_t *in_ptr_y = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset;
-            for(int y = pool_start_y; y < pool_end_y; ++y)
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
             {
-                const uint8_t *in_ptr_x = in_ptr_y + (x_off * sizeof(float));
-                for(int x = pool_start_x; x < pool_end_x; ++x)
+                float          res      = min_value;
+                uint32_t       idx      = 0U;
+                const uint8_t *in_ptr_y = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset;
+                for (int y = pool_start_y; y < pool_end_y; ++y)
                 {
-                    const float data = *(reinterpret_cast<const float *>(in_ptr_x));
-                    if(data > res)
+                    const uint8_t *in_ptr_x = in_ptr_y + (x_off * sizeof(float));
+                    for (int x = pool_start_x; x < pool_end_x; ++x)
                     {
-                        idx = pool_size_x * y + x;
-                        res = data;
+                        const float data = *(reinterpret_cast<const float *>(in_ptr_x));
+                        if (data > res)
+                        {
+                            idx = pool_size_x * y + x;
+                            res = data;
+                        }
+                        in_ptr_x += y_stride;
                     }
-                    in_ptr_x += y_stride;
+                    in_ptr_y += z_stride;
                 }
-                in_ptr_y += z_stride;
-            }
 
-            // Store result
-            *(reinterpret_cast<float *>(out.ptr()) + x_off)        = res;
-            *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = idx;
-        }
-    },
-    out, indices);
+                // Store result
+                *(reinterpret_cast<float *>(out.ptr()) + x_off)        = res;
+                *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = idx;
+            }
+        },
+        out, indices);
 }
 
-void poolingMxN_fp32_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void poolingMxN_fp32_neon_nhwc(const ITensor    *src,
+                               ITensor          *dst0,
+                               ITensor          *dst1,
+                               PoolingLayerInfo &pool_info,
+                               const Window     &window_src,
+                               const Window     &window)
 {
-    if((pool_info.pool_type == PoolingType::MAX) && pool_info.use_kernel_indices && (dst1 != nullptr))
+    if ((pool_info.pool_type == PoolingType::MAX) && pool_info.use_kernel_indices && (dst1 != nullptr))
     {
         poolingMxN_fp32_neon_nhwc_kernel_indices(src, dst0, dst1, pool_info, window);
     }
-    else if(pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && !pool_info.pad_stride_info.has_padding() && (dst1 != nullptr))
+    else if (pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX &&
+             !pool_info.pad_stride_info.has_padding() && (dst1 != nullptr))
     {
         pooling2_f32_maxpool_indices(src, dst0, dst1, pool_info, window_src, window);
     }
@@ -280,153 +307,174 @@ void poolingMxN_fp32_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1,
         Iterator in(src, window_src);
         Iterator out(dst0, window_out);
 
-        const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
-        const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
-        const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
-        const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
-        const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
-        const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-        int       pool_stride_x   = 0;
-        int       pool_stride_y   = 0;
+        const int pool_size_x =
+            pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+        const int pool_size_y =
+            pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+        const int pool_pad_right               = pool_info.pad_stride_info.pad_right();
+        const int pool_pad_top                 = pool_info.pad_stride_info.pad_top();
+        const int pool_pad_left                = pool_info.pad_stride_info.pad_left();
+        const int pool_pad_bottom              = pool_info.pad_stride_info.pad_bottom();
+        int       pool_stride_x                = 0;
+        int       pool_stride_y                = 0;
         std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
         const int   upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
         const int   upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
         const float min_value     = get_initial_min<float>(pool_info.use_inf_as_limit);
         float32x4_t vres;
 
-        execute_window_loop(window_out, [&](const Coordinates & id)
-        {
-            const int idx_width    = id.y() * pool_stride_x;
-            const int idx_height   = id.z() * pool_stride_y;
-            const int pool_limit_y = pool_pad_top - idx_height;
-            const int pool_limit_x = pool_pad_left - idx_width;
-
-            const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
-            const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
-            const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
-            const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
-
-            int x_off = window_start_x;
-            for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+        execute_window_loop(
+            window_out,
+            [&](const Coordinates &id)
             {
-                if(pool_info.pool_type != PoolingType::MAX)
+                const int idx_width    = id.y() * pool_stride_x;
+                const int idx_height   = id.z() * pool_stride_y;
+                const int pool_limit_y = pool_pad_top - idx_height;
+                const int pool_limit_x = pool_pad_left - idx_width;
+
+                const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+                const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
+                const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+                const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
+
+                int x_off = window_start_x;
+                for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
                 {
-                    // Calculate scale
-                    const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                                   pool_stride_y);
-                    const float32x4_t scale_v = vdupq_n_f32(scale);
+                    if (pool_info.pool_type != PoolingType::MAX)
+                    {
+                        // Calculate scale
+                        const float scale = calculate_avg_scale_pool2d(
+                            pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+                            upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+                        const float32x4_t scale_v = vdupq_n_f32(scale);
 
-                    // Perform pooling
-                    vres = vdupq_n_f32(0.0f);
+                        // Perform pooling
+                        vres = vdupq_n_f32(0.0f);
 
-                    for(int y = pool_start_y; y < pool_end_y; ++y)
-                    {
-                        for(int x = pool_start_x; x < pool_end_x; ++x)
+                        for (int y = pool_start_y; y < pool_end_y; ++y)
                         {
-                            const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                               (src->info()->strides_in_bytes().z())) + x_off);
-
-                            // Get power of 2 in case of l2 pooling and accumulate
-                            if(pool_info.pool_type == PoolingType::L2)
-                            {
-                                vres = vmlaq_f32(vres, data, data);
-                            }
-                            else
+                            for (int x = pool_start_x; x < pool_end_x; ++x)
                             {
-                                vres = vaddq_f32(vres, data);
+                                const float32x4_t data = vld1q_f32(
+                                    reinterpret_cast<const float *>(
+                                        in.ptr() +
+                                        (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                        (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                    x_off);
+
+                                // Get power of 2 in case of l2 pooling and accumulate
+                                if (pool_info.pool_type == PoolingType::L2)
+                                {
+                                    vres = vmlaq_f32(vres, data, data);
+                                }
+                                else
+                                {
+                                    vres = vaddq_f32(vres, data);
+                                }
                             }
                         }
+                        // Divide by scale
+                        vres = vmulq_f32(vres, scale_v);
                     }
-                    // Divide by scale
-                    vres = vmulq_f32(vres, scale_v);
-                }
-                else
-                {
-                    vres = vdupq_n_f32(min_value);
-                    for(int y = pool_start_y; y < pool_end_y; ++y)
+                    else
                     {
-                        for(int x = pool_start_x; x < pool_end_x; ++x)
+                        vres = vdupq_n_f32(min_value);
+                        for (int y = pool_start_y; y < pool_end_y; ++y)
                         {
-                            const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                               (src->info()->strides_in_bytes().z())) + x_off);
-                            vres                   = vmaxq_f32(vres, data);
+                            for (int x = pool_start_x; x < pool_end_x; ++x)
+                            {
+                                const float32x4_t data = vld1q_f32(
+                                    reinterpret_cast<const float *>(
+                                        in.ptr() +
+                                        (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                        (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                    x_off);
+                                vres = vmaxq_f32(vres, data);
+                            }
                         }
                     }
-                }
 
-                // Calculate square-root in case of l2 pooling
-                if(pool_info.pool_type == PoolingType::L2)
-                {
-                    float32x4_t l2_res = { static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
-                                           static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
-                                           static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
-                                           static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))
-                                         };
-                    vres = l2_res;
-                }
-
-                // Store result
-                vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
-            }
+                    // Calculate square-root in case of l2 pooling
+                    if (pool_info.pool_type == PoolingType::L2)
+                    {
+                        float32x4_t l2_res = {static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
+                                              static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
+                                              static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
+                                              static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))};
+                        vres               = l2_res;
+                    }
 
-            // Left-overs loop
-            for(; x_off < window_end_x; ++x_off)
-            {
-                float res = 0.0f;
+                    // Store result
+                    vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
+                }
 
-                if(pool_info.pool_type != PoolingType::MAX)
+                // Left-overs loop
+                for (; x_off < window_end_x; ++x_off)
                 {
-                    // Calculate scale
-                    const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                                   pool_stride_y);
+                    float res = 0.0f;
 
-                    for(int y = pool_start_y; y < pool_end_y; ++y)
+                    if (pool_info.pool_type != PoolingType::MAX)
                     {
-                        for(int x = pool_start_x; x < pool_end_x; ++x)
-                        {
-                            const float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                 (src->info()->strides_in_bytes().z())) + x_off);
+                        // Calculate scale
+                        const float scale = calculate_avg_scale_pool2d(
+                            pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+                            upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
 
-                            // Get power of 2 in case of l2 pooling and accumulate
-                            if(pool_info.pool_type == PoolingType::L2)
+                        for (int y = pool_start_y; y < pool_end_y; ++y)
+                        {
+                            for (int x = pool_start_x; x < pool_end_x; ++x)
                             {
-                                res += data * data;
+                                const float data =
+                                    *(reinterpret_cast<const float *>(
+                                          in.ptr() +
+                                          (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                          (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                      x_off);
+
+                                // Get power of 2 in case of l2 pooling and accumulate
+                                if (pool_info.pool_type == PoolingType::L2)
+                                {
+                                    res += data * data;
+                                }
+                                else
+                                {
+                                    res += data;
+                                }
                             }
-                            else
+                        }
+
+                        // Divide by scale
+                        res *= scale;
+                    }
+                    else
+                    {
+                        res = min_value;
+                        for (int y = pool_start_y; y < pool_end_y; ++y)
+                        {
+                            for (int x = pool_start_x; x < pool_end_x; ++x)
                             {
-                                res += data;
+                                const float data =
+                                    *(reinterpret_cast<const float *>(
+                                          in.ptr() +
+                                          (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                          (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                      x_off);
+                                res = std::max(res, data);
                             }
                         }
                     }
 
-                    // Divide by scale
-                    res *= scale;
-                }
-                else
-                {
-                    res = min_value;
-                    for(int y = pool_start_y; y < pool_end_y; ++y)
+                    // Calculate square-root in case of l2 pooling
+                    if (pool_info.pool_type == PoolingType::L2)
                     {
-                        for(int x = pool_start_x; x < pool_end_x; ++x)
-                        {
-                            const float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                 (src->info()->strides_in_bytes().z())) + x_off);
-                            res              = std::max(res, data);
-                        }
+                        res = std::sqrt(res);
                     }
-                }
 
-                // Calculate square-root in case of l2 pooling
-                if(pool_info.pool_type == PoolingType::L2)
-                {
-                    res = std::sqrt(res);
+                    // Store result
+                    *(reinterpret_cast<float *>(out.ptr()) + x_off) = res;
                 }
-
-                // Store result
-                *(reinterpret_cast<float *>(out.ptr()) + x_off) = res;
-            }
-        },
-        in, out);
+            },
+            in, out);
     }
 }
 } // namespace cpu
diff --git a/src/cpu/kernels/pool2d/neon/list.h b/src/cpu/kernels/pool2d/neon/list.h
index eb141d6fcd..f8f458a63e 100644
--- a/src/cpu/kernels/pool2d/neon/list.h
+++ b/src/cpu/kernels/pool2d/neon/list.h
@@ -26,16 +26,19 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/pool2d/neon/quantized.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_POOLING_KERNEL(func_name) \
-    void func_name(const ITensor *src0, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &, const Window &window_src, const Window &window)
+#define DECLARE_POOLING_KERNEL(func_name)                                                                           \
+    void func_name(const ITensor *src0, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &, const Window &window_src, \
+                   const Window &window)
 
 DECLARE_POOLING_KERNEL(poolingMxN_qasymm8_neon_nhwc);
 DECLARE_POOLING_KERNEL(poolingMxN_qasymm8_signed_neon_nhwc);
@@ -65,7 +68,12 @@ T get_initial_min(bool use_inf_as_limit)
 }
 
 template <typename T>
-inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id, const ITensorInfo &info, int pool_stride_x, int pool_stride_y, DataLayout data_layout)
+inline uint32_t offset_no_padding(uint32_t           padded_offset,
+                                  const Coordinates &id,
+                                  const ITensorInfo &info,
+                                  int                pool_stride_x,
+                                  int                pool_stride_y,
+                                  DataLayout         data_layout)
 {
     const int pad_left    = info.padding().left;
     const int pad_right   = info.padding().right;
@@ -76,22 +84,24 @@ inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id,
     const int pad_horiz   = pad_left + pad_right;
     const int pad_vert    = pad_top + pad_bottom;
 
-    if(data_layout == DataLayout::NCHW)
+    if (data_layout == DataLayout::NCHW)
     {
-        const uint32_t offset_base = padded_offset
-                                     - sizeof(T) * pad_horiz * id.y() * pool_stride_y                                            /* subtract padding elems per row */
-                                     - pad_top * sizeof(T)                                                                       /* top padding */
-                                     - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() - pad_vert * in_stride_y * id.z() /* for each Z plane there are height*pad_right padding elems */
-                                     - in_stride_w * id[3];
+        const uint32_t offset_base =
+            padded_offset - sizeof(T) * pad_horiz * id.y() * pool_stride_y /* subtract padding elems per row */
+            - pad_top * sizeof(T)                                          /* top padding */
+            - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() -
+            pad_vert * in_stride_y * id.z() /* for each Z plane there are height*pad_right padding elems */
+            - in_stride_w * id[3];
 
         return offset_base;
     }
     else
     {
-        const uint32_t offset_base = padded_offset
-                                     - sizeof(T) * pad_horiz * id.y() * pool_stride_x                          // subtract padding elems per row
-                                     - pad_top * sizeof(T)                                                     // top padding
-                                     - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() * pool_stride_y // for each Z plane there are width*pad_right padding elems
+        const uint32_t offset_base = padded_offset -
+                                     sizeof(T) * pad_horiz * id.y() * pool_stride_x // subtract padding elems per row
+                                     - pad_top * sizeof(T)                          // top padding
+                                     - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() *
+                                           pool_stride_y // for each Z plane there are width*pad_right padding elems
                                      - in_stride_w * id[3];
 
         return offset_base;
@@ -100,4 +110,4 @@ inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id,
 } // namespace cpu
 } // namespace arm_compute
 
-#endif // SRC_CORE_NEON_KERNELS_POOLING_LIST_H
-\ No newline at end of file
+#endif // SRC_CORE_NEON_KERNELS_POOLING_LIST_H
diff --git a/src/cpu/kernels/pool2d/neon/nchw/all.cpp b/src/cpu/kernels/pool2d/neon/nchw/all.cpp
index c342b96426..ee4a67b0fb 100644
--- a/src/cpu/kernels/pool2d/neon/nchw/all.cpp
+++ b/src/cpu/kernels/pool2d/neon/nchw/all.cpp
@@ -25,9 +25,11 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 #include "src/cpu/kernels/pool2d/neon/list.h"
+
 #include <limits>
 
 #ifdef ENABLE_NCHW_KERNELS
@@ -38,15 +40,19 @@ namespace cpu
 #define READ_2_RIGHT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \
     (x == width + pad_left - 1) ? vset_lane_f32(*(ptr), vdup_n_f32(fval), 0) : vld1_f32(ptr)
 #define READ_2_LEFT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \
-    (x == pad_left - 1) ? vset_lane_f32(*(1 + ptr), vdup_n_f32(fval), 1) : READ_2_RIGHT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)
-#define READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \
-    ((y < pad_top) || (x < pad_left - 1) || (y >= height + pad_top) || (x > width + pad_left - 1)) ? vdup_n_f32(fval) : READ_2_LEFT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)
+    (x == pad_left - 1) ? vset_lane_f32(*(1 + ptr), vdup_n_f32(fval), 1)              \
+                        : READ_2_RIGHT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)
+#define READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)                   \
+    ((y < pad_top) || (x < pad_left - 1) || (y >= height + pad_top) || (x > width + pad_left - 1)) \
+        ? vdup_n_f32(fval)                                                                         \
+        : READ_2_LEFT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)
 
 #define READ_4_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)           \
     vcombine_f32(READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval), \
                  READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, (x + 2), y, (ptr + 2), fval))
 
-float32x4x2_t read_8_boundary_aware(int height, int width, int pad_left, int pad_top, int x, int y, const float *ptr, float fval)
+float32x4x2_t
+read_8_boundary_aware(int height, int width, int pad_left, int pad_top, int x, int y, const float *ptr, float fval)
 {
     float32x4x2_t vec;
     vec.val[0] = READ_4_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval);
@@ -56,13 +62,14 @@ float32x4x2_t read_8_boundary_aware(int height, int width, int pad_left, int pad
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
-float16x4_t read_4_boundary_aware_fp16(int srcw, int srch, int pad_l, int pad_t, int x, int y, const float16_t *ptr, float16_t fval)
+float16x4_t
+read_4_boundary_aware_fp16(int srcw, int srch, int pad_l, int pad_t, int x, int y, const float16_t *ptr, float16_t fval)
 {
     float16_t  vec[4];
     const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t)));
-    for(int i = 0; i < 4; i++)
+    for (int i = 0; i < 4; i++)
     {
-        if(row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
+        if (row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
         {
             vec[i] = *(ptr + i);
         }
@@ -74,94 +81,106 @@ float16x4_t read_4_boundary_aware_fp16(int srcw, int srch, int pad_l, int pad_t,
     return wrapper::vload(vec);
 }
 
-void pooling3_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling3_fp16_neon_nchw(const ITensor    *src,
+                             ITensor          *dst0,
+                             ITensor          *dst1,
+                             PoolingLayerInfo &pool_info,
+                             const Window     &window_src,
+                             const Window     &window)
 {
     ARM_COMPUTE_UNUSED(dst1);
 
     Iterator in(src, window_src);
     Iterator out(dst0, window);
 
-    constexpr const int pool_size       = 3;
-    const int           pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int           pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int           pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int           pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int                  src_w          = src->info()->dimension(0);
-    const int                  src_h          = src->info()->dimension(1);
-    const int                  upper_bound_w  = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int                  upper_bound_h  = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-    const float16_t            fp16_min       = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
-    const float16_t            fill_value     = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.f;
-    const unsigned char *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-    const unsigned char *const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-    const unsigned char *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto  x_val    = id.x() * pool_stride_x;
-        const auto  y_val_0  = id.y() * pool_stride_y;
-        const auto  y_val_1  = (id.y() * pool_stride_y) + 1;
-        const auto  y_val_2  = (id.y() * pool_stride_y) + 2;
-        float16x4_t top_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
-                                                          x_val, y_val_0, reinterpret_cast<const float16_t *>(src_top_ptr + in.offset()), fill_value);
-        float16x4_t middle_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
-                                                             x_val, y_val_1, reinterpret_cast<const float16_t *>(src_middle_ptr + in.offset()), fill_value);
-        float16x4_t bottom_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
-                                                             x_val, y_val_2, reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset()), fill_value);
-        float16x4_t res = {};
-
-        // Get power of 2 in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
+    constexpr const int pool_size            = 3;
+    const int           pool_pad_right       = pool_info.pad_stride_info.pad_right();
+    const int           pool_pad_top         = pool_info.pad_stride_info.pad_top();
+    const int           pool_pad_left        = pool_info.pad_stride_info.pad_left();
+    const int           pool_pad_bottom      = pool_info.pad_stride_info.pad_bottom();
+    int                 pool_stride_x        = 0;
+    int                 pool_stride_y        = 0;
+    std::tie(pool_stride_x, pool_stride_y)   = pool_info.pad_stride_info.stride();
+    const int                  src_w         = src->info()->dimension(0);
+    const int                  src_h         = src->info()->dimension(1);
+    const int                  upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int                  upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const float16_t            fp16_min      = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
+    const float16_t            fill_value    = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.f;
+    const unsigned char *const src_top_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const unsigned char *const src_middle_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+    const unsigned char *const src_bottom_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            top_data    = vmul_f16(top_data, top_data);
-            middle_data = vmul_f16(middle_data, middle_data);
-            bottom_data = vmul_f16(bottom_data, bottom_data);
-        }
+            const auto  x_val   = id.x() * pool_stride_x;
+            const auto  y_val_0 = id.y() * pool_stride_y;
+            const auto  y_val_1 = (id.y() * pool_stride_y) + 1;
+            const auto  y_val_2 = (id.y() * pool_stride_y) + 2;
+            float16x4_t top_data =
+                read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_0,
+                                           reinterpret_cast<const float16_t *>(src_top_ptr + in.offset()), fill_value);
+            float16x4_t middle_data = read_4_boundary_aware_fp16(
+                src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_1,
+                reinterpret_cast<const float16_t *>(src_middle_ptr + in.offset()), fill_value);
+            float16x4_t bottom_data = read_4_boundary_aware_fp16(
+                src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_2,
+                reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset()), fill_value);
+            float16x4_t res = {};
 
-        if(pool_info.pool_type != PoolingType::MAX)
-        {
-            // Calculate scale
-            const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                           pool_stride_y);
-            const float16x4_t scale_v = vdup_n_f16(scale);
-            // Perform pooling
-            const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
-            res                        = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
-            res                        = vmul_f16(vpadd_f16(res, res), scale_v);
-        }
-        else
-        {
-            const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
-            res                        = vpmax_f16(vset_lane_f16(fp16_min, max_data, 3), max_data);
-            res                        = vpmax_f16(res, res);
-        }
+            // Get power of 2 in case of l2 pooling
+            if (pool_info.pool_type == PoolingType::L2)
+            {
+                top_data    = vmul_f16(top_data, top_data);
+                middle_data = vmul_f16(middle_data, middle_data);
+                bottom_data = vmul_f16(bottom_data, bottom_data);
+            }
 
-        // Calculate square-root in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            res = vsqrt_f16(res);
-        }
+            if (pool_info.pool_type != PoolingType::MAX)
+            {
+                // Calculate scale
+                const float scale = calculate_avg_scale_pool2d(
+                    pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h,
+                    pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+                const float16x4_t scale_v = vdup_n_f16(scale);
+                // Perform pooling
+                const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
+                res                        = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
+                res                        = vmul_f16(vpadd_f16(res, res), scale_v);
+            }
+            else
+            {
+                const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
+                res                        = vpmax_f16(vset_lane_f16(fp16_min, max_data, 3), max_data);
+                res                        = vpmax_f16(res, res);
+            }
+
+            // Calculate square-root in case of l2 pooling
+            if (pool_info.pool_type == PoolingType::L2)
+            {
+                res = vsqrt_f16(res);
+            }
 
-        *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0);
-    },
-    in, out);
+            *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0);
+        },
+        in, out);
 }
 
 template <typename T>
-inline typename std::enable_if<std::is_same<T, float16_t>::value, float32x2_t>::type
-f16_to_f32(float16x4_t in)
+inline typename std::enable_if<std::is_same<T, float16_t>::value, float32x2_t>::type f16_to_f32(float16x4_t in)
 {
-    float32x2_t out = { static_cast<float>(vget_lane_f16(in, 0)), static_cast<float>(vget_lane_f16(in, 1)) };
+    float32x2_t out = {static_cast<float>(vget_lane_f16(in, 0)), static_cast<float>(vget_lane_f16(in, 1))};
     return out;
 }
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 template <typename T>
-inline typename std::enable_if<std::is_same<T, float>::value, float32x2_t>::type
-f16_to_f32(float32x2_t in)
+inline typename std::enable_if<std::is_same<T, float>::value, float32x2_t>::type f16_to_f32(float32x2_t in)
 {
     return in;
 }
@@ -171,9 +190,9 @@ auto read_2_boundary_aware(int srcw, int srch, int pad_l, int pad_t, int x, int
 {
     T          vec[2];
     const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t)));
-    for(int i = 0; i < 2; i++)
+    for (int i = 0; i < 2; i++)
     {
-        if(row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
+        if (row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
         {
             vec[i] = *(ptr + i);
         }
@@ -186,61 +205,80 @@ auto read_2_boundary_aware(int srcw, int srch, int pad_l, int pad_t, int x, int
 }
 
 template <typename T>
-void pooling2_nchw_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling2_nchw_maxpool_indices(const ITensor    *src,
+                                   ITensor          *dst0,
+                                   ITensor          *dst1,
+                                   PoolingLayerInfo &pool_info,
+                                   const Window     &window_src,
+                                   const Window     &window)
 {
     Iterator  in(src, window_src);
     Iterator  out(dst0, window);
     Iterator  indices(dst1, window);
-    const int pool_pad_top  = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left = pool_info.pad_stride_info.pad_left();
-    int       pool_stride_x = 0;
-    int       pool_stride_y = 0;
+    const int pool_pad_top                 = pool_info.pad_stride_info.pad_top();
+    const int pool_pad_left                = pool_info.pad_stride_info.pad_left();
+    int       pool_stride_x                = 0;
+    int       pool_stride_y                = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int            src_w          = src->info()->dimension(0);
-    const int            src_h          = src->info()->dimension(1);
-    const uint8_t *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-    const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-    const int            pad_left       = src->info()->padding().left;
-    const int            pad_right      = src->info()->padding().right;
-    const int            in_stride_y    = static_cast<int>(src->info()->strides_in_bytes().y());
-    const T              float_min      = get_initial_min<T>(pool_info.use_inf_as_limit);
-    const T              fill_value     = (pool_info.pool_type == PoolingType::MAX) ? float_min : 0.f;
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto x_val    = id.x() * pool_stride_x;
-        const auto y_val_0  = id.y() * pool_stride_y;
-        const auto y_val_1  = (id.y() * pool_stride_y) + 1;
-        auto       top_data = read_2_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_top,
-                                                    x_val, y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
-        auto bottom_data = read_2_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_top,
-                                                 x_val, y_val_1, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
-        float32x2_t top_data_f32    = f16_to_f32<T>(top_data);
-        float32x2_t bottom_data_f32 = f16_to_f32<T>(bottom_data);
-
-        // Calculate max data, compare top first, then bottom, to make sue the first max is recorded.
-        const float32x2_t max_data_top      = vpmax_f32(top_data_f32, top_data_f32);
-        const float32x2_t max_data_bottom   = vpmax_f32(bottom_data_f32, bottom_data_f32);
-        const float32x2_t max_data          = vmax_f32(max_data_top, max_data_bottom);
-        *(reinterpret_cast<T *>(out.ptr())) = static_cast<T>(vget_lane_f32(max_data, 0));
-
-        // Calculate max data indice, which will be used in max unpool.
-        const uint32_t   offset_base              = offset_no_padding<T>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NCHW);
-        const uint32_t   offset_top               = (uint32_t)(offset_base / sizeof(T));
-        const uint32_t   offset_bottom            = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left;
-        const uint32x2_t voffset_top              = { offset_top, offset_top + 1u };
-        const uint32x2_t voffset_bottom           = { offset_bottom, offset_bottom + 1u };
-        const uint32x2_t tmp_indices_top          = vbsl_u32(vcge_f32(top_data_f32, vrev64_f32(top_data_f32)), voffset_top, vrev64_u32(voffset_top));
-        const uint32x2_t tmp_indices_bottom       = vbsl_u32(vcge_f32(bottom_data_f32, vrev64_f32(bottom_data_f32)), voffset_bottom, vrev64_u32(voffset_bottom));
-        *(reinterpret_cast<int *>(indices.ptr())) = vget_lane_u32(vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0);
-    },
-    in, out, indices);
+    const int            src_w             = src->info()->dimension(0);
+    const int            src_h             = src->info()->dimension(1);
+    const uint8_t *const src_top_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const uint8_t *const src_bottom_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+    const int pad_left    = src->info()->padding().left;
+    const int pad_right   = src->info()->padding().right;
+    const int in_stride_y = static_cast<int>(src->info()->strides_in_bytes().y());
+    const T   float_min   = get_initial_min<T>(pool_info.use_inf_as_limit);
+    const T   fill_value  = (pool_info.pool_type == PoolingType::MAX) ? float_min : 0.f;
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const auto x_val    = id.x() * pool_stride_x;
+            const auto y_val_0  = id.y() * pool_stride_y;
+            const auto y_val_1  = (id.y() * pool_stride_y) + 1;
+            auto       top_data = read_2_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_0,
+                                                        reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
+            auto       bottom_data =
+                read_2_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_1,
+                                      reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
+            float32x2_t top_data_f32    = f16_to_f32<T>(top_data);
+            float32x2_t bottom_data_f32 = f16_to_f32<T>(bottom_data);
+
+            // Calculate max data, compare top first, then bottom, to make sue the first max is recorded.
+            const float32x2_t max_data_top      = vpmax_f32(top_data_f32, top_data_f32);
+            const float32x2_t max_data_bottom   = vpmax_f32(bottom_data_f32, bottom_data_f32);
+            const float32x2_t max_data          = vmax_f32(max_data_top, max_data_bottom);
+            *(reinterpret_cast<T *>(out.ptr())) = static_cast<T>(vget_lane_f32(max_data, 0));
+
+            // Calculate max data indice, which will be used in max unpool.
+            const uint32_t offset_base =
+                offset_no_padding<T>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NCHW);
+            const uint32_t   offset_top     = (uint32_t)(offset_base / sizeof(T));
+            const uint32_t   offset_bottom  = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left;
+            const uint32x2_t voffset_top    = {offset_top, offset_top + 1u};
+            const uint32x2_t voffset_bottom = {offset_bottom, offset_bottom + 1u};
+            const uint32x2_t tmp_indices_top =
+                vbsl_u32(vcge_f32(top_data_f32, vrev64_f32(top_data_f32)), voffset_top, vrev64_u32(voffset_top));
+            const uint32x2_t tmp_indices_bottom       = vbsl_u32(vcge_f32(bottom_data_f32, vrev64_f32(bottom_data_f32)),
+                                                                 voffset_bottom, vrev64_u32(voffset_bottom));
+            *(reinterpret_cast<int *>(indices.ptr())) = vget_lane_u32(
+                vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0);
+        },
+        in, out, indices);
 }
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void pooling2_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling2_fp16_neon_nchw(const ITensor    *src,
+                             ITensor          *dst0,
+                             ITensor          *dst1,
+                             PoolingLayerInfo &pool_info,
+                             const Window     &window_src,
+                             const Window     &window)
 {
-    if(pool_info.pool_type == PoolingType::MAX && dst1)
+    if (pool_info.pool_type == PoolingType::MAX && dst1)
     {
         pooling2_nchw_maxpool_indices<float16_t>(src, dst0, dst1, pool_info, window_src, window);
     }
@@ -254,244 +292,274 @@ void pooling2_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, P
         const int     pool_pad_left   = pool_info.pad_stride_info.pad_left();
         const int     pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
         int           pool_stride_x, pool_stride_y = 0;
-        std::tie(pool_stride_x, pool_stride_y)     = pool_info.pad_stride_info.stride();
-        const int       src_w         = src->info()->dimension(0);
-        const int       src_h         = src->info()->dimension(1);
-        const int       upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
-        const int       upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-        const float16_t fp16_min      = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
-        const float16_t fill_value    = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f;
-
-        const unsigned char *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-        const unsigned char *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto in_top_ptr    = reinterpret_cast<const float16_t *>(src_top_ptr + in.offset());
-            const auto in_bottom_ptr = reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset());
-
-            const auto  x_val    = id.x() * pool_stride_x;
-            const auto  y_val_0  = id.y() * pool_stride_y;
-            const auto  y_val_1  = (id.y() * pool_stride_y) + 1;
-            float16x4_t top_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
-                                                              x_val, y_val_0, in_top_ptr, fill_value);
-            float16x4_t bottom_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
-                                                                 x_val, y_val_1, in_bottom_ptr, fill_value);
-            float16x4_t res = {};
-
-            // Get power of 2 in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
+        std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
+        const int       src_w                  = src->info()->dimension(0);
+        const int       src_h                  = src->info()->dimension(1);
+        const int       upper_bound_w          = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+        const int       upper_bound_h          = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+        const float16_t fp16_min               = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
+        const float16_t fill_value             = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f;
+
+        const unsigned char *const src_top_ptr =
+            src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+        const unsigned char *const src_bottom_ptr =
+            src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
             {
-                top_data    = vmul_f16(top_data, top_data);
-                bottom_data = vmul_f16(bottom_data, bottom_data);
-            }
+                const auto in_top_ptr    = reinterpret_cast<const float16_t *>(src_top_ptr + in.offset());
+                const auto in_bottom_ptr = reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset());
+
+                const auto  x_val       = id.x() * pool_stride_x;
+                const auto  y_val_0     = id.y() * pool_stride_y;
+                const auto  y_val_1     = (id.y() * pool_stride_y) + 1;
+                float16x4_t top_data    = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, x_val,
+                                                                     y_val_0, in_top_ptr, fill_value);
+                float16x4_t bottom_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, x_val,
+                                                                     y_val_1, in_bottom_ptr, fill_value);
+                float16x4_t res         = {};
 
-            if(pool_info.pool_type != PoolingType::MAX)
-            {
-                const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                               pool_stride_y);
-                const float16x4_t scale_v = vdup_n_f16(scale);
+                // Get power of 2 in case of l2 pooling
+                if (pool_info.pool_type == PoolingType::L2)
+                {
+                    top_data    = vmul_f16(top_data, top_data);
+                    bottom_data = vmul_f16(bottom_data, bottom_data);
+                }
 
-                const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
-                res                        = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v);
-            }
-            else
-            {
-                const float16x4_t max_data = vmax_f16(top_data, bottom_data);
-                res                        = vpmax_f16(max_data, max_data);
-            }
+                if (pool_info.pool_type != PoolingType::MAX)
+                {
+                    const float scale = calculate_avg_scale_pool2d(
+                        pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w,
+                        upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+                    const float16x4_t scale_v = vdup_n_f16(scale);
 
-            // Calculate square-root in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
-            {
-                res = vsqrt_f16(res);
-            }
+                    const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
+                    res                        = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v);
+                }
+                else
+                {
+                    const float16x4_t max_data = vmax_f16(top_data, bottom_data);
+                    res                        = vpmax_f16(max_data, max_data);
+                }
 
-            // Store result
-            *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0);
-        },
-        in, out);
+                // Calculate square-root in case of l2 pooling
+                if (pool_info.pool_type == PoolingType::L2)
+                {
+                    res = vsqrt_f16(res);
+                }
+
+                // Store result
+                *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0);
+            },
+            in, out);
     }
 }
 
-void poolingMxN_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void poolingMxN_fp16_neon_nchw(const ITensor    *src,
+                               ITensor          *dst0,
+                               ITensor          *dst1,
+                               PoolingLayerInfo &pool_info,
+                               const Window     &window_src,
+                               const Window     &window)
 {
     ARM_COMPUTE_UNUSED(dst1);
     Iterator in(src, window_src);
     Iterator out(dst0, window);
 
-    const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
-    const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
-    const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int       pool_stride_x   = 0;
-    int       pool_stride_y   = 0;
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
+    const int pool_pad_right               = pool_info.pad_stride_info.pad_right();
+    const int pool_pad_top                 = pool_info.pad_stride_info.pad_top();
+    const int pool_pad_left                = pool_info.pad_stride_info.pad_left();
+    const int pool_pad_bottom              = pool_info.pad_stride_info.pad_bottom();
+    int       pool_stride_x                = 0;
+    int       pool_stride_y                = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int       src_w         = src->info()->dimension(0);
-    const int       src_h         = src->info()->dimension(1);
-    const int       upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int       upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-    const float16_t fp16_min      = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
-    const float16_t fill_value    = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f;
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        float16_t res = 0.0f;
-
-        if(pool_info.pool_type != PoolingType::MAX)
+    const int       src_w                  = src->info()->dimension(0);
+    const int       src_h                  = src->info()->dimension(1);
+    const int       upper_bound_w          = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int       upper_bound_h          = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const float16_t fp16_min               = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
+    const float16_t fill_value             = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f;
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            // Calculate scale
-            const float16_t scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                               pool_stride_y);
+            float16_t res = 0.0f;
 
-            // Perform pooling
-            for(int y = 0; y < pool_size_y; ++y)
+            if (pool_info.pool_type != PoolingType::MAX)
             {
-                for(int x = 0; x < pool_size_x; ++x)
+                // Calculate scale
+                const float16_t scale = calculate_avg_scale_pool2d(
+                    pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w,
+                    upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+
+                // Perform pooling
+                for (int y = 0; y < pool_size_y; ++y)
                 {
-                    const auto ptr = reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
-                                                                         + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
+                    for (int x = 0; x < pool_size_x; ++x)
+                    {
+                        const auto ptr = reinterpret_cast<const float16_t *>(
+                            in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) +
+                            (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
 
-                    const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
-                    const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
-                    float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
+                        const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
+                        const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
+                        float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
 
-                    if(pool_info.pool_type == PoolingType::L2)
-                    {
-                        data *= data;
-                    }
+                        if (pool_info.pool_type == PoolingType::L2)
+                        {
+                            data *= data;
+                        }
 
-                    res += data;
+                        res += data;
+                    }
                 }
-            }
 
-            // Divide by scale
-            res *= scale;
-        }
-        else // if max pooling
-        {
-            res = fp16_min;
-
-            for(int y = 0; y < pool_size_y; ++y)
+                // Divide by scale
+                res *= scale;
+            }
+            else // if max pooling
             {
-                for(int x = 0; x < pool_size_x; ++x)
-                {
-                    const auto ptr = reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
-                                                                         + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
+                res = fp16_min;
 
-                    const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
-                    const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
-                    float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
-                    res            = std::max(res, data);
+                for (int y = 0; y < pool_size_y; ++y)
+                {
+                    for (int x = 0; x < pool_size_x; ++x)
+                    {
+                        const auto ptr = reinterpret_cast<const float16_t *>(
+                            in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) +
+                            (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
+
+                        const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
+                        const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
+                        float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
+                        res            = std::max(res, data);
+                    }
                 }
             }
-        }
 
-        // Calculate square-root in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            res = std::sqrt(res);
-        }
+            // Calculate square-root in case of l2 pooling
+            if (pool_info.pool_type == PoolingType::L2)
+            {
+                res = std::sqrt(res);
+            }
 
-        // Store result
-        *(reinterpret_cast<float16_t *>(out.ptr())) = res;
-    },
-    in, out);
+            // Store result
+            *(reinterpret_cast<float16_t *>(out.ptr())) = res;
+        },
+        in, out);
 }
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
-void poolingMxN_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void poolingMxN_fp32_neon_nchw(const ITensor    *src,
+                               ITensor          *dst0,
+                               ITensor          *dst1,
+                               PoolingLayerInfo &pool_info,
+                               const Window     &window_src,
+                               const Window     &window)
 {
     ARM_COMPUTE_UNUSED(dst1);
     Iterator in(src, window_src);
     Iterator out(dst0, window);
 
-    const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
-    const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
-    const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int       pool_stride_x   = 0;
-    int       pool_stride_y   = 0;
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
+    const int pool_pad_right               = pool_info.pad_stride_info.pad_right();
+    const int pool_pad_top                 = pool_info.pad_stride_info.pad_top();
+    const int pool_pad_left                = pool_info.pad_stride_info.pad_left();
+    const int pool_pad_bottom              = pool_info.pad_stride_info.pad_bottom();
+    int       pool_stride_x                = 0;
+    int       pool_stride_y                = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int   src_w         = src->info()->dimension(0);
-    const int   src_h         = src->info()->dimension(1);
-    const int   upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int   upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-    const float min_value     = get_initial_min<float>(pool_info.use_inf_as_limit);
-    const float fill_value    = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        float res = 0.0f;
-
-        if(pool_info.pool_type != PoolingType::MAX)
+    const int   src_w                      = src->info()->dimension(0);
+    const int   src_h                      = src->info()->dimension(1);
+    const int   upper_bound_w              = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int   upper_bound_h              = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const float min_value                  = get_initial_min<float>(pool_info.use_inf_as_limit);
+    const float fill_value                 = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            // Calculate scale
-            const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h,
-                                                           pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+            float res = 0.0f;
 
-            // Perform pooling
-            for(int y = 0; y < pool_size_y; ++y)
+            if (pool_info.pool_type != PoolingType::MAX)
             {
-                for(int x = 0; x < pool_size_x; ++x)
+                // Calculate scale
+                const float scale = calculate_avg_scale_pool2d(
+                    pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w,
+                    upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+
+                // Perform pooling
+                for (int y = 0; y < pool_size_y; ++y)
                 {
-                    const auto ptr = reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
-                                                                     + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
+                    for (int x = 0; x < pool_size_x; ++x)
+                    {
+                        const auto ptr = reinterpret_cast<const float *>(
+                            in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) +
+                            (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
 
-                    const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
-                    const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
-                    float     data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
+                        const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
+                        const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
+                        float     data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
 
-                    if(pool_info.pool_type == PoolingType::L2)
-                    {
-                        data *= data;
-                    }
+                        if (pool_info.pool_type == PoolingType::L2)
+                        {
+                            data *= data;
+                        }
 
-                    res += data;
+                        res += data;
+                    }
                 }
-            }
 
-            // Divide by scale
-            res *= scale;
-        }
-        else // if max pooling
-        {
-            res = min_value;
-
-            for(int y = 0; y < pool_size_y; ++y)
+                // Divide by scale
+                res *= scale;
+            }
+            else // if max pooling
             {
-                for(int x = 0; x < pool_size_x; ++x)
-                {
-                    const auto ptr = reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
-                                                                     + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
+                res = min_value;
 
-                    const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
-                    const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
-                    float     data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
-                    res            = std::max(res, data);
+                for (int y = 0; y < pool_size_y; ++y)
+                {
+                    for (int x = 0; x < pool_size_x; ++x)
+                    {
+                        const auto ptr = reinterpret_cast<const float *>(
+                            in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) +
+                            (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
+
+                        const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
+                        const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
+                        float     data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
+                        res            = std::max(res, data);
+                    }
                 }
             }
-        }
 
-        // Calculate square-root in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            res = std::sqrt(res);
-        }
+            // Calculate square-root in case of l2 pooling
+            if (pool_info.pool_type == PoolingType::L2)
+            {
+                res = std::sqrt(res);
+            }
 
-        // Store result
-        *(reinterpret_cast<float *>(out.ptr())) = res;
-    },
-    in, out);
+            // Store result
+            *(reinterpret_cast<float *>(out.ptr())) = res;
+        },
+        in, out);
 }
 
-void pooling2_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling2_fp32_neon_nchw(const ITensor    *src,
+                             ITensor          *dst0,
+                             ITensor          *dst1,
+                             PoolingLayerInfo &pool_info,
+                             const Window     &window_src,
+                             const Window     &window)
 {
-    if(pool_info.pool_type == PoolingType::MAX && dst1)
+    if (pool_info.pool_type == PoolingType::MAX && dst1)
     {
         pooling2_nchw_maxpool_indices<float>(src, dst0, dst1, pool_info, window_src, window);
     }
@@ -499,64 +567,168 @@ void pooling2_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, P
     {
         Iterator      in(src, window_src);
         Iterator      out(dst0, window);
-        constexpr int pool_size       = 2;
-        const int     pool_pad_right  = pool_info.pad_stride_info.pad_right();
-        const int     pool_pad_top    = pool_info.pad_stride_info.pad_top();
-        const int     pool_pad_left   = pool_info.pad_stride_info.pad_left();
-        const int     pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-        int           pool_stride_x   = 0;
-        int           pool_stride_y   = 0;
+        constexpr int pool_size                = 2;
+        const int     pool_pad_right           = pool_info.pad_stride_info.pad_right();
+        const int     pool_pad_top             = pool_info.pad_stride_info.pad_top();
+        const int     pool_pad_left            = pool_info.pad_stride_info.pad_left();
+        const int     pool_pad_bottom          = pool_info.pad_stride_info.pad_bottom();
+        int           pool_stride_x            = 0;
+        int           pool_stride_y            = 0;
         std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-        const int   src_w         = src->info()->dimension(0);
-        const int   src_h         = src->info()->dimension(1);
-        const int   upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
-        const int   upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-        const float min_value     = get_initial_min<float>(pool_info.use_inf_as_limit);
-        const float fill_value    = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
+        const int   src_w                      = src->info()->dimension(0);
+        const int   src_h                      = src->info()->dimension(1);
+        const int   upper_bound_w              = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+        const int   upper_bound_h              = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+        const float min_value                  = get_initial_min<float>(pool_info.use_inf_as_limit);
+        const float fill_value                 = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
+
+        const uint8_t *const src_top_ptr =
+            src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+        const uint8_t *const src_bottom_ptr =
+            src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const auto in_top_ptr    = reinterpret_cast<const float *>(src_top_ptr + in.offset());
+                const auto in_bottom_ptr = reinterpret_cast<const float *>(src_bottom_ptr + in.offset());
+
+                const auto x_val      = id.x() * pool_stride_x;
+                const auto y_val_0    = id.y() * pool_stride_y;
+                const auto y_val_1    = (id.y() * pool_stride_y) + 1;
+                auto       top_data   = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0,
+                                                              in_top_ptr, fill_value);
+                auto bottom_data      = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1,
+                                                              in_bottom_ptr, fill_value);
+                float32x2_t res       = {};
+                float       final_res = 0;
 
-        const uint8_t *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-        const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+                // Get power of 2 in case of l2 pooling
+                if (pool_info.pool_type == PoolingType::L2)
+                {
+                    top_data    = vmul_f32(top_data, top_data);
+                    bottom_data = vmul_f32(bottom_data, bottom_data);
+                }
+
+                if (pool_info.pool_type != PoolingType::MAX)
+                {
+                    // Calculate scale
+                    float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size,
+                                                             pool_size, upper_bound_w, upper_bound_h, pool_pad_left,
+                                                             pool_pad_top, pool_stride_x, pool_stride_y);
+                    const float32x2_t scale_v = vdup_n_f32(scale);
+
+                    // Perform pooling
+                    const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
+                    res                        = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
+                }
+                else
+                {
+                    const float32x2_t max_data = vmax_f32(top_data, bottom_data);
+                    res                        = vpmax_f32(max_data, max_data);
+                }
+                final_res = vget_lane_f32(res, 0);
+
+                // Calculate square-root in case of l2 pooling
+                if (pool_info.pool_type == PoolingType::L2)
+                {
+                    final_res = sqrt(final_res);
+                }
 
-        execute_window_loop(window, [&](const Coordinates & id)
+                // Store result
+                *(reinterpret_cast<float *>(out.ptr())) = final_res;
+            },
+            in, out);
+    }
+}
+
+void pooling3_fp32_neon_nchw(const ITensor    *src,
+                             ITensor          *dst0,
+                             ITensor          *dst1,
+                             PoolingLayerInfo &pool_info,
+                             const Window     &window_src,
+                             const Window     &window)
+{
+    ARM_COMPUTE_UNUSED(dst1);
+    Iterator in(src, window_src);
+    Iterator out(dst0, window);
+
+    constexpr const int pool_size          = 3;
+    const int           pool_pad_right     = pool_info.pad_stride_info.pad_right();
+    const int           pool_pad_top       = pool_info.pad_stride_info.pad_top();
+    const int           pool_pad_left      = pool_info.pad_stride_info.pad_left();
+    const int           pool_pad_bottom    = pool_info.pad_stride_info.pad_bottom();
+    int                 pool_stride_x      = 0;
+    int                 pool_stride_y      = 0;
+    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
+    const int   src_w                      = src->info()->dimension(0);
+    const int   src_h                      = src->info()->dimension(1);
+    const int   upper_bound_w              = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int   upper_bound_h              = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const float min_value                  = get_initial_min<float>(pool_info.use_inf_as_limit);
+    const float fill_value                 = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
+
+    const uint8_t *const src_top_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const uint8_t *const src_middle_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+    const uint8_t *const src_bottom_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
             const auto in_top_ptr    = reinterpret_cast<const float *>(src_top_ptr + in.offset());
+            const auto in_middle_ptr = reinterpret_cast<const float *>(src_middle_ptr + in.offset());
             const auto in_bottom_ptr = reinterpret_cast<const float *>(src_bottom_ptr + in.offset());
 
-            const auto  x_val       = id.x() * pool_stride_x;
-            const auto  y_val_0     = id.y() * pool_stride_y;
-            const auto  y_val_1     = (id.y() * pool_stride_y) + 1;
-            auto        top_data    = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0, in_top_ptr, fill_value);
-            auto        bottom_data = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1, in_bottom_ptr, fill_value);
-            float32x2_t res         = {};
-            float       final_res   = 0;
+            const auto x_val   = id.x() * pool_stride_x;
+            const auto y_val_0 = id.y() * pool_stride_y;
+            const auto y_val_1 = (id.y() * pool_stride_y) + 1;
+            const auto y_val_2 = (id.y() * pool_stride_y) + 2;
+            auto top_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0, in_top_ptr,
+                                                  fill_value);
+            auto middle_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1,
+                                                     in_middle_ptr, fill_value);
+            auto bottom_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_2,
+                                                     in_bottom_ptr, fill_value);
+
+            float32x2_t res       = {};
+            float       final_res = 0;
 
             // Get power of 2 in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
+            if (pool_info.pool_type == PoolingType::L2)
             {
-                top_data    = vmul_f32(top_data, top_data);
-                bottom_data = vmul_f32(bottom_data, bottom_data);
+                top_data    = vmulq_f32(top_data, top_data);
+                middle_data = vmulq_f32(middle_data, middle_data);
+                bottom_data = vmulq_f32(bottom_data, bottom_data);
             }
 
-            if(pool_info.pool_type != PoolingType::MAX)
+            if (pool_info.pool_type != PoolingType::MAX)
             {
                 // Calculate scale
-                float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                         pool_stride_y);
+                float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size,
+                                                         pool_size, upper_bound_w, upper_bound_h, pool_pad_left,
+                                                         pool_pad_top, pool_stride_x, pool_stride_y);
                 const float32x2_t scale_v = vdup_n_f32(scale);
 
                 // Perform pooling
-                const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
-                res                        = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
+                const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
+                res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
+                res = vmul_f32(vpadd_f32(res, res), scale_v);
             }
             else
             {
-                const float32x2_t max_data = vmax_f32(top_data, bottom_data);
-                res                        = vpmax_f32(max_data, max_data);
+                const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
+                res = vpmax_f32(vget_high_f32(vsetq_lane_f32(min_value, max_data, 3)), vget_low_f32(max_data));
+                res = vpmax_f32(res, res);
             }
             final_res = vget_lane_f32(res, 0);
 
             // Calculate square-root in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
+            if (pool_info.pool_type == PoolingType::L2)
             {
                 final_res = sqrt(final_res);
             }
@@ -565,191 +737,120 @@ void pooling2_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, P
             *(reinterpret_cast<float *>(out.ptr())) = final_res;
         },
         in, out);
-    }
-}
-
-void pooling3_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(dst1);
-    Iterator in(src, window_src);
-    Iterator out(dst0, window);
-
-    constexpr const int pool_size       = 3;
-    const int           pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int           pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int           pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int           pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int   src_w         = src->info()->dimension(0);
-    const int   src_h         = src->info()->dimension(1);
-    const int   upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int   upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-    const float min_value     = get_initial_min<float>(pool_info.use_inf_as_limit);
-    const float fill_value    = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
-
-    const uint8_t *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-    const uint8_t *const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-    const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto in_top_ptr    = reinterpret_cast<const float *>(src_top_ptr + in.offset());
-        const auto in_middle_ptr = reinterpret_cast<const float *>(src_middle_ptr + in.offset());
-        const auto in_bottom_ptr = reinterpret_cast<const float *>(src_bottom_ptr + in.offset());
-
-        const auto x_val       = id.x() * pool_stride_x;
-        const auto y_val_0     = id.y() * pool_stride_y;
-        const auto y_val_1     = (id.y() * pool_stride_y) + 1;
-        const auto y_val_2     = (id.y() * pool_stride_y) + 2;
-        auto       top_data    = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0, in_top_ptr, fill_value);
-        auto       middle_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1, in_middle_ptr, fill_value);
-        auto       bottom_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_2, in_bottom_ptr, fill_value);
-
-        float32x2_t res       = {};
-        float       final_res = 0;
-
-        // Get power of 2 in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            top_data    = vmulq_f32(top_data, top_data);
-            middle_data = vmulq_f32(middle_data, middle_data);
-            bottom_data = vmulq_f32(bottom_data, bottom_data);
-        }
-
-        if(pool_info.pool_type != PoolingType::MAX)
-        {
-            // Calculate scale
-            float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                     pool_stride_y);
-            const float32x2_t scale_v = vdup_n_f32(scale);
-
-            // Perform pooling
-            const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
-            res                        = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
-            res                        = vmul_f32(vpadd_f32(res, res), scale_v);
-        }
-        else
-        {
-            const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
-            res                        = vpmax_f32(vget_high_f32(vsetq_lane_f32(min_value, max_data, 3)), vget_low_f32(max_data));
-            res                        = vpmax_f32(res, res);
-        }
-        final_res = vget_lane_f32(res, 0);
-
-        // Calculate square-root in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            final_res = sqrt(final_res);
-        }
-
-        // Store result
-        *(reinterpret_cast<float *>(out.ptr())) = final_res;
-    },
-    in, out);
 }
 
-void pooling7_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling7_fp32_neon_nchw(const ITensor    *src,
+                             ITensor          *dst0,
+                             ITensor          *dst1,
+                             PoolingLayerInfo &pool_info,
+                             const Window     &window_src,
+                             const Window     &window)
 {
     ARM_COMPUTE_UNUSED(dst1);
     Iterator in(src, window_src);
     Iterator out(dst0, window);
 
-    constexpr const int pool_size       = 7;
-    const int           pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int           pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int           pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int           pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
+    constexpr const int pool_size          = 7;
+    const int           pool_pad_right     = pool_info.pad_stride_info.pad_right();
+    const int           pool_pad_top       = pool_info.pad_stride_info.pad_top();
+    const int           pool_pad_left      = pool_info.pad_stride_info.pad_left();
+    const int           pool_pad_bottom    = pool_info.pad_stride_info.pad_bottom();
+    int                 pool_stride_x      = 0;
+    int                 pool_stride_y      = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int   src_w         = src->info()->dimension(0);
-    const int   src_h         = src->info()->dimension(1);
-    const int   upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int   upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-    const float min_value     = get_initial_min<float>(pool_info.use_inf_as_limit);
-    const float fill_value    = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
-
-    std::array<const uint8_t *, pool_size> src_ptrs{ {} };
-    for(int i = 0; i < pool_size; ++i)
+    const int   src_w                      = src->info()->dimension(0);
+    const int   src_h                      = src->info()->dimension(1);
+    const int   upper_bound_w              = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int   upper_bound_h              = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const float min_value                  = get_initial_min<float>(pool_info.use_inf_as_limit);
+    const float fill_value                 = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
+
+    std::array<const uint8_t *, pool_size> src_ptrs{{}};
+    for (int i = 0; i < pool_size; ++i)
     {
-        src_ptrs[i] = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
+        src_ptrs[i] =
+            src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
     }
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        auto in_ptr = reinterpret_cast<const float *>(src_ptrs[0] + in.offset());
-
-        auto          x_val = id.x() * pool_stride_x;
-        auto          y_val = id.y() * pool_stride_y;
-        float32x4x2_t data  = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            auto in_ptr = reinterpret_cast<const float *>(src_ptrs[0] + in.offset());
 
-        float32x2_t res       = {};
-        float       final_res = 0.f;
+            auto          x_val = id.x() * pool_stride_x;
+            auto          y_val = id.y() * pool_stride_y;
+            float32x4x2_t data =
+                read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value);
 
-        if(pool_info.pool_type != PoolingType::MAX)
-        {
-            // Calculate scale
-            float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                     pool_stride_y);
-            const float32x2_t scale_v = vdup_n_f32(scale);
+            float32x2_t res       = {};
+            float       final_res = 0.f;
 
-            // Get power of 2 in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
+            if (pool_info.pool_type != PoolingType::MAX)
             {
-                data.val[0] = vmulq_f32(data.val[0], data.val[0]);
-                data.val[1] = vmulq_f32(data.val[1], data.val[1]);
-            }
-            float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
-            for(int i = 1; i < pool_size; ++i)
-            {
-                in_ptr = reinterpret_cast<const float *>(src_ptrs[i] + in.offset());
+                // Calculate scale
+                float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size,
+                                                         pool_size, upper_bound_w, upper_bound_h, pool_pad_left,
+                                                         pool_pad_top, pool_stride_x, pool_stride_y);
+                const float32x2_t scale_v = vdup_n_f32(scale);
 
-                x_val = id.x() * pool_stride_x;
-                y_val = (id.y() * pool_stride_y) + i;
-                data  = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value);
                 // Get power of 2 in case of l2 pooling
-                if(pool_info.pool_type == PoolingType::L2)
+                if (pool_info.pool_type == PoolingType::L2)
                 {
                     data.val[0] = vmulq_f32(data.val[0], data.val[0]);
                     data.val[1] = vmulq_f32(data.val[1], data.val[1]);
                 }
-                sum_data = vaddq_f32(sum_data, data.val[0]);
-                sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
+                float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
+                for (int i = 1; i < pool_size; ++i)
+                {
+                    in_ptr = reinterpret_cast<const float *>(src_ptrs[i] + in.offset());
+
+                    x_val = id.x() * pool_stride_x;
+                    y_val = (id.y() * pool_stride_y) + i;
+                    data  = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr,
+                                                  fill_value);
+                    // Get power of 2 in case of l2 pooling
+                    if (pool_info.pool_type == PoolingType::L2)
+                    {
+                        data.val[0] = vmulq_f32(data.val[0], data.val[0]);
+                        data.val[1] = vmulq_f32(data.val[1], data.val[1]);
+                    }
+                    sum_data = vaddq_f32(sum_data, data.val[0]);
+                    sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
+                }
+                res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
+                res = vmul_f32(vpadd_f32(res, res), scale_v);
             }
-            res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
-            res = vmul_f32(vpadd_f32(res, res), scale_v);
-        }
-        else
-        {
-            for(int i = 1; i < pool_size; ++i)
+            else
             {
-                in_ptr = reinterpret_cast<const float *>(src_ptrs[i] + in.offset());
+                for (int i = 1; i < pool_size; ++i)
+                {
+                    in_ptr = reinterpret_cast<const float *>(src_ptrs[i] + in.offset());
 
-                x_val              = id.x() * pool_stride_x;
-                y_val              = (id.y() * pool_stride_y) + i;
-                float32x4x2_t temp = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value);
-                data               = vmax2q_f32(data, temp);
+                    x_val              = id.x() * pool_stride_x;
+                    y_val              = (id.y() * pool_stride_y) + i;
+                    float32x4x2_t temp = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val,
+                                                               in_ptr, fill_value);
+                    data               = vmax2q_f32(data, temp);
+                }
+                res = vpmax_f32(vget_high_f32(vsetq_lane_f32(min_value, data.val[1], 3)), vget_low_f32(data.val[1]));
+                res = vpmax_f32(res, vpmax_f32(vget_high_f32(data.val[0]), vget_low_f32(data.val[0])));
+                res = vpmax_f32(res, res);
             }
-            res = vpmax_f32(vget_high_f32(vsetq_lane_f32(min_value, data.val[1], 3)), vget_low_f32(data.val[1]));
-            res = vpmax_f32(res, vpmax_f32(vget_high_f32(data.val[0]), vget_low_f32(data.val[0])));
-            res = vpmax_f32(res, res);
-        }
-        final_res = vget_lane_f32(res, 0);
+            final_res = vget_lane_f32(res, 0);
 
-        // Calculate square-root in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            final_res = sqrt(final_res);
-        }
+            // Calculate square-root in case of l2 pooling
+            if (pool_info.pool_type == PoolingType::L2)
+            {
+                final_res = sqrt(final_res);
+            }
 
-        // Store result
-        *(reinterpret_cast<float *>(out.ptr())) = final_res;
-    },
-    in, out);
+            // Store result
+            *(reinterpret_cast<float *>(out.ptr())) = final_res;
+        },
+        in, out);
 }
 } // namespace cpu
 } // namespace arm_compute
 
-#endif // ENABLE_NCHW_KERNELS
-\ No newline at end of file
+#endif // ENABLE_NCHW_KERNELS
diff --git a/src/cpu/kernels/pool2d/neon/qasymm8.cpp b/src/cpu/kernels/pool2d/neon/qasymm8.cpp
index 7f8841edd8..44675b5394 100644
--- a/src/cpu/kernels/pool2d/neon/qasymm8.cpp
+++ b/src/cpu/kernels/pool2d/neon/qasymm8.cpp
@@ -25,17 +25,23 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 #include "src/cpu/kernels/pool2d/neon/list.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void poolingMxN_qasymm8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void poolingMxN_qasymm8_neon_nhwc(const ITensor    *src,
+                                  ITensor          *dst0,
+                                  ITensor          *dst1,
+                                  PoolingLayerInfo &pool_info,
+                                  const Window     &window_src,
+                                  const Window     &window)
 {
     poolingMxN_q8_neon_nhwc<uint8_t>(src, dst0, dst1, pool_info, window_src, window);
 }
 } // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp b/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp
index 8643651f27..d434323e89 100644
--- a/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp
@@ -25,17 +25,23 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 #include "src/cpu/kernels/pool2d/neon/list.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void poolingMxN_qasymm8_signed_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void poolingMxN_qasymm8_signed_neon_nhwc(const ITensor    *src,
+                                         ITensor          *dst0,
+                                         ITensor          *dst1,
+                                         PoolingLayerInfo &pool_info,
+                                         const Window     &window_src,
+                                         const Window     &window)
 {
     poolingMxN_q8_neon_nhwc<int8_t>(src, dst0, dst1, pool_info, window_src, window);
 }
 } // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/pool2d/neon/quantized.h b/src/cpu/kernels/pool2d/neon/quantized.h
index a2cd3991be..38f1b2f1f9 100644
--- a/src/cpu/kernels/pool2d/neon/quantized.h
+++ b/src/cpu/kernels/pool2d/neon/quantized.h
@@ -26,11 +26,13 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+
+#include "src/core/helpers/PoolingHelpers.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/PoolingHelpers.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -38,7 +40,12 @@ namespace arm_compute
 namespace cpu
 {
 template <typename T>
-void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void poolingMxN_q8_neon_nhwc(const ITensor    *src,
+                             ITensor          *dst0,
+                             ITensor          *dst1,
+                             PoolingLayerInfo &pool_info,
+                             const Window     &window_src,
+                             const Window     &window)
 {
     ARM_COMPUTE_UNUSED(dst1);
 
@@ -60,15 +67,15 @@ void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, P
     using q32_t   = typename wrapper::traits::promote_t<q16_t>;
     using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
 
-    const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
-    const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
     const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
     const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
     const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
     const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
 
-    int pool_stride_x = 0;
-    int pool_stride_y = 0;
+    int pool_stride_x                      = 0;
+    int pool_stride_y                      = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
     const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
     const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
@@ -80,233 +87,267 @@ void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, P
     const float quant_rescale = dst_qinfo.scale / src_qinfo.scale;
     // "new_offset" doesn't have to consider the "half_scale_v" in its computation
     // With a requantization performed in a single step there won't be uncertainties introduced
-    const int32_t new_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
+    const int32_t new_offset =
+        dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
 
-    const float                   requant_scale  = dst_qinfo.scale / src_qinfo.scale;
-    const int32_t                 requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
-    const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
+    const float   requant_scale = dst_qinfo.scale / src_qinfo.scale;
+    const int32_t requant_offset =
+        dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
+    const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
 
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        const int idx_width    = id.y() * pool_stride_x;
-        const int idx_height   = id.z() * pool_stride_y;
-        const int pool_limit_y = pool_pad_top - idx_height;
-        const int pool_limit_x = pool_pad_left - idx_width;
-
-        const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
-        const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
-        const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
-        const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
-
-        int x_off = window_start_x;
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
         {
-            if(pool_info.pool_type != PoolingType::MAX)
+            const int idx_width    = id.y() * pool_stride_x;
+            const int idx_height   = id.z() * pool_stride_y;
+            const int pool_limit_y = pool_pad_top - idx_height;
+            const int pool_limit_x = pool_pad_left - idx_width;
+
+            const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+            const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
+            const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+            const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
+
+            int x_off = window_start_x;
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
             {
-                q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-                q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-                q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-                q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-
-                // Calculate scale
-                const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                               pool_stride_y);
-
-                // Perform pooling
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                if (pool_info.pool_type != PoolingType::MAX)
                 {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                    q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                    q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                    q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+
+                    // Calculate scale
+                    const float scale = calculate_avg_scale_pool2d(
+                        pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+                        upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+
+                    // Perform pooling
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                         (src->info()->strides_in_bytes().z())) + x_off);
-
-                        const q16x8_t data_q16  = wrapper::vmovl(wrapper::vgetlow(data));
-                        const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
-                        vres1                   = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
-                        vres2                   = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
-                        vres3                   = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
-                        vres4                   = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const q8x16_t data = wrapper::vloadq(
+                                reinterpret_cast<const T *>(
+                                    in.ptr() +
+                                    (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                    (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                x_off);
+
+                            const q16x8_t data_q16  = wrapper::vmovl(wrapper::vgetlow(data));
+                            const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
+                            vres1                   = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
+                            vres2                   = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
+                            vres3                   = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
+                            vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
+                        }
                     }
-                }
 
-                if(src_qinfo != dst_qinfo)
-                {
-                    const float32x4x4_t vres =
+                    if (src_qinfo != dst_qinfo)
                     {
-                        {
+                        const float32x4x4_t vres = {{
                             vcvtq_f32_q32(vres1),
                             vcvtq_f32_q32(vres2),
                             vcvtq_f32_q32(vres3),
                             vcvtq_f32_q32(vres4),
-                        }
-                    };
-                    const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
-                    // Store result
-                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
-                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
+                        }};
+                        const auto          requantized_dst =
+                            vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
+                        // Store result
+                        wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
+                        wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8,
+                                        wrapper::vgethigh(requantized_dst));
+                    }
+                    else
+                    {
+                        const float32x4_t scale_v = vdupq_n_f32(scale);
+                        // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+                        vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
+                        vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
+                        vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
+                        vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
+
+                        const q8x8_t res1 =
+                            wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
+                        const q8x8_t res2 =
+                            wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
+                        // Store result
+                        wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
+                        wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
+                    }
                 }
                 else
                 {
-                    const float32x4_t scale_v = vdupq_n_f32(scale);
-                    // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
-                    vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
-                    vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
-                    vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
-                    vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
-
-                    const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
-                    const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
-                    // Store result
-                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
-                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
-                }
-            }
-            else
-            {
-                q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
+                    q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
 
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                         (src->info()->strides_in_bytes().z())) + x_off);
-                        vres               = wrapper::vmax(vres, data);
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const q8x16_t data = wrapper::vloadq(
+                                reinterpret_cast<const T *>(
+                                    in.ptr() +
+                                    (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                    (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                x_off);
+                            vres = wrapper::vmax(vres, data);
+                        }
                     }
-                }
 
-                // Store result
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
-                                requant_qinfo) :
-                                vres);
+                    // Store result
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
+                                    (src_qinfo != dst_qinfo)
+                                        ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres),
+                                                                               wrapper::vgethigh(vres), requant_qinfo)
+                                        : vres);
+                }
             }
-        }
 
-        if(pool_info.pool_type == PoolingType::MAX)
-        {
-            for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
+            if (pool_info.pool_type == PoolingType::MAX)
             {
-                q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                for (; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
                 {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                       (src->info()->strides_in_bytes().z())) + x_off);
-                        vres              = wrapper::vmax(vres, data);
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const q8x8_t data = wrapper::vload(
+                                reinterpret_cast<const T *>(
+                                    in.ptr() +
+                                    (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                    (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                x_off);
+                            vres = wrapper::vmax(vres, data);
+                        }
                     }
-                }
 
-                // Store result
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
-                                (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
+                    // Store result
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
+                                    (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
+                }
             }
-        }
 
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            if(pool_info.pool_type != PoolingType::MAX)
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
             {
-                q32_t res = static_cast<q32_t>(0.f);
+                if (pool_info.pool_type != PoolingType::MAX)
+                {
+                    q32_t res = static_cast<q32_t>(0.f);
 
-                // Calculate scale
-                const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                               pool_stride_y);
+                    // Calculate scale
+                    const float scale = calculate_avg_scale_pool2d(
+                        pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+                        upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
 
-                // Perform pooling
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    // Perform pooling
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                     (src->info()->strides_in_bytes().z())) + x_off);
-                        res += data;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const T data =
+                                *(reinterpret_cast<const T *>(
+                                      in.ptr() +
+                                      (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                      (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                  x_off);
+                            res += data;
+                        }
                     }
-                }
 
-                if(src_qinfo != dst_qinfo)
-                {
-                    const float res_f           = static_cast<float>(res);
-                    const float new_scale       = quant_rescale / scale;
-                    const auto  requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
+                    if (src_qinfo != dst_qinfo)
+                    {
+                        const float res_f          = static_cast<float>(res);
+                        const float new_scale      = quant_rescale / scale;
+                        const auto requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
 
-                    // Store result
-                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
+                        // Store result
+                        *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
+                    }
+                    else
+                    {
+                        // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+                        res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
+
+                        // Store result
+                        *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+                    }
                 }
                 else
                 {
-                    // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
-                    res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
-
-                    // Store result
-                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
-                }
-            }
-            else
-            {
-                T res = std::numeric_limits<T>::min();
+                    T res = std::numeric_limits<T>::min();
 
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                     (src->info()->strides_in_bytes().z())) + x_off);
-                        res          = std::max(res, data);
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const T data =
+                                *(reinterpret_cast<const T *>(
+                                      in.ptr() +
+                                      (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                      (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                  x_off);
+                            res = std::max(res, data);
+                        }
                     }
-                }
 
-                // Store result
-                if(src_qinfo != dst_qinfo)
-                {
-                    const float res_f                           = static_cast<float>(res);
-                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
-                }
-                else
-                {
-                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+                    // Store result
+                    if (src_qinfo != dst_qinfo)
+                    {
+                        const float res_f                           = static_cast<float>(res);
+                        *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
+                    }
+                    else
+                    {
+                        *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+                    }
                 }
             }
-        }
-
-    },
-    in, out);
+        },
+        in, out);
 }
 
 #if defined(ENABLE_NCHW_KERNELS)
 template <typename T, typename TVec>
-inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates &id, int id_offset, int step,
-                               const int pool_size, const int upper_bound_w, const int upper_bound_h,
-                               const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+inline void scale_vector_q16x8(bool               exclude_padding,
+                               TVec              &v,
+                               const Coordinates &id,
+                               int                id_offset,
+                               int                step,
+                               const int          pool_size,
+                               const int          upper_bound_w,
+                               const int          upper_bound_h,
+                               const int          pad_x,
+                               const int          pad_y,
+                               const int          stride_x,
+                               const int          stride_y)
 {
     int       start_x = (id.x() + id_offset) * stride_x - pad_x;
     int       start_y = id.y() * stride_y - pad_y;
     const int end_y   = std::min(start_y + pool_size, upper_bound_h);
-    if(exclude_padding)
+    if (exclude_padding)
     {
         start_y = std::max(0, start_y);
     }
 
-    std::array<T, 8> elems =
-    {
-        {
-            wrapper::vgetlane(v, 0),
-            wrapper::vgetlane(v, 1),
-            wrapper::vgetlane(v, 2),
-            wrapper::vgetlane(v, 3),
-            wrapper::vgetlane(v, 4),
-            wrapper::vgetlane(v, 5),
-            wrapper::vgetlane(v, 6),
-            wrapper::vgetlane(v, 7),
-        }
-    };
-
-    for(auto &el : elems)
+    std::array<T, 8> elems = {{
+        wrapper::vgetlane(v, 0),
+        wrapper::vgetlane(v, 1),
+        wrapper::vgetlane(v, 2),
+        wrapper::vgetlane(v, 3),
+        wrapper::vgetlane(v, 4),
+        wrapper::vgetlane(v, 5),
+        wrapper::vgetlane(v, 6),
+        wrapper::vgetlane(v, 7),
+    }};
+
+    for (auto &el : elems)
     {
         int       c_start_x = start_x;
         const int end_x     = std::min(c_start_x + pool_size, upper_bound_w);
-        if(exclude_padding)
+        if (exclude_padding)
         {
             c_start_x = std::max(0, c_start_x);
         }
@@ -326,15 +367,16 @@ inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates
 }
 
 template <typename T>
-auto load16_boundary_aware(int srcw, int srch, int pad_l, int pad_r, int pad_t, int pad_b, int x, int y, const T *ptr, T fval)
+auto load16_boundary_aware(
+    int srcw, int srch, int pad_l, int pad_r, int pad_t, int pad_b, int x, int y, const T *ptr, T fval)
 {
     ARM_COMPUTE_UNUSED(pad_b, pad_r);
     T vec[16];
     //handle reading a row out of the tensor
     const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t)));
-    for(int i = 0; i < 16; i++)
+    for (int i = 0; i < 16; i++)
     {
-        if(row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
+        if (row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
         {
             vec[i] = *(ptr + i);
         }
@@ -349,24 +391,24 @@ auto load16_boundary_aware(int srcw, int srch, int pad_l, int pad_r, int pad_t,
 template <typename T, typename V, bool deinterleave>
 inline void write16_boundary_aware(int x, int dst_w, const V &lower, const V &upper, T *ptr)
 {
-    if(deinterleave)
+    if (deinterleave)
     {
-        for(int i = 0; i < 8 && (i * 2 + x) < dst_w; ++i)
+        for (int i = 0; i < 8 && (i * 2 + x) < dst_w; ++i)
         {
             *(ptr + i * 2) = lower[i];
         }
-        for(int i = 0; i < 8 && (i * 2 + x + 1) < dst_w; ++i)
+        for (int i = 0; i < 8 && (i * 2 + x + 1) < dst_w; ++i)
         {
             *(ptr + 1 + i * 2) = upper[i];
         }
     }
     else
     {
-        for(int i = 0; i < 8 && (i + x) < dst_w; ++i)
+        for (int i = 0; i < 8 && (i + x) < dst_w; ++i)
         {
             *(ptr + i) = lower[i];
         }
-        for(int i = 0; i < 8 && (i + x + 8) < dst_w; ++i)
+        for (int i = 0; i < 8 && (i + x + 8) < dst_w; ++i)
         {
             *(ptr + i + 8) = upper[i];
         }
@@ -376,14 +418,19 @@ inline void write16_boundary_aware(int x, int dst_w, const V &lower, const V &up
 template <typename T, typename V>
 inline void write8_boundary_aware(int x, int dst_w, const V &v, T *ptr)
 {
-    for(int i = 0; i < 8 && (i + x) < dst_w; ++i)
+    for (int i = 0; i < 8 && (i + x) < dst_w; ++i)
     {
         *(ptr + i) = v[i];
     }
 }
 
 template <typename T>
-void pooling2_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling2_quantized_neon_nchw(const ITensor    *src,
+                                  ITensor          *dst0,
+                                  ITensor          *dst1,
+                                  PoolingLayerInfo &pool_info,
+                                  const Window     &window_src,
+                                  const Window     &window)
 {
     ARM_COMPUTE_UNUSED(dst1);
     Iterator in(src, window_src);
@@ -397,129 +444,136 @@ void pooling2_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *ds
     using q16x8_t   = typename wrapper::traits::neon_vector<q16_t, 8>::type;
     using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
 
-    constexpr int pool_size       = 2;
-    int           pool_stride_x   = 0;
-    int           pool_stride_y   = 0;
-    const int     pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int     pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int     pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int     pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
+    constexpr int pool_size                = 2;
+    int           pool_stride_x            = 0;
+    int           pool_stride_y            = 0;
+    const int     pool_pad_right           = pool_info.pad_stride_info.pad_right();
+    const int     pool_pad_top             = pool_info.pad_stride_info.pad_top();
+    const int     pool_pad_left            = pool_info.pad_stride_info.pad_left();
+    const int     pool_pad_bottom          = pool_info.pad_stride_info.pad_bottom();
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int                     upper_bound_w        = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int                     upper_bound_h        = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-    const T *const                src_top_ptr          = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
-    const T *const                src_bottom_ptr       = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
+    const int      upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int      upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const T *const src_top_ptr   = reinterpret_cast<const T *>(
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
+    const T *const src_bottom_ptr = reinterpret_cast<const T *>(
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
     const int                     scale_step_x         = (pool_stride_x == 1) ? 2 : 1;
     const UniformQuantizationInfo src_qinfo            = src->info()->quantization_info().uniform();
     const UniformQuantizationInfo dst_qinfo            = dst0->info()->quantization_info().uniform();
     const bool                    have_different_qinfo = src_qinfo != dst_qinfo;
 
-    const float                   requant_scale  = dst_qinfo.scale / src_qinfo.scale;
-    const int32_t                 requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
-    const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
-    const int                     src_w          = src->info()->dimension(0);
-    const int                     src_h          = src->info()->dimension(1);
-    const int                     dst_w          = dst0->info()->dimension(0);
+    const float   requant_scale = dst_qinfo.scale / src_qinfo.scale;
+    const int32_t requant_offset =
+        dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
+    const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
+    const int                     src_w         = src->info()->dimension(0);
+    const int                     src_h         = src->info()->dimension(1);
+    const int                     dst_w         = dst0->info()->dimension(0);
 
     const T fill_value = (pool_info.pool_type == PoolingType::MAX) ? std::numeric_limits<T>::min() : T(0);
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto x_val   = id.x() * pool_stride_x;
-        const auto y_val_0 = id.y() * pool_stride_y;
-        const auto y_val_1 = (id.y() * pool_stride_y) + 1;
-
-        auto top_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
-                                              x_val, y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
-        auto bottom_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
-                                                 x_val, y_val_1, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const auto x_val   = id.x() * pool_stride_x;
+            const auto y_val_0 = id.y() * pool_stride_y;
+            const auto y_val_1 = (id.y() * pool_stride_y) + 1;
 
-        q8x8_t lower_res = {};
-        q8x8_t upper_res = {};
+            auto top_data =
+                load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
+                                      y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
+            auto bottom_data =
+                load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
+                                      y_val_1, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
 
-        if(pool_info.pool_type != PoolingType::MAX)
-        {
-            const q16x8x2_t top_data_q16    = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
-            const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
+            q8x8_t lower_res = {};
+            q8x8_t upper_res = {};
 
-            // Add rows
-            const q16x8x2_t vrsum =
+            if (pool_info.pool_type != PoolingType::MAX)
             {
-                {
+                const q16x8x2_t top_data_q16 = {
+                    {wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data))}};
+                const q16x8x2_t bottom_data_q16 = {
+                    {wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data))}};
+
+                // Add rows
+                const q16x8x2_t vrsum = {{
                     wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]),
                     wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]),
-                }
-            };
+                }};
 
-            // Pair-wise add row data
-            const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0]));
-            const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1]));
+                // Pair-wise add row data
+                const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0]));
+                const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1]));
 
-            q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2);
+                q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2);
 
-            // Scale lower result
-            scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_lower, id, 0, scale_step_x,
-                                               pool_size, upper_bound_w, upper_bound_h,
-                                               pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-            lower_res = wrapper::vmovn(res_lower);
+                // Scale lower result
+                scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_lower, id, 0, scale_step_x, pool_size,
+                                                   upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
+                                                   pool_stride_x, pool_stride_y);
+                lower_res = wrapper::vmovn(res_lower);
 
-            // Compute upper result for stride_x == 1
-            if(pool_stride_x == 1)
-            {
-                // Shifted row sum
-                const q16x8x2_t vrsum_shifted =
+                // Compute upper result for stride_x == 1
+                if (pool_stride_x == 1)
                 {
-                    {
-                        wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
-                        wrapper::vext_1(vrsum.val[1], vrsum.val[1])
-                    }
-                };
-
-                // Pair-wise add shifted row
-                q16x8_t res_upper = wrapper::vcombine(
-                                        wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])),
-                                        wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]), wrapper::vgethigh(vrsum_shifted.val[1])));
-
-                // Scale upper result
-                scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_upper, id, 1, 2,
-                                                   pool_size, upper_bound_w, upper_bound_h,
-                                                   pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-                upper_res = wrapper::vmovn(res_upper);
+                    // Shifted row sum
+                    const q16x8x2_t vrsum_shifted = {
+                        {wrapper::vext_1(vrsum.val[0], vrsum.val[1]), wrapper::vext_1(vrsum.val[1], vrsum.val[1])}};
+
+                    // Pair-wise add shifted row
+                    q16x8_t res_upper = wrapper::vcombine(
+                        wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])),
+                        wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]),
+                                       wrapper::vgethigh(vrsum_shifted.val[1])));
+
+                    // Scale upper result
+                    scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_upper, id, 1, 2, pool_size,
+                                                       upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
+                                                       pool_stride_x, pool_stride_y);
+                    upper_res = wrapper::vmovn(res_upper);
+                }
             }
-        }
-        else
-        {
-            const q8x16_t max_data = wrapper::vmax(top_data, bottom_data);
-            lower_res              = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data));
-            if(pool_stride_x == 1)
+            else
             {
-                const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data);
-                upper_res                      = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted));
+                const q8x16_t max_data = wrapper::vmax(top_data, bottom_data);
+                lower_res              = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data));
+                if (pool_stride_x == 1)
+                {
+                    const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data);
+                    upper_res = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted));
+                }
             }
-        }
 
-        if(have_different_qinfo)
-        {
-            const auto requantized_dst = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
-            lower_res                  = wrapper::vgetlow(requantized_dst);
-            upper_res                  = wrapper::vgethigh(requantized_dst);
-        }
-        auto out_ptr = reinterpret_cast<T *>(out.ptr());
-        // Store result
-        if(pool_stride_x == 1)
-        {
-            write16_boundary_aware<T, q8x8_t, true>(id.x(), dst_w, lower_res, upper_res, out_ptr);
-        }
-        else
-        {
-            write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, lower_res, out_ptr);
-        }
-    },
-    in, out);
+            if (have_different_qinfo)
+            {
+                const auto requantized_dst = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
+                lower_res                  = wrapper::vgetlow(requantized_dst);
+                upper_res                  = wrapper::vgethigh(requantized_dst);
+            }
+            auto out_ptr = reinterpret_cast<T *>(out.ptr());
+            // Store result
+            if (pool_stride_x == 1)
+            {
+                write16_boundary_aware<T, q8x8_t, true>(id.x(), dst_w, lower_res, upper_res, out_ptr);
+            }
+            else
+            {
+                write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, lower_res, out_ptr);
+            }
+        },
+        in, out);
 }
 
 template <typename T>
-void pooling3_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling3_quantized_neon_nchw(const ITensor    *src,
+                                  ITensor          *dst0,
+                                  ITensor          *dst1,
+                                  PoolingLayerInfo &pool_info,
+                                  const Window     &window_src,
+                                  const Window     &window)
 {
     ARM_COMPUTE_UNUSED(dst1);
     Iterator in(src, window_src);
@@ -533,13 +587,13 @@ void pooling3_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *ds
     using q16x8_t   = typename wrapper::traits::neon_vector<q16_t, 8>::type;
     using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
 
-    constexpr int pool_size       = 3;
-    const int     pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int     pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int     pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int     pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int           pool_stride_x   = 0;
-    int           pool_stride_y   = 0;
+    constexpr int pool_size                = 3;
+    const int     pool_pad_right           = pool_info.pad_stride_info.pad_right();
+    const int     pool_pad_top             = pool_info.pad_stride_info.pad_top();
+    const int     pool_pad_left            = pool_info.pad_stride_info.pad_left();
+    const int     pool_pad_bottom          = pool_info.pad_stride_info.pad_bottom();
+    int           pool_stride_x            = 0;
+    int           pool_stride_y            = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
     const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
     const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
@@ -547,147 +601,145 @@ void pooling3_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *ds
     const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
     const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform();
 
-    const float                   requant_scale  = dst_qinfo.scale / src_qinfo.scale;
-    const int32_t                 requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
-    const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
+    const float   requant_scale = dst_qinfo.scale / src_qinfo.scale;
+    const int32_t requant_offset =
+        dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
+    const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
 
-    const T *const src_top_ptr    = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
-    const T *const src_middle_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
-    const T *const src_bottom_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
+    const T *const src_top_ptr = reinterpret_cast<const T *>(
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
+    const T *const src_middle_ptr = reinterpret_cast<const T *>(
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
+    const T *const src_bottom_ptr = reinterpret_cast<const T *>(
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
 
     const int src_w      = src->info()->dimension(0);
     const int src_h      = src->info()->dimension(1);
     const T   fill_value = (pool_info.pool_type == PoolingType::AVG) ? T(0) : std::numeric_limits<T>::min();
     const int dst_w      = dst0->info()->dimension(0);
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto x_val   = id.x() * pool_stride_x;
-        const auto y_val_0 = id.y() * pool_stride_y;
-        const auto y_val_1 = (id.y() * pool_stride_y) + 1;
-        const auto y_val_2 = (id.y() * pool_stride_y) + 2;
-
-        auto top_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
-                                              x_val, y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
-        auto middle_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
-                                                 x_val, y_val_1, reinterpret_cast<const T *>(src_middle_ptr + in.offset()), fill_value);
-        auto bottom_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
-                                                 x_val, y_val_2, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
-
-        q8x8_t  fres  = {};
-        q8x16_t fqres = {};
-
-        if(pool_info.pool_type == PoolingType::AVG)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            // Convert data to u16
-            const q16x8x2_t top_data_q16    = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
-            const q16x8x2_t middle_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data)) } };
-            const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
-
-            // Calculate row sums
-            const q16x8x2_t vrsum =
+            const auto x_val   = id.x() * pool_stride_x;
+            const auto y_val_0 = id.y() * pool_stride_y;
+            const auto y_val_1 = (id.y() * pool_stride_y) + 1;
+            const auto y_val_2 = (id.y() * pool_stride_y) + 2;
+
+            auto top_data =
+                load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
+                                      y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
+            auto middle_data =
+                load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
+                                      y_val_1, reinterpret_cast<const T *>(src_middle_ptr + in.offset()), fill_value);
+            auto bottom_data =
+                load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
+                                      y_val_2, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
+
+            q8x8_t  fres  = {};
+            q8x16_t fqres = {};
+
+            if (pool_info.pool_type == PoolingType::AVG)
             {
+                // Convert data to u16
+                const q16x8x2_t top_data_q16 = {
+                    {wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data))}};
+                const q16x8x2_t middle_data_q16 = {
+                    {wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data))}};
+                const q16x8x2_t bottom_data_q16 = {
+                    {wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data))}};
+
+                // Calculate row sums
+                const q16x8x2_t vrsum           = {{
+                              wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]),
+                              wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]),
+                }};
+                const q16x8x2_t vrsum_shifted_1 = {
+                    {wrapper::vext_1(vrsum.val[0], vrsum.val[1]), wrapper::vext_1(vrsum.val[1], vrsum.val[1])}};
+                const q16x8x2_t vrsum_shifted_2 = {
+                    {wrapper::vext_2(vrsum.val[0], vrsum.val[1]), wrapper::vext_2(vrsum.val[1], vrsum.val[1])}};
+                // Calculate final sum
+                q16x8x2_t final_sum = {{
+                    wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
+                    wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
+                }};
+                if (pool_stride_x == 2)
                 {
-                    wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]),
-                    wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]),
+                    q16x8_t res = {
+                        wrapper::vgetlane(final_sum.val[0], 0), wrapper::vgetlane(final_sum.val[0], 2),
+                        wrapper::vgetlane(final_sum.val[0], 4), wrapper::vgetlane(final_sum.val[0], 6),
+                        wrapper::vgetlane(final_sum.val[1], 0), wrapper::vgetlane(final_sum.val[1], 2),
+                        wrapper::vgetlane(final_sum.val[1], 4), wrapper::vgetlane(final_sum.val[1], 6),
+                    };
+
+                    scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res, id, 0, 1, pool_size,
+                                                       upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
+                                                       pool_stride_x, pool_stride_y);
+                    fres = wrapper::vmovn(res);
                 }
-            };
-            const q16x8x2_t vrsum_shifted_1 =
-            {
+                else
                 {
-                    wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
-                    wrapper::vext_1(vrsum.val[1], vrsum.val[1])
+                    // Scale lower result
+                    scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[0], id, 0, 1, pool_size,
+                                                       upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
+                                                       pool_stride_x, pool_stride_y);
+                    // Scale lower result
+                    scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[1], id, 8, 1, pool_size,
+                                                       upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
+                                                       pool_stride_x, pool_stride_y);
+                    fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1]));
                 }
-            };
-            const q16x8x2_t vrsum_shifted_2 =
+            }
+            else
             {
+                const q8x16_t max_data        = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data);
+                const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data);
+                const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data);
+                const q8x16_t final_max = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2);
+
+                if (pool_stride_x == 2)
                 {
-                    wrapper::vext_2(vrsum.val[0], vrsum.val[1]),
-                    wrapper::vext_2(vrsum.val[1], vrsum.val[1])
+                    const q8x8x2_t      table      = {{wrapper::vgetlow(final_max), wrapper::vgethigh(final_max)}};
+                    static const q8x8_t lookup_val = {0, 2, 4, 6, 8, 10, 12, 14};
+                    fres                           = wrapper::vtbl(table, lookup_val);
                 }
-            };
-            // Calculate final sum
-            q16x8x2_t final_sum =
-            {
+                else
                 {
-                    wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
-                    wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
+                    fqres = final_max;
                 }
-            };
-            if(pool_stride_x == 2)
-            {
-                q16x8_t res =
-                {
-                    wrapper::vgetlane(final_sum.val[0], 0),
-                    wrapper::vgetlane(final_sum.val[0], 2),
-                    wrapper::vgetlane(final_sum.val[0], 4),
-                    wrapper::vgetlane(final_sum.val[0], 6),
-                    wrapper::vgetlane(final_sum.val[1], 0),
-                    wrapper::vgetlane(final_sum.val[1], 2),
-                    wrapper::vgetlane(final_sum.val[1], 4),
-                    wrapper::vgetlane(final_sum.val[1], 6),
-                };
-
-                scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res, id, 0, 1,
-                                                   pool_size, upper_bound_w, upper_bound_h,
-                                                   pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-                fres = wrapper::vmovn(res);
             }
-            else
-            {
-                // Scale lower result
-                scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[0], id, 0, 1,
-                                                   pool_size, upper_bound_w, upper_bound_h,
-                                                   pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-                // Scale lower result
-                scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[1], id, 8, 1,
-                                                   pool_size, upper_bound_w, upper_bound_h,
-                                                   pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-                fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1]));
-            }
-        }
-        else
-        {
-            const q8x16_t max_data        = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data);
-            const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data);
-            const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data);
-            const q8x16_t final_max       = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2);
 
-            if(pool_stride_x == 2)
+            // Store result
+            if (pool_stride_x == 1)
             {
-                const q8x8x2_t      table      = { { wrapper::vgetlow(final_max), wrapper::vgethigh(final_max) } };
-                static const q8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
-                fres                           = wrapper::vtbl(table, lookup_val);
+                if (src_qinfo != dst_qinfo)
+                {
+                    fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres),
+                                                                 requant_qinfo);
+                }
+                write16_boundary_aware<T, q8x8_t, false>(id.x(), dst_w, wrapper::vgetlow(fqres),
+                                                         wrapper::vgethigh(fqres), reinterpret_cast<T *>(out.ptr()));
             }
             else
             {
-                fqres = final_max;
-            }
-        }
-
-        // Store result
-        if(pool_stride_x == 1)
-        {
-            if(src_qinfo != dst_qinfo)
-            {
-                fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), requant_qinfo);
-            }
-            write16_boundary_aware<T, q8x8_t, false>(id.x(), dst_w, wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), reinterpret_cast<T *>(out.ptr()));
-        }
-        else
-        {
-            if(src_qinfo != dst_qinfo)
-            {
-                fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
+                if (src_qinfo != dst_qinfo)
+                {
+                    fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
+                }
+                write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, fres, reinterpret_cast<T *>(out.ptr()));
             }
-            write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, fres, reinterpret_cast<T *>(out.ptr()));
-        }
-    },
-    in, out);
+        },
+        in, out);
 }
 
 template <typename T>
-void poolingMxN_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void poolingMxN_quantized_neon_nchw(const ITensor    *src,
+                                    ITensor          *dst0,
+                                    ITensor          *dst1,
+                                    PoolingLayerInfo &pool_info,
+                                    const Window     &window_src,
+                                    const Window     &window)
 {
     ARM_COMPUTE_UNUSED(dst1);
     Iterator in(src, window_src);
@@ -697,74 +749,81 @@ void poolingMxN_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *
     using q16_t = typename wrapper::traits::promote_t<T>;
     using q32_t = typename wrapper::traits::promote_t<q16_t>;
 
-    const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
-    const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
-    const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int       pool_stride_x   = 0;
-    int       pool_stride_y   = 0;
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
+    const int pool_pad_right               = pool_info.pad_stride_info.pad_right();
+    const int pool_pad_top                 = pool_info.pad_stride_info.pad_top();
+    const int pool_pad_left                = pool_info.pad_stride_info.pad_left();
+    const int pool_pad_bottom              = pool_info.pad_stride_info.pad_bottom();
+    int       pool_stride_x                = 0;
+    int       pool_stride_y                = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
     const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
     const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
 
-    const UniformQuantizationInfo &src_qinfo        = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo &dst_qinfo        = dst0->info()->quantization_info().uniform();
-    const int                      src_w            = src->info()->dimension(0);
-    const int                      src_h            = src->info()->dimension(1);
-    const T                        fill_value       = (pool_info.pool_type == PoolingType::AVG) ? T(0) : std::numeric_limits<T>::min();
-    const int                      stridex_in_bytes = static_cast<int>(src->info()->strides_in_bytes().x());
-    const int                      stridey_in_bytes = static_cast<int>(src->info()->strides_in_bytes().y());
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        T res = std::numeric_limits<T>::min();
-
-        if(pool_info.pool_type != PoolingType::MAX)
+    const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform();
+    const int                      src_w     = src->info()->dimension(0);
+    const int                      src_h     = src->info()->dimension(1);
+    const T   fill_value       = (pool_info.pool_type == PoolingType::AVG) ? T(0) : std::numeric_limits<T>::min();
+    const int stridex_in_bytes = static_cast<int>(src->info()->strides_in_bytes().x());
+    const int stridey_in_bytes = static_cast<int>(src->info()->strides_in_bytes().y());
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            q32_t sres = 0;
-
-            // Calculate scale
-            const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                           pool_stride_y);
+            T res = std::numeric_limits<T>::min();
 
-            // Perform pooling
-            for(int y = 0; y < pool_size_y; ++y)
+            if (pool_info.pool_type != PoolingType::MAX)
             {
-                for(int x = 0; x < pool_size_x; ++x)
+                q32_t sres = 0;
+
+                // Calculate scale
+                const float scale = calculate_avg_scale_pool2d(
+                    pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w,
+                    upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+
+                // Perform pooling
+                for (int y = 0; y < pool_size_y; ++y)
                 {
-                    const auto in_ptr = reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes);
+                    for (int x = 0; x < pool_size_x; ++x)
+                    {
+                        const auto in_ptr = reinterpret_cast<const T *>(
+                            in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes);
 
-                    const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
-                    const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
-                    const T   data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr;
-                    sres += data;
+                        const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
+                        const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
+                        const T   data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr;
+                        sres += data;
+                    }
                 }
+                // Divide by scale
+                res = static_cast<T>(support::cpp11::round(sres * scale));
             }
-            // Divide by scale
-            res = static_cast<T>(support::cpp11::round(sres * scale));
-        }
-        else
-        {
-            for(int y = 0; y < pool_size_y; ++y)
+            else
             {
-                for(int x = 0; x < pool_size_x; ++x)
+                for (int y = 0; y < pool_size_y; ++y)
                 {
-                    const auto in_ptr = reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes);
+                    for (int x = 0; x < pool_size_x; ++x)
+                    {
+                        const auto in_ptr = reinterpret_cast<const T *>(
+                            in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes);
 
-                    const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
-                    const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
-                    const T   data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr;
-                    res            = std::max(res, data);
+                        const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
+                        const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
+                        const T   data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr;
+                        res            = std::max(res, data);
+                    }
                 }
             }
-        }
-        // Store result
-        res                                 = (src_qinfo != dst_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(Qasymm8QuantizationHelper<T>::dequantize(res, src_qinfo), dst_qinfo) : res;
-        *(reinterpret_cast<T *>(out.ptr())) = res;
-    },
-    in, out);
+            // Store result
+            res                                 = (src_qinfo != dst_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(
+                                                                                 Qasymm8QuantizationHelper<T>::dequantize(res, src_qinfo), dst_qinfo)
+                                                                           : res;
+            *(reinterpret_cast<T *>(out.ptr())) = res;
+        },
+        in, out);
 }
 #endif /* defined(ENABLE_NCHW_KERNELS) */
 } // namespace cpu
diff --git a/src/cpu/kernels/pool3d/neon/impl.h b/src/cpu/kernels/pool3d/neon/impl.h
index 013e25537c..ce89199b5d 100644
--- a/src/cpu/kernels/pool3d/neon/impl.h
+++ b/src/cpu/kernels/pool3d/neon/impl.h
@@ -25,9 +25,10 @@
 #define SRC_CORE_POOLING_3D_LAYER_IMPL_H
 
 #include "arm_compute/core/Helpers.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include "src/core/helpers/PoolingHelpers.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 #include "src/cpu/kernels/pool3d/neon/quantized.h"
 
 namespace arm_compute
@@ -37,8 +38,13 @@ namespace cpu
 namespace
 {
 template <typename T>
-void max_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out,
-                                    const int window_start_x, const int window_end_x, const int window_step_x)
+void max_poolingMxNxD_fp_neon_ndhwc(const ITensor      *src,
+                                    ITensor            *dst0,
+                                    Pooling3dLayerInfo &pool_info,
+                                    const Window       &window_out,
+                                    const int           window_start_x,
+                                    const int           window_end_x,
+                                    const int           window_step_x)
 
 {
     using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
@@ -71,80 +77,87 @@ void max_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d
     Iterator out(dst0, window_out);
 
     vector_type vres;
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // Computing the theoretical input starting/ending points
-        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
-        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
-        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // Computing the theoretical input starting/ending points
+            const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+            const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
 
-        const int pool_start_x = std::max(0, -in_idx_width);
-        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
-        const int pool_start_y = std::max(0, -in_idx_height);
-        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+            const int pool_start_x = std::max(0, -in_idx_width);
+            const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+            const int pool_start_y = std::max(0, -in_idx_height);
+            const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
 
-        const int pool_start_z = std::max(0, -in_idx_depth);
-        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+            const int pool_start_z = std::max(0, -in_idx_depth);
+            const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
 
-        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
-        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
-        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
-        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+            // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+            const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+            const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+            const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
 
-        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+            const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
 
-        int x_off = window_start_x;
+            int x_off = window_start_x;
 
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
-        {
-            vres = wrapper::vdup_n(static_cast<T>(-std::numeric_limits<float>::infinity()), tag_type());
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                vres = wrapper::vdup_n(static_cast<T>(-std::numeric_limits<float>::infinity()), tag_type());
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        vres                       = wrapper::vmax(vres, data);
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            vres                       = wrapper::vmax(vres, data);
+                        }
                     }
                 }
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
             }
-            // Store result
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
-        }
 
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            T res(0);
-            res = -std::numeric_limits<float>::infinity();
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                T res(0);
+                res = -std::numeric_limits<float>::infinity();
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        res                     = std::max(res, data);
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            res                     = std::max(res, data);
+                        }
                     }
                 }
+                // Store result
+                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
             }
-            // Store result
-            *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
-        }
-    },
-    out);
+        },
+        out);
 }
 
 template <typename T>
-void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info,
-                                    const Window &window_out, const int window_start_x, const int window_end_x, const int window_step_x)
+void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor      *src,
+                                    ITensor            *dst0,
+                                    Pooling3dLayerInfo &pool_info,
+                                    const Window       &window_out,
+                                    const int           window_start_x,
+                                    const int           window_end_x,
+                                    const int           window_step_x)
 {
     using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
     using vector_type = typename vtype::type;
@@ -183,95 +196,103 @@ void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d
     Iterator out(dst0, window_out);
 
     vector_type vres;
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // Computing the theoretical input starting/ending points
-        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
-        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
-        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
-
-        const int pool_start_x = std::max(0, -in_idx_width);
-        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
-        const int pool_start_y = std::max(0, -in_idx_height);
-        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
-
-        const int pool_start_z = std::max(0, -in_idx_depth);
-        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
-
-        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
-        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
-        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
-        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
-
-        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
-
-        // Calculate scale
-        const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
-                                                       pool_pad_top, pool_pad_front, pool_stride_x,
-                                                       pool_stride_y, pool_stride_z);
-        const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // Computing the theoretical input starting/ending points
+            const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+            const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+
+            const int pool_start_x = std::max(0, -in_idx_width);
+            const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+            const int pool_start_y = std::max(0, -in_idx_height);
+            const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+
+            const int pool_start_z = std::max(0, -in_idx_depth);
+            const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+
+            // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+            const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+            const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+            const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+
+            const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+
+            // Calculate scale
+            const float scale =
+                calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z,
+                                           upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top,
+                                           pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z);
+            const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
 
-        int x_off = window_start_x;
+            int x_off = window_start_x;
 
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
-        {
-            // Perform pooling
-            vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                // Perform pooling
+                vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        vres                       = wrapper::vadd(vres, data);
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            vres                       = wrapper::vadd(vres, data);
+                        }
                     }
                 }
-            }
 
-            // Divide by scale
-            vres = wrapper::vmul(vres, scale_v);
+                // Divide by scale
+                vres = wrapper::vmul(vres, scale_v);
 
-            // Store result
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
-        }
-
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            T res(0);
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
+            }
 
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                T res(0);
+
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        res += data;
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            res += data;
+                        }
                     }
                 }
-            }
 
-            // Divide by scale
-            res *= scale;
+                // Divide by scale
+                res *= scale;
 
-            // Store result
-            *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
-        }
-    },
-    out);
+                // Store result
+                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+            }
+        },
+        out);
 }
 
 template <typename T>
-void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info,
-                                   const Window &window_out, const int window_start_x, const int window_end_x, const int window_step_x)
+void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor      *src,
+                                   ITensor            *dst0,
+                                   Pooling3dLayerInfo &pool_info,
+                                   const Window       &window_out,
+                                   const int           window_start_x,
+                                   const int           window_end_x,
+                                   const int           window_step_x)
 {
     using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
     using vector_type = typename vtype::type;
@@ -310,97 +331,100 @@ void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dL
     Iterator out(dst0, window_out);
 
     vector_type vres;
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // Computing the theoretical input starting/ending points
-        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
-        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
-        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // Computing the theoretical input starting/ending points
+            const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+            const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
 
-        const int pool_start_x = std::max(0, -in_idx_width);
-        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
-        const int pool_start_y = std::max(0, -in_idx_height);
-        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+            const int pool_start_x = std::max(0, -in_idx_width);
+            const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+            const int pool_start_y = std::max(0, -in_idx_height);
+            const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
 
-        const int pool_start_z = std::max(0, -in_idx_depth);
-        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+            const int pool_start_z = std::max(0, -in_idx_depth);
+            const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
 
-        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
-        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
-        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
-        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+            // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+            const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+            const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+            const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
 
-        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+            const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
 
-        // Calculate scale
-        const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
-                                                       pool_pad_top, pool_pad_front, pool_stride_x,
-                                                       pool_stride_y, pool_stride_z);
+            // Calculate scale
+            const float scale =
+                calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z,
+                                           upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top,
+                                           pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z);
 
-        int x_off = window_start_x;
+            int x_off = window_start_x;
 
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
-        {
-            // Perform pooling
-            vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                // Perform pooling
+                vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        vres                       = wrapper::vmla(vres, data, data);
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            vres                       = wrapper::vmla(vres, data, data);
+                        }
                     }
                 }
-            }
-
-            const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
 
-            // Divide by scale
-            vres = wrapper::vmul(vres, scale_v);
+                const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
 
-            // Calculate square-root
-            vres = wrapper::vinv(wrapper::vinvsqrt(vres));
+                // Divide by scale
+                vres = wrapper::vmul(vres, scale_v);
 
-            // Store result
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
-        }
+                // Calculate square-root
+                vres = wrapper::vinv(wrapper::vinvsqrt(vres));
 
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            T res(0);
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
+            }
 
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                T res(0);
+
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        res += data * data;
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            res += data * data;
+                        }
                     }
                 }
-            }
 
-            // Divide by scale
-            res *= scale;
+                // Divide by scale
+                res *= scale;
 
-            // Square root
-            res = std::sqrt(res);
+                // Square root
+                res = std::sqrt(res);
 
-            // Store result
-            *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
-        }
-    },
-    out);
+                // Store result
+                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+            }
+        },
+        out);
 }
 } // namespace
 
@@ -415,16 +439,19 @@ void poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLaye
     // Needed to handle loop left-over
     window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    switch(pool_info.pool_type)
+    switch (pool_info.pool_type)
     {
         case PoolingType::MAX:
-            max_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x);
+            max_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x,
+                                              window_step_x);
             break;
         case PoolingType::AVG:
-            avg_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x);
+            avg_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x,
+                                              window_step_x);
             break;
         case PoolingType::L2:
-            l2_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x);
+            l2_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x,
+                                             window_step_x);
             break;
         default:
             ARM_COMPUTE_ERROR("Pool operation not supported");
@@ -440,7 +467,7 @@ void poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLaye
     // Needed to handle loop left-over
     window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    switch(pool_info.pool_type)
+    switch (pool_info.pool_type)
     {
         case PoolingType::MAX:
             max_poolingMxNxD_q8_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_step_x);
diff --git a/src/cpu/kernels/pool3d/neon/quantized.h b/src/cpu/kernels/pool3d/neon/quantized.h
index ac14f5eafa..8819907901 100644
--- a/src/cpu/kernels/pool3d/neon/quantized.h
+++ b/src/cpu/kernels/pool3d/neon/quantized.h
@@ -26,17 +26,18 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/PoolingHelpers.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
 template <typename T>
-void avg_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out,
-                                    const int window_step_x)
+void avg_poolingMxNxD_q8_neon_ndhwc(
+    const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, const int window_step_x)
 
 {
     using q8x8_t  = typename wrapper::traits::neon_vector<T, 8>::type;
@@ -89,144 +90,147 @@ void avg_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d
     const float quant_rescale = dst_qinfo.scale / src_qinfo.scale;
     // "new_offset" doesn't have to consider the "half_scale_v" in its computation
     // With a requantization performed in a single step there won't be uncertainties introduced
-    const int32_t new_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
-
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // Computing the theoretical input starting/ending points
-        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
-        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
-        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+    const int32_t new_offset =
+        dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
 
-        const int pool_start_x = std::max(0, -in_idx_width);
-        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
-        const int pool_start_y = std::max(0, -in_idx_height);
-        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // Computing the theoretical input starting/ending points
+            const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+            const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
 
-        const int pool_start_z = std::max(0, -in_idx_depth);
-        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+            const int pool_start_x = std::max(0, -in_idx_width);
+            const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+            const int pool_start_y = std::max(0, -in_idx_height);
+            const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
 
-        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
-        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
-        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
-        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+            const int pool_start_z = std::max(0, -in_idx_depth);
+            const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
 
-        // Calculate scale
-        const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
-                                                       pool_pad_top, pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z);
+            // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+            const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+            const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+            const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
 
-        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+            // Calculate scale
+            const float scale =
+                calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z,
+                                           upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top,
+                                           pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z);
 
-        int x_off = window_start_x;
+            const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
 
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
-        {
-            q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-            q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-            q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-            q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+            int x_off = window_start_x;
 
-            // Perform pooling
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+
+                // Perform pooling
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const q8x16_t  data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-
-                        const q16x8_t data_q16  = wrapper::vmovl(wrapper::vgetlow(data));
-                        const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
-                        vres1                   = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
-                        vres2                   = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
-                        vres3                   = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
-                        vres4                   = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const q8x16_t  data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+                            const q16x8_t data_q16  = wrapper::vmovl(wrapper::vgetlow(data));
+                            const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
+                            vres1                   = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
+                            vres2                   = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
+                            vres3                   = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
+                            vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
+                        }
                     }
                 }
-            }
 
-            if(src_qinfo != dst_qinfo)
-            {
-                const float32x4x4_t vres =
+                if (src_qinfo != dst_qinfo)
                 {
-                    {
+                    const float32x4x4_t vres = {{
                         vcvtq_f32_q32(vres1),
                         vcvtq_f32_q32(vres2),
                         vcvtq_f32_q32(vres3),
                         vcvtq_f32_q32(vres4),
-                    }
-                };
-                const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
-                // Store result
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
-            }
-            else
-            {
-                const float32x4_t scale_v = vdupq_n_f32(scale);
-                // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
-                vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
-                vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
-                vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
-                vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
-
-                const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
-                const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
-                // Store result
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
+                    }};
+                    const auto          requantized_dst =
+                        vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
+                    // Store result
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
+                }
+                else
+                {
+                    const float32x4_t scale_v = vdupq_n_f32(scale);
+                    // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+                    vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
+                    vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
+                    vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
+                    vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
+
+                    const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
+                    const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
+                    // Store result
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
+                }
             }
-        }
-
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            q32_t res = static_cast<q32_t>(0.f);
 
-            // Perform pooling
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                q32_t res = static_cast<q32_t>(0.f);
+
+                // Perform pooling
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        res += data;
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            res += data;
+                        }
                     }
                 }
-            }
 
-            if(src_qinfo != dst_qinfo)
-            {
-                const float res_f           = static_cast<float>(res);
-                const float new_scale       = quant_rescale / scale;
-                const auto  requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
+                if (src_qinfo != dst_qinfo)
+                {
+                    const float res_f           = static_cast<float>(res);
+                    const float new_scale       = quant_rescale / scale;
+                    const auto  requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
 
-                // Store result
-                *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
-            }
-            else
-            {
-                // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
-                res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
+                    // Store result
+                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
+                }
+                else
+                {
+                    // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+                    res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
 
-                // Store result
-                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+                    // Store result
+                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+                }
             }
-        }
-    },
-    out);
+        },
+        out);
 }
 
 template <typename T>
-void max_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out,
-                                    const int window_step_x)
+void max_poolingMxNxD_q8_neon_ndhwc(
+    const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, const int window_step_x)
 
 {
     using q8x8_t  = typename wrapper::traits::neon_vector<T, 8>::type;
@@ -266,125 +270,130 @@ void max_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d
     const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
     const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform();
 
-    const float                   requant_scale  = dst_qinfo.scale / src_qinfo.scale;
-    const int32_t                 requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
-    const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
-
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // Computing the theoretical input starting/ending points
-        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
-        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
-        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+    const float   requant_scale = dst_qinfo.scale / src_qinfo.scale;
+    const int32_t requant_offset =
+        dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
+    const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
 
-        const int pool_start_x = std::max(0, -in_idx_width);
-        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
-        const int pool_start_y = std::max(0, -in_idx_height);
-        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // Computing the theoretical input starting/ending points
+            const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+            const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
 
-        const int pool_start_z = std::max(0, -in_idx_depth);
-        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+            const int pool_start_x = std::max(0, -in_idx_width);
+            const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+            const int pool_start_y = std::max(0, -in_idx_height);
+            const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
 
-        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
-        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
-        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
-        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+            const int pool_start_z = std::max(0, -in_idx_depth);
+            const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
 
-        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+            // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+            const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+            const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+            const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
 
-        int x_off = window_start_x;
+            const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
 
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
-        {
-            q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
+            int x_off = window_start_x;
 
-            // Perform pooling
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
+
+                // Perform pooling
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const q8x16_t  data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-
-                        vres = wrapper::vmax(vres, data);
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const q8x16_t  data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+                            vres = wrapper::vmax(vres, data);
+                        }
                     }
                 }
-            }
-
-            // Store result
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
-                            requant_qinfo) :
-                            vres);
-        }
 
-        // Leftovers using half the window step
-        for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
-        {
-            q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
+                                (src_qinfo != dst_qinfo)
+                                    ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres),
+                                                                           wrapper::vgethigh(vres), requant_qinfo)
+                                    : vres);
+            }
 
-            // Perform pooling
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            // Leftovers using half the window step
+            for (; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
+
+                // Perform pooling
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const q8x8_t   data     = wrapper::vload(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-
-                        vres = wrapper::vmax(vres, data);
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const q8x8_t   data     = wrapper::vload(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+                            vres = wrapper::vmax(vres, data);
+                        }
                     }
                 }
-            }
-
-            // Store result
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
-                            (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
-        }
 
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            T res = std::numeric_limits<T>::min();
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
+                                (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
+            }
 
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                T res = std::numeric_limits<T>::min();
+
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-
-                        res = std::max(res, data);
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+                            res = std::max(res, data);
+                        }
                     }
                 }
-            }
 
-            // Store result
-            if(src_qinfo != dst_qinfo)
-            {
-                const float res_f                           = static_cast<float>(res);
-                *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
-            }
-            else
-            {
-                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+                // Store result
+                if (src_qinfo != dst_qinfo)
+                {
+                    const float res_f                           = static_cast<float>(res);
+                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
+                }
+                else
+                {
+                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+                }
             }
-        }
-    },
-    out);
+        },
+        out);
 }
 
 } // namespace cpu
 } // namespace arm_compute
 
-#endif // SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H
-\ No newline at end of file
+#endif // SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H
diff --git a/src/cpu/kernels/range/generic/neon/fp16.cpp b/src/cpu/kernels/range/generic/neon/fp16.cpp
index 5d50dce907..505c18c27d 100644
--- a/src/cpu/kernels/range/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/range/generic/neon/fp16.cpp
@@ -23,10 +23,10 @@
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 
-#include "src/cpu/kernels/range/generic/neon/impl.h"
-
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/range/generic/neon/impl.h"
 
 namespace arm_compute
 {
diff --git a/src/cpu/kernels/range/generic/neon/fp32.cpp b/src/cpu/kernels/range/generic/neon/fp32.cpp
index 6044f0f886..e5e472abb5 100644
--- a/src/cpu/kernels/range/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/range/generic/neon/fp32.cpp
@@ -22,10 +22,10 @@
  * SOFTWARE.
  */
 
-#include "src/cpu/kernels/range/generic/neon/impl.h"
-
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/range/generic/neon/impl.h"
 
 namespace arm_compute
 {
diff --git a/src/cpu/kernels/range/generic/neon/impl.h b/src/cpu/kernels/range/generic/neon/impl.h
index 62144e6776..f8c30d52a0 100644
--- a/src/cpu/kernels/range/generic/neon/impl.h
+++ b/src/cpu/kernels/range/generic/neon/impl.h
@@ -26,8 +26,9 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
@@ -47,35 +48,36 @@ void neon_range_function(ITensor *output, float start, float step, const Window
     const auto window_end_x   = static_cast<int>(window.x().end());
     const int  window_step_x  = 16 / sizeof(T);
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
     Iterator output_it(output, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int        x       = window_start_x;
-        const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            for(int count = 0; count < window_step_x; ++count)
+            int        x       = window_start_x;
+            const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                id_vec = wrapper::vsetlane(static_cast<T>(x + count), id_vec, count);
-            }
-
-            // start + step * id
-            const auto res_vec = wrapper::vmla(start_vec, id_vec, step_vec);
-            wrapper::vstore(out_ptr + x, res_vec);
-        }
+                for (int count = 0; count < window_step_x; ++count)
+                {
+                    id_vec = wrapper::vsetlane(static_cast<T>(x + count), id_vec, count);
+                }
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const auto res = start + x * step;
-            *(out_ptr + x) = res;
-        }
+                // start + step * id
+                const auto res_vec = wrapper::vmla(start_vec, id_vec, step_vec);
+                wrapper::vstore(out_ptr + x, res_vec);
+            }
 
-    },
-    output_it);
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                const auto res = start + x * step;
+                *(out_ptr + x) = res;
+            }
+        },
+        output_it);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/range/list.h b/src/cpu/kernels/range/list.h
index 25d52bfe7f..cade91e8dd 100644
--- a/src/cpu/kernels/range/list.h
+++ b/src/cpu/kernels/range/list.h
@@ -28,8 +28,7 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_RANGE_KERNEL(func_name) \
-    void func_name(ITensor *output, float start, float step, const Window &window)
+#define DECLARE_RANGE_KERNEL(func_name) void func_name(ITensor *output, float start, float step, const Window &window)
 
 DECLARE_RANGE_KERNEL(fp16_neon_range_function);
 DECLARE_RANGE_KERNEL(fp32_neon_range_function);
diff --git a/src/cpu/kernels/roialign/generic/neon/fp16.cpp b/src/cpu/kernels/roialign/generic/neon/fp16.cpp
index c265d5d4eb..cf99830562 100644
--- a/src/cpu/kernels/roialign/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/roialign/generic/neon/fp16.cpp
@@ -29,7 +29,12 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp16_roialign(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
+void neon_fp16_roialign(const ITensor      *input,
+                        ITensor            *output,
+                        const ITensor      *rois,
+                        ROIPoolingLayerInfo pool_info,
+                        const Window       &window,
+                        const ThreadInfo   &info)
 {
     return roi_align<float16_t, float16_t>(input, output, rois, pool_info, window, info);
 }
diff --git a/src/cpu/kernels/roialign/generic/neon/fp32.cpp b/src/cpu/kernels/roialign/generic/neon/fp32.cpp
index 51355aaef0..c1dba99b5e 100644
--- a/src/cpu/kernels/roialign/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/roialign/generic/neon/fp32.cpp
@@ -26,7 +26,12 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp32_roialign(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
+void neon_fp32_roialign(const ITensor      *input,
+                        ITensor            *output,
+                        const ITensor      *rois,
+                        ROIPoolingLayerInfo pool_info,
+                        const Window       &window,
+                        const ThreadInfo   &info)
 {
     return roi_align<float, float>(input, output, rois, pool_info, window, info);
 }
diff --git a/src/cpu/kernels/roialign/generic/neon/impl.h b/src/cpu/kernels/roialign/generic/neon/impl.h
index e5e604330a..db2f67705d 100644
--- a/src/cpu/kernels/roialign/generic/neon/impl.h
+++ b/src/cpu/kernels/roialign/generic/neon/impl.h
@@ -46,7 +46,7 @@ inline input_data_type roi_align_1x1(const ITensor *input,
                                      float          region_end_y,
                                      int            pz)
 {
-    if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+    if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
     {
         return input_data_type(0);
     }
@@ -55,9 +55,9 @@ inline input_data_type roi_align_1x1(const ITensor *input,
         const DataLayout data_layout = input->info()->data_layout();
         float            avg         = 0;
         // Iterate through the aligned pooling region
-        for(int iy = 0; iy < grid_size_y; ++iy)
+        for (int iy = 0; iy < grid_size_y; ++iy)
         {
-            for(int ix = 0; ix < grid_size_x; ++ix)
+            for (int ix = 0; ix < grid_size_x; ++ix)
             {
                 // Align the window in the middle of every bin
                 float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y);
@@ -78,20 +78,28 @@ inline input_data_type roi_align_1x1(const ITensor *input,
                 const float w2 = hy * lx;
                 const float w3 = ly * hx;
                 const float w4 = ly * lx;
-                if(data_layout == DataLayout::NCHW)
+                if (data_layout == DataLayout::NCHW)
                 {
-                    const auto data1 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch)));
-                    const auto data2 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch)));
-                    const auto data3 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch)));
-                    const auto data4 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch)));
+                    const auto data1 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch)));
+                    const auto data2 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch)));
+                    const auto data3 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch)));
+                    const auto data4 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch)));
                     avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
                 }
                 else
                 {
-                    const auto data1 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch)));
-                    const auto data2 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch)));
-                    const auto data3 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch)));
-                    const auto data4 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch)));
+                    const auto data1 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch)));
+                    const auto data2 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch)));
+                    const auto data3 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch)));
+                    const auto data4 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch)));
                     avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
                 }
             }
@@ -117,21 +125,21 @@ inline input_data_type roi_align_1x1_qasymm8(const ITensor          *input,
                                              int                     pz,
                                              const QuantizationInfo &out_qinfo)
 {
-    if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+    if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
     {
         return input_data_type(out_qinfo.uniform().offset);
     }
     else
     {
-        float                         avg              = 0;
-        const UniformQuantizationInfo input_qinfo      = input->info()->quantization_info().uniform();
-        const bool                    is_qasymm_signed = is_data_type_quantized_asymmetric_signed(input->info()->data_type());
-        const DataLayout              data_layout      = input->info()->data_layout();
+        float                         avg         = 0;
+        const UniformQuantizationInfo input_qinfo = input->info()->quantization_info().uniform();
+        const bool       is_qasymm_signed = is_data_type_quantized_asymmetric_signed(input->info()->data_type());
+        const DataLayout data_layout      = input->info()->data_layout();
 
         // Iterate through the aligned pooling region
-        for(int iy = 0; iy < grid_size_y; ++iy)
+        for (int iy = 0; iy < grid_size_y; ++iy)
         {
-            for(int ix = 0; ix < grid_size_x; ++ix)
+            for (int ix = 0; ix < grid_size_x; ++ix)
             {
                 // Align the window in the middle of every bin
                 float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y);
@@ -153,41 +161,89 @@ inline input_data_type roi_align_1x1_qasymm8(const ITensor          *input,
                 const float w3 = ly * hx;
                 const float w4 = ly * lx;
 
-                if(data_layout == DataLayout::NCHW)
+                if (data_layout == DataLayout::NCHW)
                 {
-                    if(is_qasymm_signed)
+                    if (is_qasymm_signed)
                     {
-                        float data1 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), input_qinfo);
-                        float data2 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), input_qinfo);
-                        float data3 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), input_qinfo);
-                        float data4 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), input_qinfo);
+                        float data1 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(x_low, y_low, pz, roi_batch))),
+                                                      input_qinfo);
+                        float data2 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(x_high, y_low, pz, roi_batch))),
+                                                      input_qinfo);
+                        float data3 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(x_low, y_high, pz, roi_batch))),
+                                                      input_qinfo);
+                        float data4 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(x_high, y_high, pz, roi_batch))),
+                                                      input_qinfo);
                         avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
                     }
                     else
                     {
-                        float data1 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), input_qinfo);
-                        float data2 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), input_qinfo);
-                        float data3 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), input_qinfo);
-                        float data4 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), input_qinfo);
+                        float data1 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))),
+                                               input_qinfo);
+                        float data2 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))),
+                                               input_qinfo);
+                        float data3 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))),
+                                               input_qinfo);
+                        float data4 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))),
+                                               input_qinfo);
                         avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
                     }
                 }
                 else
                 {
-                    if(is_qasymm_signed)
+                    if (is_qasymm_signed)
                     {
-                        const auto data1 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), input_qinfo);
-                        const auto data2 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), input_qinfo);
-                        const auto data3 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), input_qinfo);
-                        const auto data4 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), input_qinfo);
+                        const auto data1 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(pz, x_low, y_low, roi_batch))),
+                                                      input_qinfo);
+                        const auto data2 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(pz, x_high, y_low, roi_batch))),
+                                                      input_qinfo);
+                        const auto data3 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(pz, x_low, y_high, roi_batch))),
+                                                      input_qinfo);
+                        const auto data4 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(pz, x_high, y_high, roi_batch))),
+                                                      input_qinfo);
                         avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
                     }
                     else
                     {
-                        const auto data1 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), input_qinfo);
-                        const auto data2 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), input_qinfo);
-                        const auto data3 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), input_qinfo);
-                        const auto data4 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), input_qinfo);
+                        const auto data1 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))),
+                                               input_qinfo);
+                        const auto data2 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))),
+                                               input_qinfo);
+                        const auto data3 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))),
+                                               input_qinfo);
+                        const auto data4 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))),
+                                               input_qinfo);
                         avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
                     }
                 }
@@ -197,7 +253,7 @@ inline input_data_type roi_align_1x1_qasymm8(const ITensor          *input,
         avg /= grid_size_x * grid_size_y;
 
         input_data_type res = 0;
-        if(is_qasymm_signed)
+        if (is_qasymm_signed)
         {
             res = quantize_qasymm8_signed(avg, out_qinfo);
         }
@@ -215,7 +271,12 @@ inline float compute_region_coordinate(int p, float bin_size, float roi_anchor,
 }
 
 template <typename input_data_type, typename roi_data_type>
-void roi_align(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
+void roi_align(const ITensor      *input,
+               ITensor            *output,
+               const ITensor      *rois,
+               ROIPoolingLayerInfo pool_info,
+               const Window       &window,
+               const ThreadInfo   &info)
 {
     ARM_COMPUTE_UNUSED(info);
 
@@ -240,7 +301,7 @@ void roi_align(const ITensor *input, ITensor *output, const ITensor *rois, ROIPo
 
     const auto             *rois_ptr   = reinterpret_cast<const roi_data_type *>(rois->buffer());
     const QuantizationInfo &rois_qinfo = rois->info()->quantization_info();
-    for(int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
+    for (int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
     {
         const unsigned int roi_batch = rois_ptr[values_per_roi * roi_indx];
 
@@ -252,7 +313,7 @@ void roi_align(const ITensor *input, ITensor *output, const ITensor *rois, ROIPo
         float         x2(qx2);
         float         y1(qy1);
         float         y2(qy2);
-        if(is_qasymm)
+        if (is_qasymm)
         {
             x1 = dequantize_qasymm16(qx1, rois_qinfo);
             x2 = dequantize_qasymm16(qx2, rois_qinfo);
@@ -267,44 +328,47 @@ void roi_align(const ITensor *input, ITensor *output, const ITensor *rois, ROIPo
         float       bin_size_y   = roi_dims_y / pool_info.pooled_height();
 
         // Iterate through all feature maps
-        for(int ch = 0; ch < input_chanels; ++ch)
+        for (int ch = 0; ch < input_chanels; ++ch)
         {
             // Iterate through all output pixels
-            for(int py = 0; py < pooled_h; ++py)
+            for (int py = 0; py < pooled_h; ++py)
             {
-                for(int px = 0; px < pooled_w; ++px)
+                for (int px = 0; px < pooled_w; ++px)
                 {
-                    const float     region_start_x = compute_region_coordinate(px, bin_size_x, roi_anchor_x, input_width);
-                    const float     region_start_y = compute_region_coordinate(py, bin_size_y, roi_anchor_y, input_height);
-                    const float     region_end_x   = compute_region_coordinate(px + 1, bin_size_x, roi_anchor_x, input_width);
-                    const float     region_end_y   = compute_region_coordinate(py + 1, bin_size_y, roi_anchor_y, input_height);
-                    const int       roi_bin_grid_x = (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_x));
-                    const int       roi_bin_grid_y = (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_y));
+                    const float region_start_x = compute_region_coordinate(px, bin_size_x, roi_anchor_x, input_width);
+                    const float region_start_y = compute_region_coordinate(py, bin_size_y, roi_anchor_y, input_height);
+                    const float region_end_x = compute_region_coordinate(px + 1, bin_size_x, roi_anchor_x, input_width);
+                    const float region_end_y =
+                        compute_region_coordinate(py + 1, bin_size_y, roi_anchor_y, input_height);
+                    const int roi_bin_grid_x =
+                        (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_x));
+                    const int roi_bin_grid_y =
+                        (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_y));
                     input_data_type out_val(0);
-                    if(is_qasymm)
+                    if (is_qasymm)
                     {
                         out_val = roi_align_1x1_qasymm8<input_data_type>(
-                                      input, roi_batch, region_start_x, bin_size_x,
-                                      roi_bin_grid_x, region_end_x, region_start_y, bin_size_y,
-                                      roi_bin_grid_y, region_end_y, ch, output->info()->quantization_info());
+                            input, roi_batch, region_start_x, bin_size_x, roi_bin_grid_x, region_end_x, region_start_y,
+                            bin_size_y, roi_bin_grid_y, region_end_y, ch, output->info()->quantization_info());
                     }
                     else
                     {
-                        out_val = roi_align_1x1<input_data_type>(
-                                      input, roi_batch, region_start_x, bin_size_x,
-                                      roi_bin_grid_x, region_end_x, region_start_y, bin_size_y,
-                                      roi_bin_grid_y, region_end_y, ch);
+                        out_val = roi_align_1x1<input_data_type>(input, roi_batch, region_start_x, bin_size_x,
+                                                                 roi_bin_grid_x, region_end_x, region_start_y,
+                                                                 bin_size_y, roi_bin_grid_y, region_end_y, ch);
                     }
 
-                    if(data_layout == DataLayout::NCHW)
+                    if (data_layout == DataLayout::NCHW)
                     {
-                        auto out_ptr = reinterpret_cast<input_data_type *>(output->ptr_to_element(Coordinates(px, py, ch, roi_indx)));
-                        *out_ptr     = out_val;
+                        auto out_ptr = reinterpret_cast<input_data_type *>(
+                            output->ptr_to_element(Coordinates(px, py, ch, roi_indx)));
+                        *out_ptr = out_val;
                     }
                     else
                     {
-                        auto out_ptr = reinterpret_cast<input_data_type *>(output->ptr_to_element(Coordinates(ch, px, py, roi_indx)));
-                        *out_ptr     = out_val;
+                        auto out_ptr = reinterpret_cast<input_data_type *>(
+                            output->ptr_to_element(Coordinates(ch, px, py, roi_indx)));
+                        *out_ptr = out_val;
                     }
                 }
             }
diff --git a/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp b/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp
index d6bd9a95ce..11c5770f53 100644
--- a/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp
@@ -26,7 +26,12 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_qu8_roialign(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
+void neon_qu8_roialign(const ITensor      *input,
+                       ITensor            *output,
+                       const ITensor      *rois,
+                       ROIPoolingLayerInfo pool_info,
+                       const Window       &window,
+                       const ThreadInfo   &info)
 {
     return roi_align<uint8_t, uint16_t>(input, output, rois, pool_info, window, info);
 }
diff --git a/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp
index a839581aff..7f93cc87b3 100644
--- a/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp
@@ -26,7 +26,12 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_qs8_roialign(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
+void neon_qs8_roialign(const ITensor      *input,
+                       ITensor            *output,
+                       const ITensor      *rois,
+                       ROIPoolingLayerInfo pool_info,
+                       const Window       &window,
+                       const ThreadInfo   &info)
 {
     return roi_align<int8_t, uint16_t>(input, output, rois, pool_info, window, info);
 }
diff --git a/src/cpu/kernels/roialign/list.h b/src/cpu/kernels/roialign/list.h
index 1c71b02488..fdb3c0050d 100644
--- a/src/cpu/kernels/roialign/list.h
+++ b/src/cpu/kernels/roialign/list.h
@@ -27,9 +27,9 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_ROIALIGN_KERNEL(func_name)                                     \
-    void func_name(const ITensor *input, ITensor *output, const ITensor *rois, \
-                   ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
+#define DECLARE_ROIALIGN_KERNEL(func_name)                                                                    \
+    void func_name(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, \
+                   const Window &window, const ThreadInfo &info)
 DECLARE_ROIALIGN_KERNEL(neon_fp32_roialign);
 DECLARE_ROIALIGN_KERNEL(neon_fp16_roialign);
 DECLARE_ROIALIGN_KERNEL(neon_qu8_roialign);
diff --git a/src/cpu/kernels/scale/neon/fp16.cpp b/src/cpu/kernels/scale/neon/fp16.cpp
index 895f42215e..bd01569cc4 100644
--- a/src/cpu/kernels/scale/neon/fp16.cpp
+++ b/src/cpu/kernels/scale/neon/fp16.cpp
@@ -27,9 +27,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
@@ -41,8 +42,12 @@ namespace arm_compute
 {
 namespace
 {
-void fp16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                             float sampling_offset, bool align_corners, const Window &window)
+void fp16_neon_scale_nearest(const ITensor *src,
+                             ITensor       *dst,
+                             const ITensor *offsets,
+                             float          sampling_offset,
+                             bool           align_corners,
+                             const Window  &window)
 {
     const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
     const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -62,33 +67,46 @@ void fp16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *of
     const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
     const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t    offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto       in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int        offset_row = in_hi * in_stride_wc;
-        int32_t          x          = window_start_x;
-        const float16_t *in_ptr     = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-
-        for(; x <= window_end_x - window_step_x; x += window_step_x)
-        {
-            wrapper::vstore(reinterpret_cast<float16_t *>(out.ptr()) + x,
-                            wrapper::vloadq(in_ptr + offset + offset_row + x));
-        }
-        for(; x < window_end_x; ++x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            *(reinterpret_cast<float16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
-        }
-    },
-    out);
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int        offset_row = in_hi * in_stride_wc;
+            int32_t          x          = window_start_x;
+            const float16_t *in_ptr = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+
+            for (; x <= window_end_x - window_step_x; x += window_step_x)
+            {
+                wrapper::vstore(reinterpret_cast<float16_t *>(out.ptr()) + x,
+                                wrapper::vloadq(in_ptr + offset + offset_row + x));
+            }
+            for (; x < window_end_x; ++x)
+            {
+                *(reinterpret_cast<float16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
+            }
+        },
+        out);
 }
 
-void fp16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                              BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                              bool align_corners, const Window &window)
+void fp16_neon_scale_bilinear(const ITensor *src,
+                              ITensor       *dst,
+                              const ITensor *offsets,
+                              const ITensor *dx,
+                              const ITensor *dy,
+                              BorderMode     border_mode,
+                              PixelValue     constant_border_value,
+                              float          sampling_offset,
+                              bool           align_corners,
+                              const Window  &window)
 {
     // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+    const auto hr =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
 
     Iterator  out(dst, window);
     const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
@@ -103,68 +121,97 @@ void fp16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *o
     win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
     Iterator in(src, win_in);
 
-    if(border_mode == BorderMode::CONSTANT)
+    if (border_mode == BorderMode::CONSTANT)
     {
         using ConstType = typename std::conditional<std::is_same<float16_t, float16_t>::value, half, float16_t>::type;
 
         const float16_t const_border_value = static_cast<float16_t>(constant_border_value.get<ConstType>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto       offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto       dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto       dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int32_t    in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-            const float16_t *in_ptr = reinterpret_cast<const float16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
-            const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
-            const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
-            const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
-            const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
-            *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const auto offset =
+                    *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+                const int32_t    in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
+                const float16_t *in_ptr =
+                    reinterpret_cast<const float16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
+
+                const auto a00 =
+                    (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
+                const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h)
+                                     ? *(in_ptr + in_stride_c)
+                                     : const_border_value;
+                const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1)
+                                     ? *(in_ptr + in_stride_wc)
+                                     : const_border_value;
+                const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1)
+                                     ? *(in_ptr + in_stride_c + in_stride_wc)
+                                     : const_border_value;
+
+                *reinterpret_cast<float16_t *>(out.ptr()) =
+                    static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+            },
+            in, out);
     }
-    else if(border_mode == BorderMode::REPLICATE)
+    else if (border_mode == BorderMode::REPLICATE)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
-            auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
-            const auto a01 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
-            const auto a10 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
-            const auto a11 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
-            *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const auto offset =
+                    *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+                const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
+
+                auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
+                auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
+                auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
+                auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
+
+                const auto a00 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c +
+                                   clamped_h * in_stride_wc);
+                const auto a01 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c +
+                                   clamped_h * in_stride_wc);
+                const auto a10 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c +
+                                   clamped_h1 * in_stride_wc);
+                const auto a11 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c +
+                                   clamped_h1 * in_stride_wc);
+
+                *reinterpret_cast<float16_t *>(out.ptr()) =
+                    static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+            },
+            in, out);
     }
     else
     {
         ARM_COMPUTE_ERROR("Not implemented");
     }
 }
-}
+} // namespace
 namespace cpu
 {
-void fp16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                     InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                     bool align_corners, const Window &window)
+void fp16_neon_scale(const ITensor      *src,
+                     ITensor            *dst,
+                     const ITensor      *offsets,
+                     const ITensor      *dx,
+                     const ITensor      *dy,
+                     InterpolationPolicy policy,
+                     BorderMode          border_mode,
+                     PixelValue          constant_border_value,
+                     float               sampling_offset,
+                     bool                align_corners,
+                     const Window       &window)
 {
-    if(policy == InterpolationPolicy::BILINEAR)
+    if (policy == InterpolationPolicy::BILINEAR)
     {
-        fp16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+        fp16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+                                 align_corners, window);
     }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         fp16_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
     }
@@ -172,4 +219,4 @@ void fp16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, c
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
-\ No newline at end of file
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/scale/neon/integer.cpp b/src/cpu/kernels/scale/neon/integer.cpp
index 2ab14cf83a..bbf92e0412 100644
--- a/src/cpu/kernels/scale/neon/integer.cpp
+++ b/src/cpu/kernels/scale/neon/integer.cpp
@@ -22,8 +22,9 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/ScaleHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
@@ -33,8 +34,12 @@ namespace arm_compute
 {
 namespace
 {
-void u8_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                           float sampling_offset, bool align_corners, const Window &window)
+void u8_neon_scale_nearest(const ITensor *src,
+                           ITensor       *dst,
+                           const ITensor *offsets,
+                           float          sampling_offset,
+                           bool           align_corners,
+                           const Window  &window)
 {
     const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
     const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -54,43 +59,58 @@ void u8_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offs
     const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
     const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t  offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto     in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int      offset_row = in_hi * in_stride_wc;
-        int32_t        x          = window_start_x;
-        const uint8_t *in_ptr     = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-
-        for(; x <= window_end_x - window_step_x; x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            wrapper::vstore(reinterpret_cast<uint8_t *>(out.ptr()) + x,
-                            wrapper::vloadq(in_ptr + offset + offset_row + x));
-        }
-        for(; x < window_end_x; ++x)
-        {
-            *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
-        }
-    },
-    out);
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int      offset_row = in_hi * in_stride_wc;
+            int32_t        x          = window_start_x;
+            const uint8_t *in_ptr     = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+
+            for (; x <= window_end_x - window_step_x; x += window_step_x)
+            {
+                wrapper::vstore(reinterpret_cast<uint8_t *>(out.ptr()) + x,
+                                wrapper::vloadq(in_ptr + offset + offset_row + x));
+            }
+            for (; x < window_end_x; ++x)
+            {
+                *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
+            }
+        },
+        out);
 }
 
-void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                            BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                            bool align_corners, const Window &window)
+void u8_neon_scale_bilinear(const ITensor *src,
+                            ITensor       *dst,
+                            const ITensor *offsets,
+                            const ITensor *dx,
+                            const ITensor *dy,
+                            BorderMode     border_mode,
+                            PixelValue     constant_border_value,
+                            float          sampling_offset,
+                            bool           align_corners,
+                            const Window  &window)
 {
     // Compute the ratio between source and destination dimensions
-    const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
-    const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+    const float scale_x =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+    const float scale_y =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
 
     const int input_width  = src->info()->dimension(1);
     const int input_height = src->info()->dimension(2);
 
-    if(border_mode == BorderMode::CONSTANT)
+    if (border_mode == BorderMode::CONSTANT)
     {
         Iterator  out(dst, window);
-        const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-        const int in_stride_wc = in_stride_c * (input_width + src->info()->padding().top + src->info()->padding().bottom);
+        const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
+        const int in_stride_wc =
+            in_stride_c * (input_width + src->info()->padding().top + src->info()->padding().bottom);
 
         // Don't increment in Y and Z direction for the input tensor
         // A pointer to the start of this plane is needed as base for the precomputed offsets
@@ -100,24 +120,37 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
         Iterator in(src, win_in);
 
         const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto     offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto     dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto     dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int32_t  in_hi  = std::floor((id.z() + sampling_offset) * scale_y - sampling_offset);
-            const uint8_t *in_ptr = reinterpret_cast<const uint8_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
-            const auto a00 = (0 <= offset && offset < input_width && 0 <= in_hi && in_hi < input_height) ? *in_ptr : const_border_value;
-            const auto a01 = (-1 <= offset && offset < input_width - 1 && 0 <= in_hi && in_hi < input_height) ? *(in_ptr + in_stride_c) : const_border_value;
-            const auto a10 = (0 <= offset && offset < input_width && -1 <= in_hi && in_hi < input_height - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
-            const auto a11 = (-1 <= offset && offset < input_width - 1 && -1 <= in_hi && in_hi < input_height - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
-            *reinterpret_cast<uint8_t *>(out.ptr()) = static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const auto offset =
+                    *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+                const int32_t  in_hi = std::floor((id.z() + sampling_offset) * scale_y - sampling_offset);
+                const uint8_t *in_ptr =
+                    reinterpret_cast<const uint8_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
+
+                const auto a00 = (0 <= offset && offset < input_width && 0 <= in_hi && in_hi < input_height)
+                                     ? *in_ptr
+                                     : const_border_value;
+                const auto a01 = (-1 <= offset && offset < input_width - 1 && 0 <= in_hi && in_hi < input_height)
+                                     ? *(in_ptr + in_stride_c)
+                                     : const_border_value;
+                const auto a10 = (0 <= offset && offset < input_width && -1 <= in_hi && in_hi < input_height - 1)
+                                     ? *(in_ptr + in_stride_wc)
+                                     : const_border_value;
+                const auto a11 = (-1 <= offset && offset < input_width - 1 && -1 <= in_hi && in_hi < input_height - 1)
+                                     ? *(in_ptr + in_stride_c + in_stride_wc)
+                                     : const_border_value;
+
+                *reinterpret_cast<uint8_t *>(out.ptr()) =
+                    static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+            },
+            in, out);
     }
-    else if(border_mode == BorderMode::REPLICATE)
+    else if (border_mode == BorderMode::REPLICATE)
     {
         using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
 
@@ -152,12 +185,12 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
         const float fp_coord_offset_y = sampling_offset * (scale_y - 1);
         const float fp_coord_offset_x = sampling_offset * (scale_x - 1);
 
-        for(int bo = bo_start; bo < bo_end; bo += bo_step)
+        for (int bo = bo_start; bo < bo_end; bo += bo_step)
         {
             const uint8_t *in_ptr  = in.ptr() + bo * in_stride_b;
             uint8_t       *out_ptr = out.ptr() + bo * out_stride_b;
 
-            for(int yo = yo_start; yo < yo_end; yo += yo_step)
+            for (int yo = yo_start; yo < yo_end; yo += yo_step)
             {
                 // Floating-point coordinate
                 const float yi_f = yo * scale_y + fp_coord_offset_y;
@@ -174,7 +207,7 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
                 const uint8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y;
 
                 uint8_t *out_ptr_yo = out_ptr + yo * out_stride_y;
-                for(int xo = xo_start; xo < xo_end; xo += xo_step)
+                for (int xo = xo_start; xo < xo_end; xo += xo_step)
                 {
                     // Floating-point coordinate
                     const float xi_f = xo * scale_x + fp_coord_offset_x;
@@ -205,7 +238,7 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
                     uint8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x;
 
                     int cout = 0;
-                    for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+                    for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
                     {
                         const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(uint8_t));
                         const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(uint8_t));
@@ -270,19 +303,21 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
                         const auto out_2_int = wrapper::vcvta<uint32_t>(out_2);
                         const auto out_3_int = wrapper::vcvta<uint32_t>(out_3);
 #else  // defined(__aarch64__) && !defined(BARE_METAL)
-                        const auto out_0_int = wrapper::vcvt<uint32_t>(out_0);
-                        const auto out_1_int = wrapper::vcvt<uint32_t>(out_1);
-                        const auto out_2_int = wrapper::vcvt<uint32_t>(out_2);
-                        const auto out_3_int = wrapper::vcvt<uint32_t>(out_3);
+                        const auto out_0_int                      = wrapper::vcvt<uint32_t>(out_0);
+                        const auto out_1_int                      = wrapper::vcvt<uint32_t>(out_1);
+                        const auto out_2_int                      = wrapper::vcvt<uint32_t>(out_2);
+                        const auto out_3_int                      = wrapper::vcvt<uint32_t>(out_3);
 #endif // defined(__aarch64__) && !defined(BARE_METAL)
-                        const auto low_part  = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
-                        const auto high_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
-                        const auto out       = wrapper::vcombine(low_part, high_part);
+                        const auto low_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
+                        const auto high_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
+                        const auto out = wrapper::vcombine(low_part, high_part);
 
                         wrapper::vstore(out_ptr_xo_yo + cout * sizeof(uint8_t), out);
                     }
 
-                    for(; cout < out_dim_ch; ++cout)
+                    for (; cout < out_dim_ch; ++cout)
                     {
                         const uint8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(uint8_t));
                         const uint8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(uint8_t));
@@ -311,18 +346,27 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
     }
 }
 
-void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                            BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                            bool align_corners, const Window &window)
+void s8_neon_scale_bilinear(const ITensor *src,
+                            ITensor       *dst,
+                            const ITensor *offsets,
+                            const ITensor *dx,
+                            const ITensor *dy,
+                            BorderMode     border_mode,
+                            PixelValue     constant_border_value,
+                            float          sampling_offset,
+                            bool           align_corners,
+                            const Window  &window)
 {
     ARM_COMPUTE_UNUSED(dx, dy, offsets, constant_border_value);
-    if(border_mode == BorderMode::REPLICATE)
+    if (border_mode == BorderMode::REPLICATE)
     {
         using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
 
         // Compute the ratio between source and destination dimensions
-        const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
-        const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+        const float scale_x =
+            scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+        const float scale_y =
+            scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
 
         const int     in_stride_x  = src->info()->strides_in_bytes()[1];
         const int     in_stride_y  = src->info()->strides_in_bytes()[2];
@@ -356,12 +400,12 @@ void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
         const float fp_coord_offset_y = sampling_offset * (scale_y - 1);
         const float fp_coord_offset_x = sampling_offset * (scale_x - 1);
 
-        for(int bo = bo_start; bo < bo_end; bo += bo_step)
+        for (int bo = bo_start; bo < bo_end; bo += bo_step)
         {
             const int8_t *in_ptr  = reinterpret_cast<int8_t *>(in.ptr() + bo * in_stride_b);
             int8_t       *out_ptr = reinterpret_cast<int8_t *>(out.ptr() + bo * out_stride_b);
 
-            for(int yo = yo_start; yo < yo_end; yo += yo_step)
+            for (int yo = yo_start; yo < yo_end; yo += yo_step)
             {
                 // Floating-point coordinate
                 const float yi_f = yo * scale_y + fp_coord_offset_y;
@@ -378,7 +422,7 @@ void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
                 const int8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y;
 
                 int8_t *out_ptr_yo = out_ptr + yo * out_stride_y;
-                for(int xo = xo_start; xo < xo_end; xo += xo_step)
+                for (int xo = xo_start; xo < xo_end; xo += xo_step)
                 {
                     // Floating-point coordinate
                     const float xi_f = xo * scale_x + fp_coord_offset_x;
@@ -409,7 +453,7 @@ void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
                     int8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x;
 
                     int cout = 0;
-                    for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+                    for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
                     {
                         const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(int8_t));
                         const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(int8_t));
@@ -479,14 +523,16 @@ void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
                         const auto out_2_int                      = wrapper::vcvt<int32_t>(out_2);
                         const auto out_3_int                      = wrapper::vcvt<int32_t>(out_3);
 #endif // defined(__aarch64__) && !defined(BARE_METAL)
-                        const auto low_part  = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
-                        const auto high_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
-                        const auto out       = wrapper::vcombine(low_part, high_part);
+                        const auto low_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
+                        const auto high_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
+                        const auto out = wrapper::vcombine(low_part, high_part);
 
                         wrapper::vstore(out_ptr_xo_yo + cout * sizeof(int8_t), out);
                     }
 
-                    for(; cout < out_dim_ch; ++cout)
+                    for (; cout < out_dim_ch; ++cout)
                     {
                         const int8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(int8_t));
                         const int8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(int8_t));
@@ -515,8 +561,12 @@ void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off
     }
 }
 
-void s16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                            float sampling_offset, bool align_corners, const Window &window)
+void s16_neon_scale_nearest(const ITensor *src,
+                            ITensor       *dst,
+                            const ITensor *offsets,
+                            float          sampling_offset,
+                            bool           align_corners,
+                            const Window  &window)
 {
     const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
     const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -536,33 +586,46 @@ void s16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *off
     const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
     const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t  offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto     in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int      offset_row = in_hi * in_stride_wc;
-        int32_t        x          = window_start_x;
-        const int16_t *in_ptr     = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-
-        for(; x <= window_end_x - window_step_x; x += window_step_x)
-        {
-            wrapper::vstore(reinterpret_cast<int16_t *>(out.ptr()) + x,
-                            wrapper::vloadq(in_ptr + offset + offset_row + x));
-        }
-        for(; x < window_end_x; ++x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            *(reinterpret_cast<int16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
-        }
-    },
-    out);
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int      offset_row = in_hi * in_stride_wc;
+            int32_t        x          = window_start_x;
+            const int16_t *in_ptr     = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+
+            for (; x <= window_end_x - window_step_x; x += window_step_x)
+            {
+                wrapper::vstore(reinterpret_cast<int16_t *>(out.ptr()) + x,
+                                wrapper::vloadq(in_ptr + offset + offset_row + x));
+            }
+            for (; x < window_end_x; ++x)
+            {
+                *(reinterpret_cast<int16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
+            }
+        },
+        out);
 }
 
-void s16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                             BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                             bool align_corners, const Window &window)
+void s16_neon_scale_bilinear(const ITensor *src,
+                             ITensor       *dst,
+                             const ITensor *offsets,
+                             const ITensor *dx,
+                             const ITensor *dy,
+                             BorderMode     border_mode,
+                             PixelValue     constant_border_value,
+                             float          sampling_offset,
+                             bool           align_corners,
+                             const Window  &window)
 {
     // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+    const auto hr =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
 
     Iterator  out(dst, window);
     const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
@@ -577,64 +640,93 @@ void s16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *of
     win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
     Iterator in(src, win_in);
 
-    if(border_mode == BorderMode::CONSTANT)
+    if (border_mode == BorderMode::CONSTANT)
     {
         const int16_t const_border_value = static_cast<int16_t>(constant_border_value.get<int16_t>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto     offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto     dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto     dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int32_t  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-            const int16_t *in_ptr = reinterpret_cast<const int16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
-            const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
-            const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
-            const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
-            const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
-            *reinterpret_cast<int16_t *>(out.ptr()) = static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const auto offset =
+                    *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+                const int32_t  in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
+                const int16_t *in_ptr =
+                    reinterpret_cast<const int16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
+
+                const auto a00 =
+                    (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
+                const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h)
+                                     ? *(in_ptr + in_stride_c)
+                                     : const_border_value;
+                const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1)
+                                     ? *(in_ptr + in_stride_wc)
+                                     : const_border_value;
+                const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1)
+                                     ? *(in_ptr + in_stride_c + in_stride_wc)
+                                     : const_border_value;
+
+                *reinterpret_cast<int16_t *>(out.ptr()) =
+                    static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+            },
+            in, out);
     }
-    else if(border_mode == BorderMode::REPLICATE)
+    else if (border_mode == BorderMode::REPLICATE)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
-            const auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
-            const auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
-            const auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
-            const auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
-            const auto a00 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
-            const auto a01 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
-            const auto a10 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
-            const auto a11 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
-            *reinterpret_cast<int16_t *>(out.ptr()) = static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const auto offset =
+                    *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+                const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
+
+                const auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
+                const auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
+                const auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
+                const auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
+
+                const auto a00 =
+                    *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
+                const auto a01 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c +
+                                   clamped_h * in_stride_wc);
+                const auto a10 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c +
+                                   clamped_h1 * in_stride_wc);
+                const auto a11 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c +
+                                   clamped_h1 * in_stride_wc);
+
+                *reinterpret_cast<int16_t *>(out.ptr()) =
+                    static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+            },
+            in, out);
     }
     else
     {
         ARM_COMPUTE_ERROR("Not implemented");
     }
 }
-}
+} // namespace
 namespace cpu
 {
-void s8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                   bool align_corners, const Window &window)
+void s8_neon_scale(const ITensor      *src,
+                   ITensor            *dst,
+                   const ITensor      *offsets,
+                   const ITensor      *dx,
+                   const ITensor      *dy,
+                   InterpolationPolicy policy,
+                   BorderMode          border_mode,
+                   PixelValue          constant_border_value,
+                   float               sampling_offset,
+                   bool                align_corners,
+                   const Window       &window)
 {
-    if(policy == InterpolationPolicy::BILINEAR)
+    if (policy == InterpolationPolicy::BILINEAR)
     {
-        s8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+        s8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+                               align_corners, window);
     }
     else
     {
@@ -642,32 +734,50 @@ void s8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, con
     }
 }
 
-void u8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                   bool align_corners, const Window &window)
+void u8_neon_scale(const ITensor      *src,
+                   ITensor            *dst,
+                   const ITensor      *offsets,
+                   const ITensor      *dx,
+                   const ITensor      *dy,
+                   InterpolationPolicy policy,
+                   BorderMode          border_mode,
+                   PixelValue          constant_border_value,
+                   float               sampling_offset,
+                   bool                align_corners,
+                   const Window       &window)
 {
-    if(policy == InterpolationPolicy::BILINEAR)
+    if (policy == InterpolationPolicy::BILINEAR)
     {
-        u8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+        u8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+                               align_corners, window);
     }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         u8_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
     }
 }
 
-void s16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                    InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                    bool align_corners, const Window &window)
+void s16_neon_scale(const ITensor      *src,
+                    ITensor            *dst,
+                    const ITensor      *offsets,
+                    const ITensor      *dx,
+                    const ITensor      *dy,
+                    InterpolationPolicy policy,
+                    BorderMode          border_mode,
+                    PixelValue          constant_border_value,
+                    float               sampling_offset,
+                    bool                align_corners,
+                    const Window       &window)
 {
-    if(policy == InterpolationPolicy::BILINEAR)
+    if (policy == InterpolationPolicy::BILINEAR)
     {
-        s16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+        s16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+                                align_corners, window);
     }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         s16_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
     }
 }
 } // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/scale/neon/list.h b/src/cpu/kernels/scale/neon/list.h
index 28a1087224..0fe87d15a6 100644
--- a/src/cpu/kernels/scale/neon/list.h
+++ b/src/cpu/kernels/scale/neon/list.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
@@ -34,10 +35,10 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_SCALE_KERNEL(func_name)                                                                                         \
-    void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,              \
-                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, \
-                   bool align_corners, const Window &window)
+#define DECLARE_SCALE_KERNEL(func_name)                                                                            \
+    void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \
+                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value,           \
+                   float sampling_offset, bool align_corners, const Window &window)
 
 DECLARE_SCALE_KERNEL(s16_neon_scale);
 DECLARE_SCALE_KERNEL(u8_neon_scale);
@@ -48,14 +49,20 @@ DECLARE_SCALE_KERNEL(qasymm8_signed_neon_scale);
 #undef DECLARE_SCALE_KERNEL
 
 template <typename T>
-void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, float sampling_offset,
-                        bool align_corners, const Window &window)
+void nearest_neon_scale(const ITensor *src,
+                        ITensor       *dst,
+                        const ITensor *offsets,
+                        float          sampling_offset,
+                        bool           align_corners,
+                        const Window  &window)
 {
     ARM_COMPUTE_UNUSED(offsets);
 
     // Compute the ratio between source and destination dimensions
-    const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
-    const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+    const float scale_x =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+    const float scale_y =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
 
     const int in_stride_y  = src->info()->strides_in_bytes()[1];
     const int in_stride_z  = src->info()->strides_in_bytes()[2];
@@ -84,17 +91,17 @@ void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets
     const int bo_end   = window_execution[3].end();
     const int bo_step  = window_execution[3].step();
 
-    for(int bo = bo_start; bo < bo_end; bo += bo_step)
+    for (int bo = bo_start; bo < bo_end; bo += bo_step)
     {
         const uint8_t *in_ptr_base  = in.ptr() + bo * in_stride_w;
         uint8_t       *out_ptr_base = out.ptr() + bo * out_stride_w;
 
-        for(int yo = yo_start; yo < yo_end; yo += yo_step)
+        for (int yo = yo_start; yo < yo_end; yo += yo_step)
         {
             // Floating-point coordinate
             float yi_f = ((yo + sampling_offset) * scale_y);
             int   yi   = 0;
-            if(align_corners)
+            if (align_corners)
             {
                 yi = utils::rounding::round_half_away_from_zero(yi_f);
             }
@@ -103,12 +110,12 @@ void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets
                 yi = static_cast<int>(std::floor(yi_f));
             }
 
-            for(int xo = xo_start; xo < xo_end; xo += xo_step)
+            for (int xo = xo_start; xo < xo_end; xo += xo_step)
             {
                 // Floating-point coordinate
                 float xi_f = ((xo + sampling_offset) * scale_x);
                 int   xi   = 0;
-                if(align_corners)
+                if (align_corners)
                 {
                     xi = utils::rounding::round_half_away_from_zero(xi_f);
                 }
@@ -121,15 +128,15 @@ void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets
                 uint8_t       *out_ptr = out_ptr_base + xo * out_stride_y + yo * out_stride_z;
 
                 int cout = 0;
-                for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+                for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
                 {
                     auto out0 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
                     wrapper::vstore(reinterpret_cast<T *>(out_ptr + cout * sizeof(T)), out0);
                 }
 
-                for(; cout < out_dim_ch; ++cout)
+                for (; cout < out_dim_ch; ++cout)
                 {
-                    auto out0                                            = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
+                    auto out0 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
                     *(reinterpret_cast<T *>(out_ptr + cout * sizeof(T))) = out0;
                 }
             }
@@ -138,9 +145,16 @@ void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets
 }
 
 template <typename T>
-void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                         BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                         bool align_corners, const Window &window)
+void bilinear_neon_scale(const ITensor *src,
+                         ITensor       *dst,
+                         const ITensor *offsets,
+                         const ITensor *dx,
+                         const ITensor *dy,
+                         BorderMode     border_mode,
+                         PixelValue     constant_border_value,
+                         float          sampling_offset,
+                         bool           align_corners,
+                         const Window  &window)
 {
     ARM_COMPUTE_UNUSED(offsets);
     ARM_COMPUTE_UNUSED(dx);
@@ -148,8 +162,10 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
 
     // Compute the ratio between source and destination dimensions
-    const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
-    const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+    const float scale_x =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+    const float scale_y =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
 
     const int in_stride_y  = src->info()->strides_in_bytes()[1];
     const int in_stride_z  = src->info()->strides_in_bytes()[2];
@@ -180,7 +196,7 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
     const int bo_end   = window_execution[3].end();
     const int bo_step  = window_execution[3].step();
 
-    if(border_mode == BorderMode::CONSTANT)
+    if (border_mode == BorderMode::CONSTANT)
     {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
@@ -189,12 +205,12 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         const T const_border_value = static_cast<T>(constant_border_value.get<ConstType>());
 
-        for(int bo = bo_start; bo < bo_end; bo += bo_step)
+        for (int bo = bo_start; bo < bo_end; bo += bo_step)
         {
             const uint8_t *in_ptr_base  = in.ptr() + bo * in_stride_w;
             uint8_t       *out_ptr_base = out.ptr() + bo * out_stride_w;
 
-            for(int yo = yo_start; yo < yo_end; yo += yo_step)
+            for (int yo = yo_start; yo < yo_end; yo += yo_step)
             {
                 // Floating-point coordinate
                 const float yi_f = ((yo + sampling_offset) * scale_y - sampling_offset);
@@ -204,7 +220,7 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
                 const auto a1 = (yi_f - static_cast<float>(yi));
                 const auto b1 = (1.f - a1);
 
-                for(int xo = xo_start; xo < xo_end; xo += xo_step)
+                for (int xo = xo_start; xo < xo_end; xo += xo_step)
                 {
                     // Floating-point coordinate
                     const float xi_f = ((xo + sampling_offset) * scale_x - sampling_offset);
@@ -223,32 +239,35 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
                     uint8_t       *out_ptr = out_ptr_base + xo * out_stride_y + yo * out_stride_z;
 
                     int cout = 0;
-                    for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+                    for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
                     {
                         auto in00 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
                         auto in01 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
                         auto in10 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
                         auto in11 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
-                        if((yi >= 0) && (yi < in_dim_h))
+                        if ((yi >= 0) && (yi < in_dim_h))
                         {
-                            if((xi >= 0) && (xi < in_dim_w))
+                            if ((xi >= 0) && (xi < in_dim_w))
                             {
                                 in00 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
                             }
-                            if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
+                            if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
                             {
-                                in01 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y));
+                                in01 = wrapper::vloadq(
+                                    reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y));
                             }
                         }
-                        if(((yi + 1) >= 0) && ((yi + 1) < in_dim_h))
+                        if (((yi + 1) >= 0) && ((yi + 1) < in_dim_h))
                         {
-                            if((xi >= 0) && (xi < in_dim_w))
+                            if ((xi >= 0) && (xi < in_dim_w))
                             {
-                                in10 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_z));
+                                in10 = wrapper::vloadq(
+                                    reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_z));
                             }
-                            if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
+                            if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
                             {
-                                in11 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z));
+                                in11 = wrapper::vloadq(
+                                    reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z));
                             }
                         }
 
@@ -264,32 +283,33 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
                         wrapper::vstore(reinterpret_cast<T *>(out_ptr + cout * sizeof(T)), out0);
                     }
 
-                    for(; cout < out_dim_ch; ++cout)
+                    for (; cout < out_dim_ch; ++cout)
                     {
                         auto in00 = static_cast<T>(const_border_value);
                         auto in01 = static_cast<T>(const_border_value);
                         auto in10 = static_cast<T>(const_border_value);
                         auto in11 = static_cast<T>(const_border_value);
-                        if((yi >= 0) && (yi < in_dim_h))
+                        if ((yi >= 0) && (yi < in_dim_h))
                         {
-                            if((xi >= 0) && (xi < in_dim_w))
+                            if ((xi >= 0) && (xi < in_dim_w))
                             {
                                 in00 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
                             }
-                            if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
+                            if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
                             {
                                 in01 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y));
                             }
                         }
-                        if(((yi + 1) >= 0) && ((yi + 1) < in_dim_h))
+                        if (((yi + 1) >= 0) && ((yi + 1) < in_dim_h))
                         {
-                            if((xi >= 0) && (xi < in_dim_w))
+                            if ((xi >= 0) && (xi < in_dim_w))
                             {
                                 in10 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_z));
                             }
-                            if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
+                            if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
                             {
-                                in11 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z));
+                                in11 = *(
+                                    reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z));
                             }
                         }
                         auto out0 = static_cast<T>(0);
@@ -303,14 +323,14 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
             }
         }
     }
-    else if(border_mode == BorderMode::REPLICATE)
+    else if (border_mode == BorderMode::REPLICATE)
     {
-        for(int bo = bo_start; bo < bo_end; bo += bo_step)
+        for (int bo = bo_start; bo < bo_end; bo += bo_step)
         {
             const uint8_t *in_ptr  = in.ptr() + bo * in_stride_w;
             uint8_t       *out_ptr = out.ptr() + bo * out_stride_w;
 
-            for(int yo = yo_start; yo < yo_end; yo += yo_step)
+            for (int yo = yo_start; yo < yo_end; yo += yo_step)
             {
                 // Floating-point coordinate
                 const float yi_f = ((yo + sampling_offset) * scale_y - sampling_offset);
@@ -327,7 +347,7 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
                 const int yi1_offset = yi1 * in_stride_z;
 
                 const int y_offset = yo * out_stride_z;
-                for(int xo = xo_start; xo < xo_end; xo += xo_step)
+                for (int xo = xo_start; xo < xo_end; xo += xo_step)
                 {
                     // Floating-point coordinate
                     const float xi_f = ((xo + sampling_offset) * scale_x - sampling_offset);
@@ -356,12 +376,16 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
                     const int offset = xo * out_stride_y + y_offset;
 
                     int cout = 0;
-                    for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+                    for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
                     {
-                        const auto in00 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset));
-                        const auto in01 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset));
-                        const auto in10 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset));
-                        const auto in11 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset));
+                        const auto in00 = wrapper::vloadq(
+                            reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset));
+                        const auto in01 = wrapper::vloadq(
+                            reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset));
+                        const auto in10 = wrapper::vloadq(
+                            reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset));
+                        const auto in11 = wrapper::vloadq(
+                            reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset));
 
                         auto out0 = wrapper::vmul(in00, s00);
                         out0      = wrapper::vmla(out0, in01, s01);
@@ -370,12 +394,16 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
                         wrapper::vstore(reinterpret_cast<T *>(out_ptr + offset + cout * sizeof(T)), out0);
                     }
 
-                    for(; cout < out_dim_ch; ++cout)
+                    for (; cout < out_dim_ch; ++cout)
                     {
-                        const T in00 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset));
-                        const T in01 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset));
-                        const T in10 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset));
-                        const T in11 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset));
+                        const T in00 =
+                            *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset));
+                        const T in01 =
+                            *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset));
+                        const T in10 =
+                            *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset));
+                        const T in11 =
+                            *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset));
 
                         T out0 = in00 * s00_s;
                         out0 += in01 * s01_s;
@@ -394,15 +422,24 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset
 }
 
 template <typename T>
-void common_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                       InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                       bool align_corners, const Window &window)
+void common_neon_scale(const ITensor      *src,
+                       ITensor            *dst,
+                       const ITensor      *offsets,
+                       const ITensor      *dx,
+                       const ITensor      *dy,
+                       InterpolationPolicy policy,
+                       BorderMode          border_mode,
+                       PixelValue          constant_border_value,
+                       float               sampling_offset,
+                       bool                align_corners,
+                       const Window       &window)
 {
-    if(policy == InterpolationPolicy::BILINEAR)
+    if (policy == InterpolationPolicy::BILINEAR)
     {
-        bilinear_neon_scale<T>(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+        bilinear_neon_scale<T>(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+                               align_corners, window);
     }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         nearest_neon_scale<T>(src, dst, offsets, sampling_offset, align_corners, window);
     }
diff --git a/src/cpu/kernels/scale/neon/qasymm8.cpp b/src/cpu/kernels/scale/neon/qasymm8.cpp
index 778459ae39..62a821daa5 100644
--- a/src/cpu/kernels/scale/neon/qasymm8.cpp
+++ b/src/cpu/kernels/scale/neon/qasymm8.cpp
@@ -28,9 +28,16 @@ namespace arm_compute
 {
 namespace
 {
-void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                                 BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                                 bool align_corners, const Window &window)
+void qasymm8_neon_scale_bilinear(const ITensor *src,
+                                 ITensor       *dst,
+                                 const ITensor *offsets,
+                                 const ITensor *dx,
+                                 const ITensor *dy,
+                                 BorderMode     border_mode,
+                                 PixelValue     constant_border_value,
+                                 float          sampling_offset,
+                                 bool           align_corners,
+                                 const Window  &window)
 {
     // Data layout is NHWC
     const int32_t input_width  = src->info()->dimension(1);
@@ -40,10 +47,12 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
     const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
 
     // Compute the ratio between source and destination dimensions
-    const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
-    const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+    const float scale_x =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+    const float scale_y =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
 
-    if(border_mode == BorderMode::CONSTANT)
+    if (border_mode == BorderMode::CONSTANT)
     {
         const int32_t in_stride_y = src->info()->strides_in_bytes()[1];
         const int32_t in_stride_z = src->info()->strides_in_bytes()[2];
@@ -59,7 +68,7 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
         win_in.set(1, Window::Dimension(0, 0, 0));
         win_in.set(2, Window::Dimension(0, 0, 0));
 
-        for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
+        for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
         {
             win_off.set(d, Window::Dimension(0, 0, 0));
         }
@@ -68,36 +77,41 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
         Iterator out(dst, window);
 
         const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int32_t index_h       = std::floor((id[2] + sampling_offset) * scale_y - sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
-
-            const auto a00 = (0 <= index_w && index_w < input_width && 0 <= index_h && index_h < input_height) ?
-                             (*(pixel_row_ptr + index_w * in_stride_y + index_h * in_stride_z)) :
-                             const_border_value;
-            const auto a01 = (-1 <= index_w && index_w + 1 < input_width && 0 <= index_h && index_h < input_height) ?
-                             (*(pixel_row_ptr + (index_w + 1) * in_stride_y + index_h * in_stride_z)) :
-                             const_border_value;
-            const auto a10 = (0 <= index_w && index_w < input_width && -1 <= index_h && index_h < input_height - 1) ?
-                             (*(pixel_row_ptr + index_w * in_stride_y + (index_h + 1) * in_stride_z)) :
-                             const_border_value;
-            const auto a11 = (-1 <= index_w && index_w < input_width - 1 && -1 <= index_h && index_h < input_height - 1) ?
-                             (*(pixel_row_ptr + (index_w + 1) * in_stride_y + (index_h + 1) * in_stride_z)) :
-                             const_border_value;
-
-            const float inp00                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a00, iq_info);
-            const float inp01                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a01, iq_info);
-            const float inp10                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a10, iq_info);
-            const float inp11                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a11, iq_info);
-            *reinterpret_cast<uint8_t *>(out.ptr()) = Qasymm8QuantizationHelper<uint8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        in, out);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int32_t index_h = std::floor((id[2] + sampling_offset) * scale_y - sampling_offset);
+                const int32_t index_w =
+                    *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
+                const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
+                const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
+                const auto pixel_row_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
+
+                const auto a00 = (0 <= index_w && index_w < input_width && 0 <= index_h && index_h < input_height)
+                                     ? (*(pixel_row_ptr + index_w * in_stride_y + index_h * in_stride_z))
+                                     : const_border_value;
+                const auto a01 = (-1 <= index_w && index_w + 1 < input_width && 0 <= index_h && index_h < input_height)
+                                     ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + index_h * in_stride_z))
+                                     : const_border_value;
+                const auto a10 = (0 <= index_w && index_w < input_width && -1 <= index_h && index_h < input_height - 1)
+                                     ? (*(pixel_row_ptr + index_w * in_stride_y + (index_h + 1) * in_stride_z))
+                                     : const_border_value;
+                const auto a11 =
+                    (-1 <= index_w && index_w < input_width - 1 && -1 <= index_h && index_h < input_height - 1)
+                        ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + (index_h + 1) * in_stride_z))
+                        : const_border_value;
+
+                const float inp00                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a00, iq_info);
+                const float inp01                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a01, iq_info);
+                const float inp10                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a10, iq_info);
+                const float inp11                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a11, iq_info);
+                *reinterpret_cast<uint8_t *>(out.ptr()) = Qasymm8QuantizationHelper<uint8_t>::quantize(
+                    scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
+            },
+            in, out);
     }
-    else if(border_mode == BorderMode::REPLICATE)
+    else if (border_mode == BorderMode::REPLICATE)
     {
         using FloatTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
         using Int32TagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
@@ -141,12 +155,12 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
         const float fp_coord_offset_y = sampling_offset * (scale_y - 1);
         const float fp_coord_offset_x = sampling_offset * (scale_x - 1);
 
-        for(int bo = bo_start; bo < bo_end; bo += bo_step)
+        for (int bo = bo_start; bo < bo_end; bo += bo_step)
         {
             const uint8_t *in_ptr  = in.ptr() + bo * in_stride_b;
             uint8_t       *out_ptr = out.ptr() + bo * out_stride_b;
 
-            for(int yo = yo_start; yo < yo_end; yo += yo_step)
+            for (int yo = yo_start; yo < yo_end; yo += yo_step)
             {
                 // Floating-point coordinate
                 const float yi_f = yo * scale_y + fp_coord_offset_y;
@@ -163,7 +177,7 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
                 const uint8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y;
 
                 uint8_t *out_ptr_yo = out_ptr + yo * out_stride_y;
-                for(int xo = xo_start; xo < xo_end; xo += xo_step)
+                for (int xo = xo_start; xo < xo_end; xo += xo_step)
                 {
                     // Floating-point coordinate
                     const float xi_f = xo * scale_x + fp_coord_offset_x;
@@ -194,7 +208,7 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
                     uint8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x;
 
                     int cout = 0;
-                    for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+                    for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
                     {
                         const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(uint8_t));
                         const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(uint8_t));
@@ -204,34 +218,82 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
                         const uint16x8_t in00_low  = wrapper::vmovl(wrapper::vgetlow(in00));
                         const uint16x8_t in00_high = wrapper::vmovl(wrapper::vgethigh(in00));
 
-                        const auto in00_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in00_low))), voffset_in)), vscale_in);
-                        const auto in00_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in00_low))), voffset_in)), vscale_in);
-                        const auto in00_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in00_high))), voffset_in)), vscale_in);
-                        const auto in00_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in00_high))), voffset_in)), vscale_in);
+                        const auto in00_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in00_low))), voffset_in)),
+                            vscale_in);
+                        const auto in00_1 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in00_low))), voffset_in)),
+                            vscale_in);
+                        const auto in00_2 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in00_high))), voffset_in)),
+                            vscale_in);
+                        const auto in00_3 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in00_high))), voffset_in)),
+                            vscale_in);
 
                         const uint16x8_t in01_low  = wrapper::vmovl(wrapper::vgetlow(in01));
                         const uint16x8_t in01_high = wrapper::vmovl(wrapper::vgethigh(in01));
 
-                        const auto in01_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in01_low))), voffset_in)), vscale_in);
-                        const auto in01_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in01_low))), voffset_in)), vscale_in);
-                        const auto in01_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in01_high))), voffset_in)), vscale_in);
-                        const auto in01_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in01_high))), voffset_in)), vscale_in);
+                        const auto in01_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in01_low))), voffset_in)),
+                            vscale_in);
+                        const auto in01_1 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in01_low))), voffset_in)),
+                            vscale_in);
+                        const auto in01_2 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in01_high))), voffset_in)),
+                            vscale_in);
+                        const auto in01_3 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in01_high))), voffset_in)),
+                            vscale_in);
 
                         const uint16x8_t in10_low  = wrapper::vmovl(wrapper::vgetlow(in10));
                         const uint16x8_t in10_high = wrapper::vmovl(wrapper::vgethigh(in10));
 
-                        const auto in10_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in10_low))), voffset_in)), vscale_in);
-                        const auto in10_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in10_low))), voffset_in)), vscale_in);
-                        const auto in10_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in10_high))), voffset_in)), vscale_in);
-                        const auto in10_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in10_high))), voffset_in)), vscale_in);
+                        const auto in10_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in10_low))), voffset_in)),
+                            vscale_in);
+                        const auto in10_1 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in10_low))), voffset_in)),
+                            vscale_in);
+                        const auto in10_2 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in10_high))), voffset_in)),
+                            vscale_in);
+                        const auto in10_3 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in10_high))), voffset_in)),
+                            vscale_in);
 
                         const uint16x8_t in11_low  = wrapper::vmovl(wrapper::vgetlow(in11));
                         const uint16x8_t in11_high = wrapper::vmovl(wrapper::vgethigh(in11));
 
-                        const auto in11_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in11_low))), voffset_in)), vscale_in);
-                        const auto in11_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in11_low))), voffset_in)), vscale_in);
-                        const auto in11_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in11_high))), voffset_in)), vscale_in);
-                        const auto in11_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in11_high))), voffset_in)), vscale_in);
+                        const auto in11_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in11_low))), voffset_in)),
+                            vscale_in);
+                        const auto in11_1 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in11_low))), voffset_in)),
+                            vscale_in);
+                        const auto in11_2 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in11_high))), voffset_in)),
+                            vscale_in);
+                        const auto in11_3 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in11_high))), voffset_in)),
+                            vscale_in);
 
                         auto out_0 = wrapper::vmul(in00_0, s00);
                         out_0      = wrapper::vmla(out_0, in01_0, s01);
@@ -264,14 +326,16 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
                         const auto out_2_int = wrapper::vcvt<uint32_t>(wrapper::vmla(voffset_o, out_2, invvscale_o));
                         const auto out_3_int = wrapper::vcvt<uint32_t>(wrapper::vmla(voffset_o, out_3, invvscale_o));
 #endif // defined(__aarch64__) && !defined(BARE_METAL)
-                        const auto low_part  = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
-                        const auto high_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
-                        const auto out       = wrapper::vcombine(low_part, high_part);
+                        const auto low_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
+                        const auto high_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
+                        const auto out = wrapper::vcombine(low_part, high_part);
 
                         wrapper::vstore(out_ptr_xo_yo + cout * sizeof(uint8_t), out);
                     }
 
-                    for(; cout < out_dim_ch; ++cout)
+                    for (; cout < out_dim_ch; ++cout)
                     {
                         const uint8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(uint8_t));
                         const uint8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(uint8_t));
@@ -292,7 +356,8 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
 #if defined(__aarch64__) && !defined(BARE_METAL)
                         *(out_ptr_xo_yo + cout * sizeof(uint8_t)) = quantize_qasymm8(out, oq_info);
 #else  // defined(__aarch64__) && !defined(BARE_METAL)
-                        *(out_ptr_xo_yo + cout * sizeof(uint8_t)) = quantize_qasymm8(out, oq_info, RoundingPolicy::TO_ZERO);
+                        *(out_ptr_xo_yo + cout * sizeof(uint8_t)) =
+                            quantize_qasymm8(out, oq_info, RoundingPolicy::TO_ZERO);
 #endif // defined(__aarch64__) && !defined(BARE_METAL)
                     }
                 }
@@ -304,28 +369,38 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
         ARM_COMPUTE_ERROR("Not implemented");
     }
 }
-}
+} // namespace
 namespace cpu
 {
-void qasymm8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                        InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                        bool align_corners, const Window &window)
+void qasymm8_neon_scale(const ITensor      *src,
+                        ITensor            *dst,
+                        const ITensor      *offsets,
+                        const ITensor      *dx,
+                        const ITensor      *dy,
+                        InterpolationPolicy policy,
+                        BorderMode          border_mode,
+                        PixelValue          constant_border_value,
+                        float               sampling_offset,
+                        bool                align_corners,
+                        const Window       &window)
 {
-    if(policy == InterpolationPolicy::BILINEAR)
+    if (policy == InterpolationPolicy::BILINEAR)
     {
-        if(src->info()->quantization_info() == dst->info()->quantization_info())
+        if (src->info()->quantization_info() == dst->info()->quantization_info())
         {
-            u8_neon_scale(src, dst, offsets, dx, dy, policy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+            u8_neon_scale(src, dst, offsets, dx, dy, policy, border_mode, constant_border_value, sampling_offset,
+                          align_corners, window);
         }
         else
         {
-            qasymm8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+            qasymm8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+                                        align_corners, window);
         }
     }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         nearest_neon_scale<uint8_t>(src, dst, offsets, sampling_offset, align_corners, window);
     }
 }
 } // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/scale/neon/qasymm8_signed.cpp b/src/cpu/kernels/scale/neon/qasymm8_signed.cpp
index cd63dfba63..5a885178a7 100644
--- a/src/cpu/kernels/scale/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/scale/neon/qasymm8_signed.cpp
@@ -28,9 +28,16 @@ namespace arm_compute
 {
 namespace
 {
-void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                                        BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                                        bool align_corners, const Window &window)
+void qasymm8_signed_neon_scale_bilinear(const ITensor *src,
+                                        ITensor       *dst,
+                                        const ITensor *offsets,
+                                        const ITensor *dx,
+                                        const ITensor *dy,
+                                        BorderMode     border_mode,
+                                        PixelValue     constant_border_value,
+                                        float          sampling_offset,
+                                        bool           align_corners,
+                                        const Window  &window)
 {
     // Data layout is NHWC
     const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform();
@@ -40,10 +47,12 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
     const int32_t input_height = src->info()->dimension(2);
 
     // Compute the ratio between source and destination dimensions
-    const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
-    const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+    const float scale_x =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+    const float scale_y =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
 
-    if(border_mode == BorderMode::CONSTANT)
+    if (border_mode == BorderMode::CONSTANT)
     {
         const int32_t in_stride_y = src->info()->strides_in_bytes()[1];
         const int32_t in_stride_z = src->info()->strides_in_bytes()[2];
@@ -58,7 +67,7 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
         win_in.set(1, Window::Dimension(0, 0, 0));
         win_in.set(2, Window::Dimension(0, 0, 0));
 
-        for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
+        for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
         {
             win_off.set(d, Window::Dimension(0, 0, 0));
         }
@@ -67,36 +76,41 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
         Iterator out(dst, window);
 
         const int8_t const_border_value = static_cast<int8_t>(constant_border_value.get<int8_t>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int32_t index_h       = std::floor((id[2] + sampling_offset) * scale_y - sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const int8_t *>(in.ptr());
-
-            const auto a00 = (0 <= index_w && index_w < input_width && 0 <= index_h && index_h < input_height) ?
-                             (*(pixel_row_ptr + index_w * in_stride_y + index_h * in_stride_z)) :
-                             const_border_value;
-            const auto a01 = (-1 <= index_w && index_w + 1 < input_width && 0 <= index_h && index_h < input_height) ?
-                             (*(pixel_row_ptr + (index_w + 1) * in_stride_y + index_h * in_stride_z)) :
-                             const_border_value;
-            const auto a10 = (0 <= index_w && index_w < input_width && -1 <= index_h && index_h < input_height - 1) ?
-                             (*(pixel_row_ptr + index_w * in_stride_y + (index_h + 1) * in_stride_z)) :
-                             const_border_value;
-            const auto a11 = (-1 <= index_w && index_w < input_width - 1 && -1 <= index_h && index_h < input_height - 1) ?
-                             (*(pixel_row_ptr + (index_w + 1) * in_stride_y + (index_h + 1) * in_stride_z)) :
-                             const_border_value;
-
-            const float inp00                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a00, iq_info);
-            const float inp01                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a01, iq_info);
-            const float inp10                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a10, iq_info);
-            const float inp11                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a11, iq_info);
-            *reinterpret_cast<int8_t *>(out.ptr()) = Qasymm8QuantizationHelper<int8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        in, out);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int32_t index_h = std::floor((id[2] + sampling_offset) * scale_y - sampling_offset);
+                const int32_t index_w =
+                    *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
+                const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
+                const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
+                const auto pixel_row_ptr = reinterpret_cast<const int8_t *>(in.ptr());
+
+                const auto a00 = (0 <= index_w && index_w < input_width && 0 <= index_h && index_h < input_height)
+                                     ? (*(pixel_row_ptr + index_w * in_stride_y + index_h * in_stride_z))
+                                     : const_border_value;
+                const auto a01 = (-1 <= index_w && index_w + 1 < input_width && 0 <= index_h && index_h < input_height)
+                                     ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + index_h * in_stride_z))
+                                     : const_border_value;
+                const auto a10 = (0 <= index_w && index_w < input_width && -1 <= index_h && index_h < input_height - 1)
+                                     ? (*(pixel_row_ptr + index_w * in_stride_y + (index_h + 1) * in_stride_z))
+                                     : const_border_value;
+                const auto a11 =
+                    (-1 <= index_w && index_w < input_width - 1 && -1 <= index_h && index_h < input_height - 1)
+                        ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + (index_h + 1) * in_stride_z))
+                        : const_border_value;
+
+                const float inp00                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a00, iq_info);
+                const float inp01                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a01, iq_info);
+                const float inp10                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a10, iq_info);
+                const float inp11                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a11, iq_info);
+                *reinterpret_cast<int8_t *>(out.ptr()) = Qasymm8QuantizationHelper<int8_t>::quantize(
+                    scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
+            },
+            in, out);
     }
-    else if(border_mode == BorderMode::REPLICATE)
+    else if (border_mode == BorderMode::REPLICATE)
     {
         using FloatTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
         using Int32TagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
@@ -140,12 +154,12 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
         const float32x4_t invvscale_o = wrapper::vdup_n(1.f / oq_info.scale, FloatTagType{});
         const float32x4_t voffset_o   = vdupq_n_f32(oq_info.offset);
 
-        for(int bo = bo_start; bo < bo_end; bo += bo_step)
+        for (int bo = bo_start; bo < bo_end; bo += bo_step)
         {
             const int8_t *in_ptr  = reinterpret_cast<int8_t *>(in.ptr() + bo * in_stride_b);
             int8_t       *out_ptr = reinterpret_cast<int8_t *>(out.ptr() + bo * out_stride_b);
 
-            for(int yo = yo_start; yo < yo_end; yo += yo_step)
+            for (int yo = yo_start; yo < yo_end; yo += yo_step)
             {
                 // Floating-point coordinate
                 const float yi_f = yo * scale_y + fp_coord_offset_y;
@@ -162,7 +176,7 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
                 const int8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y;
 
                 int8_t *out_ptr_yo = out_ptr + yo * out_stride_y;
-                for(int xo = xo_start; xo < xo_end; xo += xo_step)
+                for (int xo = xo_start; xo < xo_end; xo += xo_step)
                 {
                     // Floating-point coordinate
                     const float xi_f = xo * scale_x + fp_coord_offset_x;
@@ -193,7 +207,7 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
                     int8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x;
 
                     int cout = 0;
-                    for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+                    for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
                     {
                         const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(int8_t));
                         const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(int8_t));
@@ -203,34 +217,70 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
                         const int16x8_t in00_low  = wrapper::vmovl(wrapper::vgetlow(in00));
                         const int16x8_t in00_high = wrapper::vmovl(wrapper::vgethigh(in00));
 
-                        const auto in00_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in00_low)), voffset_in)), vscale_in);
-                        const auto in00_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in00_low)), voffset_in)), vscale_in);
-                        const auto in00_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in00_high)), voffset_in)), vscale_in);
-                        const auto in00_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in00_high)), voffset_in)), vscale_in);
+                        const auto in00_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in00_low)), voffset_in)),
+                            vscale_in);
+                        const auto in00_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgethigh(in00_low)), voffset_in)),
+                                                          vscale_in);
+                        const auto in00_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgetlow(in00_high)), voffset_in)),
+                                                          vscale_in);
+                        const auto in00_3 =
+                            wrapper::vmul(wrapper::vcvt<float>(
+                                              wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in00_high)), voffset_in)),
+                                          vscale_in);
 
                         const int16x8_t in01_low  = wrapper::vmovl(wrapper::vgetlow(in01));
                         const int16x8_t in01_high = wrapper::vmovl(wrapper::vgethigh(in01));
 
-                        const auto in01_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in01_low)), voffset_in)), vscale_in);
-                        const auto in01_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in01_low)), voffset_in)), vscale_in);
-                        const auto in01_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in01_high)), voffset_in)), vscale_in);
-                        const auto in01_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in01_high)), voffset_in)), vscale_in);
+                        const auto in01_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in01_low)), voffset_in)),
+                            vscale_in);
+                        const auto in01_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgethigh(in01_low)), voffset_in)),
+                                                          vscale_in);
+                        const auto in01_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgetlow(in01_high)), voffset_in)),
+                                                          vscale_in);
+                        const auto in01_3 =
+                            wrapper::vmul(wrapper::vcvt<float>(
+                                              wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in01_high)), voffset_in)),
+                                          vscale_in);
 
                         const int16x8_t in10_low  = wrapper::vmovl(wrapper::vgetlow(in10));
                         const int16x8_t in10_high = wrapper::vmovl(wrapper::vgethigh(in10));
 
-                        const auto in10_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in10_low)), voffset_in)), vscale_in);
-                        const auto in10_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in10_low)), voffset_in)), vscale_in);
-                        const auto in10_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in10_high)), voffset_in)), vscale_in);
-                        const auto in10_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in10_high)), voffset_in)), vscale_in);
+                        const auto in10_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in10_low)), voffset_in)),
+                            vscale_in);
+                        const auto in10_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgethigh(in10_low)), voffset_in)),
+                                                          vscale_in);
+                        const auto in10_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgetlow(in10_high)), voffset_in)),
+                                                          vscale_in);
+                        const auto in10_3 =
+                            wrapper::vmul(wrapper::vcvt<float>(
+                                              wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in10_high)), voffset_in)),
+                                          vscale_in);
 
                         const int16x8_t in11_low  = wrapper::vmovl(wrapper::vgetlow(in11));
                         const int16x8_t in11_high = wrapper::vmovl(wrapper::vgethigh(in11));
 
-                        const auto in11_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in11_low)), voffset_in)), vscale_in);
-                        const auto in11_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in11_low)), voffset_in)), vscale_in);
-                        const auto in11_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in11_high)), voffset_in)), vscale_in);
-                        const auto in11_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in11_high)), voffset_in)), vscale_in);
+                        const auto in11_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in11_low)), voffset_in)),
+                            vscale_in);
+                        const auto in11_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgethigh(in11_low)), voffset_in)),
+                                                          vscale_in);
+                        const auto in11_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgetlow(in11_high)), voffset_in)),
+                                                          vscale_in);
+                        const auto in11_3 =
+                            wrapper::vmul(wrapper::vcvt<float>(
+                                              wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in11_high)), voffset_in)),
+                                          vscale_in);
 
                         auto out_0 = wrapper::vmul(in00_0, s00);
                         out_0      = wrapper::vmla(out_0, in01_0, s01);
@@ -263,14 +313,16 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
                         const auto out_2_int = wrapper::vcvt<int32_t>(wrapper::vmla(voffset_o, out_2, invvscale_o));
                         const auto out_3_int = wrapper::vcvt<int32_t>(wrapper::vmla(voffset_o, out_3, invvscale_o));
 #endif // defined(__aarch64__) && !defined(BARE_METAL)
-                        const auto low_part  = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
-                        const auto high_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
-                        const auto out       = wrapper::vcombine(low_part, high_part);
+                        const auto low_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
+                        const auto high_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
+                        const auto out = wrapper::vcombine(low_part, high_part);
 
                         wrapper::vstore(out_ptr_xo_yo + cout * sizeof(int8_t), out);
                     }
 
-                    for(; cout < out_dim_ch; ++cout)
+                    for (; cout < out_dim_ch; ++cout)
                     {
                         const int8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(int8_t));
                         const int8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(int8_t));
@@ -291,7 +343,8 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
 #if defined(__aarch64__) && !defined(BARE_METAL)
                         *(out_ptr_xo_yo + cout * sizeof(int8_t)) = quantize_qasymm8_signed(out, oq_info);
 #else  // defined(__aarch64__) && !defined(BARE_METAL)
-                        *(out_ptr_xo_yo + cout * sizeof(int8_t)) = quantize_qasymm8_signed(out, oq_info, RoundingPolicy::TO_ZERO);
+                        *(out_ptr_xo_yo + cout * sizeof(int8_t)) =
+                            quantize_qasymm8_signed(out, oq_info, RoundingPolicy::TO_ZERO);
 #endif // defined(__aarch64__) && !defined(BARE_METAL)
                     }
                 }
@@ -303,28 +356,39 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const
         ARM_COMPUTE_ERROR("Not implemented");
     }
 }
-}
+} // namespace
 namespace cpu
 {
-void qasymm8_signed_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                               InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                               bool align_corners, const Window &window)
+void qasymm8_signed_neon_scale(const ITensor      *src,
+                               ITensor            *dst,
+                               const ITensor      *offsets,
+                               const ITensor      *dx,
+                               const ITensor      *dy,
+                               InterpolationPolicy policy,
+                               BorderMode          border_mode,
+                               PixelValue          constant_border_value,
+                               float               sampling_offset,
+                               bool                align_corners,
+                               const Window       &window)
 {
-    if(policy == InterpolationPolicy::BILINEAR)
+    if (policy == InterpolationPolicy::BILINEAR)
     {
-        if(src->info()->quantization_info() == dst->info()->quantization_info() && border_mode == BorderMode::REPLICATE)
+        if (src->info()->quantization_info() == dst->info()->quantization_info() &&
+            border_mode == BorderMode::REPLICATE)
         {
-            s8_neon_scale(src, dst, offsets, dx, dy, policy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+            s8_neon_scale(src, dst, offsets, dx, dy, policy, border_mode, constant_border_value, sampling_offset,
+                          align_corners, window);
         }
         else
         {
-            qasymm8_signed_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+            qasymm8_signed_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value,
+                                               sampling_offset, align_corners, window);
         }
     }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         nearest_neon_scale<int8_t>(src, dst, offsets, sampling_offset, align_corners, window);
     }
 }
 } // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/scale/sve/fp16.cpp b/src/cpu/kernels/scale/sve/fp16.cpp
index ceda19f366..cb28f4cb1c 100644
--- a/src/cpu/kernels/scale/sve/fp16.cpp
+++ b/src/cpu/kernels/scale/sve/fp16.cpp
@@ -27,9 +27,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
@@ -41,8 +42,12 @@ namespace arm_compute
 {
 namespace
 {
-void fp16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                            float sampling_offset, bool align_corners, const Window &window)
+void fp16_sve_scale_nearest(const ITensor *src,
+                            ITensor       *dst,
+                            const ITensor *offsets,
+                            float          sampling_offset,
+                            bool           align_corners,
+                            const Window  &window)
 {
     const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
     const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -61,38 +66,50 @@ void fp16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *off
     const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
     const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<float16_t *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b16(x, window_end_x);
-        do
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            // Store results
-            svst1_f16(pg, out_ptr + x, svld1_f16(pg, in_ptr + offset + offset_row + x));
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int  offset_row = in_hi * in_stride_wc;
+            const auto in_ptr     = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+            const auto out_ptr    = reinterpret_cast<float16_t *>(out.ptr());
 
-            x += svcntw();
-            pg = svwhilelt_b16(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b16(), pg));
-    },
-    out);
-}
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b16(x, window_end_x);
+            do
+            {
+                // Store results
+                svst1_f16(pg, out_ptr + x, svld1_f16(pg, in_ptr + offset + offset_row + x));
+
+                x += svcntw();
+                pg = svwhilelt_b16(x, window_end_x);
+            } while (svptest_any(svptrue_b16(), pg));
+        },
+        out);
 }
+} // namespace
 namespace cpu
 {
-void fp16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                    InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                    bool align_corners, const Window &window)
+void fp16_sve_scale(const ITensor      *src,
+                    ITensor            *dst,
+                    const ITensor      *offsets,
+                    const ITensor      *dx,
+                    const ITensor      *dy,
+                    InterpolationPolicy policy,
+                    BorderMode          border_mode,
+                    PixelValue          constant_border_value,
+                    float               sampling_offset,
+                    bool                align_corners,
+                    const Window       &window)
 {
     ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
-    if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         fp16_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
     }
@@ -103,4 +120,4 @@ void fp16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, co
 }
 } // namespace cpu
 } // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
-\ No newline at end of file
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/scale/sve/fp32.cpp b/src/cpu/kernels/scale/sve/fp32.cpp
index f3472f1efd..cbb345edbb 100644
--- a/src/cpu/kernels/scale/sve/fp32.cpp
+++ b/src/cpu/kernels/scale/sve/fp32.cpp
@@ -25,23 +25,27 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
+#include <arm_sve.h>
 #include <cmath>
 #include <cstddef>
 
-#include <arm_sve.h>
-
 namespace arm_compute
 {
 namespace
 {
-void fp32_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                            float sampling_offset, bool align_corners, const Window &window)
+void fp32_sve_scale_nearest(const ITensor *src,
+                            ITensor       *dst,
+                            const ITensor *offsets,
+                            float          sampling_offset,
+                            bool           align_corners,
+                            const Window  &window)
 {
     const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
     const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -60,38 +64,50 @@ void fp32_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *off
     const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
     const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const float *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<float *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b32(x, window_end_x);
-        do
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            // Store results
-            svst1_f32(pg, out_ptr + x, svld1_f32(pg, in_ptr + offset + offset_row + x));
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int  offset_row = in_hi * in_stride_wc;
+            const auto in_ptr     = reinterpret_cast<const float *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+            const auto out_ptr    = reinterpret_cast<float *>(out.ptr());
 
-            x += svcntw();
-            pg = svwhilelt_b32(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b32(), pg));
-    },
-    out);
-}
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b32(x, window_end_x);
+            do
+            {
+                // Store results
+                svst1_f32(pg, out_ptr + x, svld1_f32(pg, in_ptr + offset + offset_row + x));
+
+                x += svcntw();
+                pg = svwhilelt_b32(x, window_end_x);
+            } while (svptest_any(svptrue_b32(), pg));
+        },
+        out);
 }
+} // namespace
 namespace cpu
 {
-void fp32_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                    InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                    bool align_corners, const Window &window)
+void fp32_sve_scale(const ITensor      *src,
+                    ITensor            *dst,
+                    const ITensor      *offsets,
+                    const ITensor      *dx,
+                    const ITensor      *dy,
+                    InterpolationPolicy policy,
+                    BorderMode          border_mode,
+                    PixelValue          constant_border_value,
+                    float               sampling_offset,
+                    bool                align_corners,
+                    const Window       &window)
 {
     ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
-    if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         fp32_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
     }
diff --git a/src/cpu/kernels/scale/sve/integer.cpp b/src/cpu/kernels/scale/sve/integer.cpp
index 82c70ee360..df950b1789 100644
--- a/src/cpu/kernels/scale/sve/integer.cpp
+++ b/src/cpu/kernels/scale/sve/integer.cpp
@@ -25,9 +25,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
@@ -39,8 +40,12 @@ namespace arm_compute
 {
 namespace
 {
-void u8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                          float sampling_offset, bool align_corners, const Window &window)
+void u8_sve_scale_nearest(const ITensor *src,
+                          ITensor       *dst,
+                          const ITensor *offsets,
+                          float          sampling_offset,
+                          bool           align_corners,
+                          const Window  &window)
 {
     const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
     const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -59,32 +64,40 @@ void u8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offse
     const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
     const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<uint8_t *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b8(x, window_end_x);
-        do
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            // Store results
-            svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x));
-
-            x += svcntw();
-            pg = svwhilelt_b8(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b8(), pg));
-    },
-    out);
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int  offset_row = in_hi * in_stride_wc;
+            const auto in_ptr     = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+            const auto out_ptr    = reinterpret_cast<uint8_t *>(out.ptr());
+
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b8(x, window_end_x);
+            do
+            {
+                // Store results
+                svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x));
+
+                x += svcntw();
+                pg = svwhilelt_b8(x, window_end_x);
+            } while (svptest_any(svptrue_b8(), pg));
+        },
+        out);
 }
 
-void s16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                           float sampling_offset, bool align_corners, const Window &window)
+void s16_sve_scale_nearest(const ITensor *src,
+                           ITensor       *dst,
+                           const ITensor *offsets,
+                           float          sampling_offset,
+                           bool           align_corners,
+                           const Window  &window)
 {
     const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
     const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -103,38 +116,50 @@ void s16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offs
     const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
     const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<int16_t *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b16(x, window_end_x);
-        do
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            // Store results
-            svst1_s16(pg, out_ptr + x, svld1_s16(pg, in_ptr + offset + offset_row + x));
-
-            x += svcntw();
-            pg = svwhilelt_b16(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b16(), pg));
-    },
-    out);
-}
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int  offset_row = in_hi * in_stride_wc;
+            const auto in_ptr     = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+            const auto out_ptr    = reinterpret_cast<int16_t *>(out.ptr());
+
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b16(x, window_end_x);
+            do
+            {
+                // Store results
+                svst1_s16(pg, out_ptr + x, svld1_s16(pg, in_ptr + offset + offset_row + x));
+
+                x += svcntw();
+                pg = svwhilelt_b16(x, window_end_x);
+            } while (svptest_any(svptrue_b16(), pg));
+        },
+        out);
 }
+} // namespace
 namespace cpu
 {
-void u8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                  InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                  bool align_corners, const Window &window)
+void u8_sve_scale(const ITensor      *src,
+                  ITensor            *dst,
+                  const ITensor      *offsets,
+                  const ITensor      *dx,
+                  const ITensor      *dy,
+                  InterpolationPolicy policy,
+                  BorderMode          border_mode,
+                  PixelValue          constant_border_value,
+                  float               sampling_offset,
+                  bool                align_corners,
+                  const Window       &window)
 {
     ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
-    if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         u8_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
     }
@@ -144,12 +169,20 @@ void u8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, cons
     }
 }
 
-void s16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                   bool align_corners, const Window &window)
+void s16_sve_scale(const ITensor      *src,
+                   ITensor            *dst,
+                   const ITensor      *offsets,
+                   const ITensor      *dx,
+                   const ITensor      *dy,
+                   InterpolationPolicy policy,
+                   BorderMode          border_mode,
+                   PixelValue          constant_border_value,
+                   float               sampling_offset,
+                   bool                align_corners,
+                   const Window       &window)
 {
     ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
-    if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         s16_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
     }
diff --git a/src/cpu/kernels/scale/sve/list.h b/src/cpu/kernels/scale/sve/list.h
index b9c3a10a78..aff741a4a7 100644
--- a/src/cpu/kernels/scale/sve/list.h
+++ b/src/cpu/kernels/scale/sve/list.h
@@ -28,10 +28,10 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_SCALE_KERNEL(func_name)                                                                                         \
-    void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,              \
-                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, \
-                   bool align_corners, const Window &window)
+#define DECLARE_SCALE_KERNEL(func_name)                                                                            \
+    void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \
+                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value,           \
+                   float sampling_offset, bool align_corners, const Window &window)
 
 DECLARE_SCALE_KERNEL(fp16_sve_scale);
 DECLARE_SCALE_KERNEL(fp32_sve_scale);
diff --git a/src/cpu/kernels/scale/sve/qasymm8.cpp b/src/cpu/kernels/scale/sve/qasymm8.cpp
index d45a69e43b..0fc794c6c2 100644
--- a/src/cpu/kernels/scale/sve/qasymm8.cpp
+++ b/src/cpu/kernels/scale/sve/qasymm8.cpp
@@ -25,10 +25,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
@@ -40,8 +40,12 @@ namespace arm_compute
 {
 namespace
 {
-void qasymm8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                               float sampling_offset, bool align_corners, const Window &window)
+void qasymm8_sve_scale_nearest(const ITensor *src,
+                               ITensor       *dst,
+                               const ITensor *offsets,
+                               float          sampling_offset,
+                               bool           align_corners,
+                               const Window  &window)
 {
     const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
     const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -60,38 +64,50 @@ void qasymm8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *
     const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
     const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<uint8_t *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b8(x, window_end_x);
-        do
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            // Store results
-            svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x));
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int  offset_row = in_hi * in_stride_wc;
+            const auto in_ptr     = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+            const auto out_ptr    = reinterpret_cast<uint8_t *>(out.ptr());
 
-            x += svcntw();
-            pg = svwhilelt_b8(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b8(), pg));
-    },
-    out);
-}
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b8(x, window_end_x);
+            do
+            {
+                // Store results
+                svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x));
+
+                x += svcntw();
+                pg = svwhilelt_b8(x, window_end_x);
+            } while (svptest_any(svptrue_b8(), pg));
+        },
+        out);
 }
+} // namespace
 namespace cpu
 {
-void qasymm8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                       InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                       bool align_corners, const Window &window)
+void qasymm8_sve_scale(const ITensor      *src,
+                       ITensor            *dst,
+                       const ITensor      *offsets,
+                       const ITensor      *dx,
+                       const ITensor      *dy,
+                       InterpolationPolicy policy,
+                       BorderMode          border_mode,
+                       PixelValue          constant_border_value,
+                       float               sampling_offset,
+                       bool                align_corners,
+                       const Window       &window)
 {
     ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
-    if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         qasymm8_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
     }
diff --git a/src/cpu/kernels/scale/sve/qasymm8_signed.cpp b/src/cpu/kernels/scale/sve/qasymm8_signed.cpp
index 67bca65f58..68ea01e29e 100644
--- a/src/cpu/kernels/scale/sve/qasymm8_signed.cpp
+++ b/src/cpu/kernels/scale/sve/qasymm8_signed.cpp
@@ -25,10 +25,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
@@ -40,8 +40,12 @@ namespace arm_compute
 {
 namespace
 {
-void qasymm8_signed_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                                      float sampling_offset, bool align_corners, const Window &window)
+void qasymm8_signed_sve_scale_nearest(const ITensor *src,
+                                      ITensor       *dst,
+                                      const ITensor *offsets,
+                                      float          sampling_offset,
+                                      bool           align_corners,
+                                      const Window  &window)
 {
     const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
     const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -60,38 +64,50 @@ void qasymm8_signed_sve_scale_nearest(const ITensor *src, ITensor *dst, const IT
     const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
     const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const int8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<int8_t *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b8(x, window_end_x);
-        do
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            // Store results
-            svst1_s8(pg, out_ptr + x, svld1_s8(pg, in_ptr + offset + offset_row + x));
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int  offset_row = in_hi * in_stride_wc;
+            const auto in_ptr     = reinterpret_cast<const int8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+            const auto out_ptr    = reinterpret_cast<int8_t *>(out.ptr());
 
-            x += svcntw();
-            pg = svwhilelt_b8(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b8(), pg));
-    },
-    out);
-}
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b8(x, window_end_x);
+            do
+            {
+                // Store results
+                svst1_s8(pg, out_ptr + x, svld1_s8(pg, in_ptr + offset + offset_row + x));
+
+                x += svcntw();
+                pg = svwhilelt_b8(x, window_end_x);
+            } while (svptest_any(svptrue_b8(), pg));
+        },
+        out);
 }
+} // namespace
 namespace cpu
 {
-void qasymm8_signed_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                              InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                              bool align_corners, const Window &window)
+void qasymm8_signed_sve_scale(const ITensor      *src,
+                              ITensor            *dst,
+                              const ITensor      *offsets,
+                              const ITensor      *dx,
+                              const ITensor      *dy,
+                              InterpolationPolicy policy,
+                              BorderMode          border_mode,
+                              PixelValue          constant_border_value,
+                              float               sampling_offset,
+                              bool                align_corners,
+                              const Window       &window)
 {
     ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
-    if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         qasymm8_signed_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
     }
diff --git a/src/cpu/kernels/select/generic/neon/fp16.cpp b/src/cpu/kernels/select/generic/neon/fp16.cpp
index b460213c72..38a58099bd 100644
--- a/src/cpu/kernels/select/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/select/generic/neon/fp16.cpp
@@ -23,20 +23,22 @@
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 
-#include "src/cpu/kernels/select/generic/neon/impl.h"
-
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/select/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_f16_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_f16_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_16<float16_t, uint16x8_t>(c, x, y, output, window);
 }
-void neon_f16_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_f16_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_not_same_rank<float16_t>(c, x, y, output, window);
 }
@@ -45,4 +47,4 @@ void neon_f16_select_not_same_rank(const ITensor *c, const ITensor *x, const ITe
 
 } // namespace arm_compute
 
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)  && defined(ENABLE_FP16_KERNELS) */
-\ No newline at end of file
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)  && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/select/generic/neon/fp32.cpp b/src/cpu/kernels/select/generic/neon/fp32.cpp
index 63fd594901..50a80cb338 100644
--- a/src/cpu/kernels/select/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/select/generic/neon/fp32.cpp
@@ -22,20 +22,22 @@
  * SOFTWARE.
  */
 
-#include "src/cpu/kernels/select/generic/neon/impl.h"
-
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/select/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_f32_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_f32_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_32<float, uint32x4_t>(c, x, y, output, window);
 }
-void neon_f32_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_f32_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_not_same_rank<float>(c, x, y, output, window);
 }
diff --git a/src/cpu/kernels/select/generic/neon/impl.h b/src/cpu/kernels/select/generic/neon/impl.h
index 6a6d9969f8..7ce640b6ff 100644
--- a/src/cpu/kernels/select/generic/neon/impl.h
+++ b/src/cpu/kernels/select/generic/neon/impl.h
@@ -25,6 +25,7 @@
 #define ACL_SRC_CPU_KERNELS_SELECT_GENERIC_NEON_IMPL_H
 
 #include "arm_compute/core/TensorInfo.h"
+
 #include "src/core/NEON/NEAsymm.h"
 #include "src/cpu/kernels/select/generic/neon/impl.h"
 
@@ -37,8 +38,16 @@ namespace arm_compute
 namespace cpu
 {
 template <typename ScalarType, typename VectorType>
-void select_op(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-               const int window_step_x, const int window_start_x, const int window_end_x, const int limit, VectorType (*condition_conversion)(const uint8_t *))
+void select_op(const ITensor *cond,
+               const ITensor *in1,
+               const ITensor *in2,
+               ITensor       *out,
+               const Window  &window,
+               const int      window_step_x,
+               const int      window_start_x,
+               const int      window_end_x,
+               const int      limit,
+               VectorType (*condition_conversion)(const uint8_t *))
 {
     Window win = window;
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -48,30 +57,32 @@ void select_op(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITen
     Iterator input2(in2, win);
     Iterator output(out, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        auto       output_ptr    = reinterpret_cast<ScalarType *>(output.ptr());
-        const auto condition_ptr = reinterpret_cast<const uint8_t *>(condition.ptr());
-        const auto input1_ptr    = reinterpret_cast<const ScalarType *>(input1.ptr());
-        const auto input2_ptr    = reinterpret_cast<const ScalarType *>(input2.ptr());
-
-        int x = window_start_x;
-        for(; x <= limit; x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const auto c = (*condition_conversion)(condition_ptr + x);
-            const auto a = wrapper::vloadq(input1_ptr + x);
-            const auto b = wrapper::vloadq(input2_ptr + x);
-            wrapper::vstore(output_ptr + x, wrapper::vbsl(c, a, b));
-        }
-        for(; x < window_end_x; ++x)
-        {
-            const auto c      = *(condition_ptr + x);
-            const auto a      = *(input1_ptr + x);
-            const auto b      = *(input2_ptr + x);
-            *(output_ptr + x) = static_cast<bool>(c) ? a : b;
-        }
-    },
-    condition, input1, input2, output);
+            auto       output_ptr    = reinterpret_cast<ScalarType *>(output.ptr());
+            const auto condition_ptr = reinterpret_cast<const uint8_t *>(condition.ptr());
+            const auto input1_ptr    = reinterpret_cast<const ScalarType *>(input1.ptr());
+            const auto input2_ptr    = reinterpret_cast<const ScalarType *>(input2.ptr());
+
+            int x = window_start_x;
+            for (; x <= limit; x += window_step_x)
+            {
+                const auto c = (*condition_conversion)(condition_ptr + x);
+                const auto a = wrapper::vloadq(input1_ptr + x);
+                const auto b = wrapper::vloadq(input2_ptr + x);
+                wrapper::vstore(output_ptr + x, wrapper::vbsl(c, a, b));
+            }
+            for (; x < window_end_x; ++x)
+            {
+                const auto c      = *(condition_ptr + x);
+                const auto a      = *(input1_ptr + x);
+                const auto b      = *(input2_ptr + x);
+                *(output_ptr + x) = static_cast<bool>(c) ? a : b;
+            }
+        },
+        condition, input1, input2, output);
 }
 
 template <typename ScalarType, typename VectorType>
@@ -81,11 +92,14 @@ void select_op_8(const ITensor *cond, const ITensor *in1, const ITensor *in2, IT
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
 
-    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
-    {
-        static const auto zero = wrapper::vdup_n(static_cast<uint8_t>(0), arm_compute::wrapper::traits::vector_128_tag());
-        return wrapper::vcgt(wrapper::vloadq(condition_ptr), zero);
-    });
+    select_op<ScalarType, VectorType>(
+        cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x,
+        [](const uint8_t *condition_ptr) -> VectorType
+        {
+            static const auto zero =
+                wrapper::vdup_n(static_cast<uint8_t>(0), arm_compute::wrapper::traits::vector_128_tag());
+            return wrapper::vcgt(wrapper::vloadq(condition_ptr), zero);
+        });
 }
 
 template <typename ScalarType, typename VectorType>
@@ -95,11 +109,14 @@ void select_op_16(const ITensor *cond, const ITensor *in1, const ITensor *in2, I
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
 
-    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
-    {
-        static const auto zero = wrapper::vdup_n(static_cast<uint16_t>(0), arm_compute::wrapper::traits::vector_128_tag());
-        return wrapper::vcgt(wrapper::vmovl(wrapper::vload(condition_ptr)), zero);
-    });
+    select_op<ScalarType, VectorType>(
+        cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x,
+        [](const uint8_t *condition_ptr) -> VectorType
+        {
+            static const auto zero =
+                wrapper::vdup_n(static_cast<uint16_t>(0), arm_compute::wrapper::traits::vector_128_tag());
+            return wrapper::vcgt(wrapper::vmovl(wrapper::vload(condition_ptr)), zero);
+        });
 }
 
 template <typename ScalarType, typename VectorType>
@@ -109,15 +126,19 @@ void select_op_32(const ITensor *cond, const ITensor *in1, const ITensor *in2, I
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
 
-    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
-    {
-        static const auto zero = wrapper::vdup_n(static_cast<uint32_t>(0), arm_compute::wrapper::traits::vector_128_tag());
-        return wrapper::vcgt(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero);
-    });
+    select_op<ScalarType, VectorType>(
+        cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x,
+        [](const uint8_t *condition_ptr) -> VectorType
+        {
+            static const auto zero =
+                wrapper::vdup_n(static_cast<uint32_t>(0), arm_compute::wrapper::traits::vector_128_tag());
+            return wrapper::vcgt(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero);
+        });
 }
 
 template <typename ScalarType>
-void select_op_not_same_rank(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+void select_op_not_same_rank(
+    const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     ARM_COMPUTE_UNUSED(window);
 
@@ -131,20 +152,20 @@ void select_op_not_same_rank(const ITensor *cond, const ITensor *in1, const ITen
     int       offset     = 0;
     const int step       = 16 / in1->info()->element_size();
 
-    for(int i = 0; i < outer_size; ++i)
+    for (int i = 0; i < outer_size; ++i)
     {
         int        x         = offset;
         const auto input_ptr = static_cast<bool>(*(condition_ptr + i)) ? input1_ptr : input2_ptr;
-        for(; x <= offset + inner_size - step; x += step)
+        for (; x <= offset + inner_size - step; x += step)
         {
             wrapper::vstore(output_ptr + x, wrapper::vloadq(input_ptr + x));
         }
-        if(x <= offset + inner_size - (step / 2))
+        if (x <= offset + inner_size - (step / 2))
         {
             wrapper::vstore(output_ptr + x, wrapper::vload(input_ptr + x));
             x += step / 2;
         }
-        for(; x < offset + inner_size; ++x)
+        for (; x < offset + inner_size; ++x)
         {
             *(output_ptr + x) = *(input_ptr + x);
         }
diff --git a/src/cpu/kernels/select/generic/neon/integer.cpp b/src/cpu/kernels/select/generic/neon/integer.cpp
index 71b2f0b933..135087c261 100644
--- a/src/cpu/kernels/select/generic/neon/integer.cpp
+++ b/src/cpu/kernels/select/generic/neon/integer.cpp
@@ -25,59 +25,71 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 
-#include <arm_neon.h>
-
 #include "src/cpu/kernels/select/generic/neon/impl.h"
 
+#include <arm_neon.h>
+
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_s8_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_s8_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_8<int8_t, uint8x16_t>(c, x, y, output, window);
 }
-void neon_s16_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_s16_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_16<int16_t, uint16x8_t>(c, x, y, output, window);
 }
-void neon_s32_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_s32_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_32<int32_t, uint32x4_t>(c, x, y, output, window);
 }
-void neon_s8_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_s8_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_not_same_rank<int8_t>(c, x, y, output, window);
 }
-void neon_s16_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_s16_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_not_same_rank<int16_t>(c, x, y, output, window);
 }
-void neon_s32_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_s32_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_not_same_rank<int32_t>(c, x, y, output, window);
 }
-void neon_u8_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_u8_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_8<uint8_t, uint8x16_t>(c, x, y, output, window);
 }
-void neon_u16_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_u16_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_16<uint16_t, uint16x8_t>(c, x, y, output, window);
 }
-void neon_u32_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_u32_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_32<uint32_t, uint32x4_t>(c, x, y, output, window);
 }
-void neon_u8_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_u8_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_not_same_rank<uint8_t>(c, x, y, output, window);
 }
-void neon_u16_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_u16_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_not_same_rank<uint16_t>(c, x, y, output, window);
 }
-void neon_u32_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_u32_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_not_same_rank<uint32_t>(c, x, y, output, window);
 }
diff --git a/src/cpu/kernels/softmax/generic/neon/fp16.cpp b/src/cpu/kernels/softmax/generic/neon/fp16.cpp
index f6556696b0..2e2adf33e0 100644
--- a/src/cpu/kernels/softmax/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/fp16.cpp
@@ -23,6 +23,7 @@
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/CpuTypes.h"
 #include "src/cpu/kernels/softmax/generic/neon/impl.h"
 
@@ -30,8 +31,13 @@ namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp16_softmax(const ITensor *in, const ITensor *max, void *const tmp,
-                       ITensor *out, const float beta, bool is_log, const Window &window)
+void neon_fp16_softmax(const ITensor *in,
+                       const ITensor *max,
+                       void *const    tmp,
+                       ITensor       *out,
+                       const float    beta,
+                       bool           is_log,
+                       const Window  &window)
 {
     return neon_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window);
 }
@@ -40,6 +46,6 @@ void neon_fp16_logits(const ITensor *in, ITensor *out, const Window &window)
 {
     return neon_logits_1d_max<float16_t>(in, out, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
 #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/cpu/kernels/softmax/generic/neon/fp32.cpp b/src/cpu/kernels/softmax/generic/neon/fp32.cpp
index ddd270ae70..61df40c1b5 100644
--- a/src/cpu/kernels/softmax/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/fp32.cpp
@@ -22,14 +22,20 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/softmax/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp32_softmax(const ITensor *in, const ITensor *max, void *const tmp,
-                       ITensor *out, const float beta, bool is_log, const Window &window)
+void neon_fp32_softmax(const ITensor *in,
+                       const ITensor *max,
+                       void *const    tmp,
+                       ITensor       *out,
+                       const float    beta,
+                       bool           is_log,
+                       const Window  &window)
 {
     return neon_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window);
 }
@@ -38,5 +44,5 @@ void neon_fp32_logits(const ITensor *in, ITensor *out, const Window &window)
 {
     return neon_logits_1d_max<float>(in, out, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/impl.cpp b/src/cpu/kernels/softmax/generic/neon/impl.cpp
index f07fd2fb27..5d6e6a4f80 100644
--- a/src/cpu/kernels/softmax/generic/neon/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/impl.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "src/cpu/kernels/softmax/generic/neon/impl.h"
+
 #include "support/SaturateCast.h"
 
 namespace arm_compute
@@ -32,11 +33,10 @@ template void neon_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *o
 template void neon_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window);
 
 template <typename T>
-void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
-                                      ITensor *out, float beta, bool is_log, const Window &window)
+void neon_softmax_logits_1d_quantized(
+    const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
 {
-    static_assert(std::is_same<T, qasymm8_t>::value
-                  || std::is_same<T, qasymm8_signed_t>::value,
+    static_assert(std::is_same<T, qasymm8_t>::value || std::is_same<T, qasymm8_signed_t>::value,
                   "quantized type should be either qasymm8_t or qasymm8_signed_t.");
 
     const int start_x     = in->info()->valid_region().anchor.x();
@@ -50,163 +50,174 @@ void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, voi
     Iterator      out_it(out, window);
     constexpr int vec_size = 16;
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        /* Get pointers */
-        const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
-        const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
-        const auto tmp_ptr = reinterpret_cast<float *>(tmp);
-
-        float sum{};
-        float sum_inversed{};
-
-        /* Compute exponentials and sum */
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
         {
-            /* Get max value */
-            const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
-            const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{});
+            /* Get pointers */
+            const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
+            const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
+            const auto tmp_ptr = reinterpret_cast<float *>(tmp);
 
-            /* Init sum to zero */
-            float32x4x4_t vec_sum =
-            {
-                vdupq_n_f32(0.f),
-                vdupq_n_f32(0.f),
-                vdupq_n_f32(0.f),
-                vdupq_n_f32(0.f),
-            };
-
-            /* Loop over row and compute exponentials and sum */
-            int x = 0;
-            for(; x <= (input_width - vec_size); x += vec_size)
-            {
-                auto vec_elements     = wrapper::vloadq(in_ptr + x);
-                vec_elements          = wrapper::vqsub(vec_max, vec_elements);
-                auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
+            float sum{};
+            float sum_inversed{};
 
-                if(is_log)
-                {
-                    vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
-                    vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
-                    vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec);
-                    vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec);
-                    vec_sum.val[0]          = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0]));
-                    vec_sum.val[1]          = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1]));
-                    vec_sum.val[2]          = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2]));
-                    vec_sum.val[3]          = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3]));
-                }
-                else
+            /* Compute exponentials and sum */
+            {
+                /* Get max value */
+                const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
+                const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{});
+
+                /* Init sum to zero */
+                float32x4x4_t vec_sum = {
+                    vdupq_n_f32(0.f),
+                    vdupq_n_f32(0.f),
+                    vdupq_n_f32(0.f),
+                    vdupq_n_f32(0.f),
+                };
+
+                /* Loop over row and compute exponentials and sum */
+                int x = 0;
+                for (; x <= (input_width - vec_size); x += vec_size)
                 {
-                    vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec));
-                    vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec));
-                    vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec));
-                    vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec));
-                    vec_sum.val[0]          = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]);
-                    vec_sum.val[1]          = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]);
-                    vec_sum.val[2]          = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]);
-                    vec_sum.val[3]          = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]);
+                    auto vec_elements     = wrapper::vloadq(in_ptr + x);
+                    vec_elements          = wrapper::vqsub(vec_max, vec_elements);
+                    auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
+
+                    if (is_log)
+                    {
+                        vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
+                        vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
+                        vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec);
+                        vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec);
+                        vec_sum.val[0]          = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0]));
+                        vec_sum.val[1]          = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1]));
+                        vec_sum.val[2]          = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2]));
+                        vec_sum.val[3]          = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3]));
+                    }
+                    else
+                    {
+                        vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec));
+                        vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec));
+                        vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec));
+                        vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec));
+                        vec_sum.val[0]          = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]);
+                        vec_sum.val[1]          = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]);
+                        vec_sum.val[2]          = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]);
+                        vec_sum.val[3]          = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]);
+                    }
+
+                    vst4q_f32(tmp_ptr + x, vec_elements_flt);
                 }
 
-                vst4q_f32(tmp_ptr + x, vec_elements_flt);
-            }
+                /* Reduce sum */
+                const auto sum_16_byte =
+                    vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
+                auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));
+                sum_res      = vpadd_f32(sum_res, sum_res);
+                sum          = wrapper::vgetlane(sum_res, 0);
 
-            /* Reduce sum */
-            const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
-            auto       sum_res     = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));
-            sum_res                = vpadd_f32(sum_res, sum_res);
-            sum                    = wrapper::vgetlane(sum_res, 0);
+                /* Run remaining elements */
+                for (; x < input_width; ++x)
+                {
+                    float element{};
+                    if (is_log)
+                    {
+                        element = (max_val - in_ptr[x]) * scale_beta;
+                        sum += std::exp(element);
+                    }
+                    else
+                    {
+                        element = std::exp((max_val - in_ptr[x]) * scale_beta);
+                        sum += element;
+                    }
 
-            /* Run remaining elements */
-            for(; x < input_width; ++x)
-            {
-                float element{};
-                if(is_log)
+                    tmp_ptr[x] = element;
+                }
+
+                if (!is_log)
                 {
-                    element = (max_val - in_ptr[x]) * scale_beta;
-                    sum += std::exp(element);
+                    sum_inversed = 256.f / sum;
                 }
                 else
                 {
-                    element = std::exp((max_val - in_ptr[x]) * scale_beta);
-                    sum += element;
+                    sum = std::log(sum);
                 }
-
-                tmp_ptr[x] = element;
             }
 
-            if(!is_log)
-            {
-                sum_inversed = 256.f / sum;
-            }
-            else
+            /* Normalize exponentials */
             {
-                sum = std::log(sum);
-            }
-        }
-
-        /* Normalize exponentials */
-        {
-            constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
-            /* Loop over row and compute softmax */
-            int x = 0;
-            for(; x <= (input_width - vec_size); x += vec_size)
-            {
-                using int_vec_type   = wrapper::traits::neon_vector_t<T, 16>;
-                float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
-                int_vec_type  normalized_value{};
-                if(is_log)
+                constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
+                /* Loop over row and compute softmax */
+                int x = 0;
+                for (; x <= (input_width - vec_size); x += vec_size)
                 {
-                    const float32x4x4_t sub =
+                    using int_vec_type   = wrapper::traits::neon_vector_t<T, 16>;
+                    float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
+                    int_vec_type  normalized_value{};
+                    if (is_log)
                     {
-                        vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)),
-                        vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)),
-                        vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)),
-                        vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)),
-                    };
-                    normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
+                        const float32x4x4_t sub = {
+                            vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)),
+                            vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)),
+                            vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)),
+                            vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)),
+                        };
+                        normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
+                    }
+                    else
+                    {
+                        float32x4x4_t mul = {
+                            vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)),
+                            vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)),
+                            vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)),
+                            vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)),
+                        };
+
+                        if (is_qasymm8_signed)
+                        {
+                            const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{});
+                            mul.val[0]            = wrapper::vsub(mul.val[0], offset_vec);
+                            mul.val[1]            = wrapper::vsub(mul.val[1], offset_vec);
+                            mul.val[2]            = wrapper::vsub(mul.val[2], offset_vec);
+                            mul.val[3]            = wrapper::vsub(mul.val[3], offset_vec);
+                        }
+
+                        normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul);
+                    }
+                    wrapper::vstore(out_ptr + x, normalized_value);
                 }
-                else
+                /* Run remaining elements */
+                for (; x < input_width; ++x)
                 {
-                    float32x4x4_t mul =
+                    if (is_log)
                     {
-                        vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)),
-                        vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)),
-                        vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)),
-                        vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)),
-                    };
-
-                    if(is_qasymm8_signed)
+                        out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum);
+                    }
+                    else
                     {
-                        const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{});
-                        mul.val[0]            = wrapper::vsub(mul.val[0], offset_vec);
-                        mul.val[1]            = wrapper::vsub(mul.val[1], offset_vec);
-                        mul.val[2]            = wrapper::vsub(mul.val[2], offset_vec);
-                        mul.val[3]            = wrapper::vsub(mul.val[3], offset_vec);
+                        out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) -
+                                                                   (is_qasymm8_signed ? 128.f : 0));
                     }
-
-                    normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul);
-                }
-                wrapper::vstore(out_ptr + x, normalized_value);
-            }
-            /* Run remaining elements */
-            for(; x < input_width; ++x)
-            {
-                if(is_log)
-                {
-                    out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum);
-                }
-                else
-                {
-                    out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) - (is_qasymm8_signed ? 128.f : 0));
                 }
             }
-        }
-    },
-    in_it, max_it, out_it);
+        },
+        in_it, max_it, out_it);
 }
 
-template void neon_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in, const ITensor *max, void *const tmp,
-                                                                 ITensor *out, float beta, bool is_log, const Window &window);
-template void neon_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in, const ITensor *max, void *const tmp,
-                                                          ITensor *out, float beta, bool is_log, const Window &window);
+template void neon_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in,
+                                                                 const ITensor *max,
+                                                                 void *const    tmp,
+                                                                 ITensor       *out,
+                                                                 float          beta,
+                                                                 bool           is_log,
+                                                                 const Window  &window);
+template void neon_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in,
+                                                          const ITensor *max,
+                                                          void *const    tmp,
+                                                          ITensor       *out,
+                                                          float          beta,
+                                                          bool           is_log,
+                                                          const Window  &window);
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/impl.h b/src/cpu/kernels/softmax/generic/neon/impl.h
index 206d36a2e0..4d9b789297 100644
--- a/src/cpu/kernels/softmax/generic/neon/impl.h
+++ b/src/cpu/kernels/softmax/generic/neon/impl.h
@@ -25,6 +25,7 @@
 #define SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
 
@@ -42,53 +43,65 @@ void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
     const auto    window_start_x = static_cast<int>(window.x().start());
     const auto    window_end_x   = static_cast<int>(window.x().end());
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
     Iterator input(in, win);
     Iterator output(out, win);
 
     const int sum_stages = log2(window_step_x / 2);
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        // Get pointers
-        const auto in_ptr  = reinterpret_cast<const T *>(input.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(output.ptr());
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            // Get pointers
+            const auto in_ptr  = reinterpret_cast<const T *>(input.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(output.ptr());
 
-        // Init max value
-        auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
-        int  x       = window_start_x;
+            // Init max value
+            auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
+            int  x       = window_start_x;
 
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto current_value = wrapper::vloadq(in_ptr + x);
-            vec_max                  = wrapper::vmax(vec_max, current_value);
-        }
-        auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto current_value = wrapper::vloadq(in_ptr + x);
+                vec_max                  = wrapper::vmax(vec_max, current_value);
+            }
+            auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
 
-        for(int i = 0; i < sum_stages; ++i)
-        {
-            carry_max = wrapper::vpmax(carry_max, carry_max);
-        }
-        T max_val = wrapper::vgetlane(carry_max, 0);
+            for (int i = 0; i < sum_stages; ++i)
+            {
+                carry_max = wrapper::vpmax(carry_max, carry_max);
+            }
+            T max_val = wrapper::vgetlane(carry_max, 0);
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
-        }
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
+            }
 
-        *out_ptr = max_val;
-    },
-    input, output);
+            *out_ptr = max_val;
+        },
+        input, output);
 }
 
 template <typename T>
-void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
-                                      ITensor *out, float beta, bool is_log, const Window &window);
+void neon_softmax_logits_1d_quantized(const ITensor *in,
+                                      const ITensor *max,
+                                      void *const    tmp,
+                                      ITensor       *out,
+                                      float          beta,
+                                      bool           is_log,
+                                      const Window  &window);
 
 template <typename T>
-void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
-                                  ITensor *out, const float beta, bool is_log, const Window &window)
+void neon_softmax_logits_1d_float(const ITensor *in,
+                                  const ITensor *max,
+                                  void *const    tmp,
+                                  ITensor       *out,
+                                  const float    beta,
+                                  bool           is_log,
+                                  const Window  &window)
 {
     const int start_x     = in->info()->valid_region().anchor.x();
     const int input_width = in->info()->valid_region().shape.x();
@@ -103,113 +116,118 @@ void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *c
     constexpr int vec_size   = 16 / sizeof(T);
     const int     sum_stages = log2(vec_size / 2);
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        /* Get pointers */
-        const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
-        const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
-        const auto tmp_ptr = reinterpret_cast<T *>(tmp);
-
-        T sum{};
-        T sum_inversed{};
-
-        /* Compute exponentials and sum */
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
         {
-            /* Get max value */
-            const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
-            const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});
-
-            /* Init sum to zero */
-            auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
+            /* Get pointers */
+            const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
+            const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
+            const auto tmp_ptr = reinterpret_cast<T *>(tmp);
 
-            /* Loop over row and compute exponentials and sum */
-            int x = 0;
-            for(; x <= (input_width - vec_size); x += vec_size)
-            {
-                auto vec_elements = wrapper::vloadq(in_ptr + x);
-                vec_elements      = wrapper::vsub(vec_elements, vec_max);
-                if(is_log)
-                {
-                    vec_elements = wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}));
-                    vec_sum      = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
-                }
-                else
-                {
-                    vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})));
-                    vec_sum      = wrapper::vadd(vec_sum, vec_elements);
-                }
-                wrapper::vstore(tmp_ptr + x, vec_elements);
-            }
+            T sum{};
+            T sum_inversed{};
 
-            /* Reduce sum */
-            auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));
-            for(int i = 0; i < sum_stages; ++i)
+            /* Compute exponentials and sum */
             {
-                sum_res = wrapper::vpadd(sum_res, sum_res);
-            }
-            sum = wrapper::vgetlane(sum_res, 0);
+                /* Get max value */
+                const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
+                const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});
 
-            /* Run remaining elements */
-            for(; x < input_width; ++x)
-            {
-                T element{};
+                /* Init sum to zero */
+                auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
 
-                if(is_log)
+                /* Loop over row and compute exponentials and sum */
+                int x = 0;
+                for (; x <= (input_width - vec_size); x += vec_size)
                 {
-                    element = (in_ptr[x] - max_val) * beta;
-                    sum += std::exp(element);
+                    auto vec_elements = wrapper::vloadq(in_ptr + x);
+                    vec_elements      = wrapper::vsub(vec_elements, vec_max);
+                    if (is_log)
+                    {
+                        vec_elements =
+                            wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}));
+                        vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
+                    }
+                    else
+                    {
+                        vec_elements = wrapper::vexpq(
+                            wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})));
+                        vec_sum = wrapper::vadd(vec_sum, vec_elements);
+                    }
+                    wrapper::vstore(tmp_ptr + x, vec_elements);
                 }
-                else
+
+                /* Reduce sum */
+                auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));
+                for (int i = 0; i < sum_stages; ++i)
                 {
-                    element = std::exp((in_ptr[x] - max_val) * beta);
-                    sum += element;
+                    sum_res = wrapper::vpadd(sum_res, sum_res);
                 }
-                tmp_ptr[x] = element;
-            }
+                sum = wrapper::vgetlane(sum_res, 0);
 
-            if(!is_log)
-            {
-                sum_inversed = T(1) / sum;
-            }
-            else
-            {
-                sum = static_cast<T>(std::log(sum));
-            }
-        }
+                /* Run remaining elements */
+                for (; x < input_width; ++x)
+                {
+                    T element{};
+
+                    if (is_log)
+                    {
+                        element = (in_ptr[x] - max_val) * beta;
+                        sum += std::exp(element);
+                    }
+                    else
+                    {
+                        element = std::exp((in_ptr[x] - max_val) * beta);
+                        sum += element;
+                    }
+                    tmp_ptr[x] = element;
+                }
 
-        /* Normalize exponentials */
-        {
-            /* Loop over row and compute softmax */
-            int x = 0;
-            for(; x <= (input_width - vec_size); x += vec_size)
-            {
-                auto vec_in           = wrapper::vloadq(tmp_ptr + x);
-                auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
-                if(is_log)
+                if (!is_log)
                 {
-                    normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{}));
+                    sum_inversed = T(1) / sum;
                 }
                 else
                 {
-                    normalized_value = wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{}));
+                    sum = static_cast<T>(std::log(sum));
                 }
-                wrapper::vstore(out_ptr + x, normalized_value);
             }
-            /* Run remaining elements */
-            for(; x < input_width; ++x)
+
+            /* Normalize exponentials */
             {
-                if(is_log)
+                /* Loop over row and compute softmax */
+                int x = 0;
+                for (; x <= (input_width - vec_size); x += vec_size)
                 {
-                    out_ptr[x] = tmp_ptr[x] - sum;
+                    auto vec_in           = wrapper::vloadq(tmp_ptr + x);
+                    auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
+                    if (is_log)
+                    {
+                        normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{}));
+                    }
+                    else
+                    {
+                        normalized_value =
+                            wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{}));
+                    }
+                    wrapper::vstore(out_ptr + x, normalized_value);
                 }
-                else
+                /* Run remaining elements */
+                for (; x < input_width; ++x)
                 {
-                    out_ptr[x] = tmp_ptr[x] * sum_inversed;
+                    if (is_log)
+                    {
+                        out_ptr[x] = tmp_ptr[x] - sum;
+                    }
+                    else
+                    {
+                        out_ptr[x] = tmp_ptr[x] * sum_inversed;
+                    }
                 }
             }
-        }
-    },
-    in_it, max_it, out_it);
+        },
+        in_it, max_it, out_it);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
index a572891561..40713dc496 100644
--- a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
@@ -22,14 +22,20 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/softmax/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_qasymm8_softmax(const ITensor *in, const ITensor *max, void *const tmp,
-                          ITensor *out, const float beta, bool is_log, const Window &window)
+void neon_qasymm8_softmax(const ITensor *in,
+                          const ITensor *max,
+                          void *const    tmp,
+                          ITensor       *out,
+                          const float    beta,
+                          bool           is_log,
+                          const Window  &window)
 {
     return neon_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window);
 }
@@ -38,5 +44,5 @@ void neon_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window)
 {
     return neon_logits_1d_max<qasymm8_t>(in, out, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
index 7d3fe6e046..2c5e284f54 100644
--- a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
@@ -22,14 +22,20 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/softmax/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_qasymm8_signed_softmax(const ITensor *in, const ITensor *max, void *const tmp,
-                                 ITensor *out, const float beta, bool is_log, const Window &window)
+void neon_qasymm8_signed_softmax(const ITensor *in,
+                                 const ITensor *max,
+                                 void *const    tmp,
+                                 ITensor       *out,
+                                 const float    beta,
+                                 bool           is_log,
+                                 const Window  &window)
 {
     return neon_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window);
 }
@@ -38,5 +44,5 @@ void neon_qasymm8_singed_logits(const ITensor *in, ITensor *out, const Window &w
 {
     return neon_logits_1d_max<qasymm8_signed_t>(in, out, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/fp16.cpp b/src/cpu/kernels/softmax/generic/sve/fp16.cpp
index 15a523bfc9..5e94f72faf 100644
--- a/src/cpu/kernels/softmax/generic/sve/fp16.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/fp16.cpp
@@ -23,14 +23,20 @@
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/CpuTypes.h"
 #include "src/cpu/kernels/softmax/generic/sve/impl.h"
 namespace arm_compute
 {
 namespace cpu
 {
-void sve_fp16_softmax(const ITensor *in, const ITensor *max, void *const tmp,
-                      ITensor *out, const float beta, bool is_log, const Window &window)
+void sve_fp16_softmax(const ITensor *in,
+                      const ITensor *max,
+                      void *const    tmp,
+                      ITensor       *out,
+                      const float    beta,
+                      bool           is_log,
+                      const Window  &window)
 {
     return sve_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window);
 }
@@ -39,6 +45,6 @@ void sve_fp16_logits(const ITensor *in, ITensor *out, const Window &window)
 {
     return sve_logits_1d_max<float16_t>(in, out, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/softmax/generic/sve/fp32.cpp b/src/cpu/kernels/softmax/generic/sve/fp32.cpp
index 55c4aee426..d692cc2477 100644
--- a/src/cpu/kernels/softmax/generic/sve/fp32.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/fp32.cpp
@@ -23,14 +23,20 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/softmax/generic/sve/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sve_fp32_softmax(const ITensor *in, const ITensor *max, void *const tmp,
-                      ITensor *out, const float beta, bool is_log, const Window &window)
+void sve_fp32_softmax(const ITensor *in,
+                      const ITensor *max,
+                      void *const    tmp,
+                      ITensor       *out,
+                      const float    beta,
+                      bool           is_log,
+                      const Window  &window)
 {
     return sve_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window);
 }
@@ -39,5 +45,5 @@ void sve_fp32_logits(const ITensor *in, ITensor *out, const Window &window)
 {
     return sve_logits_1d_max<float>(in, out, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/impl.cpp b/src/cpu/kernels/softmax/generic/sve/impl.cpp
index 2340a31cbd..24f1bb8143 100644
--- a/src/cpu/kernels/softmax/generic/sve/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/impl.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "src/cpu/kernels/softmax/generic/sve/impl.h"
+
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 
 namespace arm_compute
@@ -36,42 +37,48 @@ void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
     Iterator input(in, win);
     Iterator output(out, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        // Get pointers
-        const auto in_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
-        const auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            // Get pointers
+            const auto in_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
+            const auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr());
 
-        // Init max value
-        auto vec_max = wrapper::svdup_n(support::cpp11::lowest<ScalarType>());
+            // Init max value
+            auto vec_max = wrapper::svdup_n(support::cpp11::lowest<ScalarType>());
 
-        int      x  = window_start_x;
-        svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-        do
-        {
-            const auto current_value = svld1(pg, in_ptr + x);
-            vec_max                  = svmax_m(pg, vec_max, current_value);
+            int      x  = window_start_x;
+            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+            do
+            {
+                const auto current_value = svld1(pg, in_ptr + x);
+                vec_max                  = svmax_m(pg, vec_max, current_value);
 
-            x += wrapper::svcnt<ScalarType>();
-            pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-        }
-        while(svptest_any(all_true_pg, pg));
+                x += wrapper::svcnt<ScalarType>();
+                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+            } while (svptest_any(all_true_pg, pg));
 
-        auto max_val = svmaxv(all_true_pg, vec_max);
+            auto max_val = svmaxv(all_true_pg, vec_max);
 
-        *out_ptr = max_val;
-    },
-    input, output);
+            *out_ptr = max_val;
+        },
+        input, output);
 }
 
 template <typename ScalarType>
-void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
-                                 ITensor *out, const float beta, bool is_log, const Window &window)
+void sve_softmax_logits_1d_float(const ITensor *in,
+                                 const ITensor *max,
+                                 void *const    tmp,
+                                 ITensor       *out,
+                                 const float    beta,
+                                 bool           is_log,
+                                 const Window  &window)
 {
     const int start_x     = in->info()->valid_region().anchor.x();
     const int input_width = in->info()->valid_region().shape.x();
@@ -82,88 +89,88 @@ void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *co
 
     const auto all_true_pg = wrapper::svptrue<ScalarType>();
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        /* Get pointers */
-        const auto in_ptr  = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
-        const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
-        const auto tmp_ptr = reinterpret_cast<ScalarType *>(tmp);
-
-        ScalarType sum{ 0 };
-
-        /* Compute exponentials and sum */
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
         {
-            /* Get max value */
-            const auto max_val  = *reinterpret_cast<const ScalarType *>(max_it.ptr());
-            const auto vec_max  = wrapper::svdup_n(max_val);
-            const auto vec_beta = wrapper::svdup_n(static_cast<ScalarType>(beta));
+            /* Get pointers */
+            const auto in_ptr  = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
+            const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
+            const auto tmp_ptr = reinterpret_cast<ScalarType *>(tmp);
 
-            /* Init sum to zero */
-            auto vec_sum = wrapper::svdup_n(static_cast<ScalarType>(0));
+            ScalarType sum{0};
 
-            /* Loop over row and compute exponentials and sum */
-            int      x  = 0;
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
-            do
+            /* Compute exponentials and sum */
             {
-                auto vec_elements = svld1(pg, in_ptr + x);
-                vec_elements      = svmul_z(pg, svsub_z(pg, vec_elements, vec_max), vec_beta);
-                if(!is_log)
+                /* Get max value */
+                const auto max_val  = *reinterpret_cast<const ScalarType *>(max_it.ptr());
+                const auto vec_max  = wrapper::svdup_n(max_val);
+                const auto vec_beta = wrapper::svdup_n(static_cast<ScalarType>(beta));
+
+                /* Init sum to zero */
+                auto vec_sum = wrapper::svdup_n(static_cast<ScalarType>(0));
+
+                /* Loop over row and compute exponentials and sum */
+                int      x  = 0;
+                svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+                do
                 {
-                    vec_elements = wrapper::svexp_z(pg, vec_elements);
-                    vec_sum      = svadd_m(pg, vec_sum, vec_elements);
+                    auto vec_elements = svld1(pg, in_ptr + x);
+                    vec_elements      = svmul_z(pg, svsub_z(pg, vec_elements, vec_max), vec_beta);
+                    if (!is_log)
+                    {
+                        vec_elements = wrapper::svexp_z(pg, vec_elements);
+                        vec_sum      = svadd_m(pg, vec_sum, vec_elements);
+                    }
+                    svst1(pg, tmp_ptr + x, vec_elements);
+
+                    if (is_log)
+                    {
+                        vec_sum = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements));
+                    }
+
+                    x += wrapper::svcnt<ScalarType>();
+                    pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+                } while (svptest_any(all_true_pg, pg));
+
+                /* Reduce sum */
+                sum = svaddv(all_true_pg, vec_sum);
+
+                if (is_log)
+                {
+                    sum = static_cast<ScalarType>(std::log(sum));
                 }
-                svst1(pg, tmp_ptr + x, vec_elements);
-
-                if(is_log)
+                else
                 {
-                    vec_sum = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements));
+                    sum = ScalarType(1) / sum;
                 }
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, input_width);
             }
-            while(svptest_any(all_true_pg, pg));
 
-            /* Reduce sum */
-            sum = svaddv(all_true_pg, vec_sum);
-
-            if(is_log)
-            {
-                sum = static_cast<ScalarType>(std::log(sum));
-            }
-            else
-            {
-                sum = ScalarType(1) / sum;
-            }
-        }
-
-        /* Normalize exponentials */
-        {
-            /* Loop over row and compute softmax */
-            int      x  = 0;
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
-            do
+            /* Normalize exponentials */
             {
-                auto vec_in           = svld1(pg, tmp_ptr + x);
-                auto normalized_value = wrapper::svdup_n(static_cast<ScalarType>(0));
-                if(is_log)
-                {
-                    normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
-                }
-                else
+                /* Loop over row and compute softmax */
+                int      x  = 0;
+                svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+                do
                 {
-                    normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
-                }
-                svst1(pg, out_ptr + x, normalized_value);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+                    auto vec_in           = svld1(pg, tmp_ptr + x);
+                    auto normalized_value = wrapper::svdup_n(static_cast<ScalarType>(0));
+                    if (is_log)
+                    {
+                        normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
+                    }
+                    else
+                    {
+                        normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
+                    }
+                    svst1(pg, out_ptr + x, normalized_value);
+
+                    x += wrapper::svcnt<ScalarType>();
+                    pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+                } while (svptest_any(all_true_pg, pg));
             }
-            while(svptest_any(all_true_pg, pg));
-        }
-    },
-    in_it, max_it, out_it);
+        },
+        in_it, max_it, out_it);
 }
 
 template void sve_logits_1d_max<float>(const ITensor *in, ITensor *out, const Window &window);
@@ -171,9 +178,19 @@ template void sve_logits_1d_max<float16_t>(const ITensor *in, ITensor *out, cons
 template void sve_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window);
 template void sve_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window);
 
-template void sve_softmax_logits_1d_float<float>(const ITensor *in, const ITensor *max, void *const tmp,
-                                                 ITensor *out, const float beta, bool is_log, const Window &window);
-template void sve_softmax_logits_1d_float<float16_t>(const ITensor *in, const ITensor *max, void *const tmp,
-                                                     ITensor *out, const float beta, bool is_log, const Window &window);
+template void sve_softmax_logits_1d_float<float>(const ITensor *in,
+                                                 const ITensor *max,
+                                                 void *const    tmp,
+                                                 ITensor       *out,
+                                                 const float    beta,
+                                                 bool           is_log,
+                                                 const Window  &window);
+template void sve_softmax_logits_1d_float<float16_t>(const ITensor *in,
+                                                     const ITensor *max,
+                                                     void *const    tmp,
+                                                     ITensor       *out,
+                                                     const float    beta,
+                                                     bool           is_log,
+                                                     const Window  &window);
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/impl.h b/src/cpu/kernels/softmax/generic/sve/impl.h
index 4f76ec6a26..89a30d042f 100644
--- a/src/cpu/kernels/softmax/generic/sve/impl.h
+++ b/src/cpu/kernels/softmax/generic/sve/impl.h
@@ -33,8 +33,13 @@ template <typename ScalarType>
 void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window);
 
 template <typename ScalarType>
-void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
-                                 ITensor *out, const float beta, bool is_log, const Window &window);
+void sve_softmax_logits_1d_float(const ITensor *in,
+                                 const ITensor *max,
+                                 void *const    tmp,
+                                 ITensor       *out,
+                                 const float    beta,
+                                 bool           is_log,
+                                 const Window  &window);
 } // namespace cpu
 } // namespace arm_compute
 
diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp
index e9044d5fc9..85e5ccfea1 100644
--- a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/softmax/generic/sve/impl.h"
 
 namespace arm_compute
@@ -33,5 +34,5 @@ void sve_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window)
 {
     return sve_logits_1d_max<qasymm8_t>(in, out, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
index ab45ce598d..4be2e2eed6 100644
--- a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/softmax/generic/sve/impl.h"
 
 namespace arm_compute
@@ -33,5 +34,5 @@ void sve_qasymm8_signed_logits(const ITensor *in, ITensor *out, const Window &wi
 {
     return sve_logits_1d_max<qasymm8_signed_t>(in, out, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve2/impl.cpp b/src/cpu/kernels/softmax/generic/sve2/impl.cpp
index 8f677c62d4..98b2f5117f 100644
--- a/src/cpu/kernels/softmax/generic/sve2/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/sve2/impl.cpp
@@ -23,7 +23,9 @@
  */
 
 #include "src/cpu/kernels/softmax/generic/sve2/impl.h"
+
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
@@ -31,8 +33,8 @@ namespace arm_compute
 namespace cpu
 {
 template <typename ScalarType>
-void sve2_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
-                                      ITensor *out, float beta, bool is_log, const Window &window)
+void sve2_softmax_logits_1d_quantized(
+    const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
 {
     const int start_x     = in->info()->valid_region().anchor.x();
     const int input_width = in->info()->valid_region().shape.x();
@@ -50,162 +52,173 @@ void sve2_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, voi
     const int inc_2 = static_cast<int>(2 * svcntw());
     const int inc_3 = static_cast<int>(3 * svcntw());
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        /* Get pointers */
-        const auto in_ptr  = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
-        const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
-        const auto tmp_ptr = reinterpret_cast<float *>(tmp);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
+        {
+            /* Get pointers */
+            const auto in_ptr  = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
+            const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
+            const auto tmp_ptr = reinterpret_cast<float *>(tmp);
 
-        float sum{};
+            float sum{};
 
-        /* Compute exponentials and sum */
-        {
-            /* Get max value */
-            const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr());
-            const auto vec_max = wrapper::svdup_n(max_val);
-
-            /* Init sum to zero */
-            auto vec_sum_0 = svdup_n_f32(0.f);
-            auto vec_sum_1 = svdup_n_f32(0.f);
-            auto vec_sum_2 = svdup_n_f32(0.f);
-            auto vec_sum_3 = svdup_n_f32(0.f);
-
-            /* Loop over row and compute exponentials and sum */
-            int      x    = 0;
-            svbool_t pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
-            svbool_t pg_0 = svunpklo(svunpklo(pg));
-            svbool_t pg_1 = svunpkhi(svunpklo(pg));
-            svbool_t pg_2 = svunpklo(svunpkhi(pg));
-            svbool_t pg_3 = svunpkhi(svunpkhi(pg));
-            do
+            /* Compute exponentials and sum */
             {
-                const auto vec_elements     = svld1(pg, in_ptr + x);
-                const auto vec_elements_sub = svreinterpret_u8(svsub_z(pg, vec_max, vec_elements));
+                /* Get max value */
+                const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr());
+                const auto vec_max = wrapper::svdup_n(max_val);
+
+                /* Init sum to zero */
+                auto vec_sum_0 = svdup_n_f32(0.f);
+                auto vec_sum_1 = svdup_n_f32(0.f);
+                auto vec_sum_2 = svdup_n_f32(0.f);
+                auto vec_sum_3 = svdup_n_f32(0.f);
+
+                /* Loop over row and compute exponentials and sum */
+                int      x    = 0;
+                svbool_t pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
+                svbool_t pg_0 = svunpklo(svunpklo(pg));
+                svbool_t pg_1 = svunpkhi(svunpklo(pg));
+                svbool_t pg_2 = svunpklo(svunpkhi(pg));
+                svbool_t pg_3 = svunpkhi(svunpkhi(pg));
+                do
+                {
+                    const auto vec_elements     = svld1(pg, in_ptr + x);
+                    const auto vec_elements_sub = svreinterpret_u8(svsub_z(pg, vec_max, vec_elements));
+
+                    auto vec_elements_flt_0 = svcvt_f32_z(pg_0, svunpklo(svunpklo(vec_elements_sub)));
+                    auto vec_elements_flt_1 = svcvt_f32_z(pg_1, svunpkhi(svunpklo(vec_elements_sub)));
+                    auto vec_elements_flt_2 = svcvt_f32_z(pg_2, svunpklo(svunpkhi(vec_elements_sub)));
+                    auto vec_elements_flt_3 = svcvt_f32_z(pg_3, svunpkhi(svunpkhi(vec_elements_sub)));
 
-                auto vec_elements_flt_0 = svcvt_f32_z(pg_0, svunpklo(svunpklo(vec_elements_sub)));
-                auto vec_elements_flt_1 = svcvt_f32_z(pg_1, svunpkhi(svunpklo(vec_elements_sub)));
-                auto vec_elements_flt_2 = svcvt_f32_z(pg_2, svunpklo(svunpkhi(vec_elements_sub)));
-                auto vec_elements_flt_3 = svcvt_f32_z(pg_3, svunpkhi(svunpkhi(vec_elements_sub)));
+                    if (is_log)
+                    {
+                        vec_elements_flt_0 = svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec);
+                        vec_elements_flt_1 = svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec);
+                        vec_elements_flt_2 = svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec);
+                        vec_elements_flt_3 = svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec);
+                        vec_sum_0          = svadd_f32_m(pg_0, vec_sum_0, svexp_f32_z(pg_0, vec_elements_flt_0));
+                        vec_sum_1          = svadd_f32_m(pg_1, vec_sum_1, svexp_f32_z(pg_1, vec_elements_flt_1));
+                        vec_sum_2          = svadd_f32_m(pg_2, vec_sum_2, svexp_f32_z(pg_2, vec_elements_flt_2));
+                        vec_sum_3          = svadd_f32_m(pg_3, vec_sum_3, svexp_f32_z(pg_3, vec_elements_flt_3));
+                    }
+                    else
+                    {
+                        vec_elements_flt_0 = svexp_f32_z(pg_0, svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec));
+                        vec_elements_flt_1 = svexp_f32_z(pg_1, svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec));
+                        vec_elements_flt_2 = svexp_f32_z(pg_2, svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec));
+                        vec_elements_flt_3 = svexp_f32_z(pg_3, svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec));
+                        vec_sum_0          = svadd_f32_m(pg_0, vec_sum_0, vec_elements_flt_0);
+                        vec_sum_1          = svadd_f32_m(pg_1, vec_sum_1, vec_elements_flt_1);
+                        vec_sum_2          = svadd_f32_m(pg_2, vec_sum_2, vec_elements_flt_2);
+                        vec_sum_3          = svadd_f32_m(pg_3, vec_sum_3, vec_elements_flt_3);
+                    }
 
-                if(is_log)
+                    svst1_f32(pg_0, tmp_ptr + x, vec_elements_flt_0);
+                    svst1_f32(pg_1, tmp_ptr + x + inc_1, vec_elements_flt_1);
+                    svst1_f32(pg_2, tmp_ptr + x + inc_2, vec_elements_flt_2);
+                    svst1_f32(pg_3, tmp_ptr + x + inc_3, vec_elements_flt_3);
+
+                    x += wrapper::svcnt<ScalarType>();
+                    pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
+                    pg_0 = svunpklo(svunpklo(pg));
+                    pg_1 = svunpkhi(svunpklo(pg));
+                    pg_2 = svunpklo(svunpkhi(pg));
+                    pg_3 = svunpkhi(svunpkhi(pg));
+                } while (svptest_any(all_true_pg, pg));
+
+                /* Reduce sum */
+                const auto vec_sum = svadd_f32_z(all_true_pg, svadd_f32_z(all_true_pg, vec_sum_0, vec_sum_1),
+                                                 svadd_f32_z(all_true_pg, vec_sum_2, vec_sum_3));
+                sum                = svaddv_f32(all_true_pg, vec_sum);
+
+                /* Run remaining elements */
+                x = 0;
+                if (is_log)
                 {
-                    vec_elements_flt_0 = svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec);
-                    vec_elements_flt_1 = svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec);
-                    vec_elements_flt_2 = svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec);
-                    vec_elements_flt_3 = svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec);
-                    vec_sum_0          = svadd_f32_m(pg_0, vec_sum_0, svexp_f32_z(pg_0, vec_elements_flt_0));
-                    vec_sum_1          = svadd_f32_m(pg_1, vec_sum_1, svexp_f32_z(pg_1, vec_elements_flt_1));
-                    vec_sum_2          = svadd_f32_m(pg_2, vec_sum_2, svexp_f32_z(pg_2, vec_elements_flt_2));
-                    vec_sum_3          = svadd_f32_m(pg_3, vec_sum_3, svexp_f32_z(pg_3, vec_elements_flt_3));
+                    sum = std::log(sum);
                 }
                 else
                 {
-                    vec_elements_flt_0 = svexp_f32_z(pg_0, svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec));
-                    vec_elements_flt_1 = svexp_f32_z(pg_1, svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec));
-                    vec_elements_flt_2 = svexp_f32_z(pg_2, svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec));
-                    vec_elements_flt_3 = svexp_f32_z(pg_3, svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec));
-                    vec_sum_0          = svadd_f32_m(pg_0, vec_sum_0, vec_elements_flt_0);
-                    vec_sum_1          = svadd_f32_m(pg_1, vec_sum_1, vec_elements_flt_1);
-                    vec_sum_2          = svadd_f32_m(pg_2, vec_sum_2, vec_elements_flt_2);
-                    vec_sum_3          = svadd_f32_m(pg_3, vec_sum_3, vec_elements_flt_3);
+                    sum = 256.f / sum;
                 }
-
-                svst1_f32(pg_0, tmp_ptr + x, vec_elements_flt_0);
-                svst1_f32(pg_1, tmp_ptr + x + inc_1, vec_elements_flt_1);
-                svst1_f32(pg_2, tmp_ptr + x + inc_2, vec_elements_flt_2);
-                svst1_f32(pg_3, tmp_ptr + x + inc_3, vec_elements_flt_3);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
-                pg_0 = svunpklo(svunpklo(pg));
-                pg_1 = svunpkhi(svunpklo(pg));
-                pg_2 = svunpklo(svunpkhi(pg));
-                pg_3 = svunpkhi(svunpkhi(pg));
             }
-            while(svptest_any(all_true_pg, pg));
 
-            /* Reduce sum */
-            const auto vec_sum = svadd_f32_z(all_true_pg, svadd_f32_z(all_true_pg, vec_sum_0, vec_sum_1), svadd_f32_z(all_true_pg, vec_sum_2, vec_sum_3));
-            sum                = svaddv_f32(all_true_pg, vec_sum);
-
-            /* Run remaining elements */
-            x = 0;
-            if(is_log)
-            {
-                sum = std::log(sum);
-            }
-            else
+            /* Normalize exponentials */
             {
-                sum = 256.f / sum;
-            }
-        }
-
-        /* Normalize exponentials */
-        {
-            constexpr bool is_qasymm8_signed = std::is_same<ScalarType, qasymm8_signed_t>::value;
-            /* Loop over row and compute softmax */
-            int      x    = 0;
-            svbool_t pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
-            svbool_t pg_0 = svunpklo(svunpklo(pg));
-            svbool_t pg_1 = svunpkhi(svunpklo(pg));
-            svbool_t pg_2 = svunpklo(svunpkhi(pg));
-            svbool_t pg_3 = svunpkhi(svunpkhi(pg));
-            do
-            {
-                auto vec_in_0 = svld1_f32(pg_0, tmp_ptr + x);
-                auto vec_in_1 = svld1_f32(pg_1, tmp_ptr + x + inc_1);
-                auto vec_in_2 = svld1_f32(pg_2, tmp_ptr + x + inc_2);
-                auto vec_in_3 = svld1_f32(pg_3, tmp_ptr + x + inc_3);
-
-                svfloat32_t res_0{};
-                svfloat32_t res_1{};
-                svfloat32_t res_2{};
-                svfloat32_t res_3{};
-
-                if(is_log)
+                constexpr bool is_qasymm8_signed = std::is_same<ScalarType, qasymm8_signed_t>::value;
+                /* Loop over row and compute softmax */
+                int      x    = 0;
+                svbool_t pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
+                svbool_t pg_0 = svunpklo(svunpklo(pg));
+                svbool_t pg_1 = svunpkhi(svunpklo(pg));
+                svbool_t pg_2 = svunpklo(svunpkhi(pg));
+                svbool_t pg_3 = svunpkhi(svunpkhi(pg));
+                do
                 {
-                    res_0 = svsub_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
-                    res_1 = svsub_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
-                    res_2 = svsub_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
-                    res_3 = svsub_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
-                }
-                else
-                {
-                    res_0 = svmul_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
-                    res_1 = svmul_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
-                    res_2 = svmul_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
-                    res_3 = svmul_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
+                    auto vec_in_0 = svld1_f32(pg_0, tmp_ptr + x);
+                    auto vec_in_1 = svld1_f32(pg_1, tmp_ptr + x + inc_1);
+                    auto vec_in_2 = svld1_f32(pg_2, tmp_ptr + x + inc_2);
+                    auto vec_in_3 = svld1_f32(pg_3, tmp_ptr + x + inc_3);
+
+                    svfloat32_t res_0{};
+                    svfloat32_t res_1{};
+                    svfloat32_t res_2{};
+                    svfloat32_t res_3{};
 
-                    if(is_qasymm8_signed)
+                    if (is_log)
                     {
-                        const auto offset_vec = svdup_n_f32(128.f);
-                        res_0                 = svsub_z(pg_0, res_0, offset_vec);
-                        res_1                 = svsub_z(pg_1, res_1, offset_vec);
-                        res_2                 = svsub_z(pg_2, res_2, offset_vec);
-                        res_3                 = svsub_z(pg_3, res_3, offset_vec);
+                        res_0 = svsub_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
+                        res_1 = svsub_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
+                        res_2 = svsub_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
+                        res_3 = svsub_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
+                    }
+                    else
+                    {
+                        res_0 = svmul_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
+                        res_1 = svmul_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
+                        res_2 = svmul_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
+                        res_3 = svmul_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
+
+                        if (is_qasymm8_signed)
+                        {
+                            const auto offset_vec = svdup_n_f32(128.f);
+                            res_0                 = svsub_z(pg_0, res_0, offset_vec);
+                            res_1                 = svsub_z(pg_1, res_1, offset_vec);
+                            res_2                 = svsub_z(pg_2, res_2, offset_vec);
+                            res_3                 = svsub_z(pg_3, res_3, offset_vec);
+                        }
                     }
-                }
 
-                // Store value
-                const auto out = convert_float_to_int<SVEType>(res_0, res_1, res_2, res_3);
-                svst1(pg, out_ptr + x, out);
-                x += wrapper::svcnt<ScalarType>();
-                pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
-                pg_0 = svunpklo(svunpklo(pg));
-                pg_1 = svunpkhi(svunpklo(pg));
-                pg_2 = svunpklo(svunpkhi(pg));
-                pg_3 = svunpkhi(svunpkhi(pg));
+                    // Store value
+                    const auto out = convert_float_to_int<SVEType>(res_0, res_1, res_2, res_3);
+                    svst1(pg, out_ptr + x, out);
+                    x += wrapper::svcnt<ScalarType>();
+                    pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
+                    pg_0 = svunpklo(svunpklo(pg));
+                    pg_1 = svunpkhi(svunpklo(pg));
+                    pg_2 = svunpklo(svunpkhi(pg));
+                    pg_3 = svunpkhi(svunpkhi(pg));
+                } while (svptest_any(all_true_pg, pg));
             }
-            while(svptest_any(all_true_pg, pg));
-        }
-    },
-    in_it, max_it, out_it);
+        },
+        in_it, max_it, out_it);
 }
 
-template void sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in, const ITensor *max, void *const tmp,
-                                                                 ITensor *out, float beta, bool is_log, const Window &window);
-template void sve2_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in, const ITensor *max, void *const tmp,
-                                                          ITensor *out, float beta, bool is_log, const Window &window);
+template void sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in,
+                                                                 const ITensor *max,
+                                                                 void *const    tmp,
+                                                                 ITensor       *out,
+                                                                 float          beta,
+                                                                 bool           is_log,
+                                                                 const Window  &window);
+template void sve2_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in,
+                                                          const ITensor *max,
+                                                          void *const    tmp,
+                                                          ITensor       *out,
+                                                          float          beta,
+                                                          bool           is_log,
+                                                          const Window  &window);
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve2/impl.h b/src/cpu/kernels/softmax/generic/sve2/impl.h
index abbcc15181..33fcc26cda 100644
--- a/src/cpu/kernels/softmax/generic/sve2/impl.h
+++ b/src/cpu/kernels/softmax/generic/sve2/impl.h
@@ -31,8 +31,13 @@ namespace arm_compute
 namespace cpu
 {
 template <typename ScalarType>
-void sve2_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
-                                      ITensor *out, float beta, bool is_log, const Window &window);
+void sve2_softmax_logits_1d_quantized(const ITensor *in,
+                                      const ITensor *max,
+                                      void *const    tmp,
+                                      ITensor       *out,
+                                      float          beta,
+                                      bool           is_log,
+                                      const Window  &window);
 } // namespace cpu
 } // namespace arm_compute
 #endif /* SRC_CORE_SVE2_KERNELS_SOFTMAX_IMPL_H */
diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp
index 810035eb9c..95623786b3 100644
--- a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp
+++ b/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp
@@ -23,16 +23,22 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/softmax/generic/sve2/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sve2_qasymm8_softmax(const ITensor *in, const ITensor *max, void *const tmp,
-                          ITensor *out, const float beta, bool is_log, const Window &window)
+void sve2_qasymm8_softmax(const ITensor *in,
+                          const ITensor *max,
+                          void *const    tmp,
+                          ITensor       *out,
+                          const float    beta,
+                          bool           is_log,
+                          const Window  &window)
 {
     return sve2_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
index 283b55e9ce..c20462fcef 100644
--- a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
+++ b/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
@@ -23,16 +23,22 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/softmax/generic/sve2/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sve2_qasymm8_signed_softmax(const ITensor *in, const ITensor *max, void *const tmp,
-                                 ITensor *out, const float beta, bool is_log, const Window &window)
+void sve2_qasymm8_signed_softmax(const ITensor *in,
+                                 const ITensor *max,
+                                 void *const    tmp,
+                                 ITensor       *out,
+                                 const float    beta,
+                                 bool           is_log,
+                                 const Window  &window)
 {
     return sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/list.h b/src/cpu/kernels/softmax/list.h
index ed3515f417..627ce0c264 100644
--- a/src/cpu/kernels/softmax/list.h
+++ b/src/cpu/kernels/softmax/list.h
@@ -28,9 +28,9 @@ namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_SOFTMAX_KERNEL(func_name)                                  \
-    void func_name(const ITensor *in, const ITensor *max, void *const tmp, \
-                   ITensor *out, const float beta, bool is_log, const Window &window)
+#define DECLARE_SOFTMAX_KERNEL(func_name)                                                                  \
+    void func_name(const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, \
+                   bool is_log, const Window &window)
 
 DECLARE_SOFTMAX_KERNEL(neon_fp32_softmax);
 DECLARE_SOFTMAX_KERNEL(neon_fp16_softmax);
@@ -43,8 +43,7 @@ DECLARE_SOFTMAX_KERNEL(sve2_qasymm8_softmax);
 
 #undef DECLARE_SOFTMAX_KERNEL
 
-#define DECLARE_LOGITS_KERNEL(func_name) \
-    void func_name(const ITensor *in, ITensor *out, const Window &window)
+#define DECLARE_LOGITS_KERNEL(func_name) void func_name(const ITensor *in, ITensor *out, const Window &window)
 
 DECLARE_LOGITS_KERNEL(neon_fp32_logits);
 DECLARE_LOGITS_KERNEL(neon_fp16_logits);
diff --git a/src/cpu/kernels/sub/neon/list.h b/src/cpu/kernels/sub/neon/list.h
index f7e1a040bd..9f6c92271f 100644
--- a/src/cpu/kernels/sub/neon/list.h
+++ b/src/cpu/kernels/sub/neon/list.h
@@ -26,14 +26,16 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_SUB_KERNEL(func_name) \
-    void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+#define DECLARE_SUB_KERNEL(func_name)                                                                   \
+    void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, \
+                   const Window &window)
 
 DECLARE_SUB_KERNEL(sub_qasymm8_neon_fixedpoint);
 DECLARE_SUB_KERNEL(sub_qasymm8_signed_neon_fixedpoint);
@@ -44,7 +46,8 @@ DECLARE_SUB_KERNEL(sub_qsymm16_neon);
 #undef DECLARE_SUB_KERNEL
 
 template <typename T>
-void sub_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void sub_same_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
@@ -68,7 +71,7 @@ void sub_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const
     Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()));
     Iterator output(dst, window);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -84,41 +87,44 @@ void sub_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const
         Iterator output(dst, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<T *>(output.ptr());
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<T *>(output.ptr());
 
-            const T    broadcast_value     = *reinterpret_cast<const T *>(broadcast_input.ptr());
-            const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
+                const T    broadcast_value     = *reinterpret_cast<const T *>(broadcast_input.ptr());
+                const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
 
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
-                auto       res             = is_sat ? wrapper::vqsub(broadcast_value_vec, non_broadcast_v) : wrapper::vsub(broadcast_value_vec, non_broadcast_v);
-                if(is_broadcast_input_2)
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    res = wrapper::vmul(res, wrapper::vdup_n(static_cast<T>(-1), ExactTagType{}));
+                    const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+                    auto       res             = is_sat ? wrapper::vqsub(broadcast_value_vec, non_broadcast_v)
+                                                        : wrapper::vsub(broadcast_value_vec, non_broadcast_v);
+                    if (is_broadcast_input_2)
+                    {
+                        res = wrapper::vmul(res, wrapper::vdup_n(static_cast<T>(-1), ExactTagType{}));
+                    }
+                    wrapper::vstore(output_ptr + x, res);
                 }
-                wrapper::vstore(output_ptr + x, res);
-            }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
-                auto       res             = is_sat ? wrapper::sub_sat(broadcast_value, non_broadcast_v) : broadcast_value - non_broadcast_v;
-                if(is_broadcast_input_2)
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
-                    res = static_cast<T>(-1) * res;
+                    const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
+                    auto       res =
+                        is_sat ? wrapper::sub_sat(broadcast_value, non_broadcast_v) : broadcast_value - non_broadcast_v;
+                    if (is_broadcast_input_2)
+                    {
+                        res = static_cast<T>(-1) * res;
+                    }
+
+                    *(output_ptr + x) = res;
                 }
-
-                *(output_ptr + x) = res;
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -131,31 +137,32 @@ void sub_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const
         Iterator output(dst, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<T *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto val1 = wrapper::vloadq(input1_ptr + x);
-                const auto val2 = wrapper::vloadq(input2_ptr + x);
-                const auto res  = is_sat ? wrapper::vqsub(val1, val2) : wrapper::vsub(val1, val2);
-                wrapper::vstore(output_ptr + x, res);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
+            win,
+            [&](const Coordinates &)
             {
-                const auto val1   = *(input1_ptr + x);
-                const auto val2   = *(input2_ptr + x);
-                *(output_ptr + x) = is_sat ? wrapper::sub_sat(val1, val2) : val1 - val2;
-            }
-        },
-        input1, input2, output);
+                const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto val1 = wrapper::vloadq(input1_ptr + x);
+                    const auto val2 = wrapper::vloadq(input2_ptr + x);
+                    const auto res  = is_sat ? wrapper::vqsub(val1, val2) : wrapper::vsub(val1, val2);
+                    wrapper::vstore(output_ptr + x, res);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto val1   = *(input1_ptr + x);
+                    const auto val2   = *(input2_ptr + x);
+                    *(output_ptr + x) = is_sat ? wrapper::sub_sat(val1, val2) : val1 - val2;
+                }
+            },
+            input1, input2, output);
     }
 }
 } // namespace cpu
diff --git a/src/cpu/kernels/sub/neon/qasymm8.cpp b/src/cpu/kernels/sub/neon/qasymm8.cpp
index ea6e5826dd..b750afce6e 100644
--- a/src/cpu/kernels/sub/neon/qasymm8.cpp
+++ b/src/cpu/kernels/sub/neon/qasymm8.cpp
@@ -23,21 +23,24 @@
  */
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/cpu/kernels/add/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sub_qasymm8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void sub_qasymm8_neon_fixedpoint(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     add_sub_q8_neon_fixedpoint<uint8_t>(src0, src1, dst, policy, window, false /*is_addition*/);
 }
 
-void sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void sub_qasymm8_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     add_sub_qasymm8_neon(src0, src1, dst, policy, window, false /*is_addition*/);
 }
 
 } // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/sub/neon/qasymm8_signed.cpp b/src/cpu/kernels/sub/neon/qasymm8_signed.cpp
index a86c7f22f6..fb0bb62682 100644
--- a/src/cpu/kernels/sub/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/sub/neon/qasymm8_signed.cpp
@@ -24,21 +24,24 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/cpu/kernels/add/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sub_qasymm8_signed_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void sub_qasymm8_signed_neon_fixedpoint(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     add_sub_q8_neon_fixedpoint<int8_t>(src0, src1, dst, policy, window, false /*is_addition*/);
 }
 
-void sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void sub_qasymm8_signed_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     add_sub_qasymm8_signed_neon(src0, src1, dst, policy, window, false /*is_addition*/);
 }
 
 } // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/kernels/sub/neon/qsymm16.cpp b/src/cpu/kernels/sub/neon/qsymm16.cpp
index 4dfdc0e78c..23e4b03843 100644
--- a/src/cpu/kernels/sub/neon/qsymm16.cpp
+++ b/src/cpu/kernels/sub/neon/qsymm16.cpp
@@ -25,14 +25,16 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void sub_qsymm16_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -57,7 +59,7 @@ void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co
     const float32x4_t vscale2    = vdupq_n_f32(iq2_info.scale);
     const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool                    is_broadcast_input_2 = input2_win.x().step() == 0;
         Window                        broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -65,7 +67,7 @@ void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co
         const ITensor                *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
         const ITensor                *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
         const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo  = non_broadcast_tensor->info()->quantization_info().uniform();
+        const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
 
         // Clear X Dimension on execution window as we handle manually
         non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -74,61 +76,62 @@ void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int16_t *>(output.ptr());
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<int16_t *>(output.ptr());
 
-            const int16_t   broadcast_value     = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
-            const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
+                const int16_t   broadcast_value     = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
+                const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
 
-            const float32x4x2_t bf =
-            {
-                {
-                    vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2),
-                    vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2),
-                }
-            };
-            const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
+                const float32x4x2_t bf  = {{
+                     vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2),
+                     vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2),
+                }};
+                const float         bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
 
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const int16x8_t     a = vld1q_s16(non_broadcast_input_ptr + x);
-                const float32x4x2_t af =
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
+                    const int16x8_t     a  = vld1q_s16(non_broadcast_input_ptr + x);
+                    const float32x4x2_t af = {{
                         vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
                         vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
-                    }
-                };
+                    }};
 
-                const int32x4x4_t rf =
-                {
-                    {
+                    const int32x4x4_t rf = {{
 #ifdef __aarch64__
-                        vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
+                        vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0])
+                                                                      : vsubq_f32(af.val[0], bf.val[0]),
+                                                 invvscaleo)),
+                        vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1])
+                                                                      : vsubq_f32(af.val[1], bf.val[1]),
+                                                 invvscaleo)),
 #else  //__aarch64__
-                        vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
+                        vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0])
+                                                                     : vsubq_f32(af.val[0], bf.val[0]),
+                                                invvscaleo)),
+                        vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1])
+                                                                     : vsubq_f32(af.val[1], bf.val[1]),
+                                                invvscaleo)),
 #endif //__aarch64__
-                    }
-                };
+                    }};
 
-                const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
-                vst1q_s16(output_ptr + x, pa);
-            }
+                    const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
+                    vst1q_s16(output_ptr + x, pa);
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
-                *(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (bfs - afs) : (afs - bfs), oq_info);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
+                    *(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (bfs - afs) : (afs - bfs), oq_info);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -140,38 +143,32 @@ void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co
         Iterator input2(src1, input2_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const int16x8_t a = vld1q_s16(input1_ptr + x);
-                const int16x8_t b = vld1q_s16(input2_ptr + x);
+                const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
 
-                const float32x4x2_t af =
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
+                    const int16x8_t a = vld1q_s16(input1_ptr + x);
+                    const int16x8_t b = vld1q_s16(input2_ptr + x);
+
+                    const float32x4x2_t af = {{
                         vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
                         vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
-                    }
-                };
+                    }};
 
-                const float32x4x2_t bf =
-                {
-                    {
+                    const float32x4x2_t bf = {{
                         vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2),
                         vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2),
-                    }
-                };
+                    }};
 
-                const int32x4x2_t rf =
-                {
-                    {
+                    const int32x4x2_t rf = {{
 #ifdef __aarch64__
                         vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
                         vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
@@ -179,23 +176,22 @@ void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co
                         vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
                         vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
 #endif //__aarch64__
-                    }
-                };
+                    }};
 
-                const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
-                vst1q_s16(output_ptr + x, pa);
-            }
+                    const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
+                    vst1q_s16(output_ptr + x, pa);
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
-                const float bfs   = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
-                *(output_ptr + x) = quantize_qsymm16((afs - bfs), dst->info()->quantization_info());
-            }
-        },
-        input1, input2, output);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
+                    const float bfs   = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
+                    *(output_ptr + x) = quantize_qsymm16((afs - bfs), dst->info()->quantization_info());
+                }
+            },
+            input1, input2, output);
     }
 }
 } // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuActivation.cpp b/src/cpu/operators/CpuActivation.cpp
index 197e9850b9..44d70cf503 100644
--- a/src/cpu/operators/CpuActivation.cpp
+++ b/src/cpu/operators/CpuActivation.cpp
@@ -24,6 +24,7 @@
 #include "src/cpu/operators/CpuActivation.h"
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/IOperator.h"
 #include "src/common/utils/LegacySupport.h"
 #include "src/common/utils/Log.h"
@@ -42,7 +43,8 @@ void CpuActivation::configure(const ITensorInfo *input, ITensorInfo *output, con
     _kernel = std::move(k);
 }
 
-Status CpuActivation::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info)
+Status
+CpuActivation::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info)
 {
     return kernels::CpuActivationKernel::validate(input, output, activation_info);
 }
@@ -54,13 +56,17 @@ void CpuActivation::run(ITensorPack &tensors)
     NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors);
 }
 
-std::tuple<IOperator *, StatusCode> CpuContext::create_activation(const AclTensorDescriptor &src, const AclTensorDescriptor &dst, const AclActivationDescriptor &act, bool is_validate)
+std::tuple<IOperator *, StatusCode> CpuContext::create_activation(const AclTensorDescriptor     &src,
+                                                                  const AclTensorDescriptor     &dst,
+                                                                  const AclActivationDescriptor &act,
+                                                                  bool                           is_validate)
 {
     TensorInfo src_info = detail::convert_to_legacy_tensor_info(src);
     TensorInfo dst_info = detail::convert_to_legacy_tensor_info(dst);
     auto       info     = detail::convert_to_activation_info(act);
 
-    if(is_validate && !bool(CpuActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info)))
+    if (is_validate &&
+        !bool(CpuActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info)))
     {
         return std::make_tuple(nullptr, StatusCode::UnsupportedConfig);
     }
@@ -69,7 +75,7 @@ std::tuple<IOperator *, StatusCode> CpuContext::create_activation(const AclTenso
     act_op->configure(&src_info, &dst_info, info);
 
     auto op = new arm_compute::IOperator(static_cast<IContext *>(this));
-    if(op == nullptr)
+    if (op == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources");
         return std::make_tuple(nullptr, StatusCode::OutOfMemory);
diff --git a/src/cpu/operators/CpuActivation.h b/src/cpu/operators/CpuActivation.h
index e21fc7d32c..ec442f92c8 100644
--- a/src/cpu/operators/CpuActivation.h
+++ b/src/cpu/operators/CpuActivation.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_ACTIVATION_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/cpu/ICpuOperator.h"
 
 namespace arm_compute
diff --git a/src/cpu/operators/CpuAdd.cpp b/src/cpu/operators/CpuAdd.cpp
index 41def8e22f..53cd7fa1b7 100644
--- a/src/cpu/operators/CpuAdd.cpp
+++ b/src/cpu/operators/CpuAdd.cpp
@@ -23,17 +23,20 @@
  */
 #include "src/cpu/operators/CpuAdd.h"
 
-#include "src/cpu/kernels/CpuAddKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include "src/common/utils/Log.h"
-
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/cpu/kernels/CpuAddKernel.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void CpuAdd::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CpuAdd::configure(const ITensorInfo         *src0,
+                       const ITensorInfo         *src1,
+                       ITensorInfo               *dst,
+                       ConvertPolicy              policy,
+                       const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     ARM_COMPUTE_LOG_PARAMS(src0, src1, dst, policy, act_info);
@@ -42,7 +45,11 @@ void CpuAdd::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensor
     _kernel = std::move(k);
 }
 
-Status CpuAdd::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CpuAdd::validate(const ITensorInfo         *src0,
+                        const ITensorInfo         *src1,
+                        const ITensorInfo         *dst,
+                        ConvertPolicy              policy,
+                        const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return kernels::CpuAddKernel::validate(src0, src1, dst, policy);
diff --git a/src/cpu/operators/CpuAdd.h b/src/cpu/operators/CpuAdd.h
index db05c100cc..5f60102de2 100644
--- a/src/cpu/operators/CpuAdd.h
+++ b/src/cpu/operators/CpuAdd.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_ADD_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/cpu/ICpuOperator.h"
 
 namespace arm_compute
@@ -55,14 +56,22 @@ public:
      * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      *
      */
-    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ITensorInfo         *src0,
+                   const ITensorInfo         *src1,
+                   ITensorInfo               *dst,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref CpuAdd::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src0,
+                           const ITensorInfo         *src1,
+                           const ITensorInfo         *dst,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
diff --git a/src/cpu/operators/CpuAddMulAdd.cpp b/src/cpu/operators/CpuAddMulAdd.cpp
index 590ee482ca..2f19f2f842 100644
--- a/src/cpu/operators/CpuAddMulAdd.cpp
+++ b/src/cpu/operators/CpuAddMulAdd.cpp
@@ -21,39 +21,49 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "src/cpu/operators/CpuAddMulAdd.h"
+
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/kernels/CpuAddMulAddKernel.h"
-#include "src/cpu/operators/CpuAddMulAdd.h"
 #include "src/cpu/utils/CpuAuxTensorHandler.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void CpuAddMulAdd::configure(const ITensorInfo *input1, const ITensorInfo *input2,
-                             const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
-                             ITensorInfo *add_output, ITensorInfo *final_output,
-                             ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CpuAddMulAdd::configure(const ITensorInfo         *input1,
+                             const ITensorInfo         *input2,
+                             const ITensorInfo         *bn_mul,
+                             const ITensorInfo         *bn_add,
+                             ITensorInfo               *add_output,
+                             ITensorInfo               *final_output,
+                             ConvertPolicy              policy,
+                             const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info);
 
     auto k = std::make_unique<kernels::CpuAddMulAddKernel>();
 
     const DataType data_type = input1->data_type();
-    if(is_data_type_quantized(data_type))
+    if (is_data_type_quantized(data_type))
     {
         _dequantize_bn_mul.configure(bn_mul, &_dequantized_bn_mul);
         _dequantize_bn_add.configure(bn_add, &_dequantized_bn_add);
 
-        k->configure(input1, input2, &_dequantized_bn_mul, &_dequantized_bn_add, add_output, final_output, policy, act_info);
+        k->configure(input1, input2, &_dequantized_bn_mul, &_dequantized_bn_add, add_output, final_output, policy,
+                     act_info);
 
         // Save auxilary memory requirements after configuration
-        _aux_mem[DequantizedBnMul] = experimental::MemoryInfo(offset_int_vec(DequantizedBnMul), experimental::MemoryLifetime::Temporary, _dequantized_bn_mul.total_size());
-        _aux_mem[DequantizedBnAdd] = experimental::MemoryInfo(offset_int_vec(DequantizedBnAdd), experimental::MemoryLifetime::Temporary, _dequantized_bn_add.total_size());
+        _aux_mem[DequantizedBnMul] =
+            experimental::MemoryInfo(offset_int_vec(DequantizedBnMul), experimental::MemoryLifetime::Temporary,
+                                     _dequantized_bn_mul.total_size());
+        _aux_mem[DequantizedBnAdd] =
+            experimental::MemoryInfo(offset_int_vec(DequantizedBnAdd), experimental::MemoryLifetime::Temporary,
+                                     _dequantized_bn_add.total_size());
     }
     else
     {
@@ -63,13 +73,17 @@ void CpuAddMulAdd::configure(const ITensorInfo *input1, const ITensorInfo *input
     _kernel = std::move(k);
 }
 
-Status CpuAddMulAdd::validate(const ITensorInfo *input1, const ITensorInfo *input2,
-                              const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
-                              const ITensorInfo *add_output, const ITensorInfo *final_output,
-                              ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CpuAddMulAdd::validate(const ITensorInfo         *input1,
+                              const ITensorInfo         *input2,
+                              const ITensorInfo         *bn_mul,
+                              const ITensorInfo         *bn_add,
+                              const ITensorInfo         *add_output,
+                              const ITensorInfo         *final_output,
+                              ConvertPolicy              policy,
+                              const ActivationLayerInfo &act_info)
 {
     const DataType data_type = input1->data_type();
-    if(is_data_type_quantized(data_type))
+    if (is_data_type_quantized(data_type))
     {
         TensorInfo dequantized_bn_mul = bn_mul->clone()->set_data_type(DataType::F32);
         TensorInfo dequantized_bn_add = bn_add->clone()->set_data_type(DataType::F32);
@@ -77,11 +91,13 @@ Status CpuAddMulAdd::validate(const ITensorInfo *input1, const ITensorInfo *inpu
         ARM_COMPUTE_RETURN_ON_ERROR(CpuDequantize::validate(bn_mul, &dequantized_bn_mul));
         ARM_COMPUTE_RETURN_ON_ERROR(CpuDequantize::validate(bn_add, &dequantized_bn_add));
 
-        return kernels::CpuAddMulAddKernel::validate(input1, input2, &dequantized_bn_mul, &dequantized_bn_add, add_output, final_output, policy, act_info);
+        return kernels::CpuAddMulAddKernel::validate(input1, input2, &dequantized_bn_mul, &dequantized_bn_add,
+                                                     add_output, final_output, policy, act_info);
     }
     else
     {
-        return kernels::CpuAddMulAddKernel::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info);
+        return kernels::CpuAddMulAddKernel::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy,
+                                                     act_info);
     }
 }
 
@@ -89,37 +105,32 @@ void CpuAddMulAdd::run(ITensorPack &tensors)
 {
     const DataType data_type = tensors.get_const_tensor(TensorType::ACL_SRC_0)->info()->data_type();
 
-    if(is_data_type_quantized(data_type))
+    if (is_data_type_quantized(data_type))
     {
         const ITensor *bn_mul = tensors.get_const_tensor(TensorType::ACL_SRC_2);
         const ITensor *bn_add = tensors.get_const_tensor(TensorType::ACL_SRC_3);
 
-        CpuAuxTensorHandler dequantized_bn_mul_handler(offset_int_vec(DequantizedBnMul), _dequantized_bn_mul, tensors, true);
-        CpuAuxTensorHandler dequantized_bn_add_handler(offset_int_vec(DequantizedBnAdd), _dequantized_bn_add, tensors, true);
+        CpuAuxTensorHandler dequantized_bn_mul_handler(offset_int_vec(DequantizedBnMul), _dequantized_bn_mul, tensors,
+                                                       true);
+        CpuAuxTensorHandler dequantized_bn_add_handler(offset_int_vec(DequantizedBnAdd), _dequantized_bn_add, tensors,
+                                                       true);
 
-        ITensorPack dequantize_mul_pack =
-        {
-            { TensorType::ACL_SRC_0, bn_mul },
-            { TensorType::ACL_DST_0, dequantized_bn_mul_handler.get() }
-        };
+        ITensorPack dequantize_mul_pack = {{TensorType::ACL_SRC_0, bn_mul},
+                                           {TensorType::ACL_DST_0, dequantized_bn_mul_handler.get()}};
 
-        ITensorPack dequantize_add_pack =
-        {
-            { TensorType::ACL_SRC_0, bn_add },
-            { TensorType::ACL_DST_0, dequantized_bn_add_handler.get() }
-        };
+        ITensorPack dequantize_add_pack = {{TensorType::ACL_SRC_0, bn_add},
+                                           {TensorType::ACL_DST_0, dequantized_bn_add_handler.get()}};
 
         _dequantize_bn_mul.run(dequantize_mul_pack);
         _dequantize_bn_add.run(dequantize_add_pack);
 
-        ITensorPack add_mul_add_pack =
-        {
-            { TensorType::ACL_SRC_0, tensors.get_const_tensor(TensorType::ACL_SRC_0) },
-            { TensorType::ACL_SRC_1, tensors.get_const_tensor(TensorType::ACL_SRC_1) },
-            { TensorType::ACL_SRC_2, dequantized_bn_mul_handler.get() },
-            { TensorType::ACL_SRC_3, dequantized_bn_add_handler.get() },
-            { TensorType::ACL_DST_0, tensors.get_tensor(TensorType::ACL_DST_0) },
-            { TensorType::ACL_DST_1, tensors.get_tensor(TensorType::ACL_DST_1) },
+        ITensorPack add_mul_add_pack = {
+            {TensorType::ACL_SRC_0, tensors.get_const_tensor(TensorType::ACL_SRC_0)},
+            {TensorType::ACL_SRC_1, tensors.get_const_tensor(TensorType::ACL_SRC_1)},
+            {TensorType::ACL_SRC_2, dequantized_bn_mul_handler.get()},
+            {TensorType::ACL_SRC_3, dequantized_bn_add_handler.get()},
+            {TensorType::ACL_DST_0, tensors.get_tensor(TensorType::ACL_DST_0)},
+            {TensorType::ACL_DST_1, tensors.get_tensor(TensorType::ACL_DST_1)},
         };
 
         NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), add_mul_add_pack);
diff --git a/src/cpu/operators/CpuAddMulAdd.h b/src/cpu/operators/CpuAddMulAdd.h
index cf1ece68f1..47db75c37e 100644
--- a/src/cpu/operators/CpuAddMulAdd.h
+++ b/src/cpu/operators/CpuAddMulAdd.h
@@ -42,20 +42,28 @@ public:
      * Similar to @ref NEAddMulAdd::configure()
      *
      */
-    void configure(const ITensorInfo *input1, const ITensorInfo *input2,
-                   const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
-                   ITensorInfo *add_output, ITensorInfo *final_output,
-                   ConvertPolicy policy, const ActivationLayerInfo &act_info);
+    void configure(const ITensorInfo         *input1,
+                   const ITensorInfo         *input2,
+                   const ITensorInfo         *bn_mul,
+                   const ITensorInfo         *bn_add,
+                   ITensorInfo               *add_output,
+                   ITensorInfo               *final_output,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref CpuAddMulAdd::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
-                           const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
-                           const ITensorInfo *add_output, const ITensorInfo *final_output,
-                           ConvertPolicy policy, const ActivationLayerInfo &act_info);
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *bn_mul,
+                           const ITensorInfo         *bn_add,
+                           const ITensorInfo         *add_output,
+                           const ITensorInfo         *final_output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info);
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
@@ -77,7 +85,7 @@ private:
     TensorInfo _dequantized_bn_mul{};
     TensorInfo _dequantized_bn_add{};
 
-    experimental::MemoryRequirements _aux_mem{ Count };
+    experimental::MemoryRequirements _aux_mem{Count};
 };
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuCast.cpp b/src/cpu/operators/CpuCast.cpp
index 1cfd8c1d0e..55b9204d71 100644
--- a/src/cpu/operators/CpuCast.cpp
+++ b/src/cpu/operators/CpuCast.cpp
@@ -23,9 +23,8 @@
  */
 #include "src/cpu/operators/CpuCast.h"
 
-#include "src/cpu/kernels/CpuCastKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuCastKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/cpu/operators/CpuConcatenate.cpp b/src/cpu/operators/CpuConcatenate.cpp
index 4021fd8ded..5f517a8fcb 100644
--- a/src/cpu/operators/CpuConcatenate.cpp
+++ b/src/cpu/operators/CpuConcatenate.cpp
@@ -23,21 +23,20 @@
  */
 #include "src/cpu/operators/CpuConcatenate.h"
 
-#include "src/cpu/kernels/CpuConcatenateBatchKernel.h"
-#include "src/cpu/kernels/CpuConcatenateDepthKernel.h"
-#include "src/cpu/kernels/CpuConcatenateHeightKernel.h"
-#include "src/cpu/kernels/CpuConcatenateWidthKernel.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
+#include "src/cpu/kernels/CpuConcatenateBatchKernel.h"
+#include "src/cpu/kernels/CpuConcatenateDepthKernel.h"
+#include "src/cpu/kernels/CpuConcatenateHeightKernel.h"
+#include "src/cpu/kernels/CpuConcatenateWidthKernel.h"
 
 namespace arm_compute
 {
@@ -59,9 +58,9 @@ void CpuConcatenate::configure(const std::vector<const ITensorInfo *> &srcs_vect
 
     unsigned int offset = 0;
 
-    for(unsigned int i = 0; i < _num_srcs; ++i)
+    for (unsigned int i = 0; i < _num_srcs; ++i)
     {
-        switch(axis)
+        switch (axis)
         {
             case Window::DimX:
             {
@@ -98,16 +97,17 @@ void CpuConcatenate::configure(const std::vector<const ITensorInfo *> &srcs_vect
     }
 }
 
-Status CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vector, const ITensorInfo *dst, size_t axis)
+Status
+CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vector, const ITensorInfo *dst, size_t axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
     ARM_COMPUTE_RETURN_ERROR_ON(srcs_vector.size() < 2);
 
     unsigned int offset = 0;
-    for(const auto &src : srcs_vector)
+    for (const auto &src : srcs_vector)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
-        switch(axis)
+        switch (axis)
         {
             case Window::DimX:
             {
@@ -135,7 +135,7 @@ Status CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vec
         offset += src->dimension(axis);
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(srcs_vector, axis);
         ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size());
@@ -146,18 +146,18 @@ Status CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vec
 
 void CpuConcatenate::run(ITensorPack &tensors)
 {
-    if(tensors.empty())
+    if (tensors.empty())
     {
         ARM_COMPUTE_ERROR("No inputs provided");
     }
 
-    if(static_cast<int>(tensors.size() - 1) != static_cast<int>(_num_srcs))
+    if (static_cast<int>(tensors.size() - 1) != static_cast<int>(_num_srcs))
     {
         ARM_COMPUTE_ERROR("Configured with different number of inputs");
     }
 
     int i = 0;
-    for(auto &k : _concat_kernels)
+    for (auto &k : _concat_kernels)
     {
         ITensorPack pack;
         pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i));
diff --git a/src/cpu/operators/CpuConcatenate.h b/src/cpu/operators/CpuConcatenate.h
index eb11926b48..c36977c70f 100644
--- a/src/cpu/operators/CpuConcatenate.h
+++ b/src/cpu/operators/CpuConcatenate.h
@@ -68,8 +68,8 @@ public:
 
 private:
     std::vector<std::unique_ptr<ICPPKernel>> _concat_kernels{};
-    unsigned int                             _num_srcs{ 0 };
-    unsigned int                             _axis{ 0 };
+    unsigned int                             _num_srcs{0};
+    unsigned int                             _axis{0};
 };
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuConv2d.cpp b/src/cpu/operators/CpuConv2d.cpp
index 16ac16b3ba..19311733db 100644
--- a/src/cpu/operators/CpuConv2d.cpp
+++ b/src/cpu/operators/CpuConv2d.cpp
@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "src/cpu/operators/CpuConv2d.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/operators/CpuDirectConv2d.h"
 #include "src/cpu/operators/CpuGemm.h"
@@ -35,26 +37,35 @@ namespace arm_compute
 {
 namespace cpu
 {
-CpuConv2d::CpuConv2d()
-    : _function()
+CpuConv2d::CpuConv2d() : _function()
 {
 }
 
 CpuConv2d::~CpuConv2d() = default;
 
-void CpuConv2d::configure(ITensorInfo *input, ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                          const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+void CpuConv2d::configure(ITensorInfo               *input,
+                          ITensorInfo               *weights,
+                          const ITensorInfo         *biases,
+                          ITensorInfo               *output,
+                          const PadStrideInfo       &conv_info,
+                          const WeightsInfo         &weights_info,
+                          const Size2D              &dilation,
+                          const ActivationLayerInfo &act_info,
+                          bool                       enable_fast_math,
+                          unsigned int               num_groups)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_UNUSED(num_groups);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
-                                                   enable_fast_math, num_groups));
+    ARM_COMPUTE_ERROR_THROW_ON(CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation,
+                                                   act_info, enable_fast_math, num_groups));
 
-    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
+                           enable_fast_math, num_groups);
 
     const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
-    switch(CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math))
+    switch (CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
+                                              enable_fast_math))
     {
         case ConvolutionMethod::WINOGRAD:
         {
@@ -92,19 +103,30 @@ void CpuConv2d::configure(ITensorInfo *input, ITensorInfo *weights, const ITenso
     _aux_mem = _function->workspace();
 }
 
-Status CpuConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+Status CpuConv2d::validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const WeightsInfo         &weights_info,
+                           const Size2D              &dilation,
+                           const ActivationLayerInfo &act_info,
+                           bool                       enable_fast_math,
+                           unsigned int               num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on Neon");
 
     const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
-    switch(CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math))
+    switch (CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
+                                              enable_fast_math))
     {
         case ConvolutionMethod::WINOGRAD:
-            ARM_COMPUTE_RETURN_ON_ERROR(CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
             break;
         case ConvolutionMethod::GEMM:
-            ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math));
+            ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info,
+                                                                dilation, act_info, enable_fast_math));
             break;
         case ConvolutionMethod::GEMM_CONV2D:
             ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmDirectConv2d::validate(input, weights, biases, output, info));
@@ -120,9 +142,14 @@ Status CpuConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights,
     return Status{};
 }
 
-ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights,
-                                                    const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
+ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo         *input,
+                                                    const ITensorInfo         *weights,
+                                                    const ITensorInfo         *output,
+                                                    const PadStrideInfo       &conv_info,
+                                                    const WeightsInfo         &weights_info,
+                                                    const Size2D              &dilation,
+                                                    const ActivationLayerInfo &act_info,
+                                                    bool                       enable_fast_math)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, weights);
     ARM_COMPUTE_UNUSED(weights_info);
@@ -137,35 +164,46 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, co
     using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo>;
     using ConfigurationMethod      = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
 
-    const std::vector<ConfigurationMethod> known_configs =
-    {
+    const std::vector<ConfigurationMethod> known_configs = {
         // Alexnet
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U)), ConvolutionMethod::GEMM),
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U),
+                                                     PadStrideInfo(1U, 1U, 2U, 2U)),
+                            ConvolutionMethod::GEMM),
         // VGG16 / VGG19
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)), ConvolutionMethod::GEMM),
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U),
+                                                     PadStrideInfo(1U, 1U, 1U, 1U)),
+                            ConvolutionMethod::GEMM),
         // Mobilenet 224
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM),
+        ConfigurationMethod(
+            ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U),
+                                     PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)),
+            ConvolutionMethod::GEMM),
         // Mobilenet 160
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM)
-    };
+        ConfigurationMethod(
+            ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U),
+                                     PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)),
+            ConvolutionMethod::GEMM)};
 
     const auto find_config = [&](ConfigurationMethod c)
     {
         const ConvolutionConfiguration config = c.first;
         const PadStrideInfo            info   = std::get<3>(config);
 
-        return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
-               && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
-               && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride();
+        return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) &&
+               std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) &&
+               std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) &&
+               info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() &&
+               info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() &&
+               info.stride() == conv_info.stride();
     };
 
     std::vector<ConfigurationMethod>::const_iterator found;
-    if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
+    if ((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
     {
         return (*found).second;
     }
 
-    if(dilation != Size2D(1U, 1U))
+    if (dilation != Size2D(1U, 1U))
     {
         return ConvolutionMethod::GEMM;
     }
@@ -173,43 +211,49 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, co
     {
         // SRGAN
         // Output might not be initialized when it is an internal tensor of the layer using the convolution
-        if(input->total_size() > 1e7 && (weights->dimension(idx_h) > 7)
-           && (CpuDirectConv2d::validate(input, weights, nullptr, output, conv_info, act_info)))
+        if (input->total_size() > 1e7 && (weights->dimension(idx_h) > 7) &&
+            (CpuDirectConv2d::validate(input, weights, nullptr, output, conv_info, act_info)))
         {
             return ConvolutionMethod::DIRECT;
         }
-        if(input->dimension(idx_c) < 16)
+        if (input->dimension(idx_c) < 16)
         {
             return ConvolutionMethod::GEMM;
         }
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         // This heuristics only applies to F16 data type on A55r1
-        if(NEScheduler::get().cpu_info().get_cpu_model() == CPUModel::A55r1 && enable_fast_math && input->data_type() == DataType::F16)
+        if (NEScheduler::get().cpu_info().get_cpu_model() == CPUModel::A55r1 && enable_fast_math &&
+            input->data_type() == DataType::F16)
         {
             // Exclude known bad winograd configs (and defaults to GEMM)
-            const std::vector<ConvolutionConfiguration> known_bad_winograd_f16_with_fastmath_configs =
-            {
+            const std::vector<ConvolutionConfiguration> known_bad_winograd_f16_with_fastmath_configs = {
                 // Squeezenet_V1_1 fire2 and fire3
-                ConvolutionConfiguration(Size2D(56U, 56U), Size2D(3U, 3U), Size2D(16U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)),
+                ConvolutionConfiguration(Size2D(56U, 56U), Size2D(3U, 3U), Size2D(16U, 64U),
+                                         PadStrideInfo(1U, 1U, 1U, 1U)),
                 // Squeezenet_V1_1 fire6 and fire7
-                ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(48U, 192U), PadStrideInfo(1U, 1U, 1U, 1U)),
+                ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(48U, 192U),
+                                         PadStrideInfo(1U, 1U, 1U, 1U)),
                 // Squeezenet_V1_1 fire8 and fire9
-                ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(64U, 256U), PadStrideInfo(1U, 1U, 1U, 1U)),
+                ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(64U, 256U),
+                                         PadStrideInfo(1U, 1U, 1U, 1U)),
             };
             const auto find_conv_config = [&](ConvolutionConfiguration c)
             {
                 const PadStrideInfo info = std::get<3>(c);
 
-                return std::get<0>(c) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(c) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
-                       && std::get<2>(c) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
-                       && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride();
+                return std::get<0>(c) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) &&
+                       std::get<1>(c) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) &&
+                       std::get<2>(c) == Size2D(weights->dimension(idx_c), weights->dimension(3)) &&
+                       info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() &&
+                       info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() &&
+                       info.stride() == conv_info.stride();
             };
 
-            bool found_bad = std::find_if(known_bad_winograd_f16_with_fastmath_configs.begin(), known_bad_winograd_f16_with_fastmath_configs.end(),
-                                          find_conv_config)
-                             != known_bad_winograd_f16_with_fastmath_configs.end();
-            if(found_bad)
+            bool found_bad = std::find_if(known_bad_winograd_f16_with_fastmath_configs.begin(),
+                                          known_bad_winograd_f16_with_fastmath_configs.end(),
+                                          find_conv_config) != known_bad_winograd_f16_with_fastmath_configs.end();
+            if (found_bad)
             {
                 return ConvolutionMethod::GEMM;
             }
@@ -217,16 +261,16 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, co
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
         // For 1x1 convolutions run the default GEMM
-        if(weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1)
+        if (weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1)
         {
             return ConvolutionMethod::GEMM;
         }
 
-        if(bool(CpuWinogradConv2d::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)))
+        if (bool(CpuWinogradConv2d::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)))
         {
             return ConvolutionMethod::WINOGRAD;
         }
-        if(bool(CpuGemmDirectConv2d::validate(input, weights, nullptr, output, info)))
+        if (bool(CpuGemmDirectConv2d::validate(input, weights, nullptr, output, info)))
         {
             return ConvolutionMethod::GEMM_CONV2D;
         }
diff --git a/src/cpu/operators/CpuConv2d.h b/src/cpu/operators/CpuConv2d.h
index 0908ac0cbb..71b9e15dc1 100644
--- a/src/cpu/operators/CpuConv2d.h
+++ b/src/cpu/operators/CpuConv2d.h
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuOperator.h"
 
@@ -102,17 +103,32 @@ public:
      *                              available which may introduce a drop of accuracy as well. Default is false
      * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      */
-    void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
+    void configure(ITensorInfo               *src,
+                   ITensorInfo               *weights,
+                   const ITensorInfo         *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
+                   const WeightsInfo         &weights_info     = WeightsInfo(),
+                   const Size2D              &dilation         = Size2D(1U, 1U),
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false,
+                   unsigned int               num_groups       = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref CpuConv2d
      *
      * Similar to CpuConv2d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false,
-                           unsigned int num_groups = 1);
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const WeightsInfo         &weights_info     = WeightsInfo(),
+                           const Size2D              &dilation         = Size2D(1U, 1U),
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false,
+                           unsigned int               num_groups       = 1);
     /** Static function to check if given info will return the convolution called by @ref CpuConv2d
      *
      * @param[in] src              Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
@@ -132,11 +148,17 @@ public:
      *
      * @return the Convolution Method Hint
      */
-    static ConvolutionMethod get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                                                    const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    static ConvolutionMethod get_convolution_method(const ITensorInfo         *src,
+                                                    const ITensorInfo         *weights,
+                                                    const ITensorInfo         *dst,
+                                                    const PadStrideInfo       &conv_info,
+                                                    const WeightsInfo         &weights_info     = WeightsInfo(),
+                                                    const Size2D              &dilation         = Size2D(1U, 1U),
+                                                    const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                                                    bool                       enable_fast_math = false);
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
diff --git a/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp b/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp
index 810ffb1e4e..49e31926e3 100644
--- a/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp
+++ b/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp
@@ -24,6 +24,7 @@
 #include "src/cpu/operators/CpuConvertFullyConnectedWeights.h"
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h"
 
@@ -31,7 +32,10 @@ namespace arm_compute
 {
 namespace cpu
 {
-void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
+void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src,
+                                                ITensorInfo       *dst,
+                                                const TensorShape &original_src_shape,
+                                                DataLayout         data_layout)
 {
     ARM_COMPUTE_LOG_PARAMS(src, dst, original_src_shape, data_layout);
     auto k = std::make_unique<kernels::CpuConvertFullyConnectedWeightsKernel>();
@@ -39,7 +43,10 @@ void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src, ITensorI
     _kernel = std::move(k);
 }
 
-Status CpuConvertFullyConnectedWeights::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
+Status CpuConvertFullyConnectedWeights::validate(const ITensorInfo *src,
+                                                 const ITensorInfo *dst,
+                                                 const TensorShape &original_src_shape,
+                                                 DataLayout         data_layout)
 {
     return kernels::CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout);
 }
@@ -48,5 +55,5 @@ void CpuConvertFullyConnectedWeights::run(ITensorPack &tensors)
 {
     NEScheduler::get().schedule_op(_kernel.get(), Window::DimZ, _kernel->window(), tensors);
 }
-} // namesapce cpu
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuConvertFullyConnectedWeights.h b/src/cpu/operators/CpuConvertFullyConnectedWeights.h
index ea70eee134..e208cca3a0 100644
--- a/src/cpu/operators/CpuConvertFullyConnectedWeights.h
+++ b/src/cpu/operators/CpuConvertFullyConnectedWeights.h
@@ -41,14 +41,18 @@ public:
      * @param[in]  original_src_shape Shape of the original src tensor (the one entering fully connected layer).
      * @param[in]  data_layout        The data layout the weights have been trained in.
      */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+    void
+    configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref CpuConvertFullyConnectedWeights::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           const TensorShape &original_src_shape,
+                           DataLayout         data_layout);
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
 };
diff --git a/src/cpu/operators/CpuCopy.cpp b/src/cpu/operators/CpuCopy.cpp
index 7420ff6240..92c19d4df2 100644
--- a/src/cpu/operators/CpuCopy.cpp
+++ b/src/cpu/operators/CpuCopy.cpp
@@ -23,9 +23,8 @@
  */
 #include "src/cpu/operators/CpuCopy.h"
 
-#include "src/cpu/kernels/CpuCopyKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuCopyKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/cpu/operators/CpuDepthwiseConv2d.cpp b/src/cpu/operators/CpuDepthwiseConv2d.cpp
index 884fe5c4ed..54075f2afa 100644
--- a/src/cpu/operators/CpuDepthwiseConv2d.cpp
+++ b/src/cpu/operators/CpuDepthwiseConv2d.cpp
@@ -24,10 +24,11 @@
 #include "src/cpu/operators/CpuDepthwiseConv2d.h"
 
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
 
@@ -37,11 +38,16 @@ namespace cpu
 {
 namespace
 {
-Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
+Status validate_arguments_optimized(const ITensorInfo     *src,
+                                    const ITensorInfo     *weights,
+                                    const ITensorInfo     *biases,
+                                    const ITensorInfo     *dst,
+                                    const ConvolutionInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    if(!is_data_type_quantized_per_channel(weights->data_type()))
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    if (!is_data_type_quantized_per_channel(weights->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
     }
@@ -49,14 +55,17 @@ Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *w
     ARM_COMPUTE_RETURN_ERROR_ON(info.dilation.x() < 1 || info.dilation.y() < 1);
     const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) > src->dimension(idx_w) + info.pad_stride_info.pad_left() +
-                                info.pad_stride_info.pad_right());
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) > src->dimension(idx_h) + info.pad_stride_info.pad_top() +
-                                info.pad_stride_info.pad_bottom());
-
-    if(biases != nullptr)
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) >
+                                src->dimension(idx_w) + info.pad_stride_info.pad_left() +
+                                    info.pad_stride_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) >
+                                src->dimension(idx_h) + info.pad_stride_info.pad_top() +
+                                    info.pad_stride_info.pad_bottom());
+
+    if (biases != nullptr)
     {
-        const unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
+        const unsigned int channel_idx =
+            get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
     }
@@ -64,7 +73,7 @@ Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *w
     ARM_COMPUTE_RETURN_ON_ERROR(CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, biases, dst, info));
 
     // Validate Activation Layer
-    if(info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info))
+    if (info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info))
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info));
     }
@@ -80,8 +89,8 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorI
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, (biases == nullptr) ? nullptr : biases,
-                                                                             dst, info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, (biases == nullptr) ? nullptr : biases, dst, info));
 
     _is_quantized      = is_data_type_quantized_asymmetric(src->data_type());
     _has_bias          = biases != nullptr;
@@ -91,10 +100,11 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorI
     _are_weights_const = weights->are_values_constant();
 
     // Configure pipeline
-    _is_activationlayer_enabled = info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info);
+    _is_activationlayer_enabled =
+        info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info);
 
     _dwc_optimized_func = std::make_unique<CpuDepthwiseConv2dAssemblyDispatch>();
-    if(_is_nchw)
+    if (_is_nchw)
     {
         _permute_input   = std::make_unique<cpu::CpuPermute>();
         _permute_weights = std::make_unique<cpu::CpuPermute>();
@@ -128,7 +138,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorI
     }
 
     // Configure activation
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activationlayer_function = std::make_unique<cpu::CpuActivation>();
         _activationlayer_function->configure(dst, nullptr, info.act_info);
@@ -155,7 +165,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &t
     auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
 
     // Permute input
-    if(_permute)
+    if (_permute)
     {
         ITensorPack pack;
         auto        src      = tensors.get_const_tensor(TensorType::ACL_SRC_0);
@@ -166,7 +176,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &t
     }
 
     // Run assembly function
-    if(_is_nchw)
+    if (_is_nchw)
     {
         auto src_perm     = tensors.get_tensor(TensorType::ACL_INT_0);
         auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
@@ -198,7 +208,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &t
     }
 
     // Permute output
-    if(_is_nchw)
+    if (_is_nchw)
     {
         ITensorPack pack;
         auto        dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
@@ -208,7 +218,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &t
     }
 
     // Run activation
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         ITensorPack pack;
         pack.add_tensor(TensorType::ACL_SRC, dst);
@@ -221,7 +231,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPac
 {
     // if weights are not constant then we need to repack so that weights
     // can be updated in-place
-    if(!_are_weights_const)
+    if (!_are_weights_const)
     {
         auto weights        = tensors.get_const_tensor(TensorType::ACL_SRC_1);
         auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_2);
@@ -238,14 +248,14 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPac
         return;
     }
 
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         auto weights        = tensors.get_const_tensor(TensorType::ACL_SRC_1);
         auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_2);
         auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
 
         // Permute weights
-        if(_permute)
+        if (_permute)
         {
             auto permuted_weights = tensors.get_tensor(TensorType::ACL_INT_1);
 
@@ -279,11 +289,15 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPac
     }
 }
 
-void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo           *src,
+                                                              const ITensorInfo     *weights,
+                                                              const ITensorInfo     *biases,
+                                                              ITensorInfo           *dst,
+                                                              const ConvolutionInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2d::validate(src, weights, (biases == nullptr) ? nullptr : biases,
-                                                            dst, info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        CpuDepthwiseConv2d::validate(src, weights, (biases == nullptr) ? nullptr : biases, dst, info));
 
     _is_nchw     = src->data_layout() == DataLayout::NCHW;
     _is_prepared = !_is_nchw;
@@ -294,9 +308,10 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src,
 
     auto input_perm   = std::make_unique<TensorInfo>();
     auto weights_perm = std::make_unique<TensorInfo>();
-    auto output_perm  = std::make_unique<TensorInfo>(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
+    auto output_perm  = std::make_unique<TensorInfo>(
+        dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
 
-    if(_is_nchw)
+    if (_is_nchw)
     {
         _permute_input   = std::make_unique<cpu::CpuPermute>();
         _permute_weights = std::make_unique<cpu::CpuPermute>();
@@ -315,7 +330,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src,
     _depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>();
     _depthwise_conv_kernel->configure(input_to_use, weights_to_use, biases, output_to_use, info);
 
-    if(_is_nchw)
+    if (_is_nchw)
     {
         _permute_output = std::make_unique<cpu::CpuPermute>();
         _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U));
@@ -324,43 +339,61 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src,
 
     //Configure Activation Layer
     _is_activationlayer_enabled = info.act_info.enabled();
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activationlayer_function = std::make_unique<cpu::CpuActivation>();
         _activationlayer_function->configure(dst, nullptr, info.act_info);
     }
 }
 
-Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo     *src,
+                                                               const ITensorInfo     *weights,
+                                                               const ITensorInfo     *biases,
+                                                               const ITensorInfo     *dst,
                                                                const ConvolutionInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    if(src->data_layout() == DataLayout::NCHW)
+    if (src->data_layout() == DataLayout::NCHW)
     {
         TensorShape permuted_input_shape   = src->tensor_shape();
         TensorShape permuted_weights_shape = weights->tensor_shape();
-        TensorShape permuted_output_shape  = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+        TensorShape permuted_output_shape =
+            misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
         permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
         permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
         permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
 
-        const TensorInfo permuted_input   = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC));
-        const TensorInfo permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC));
-        const TensorInfo permuted_output  = TensorInfo(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW));
+        const TensorInfo permuted_input   = TensorInfo(src->clone()
+                                                           ->set_is_resizable(true)
+                                                           .reset_padding()
+                                                           .set_tensor_shape(permuted_input_shape)
+                                                           .set_data_layout(DataLayout::NHWC));
+        const TensorInfo permuted_weights = TensorInfo(weights->clone()
+                                                           ->set_is_resizable(true)
+                                                           .reset_padding()
+                                                           .set_tensor_shape(permuted_weights_shape)
+                                                           .set_data_layout(DataLayout::NHWC));
+        const TensorInfo permuted_output  = TensorInfo(dst->clone()
+                                                           ->set_is_resizable(true)
+                                                           .reset_padding()
+                                                           .set_tensor_shape(permuted_output_shape)
+                                                           .set_data_layout(DataLayout::NCHW));
 
         ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &permuted_input, PermutationVector(2U, 0U, 1U)));
         ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
         ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&permuted_output, dst, PermutationVector(1U, 2U, 0U)));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, info));
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(
+            &permuted_input, &permuted_weights, biases, &permuted_output, info));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(src, weights, biases, dst, info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(src, weights, biases, dst, info));
     }
 
     // Validate Activation Layer
-    if(info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info))
+    if (info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info))
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info));
     }
@@ -375,7 +408,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors)
     auto biases  = tensors.get_const_tensor(TensorType::ACL_SRC_2);
     auto dst     = tensors.get_tensor(TensorType::ACL_DST_0);
 
-    if(_is_nchw)
+    if (_is_nchw)
     {
         prepare(tensors);
         auto src_perm     = tensors.get_tensor(TensorType::ACL_INT_0);
@@ -392,7 +425,8 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors)
         pack_depth.add_const_tensor(TensorType::ACL_SRC_1, weights_perm);
         pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
         pack_depth.add_tensor(TensorType::ACL_DST, dst_perm);
-        NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth);
+        NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(),
+                                       pack_depth);
     }
     else
     {
@@ -401,10 +435,11 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors)
         pack_depth.add_tensor(TensorType::ACL_SRC_1, weights);
         pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
         pack_depth.add_tensor(TensorType::ACL_DST, dst);
-        NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth);
+        NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(),
+                                       pack_depth);
     }
 
-    if(_is_nchw)
+    if (_is_nchw)
     {
         ITensorPack pack;
         auto        dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
@@ -413,7 +448,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors)
         _permute_output->run(pack);
     }
 
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         ITensorPack pack;
         pack.add_tensor(TensorType::ACL_SRC, dst);
@@ -424,7 +459,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors)
 
 void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         auto weights      = tensors.get_const_tensor(TensorType::ACL_SRC_1);
         auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
@@ -441,12 +476,17 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::prepare(ITensorPack &tensors
     }
 }
 
-void CpuDepthwiseConv2d::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
+void CpuDepthwiseConv2d::configure(ITensorInfo           *src,
+                                   const ITensorInfo     *weights,
+                                   const ITensorInfo     *biases,
+                                   ITensorInfo           *dst,
+                                   const ConvolutionInfo &info)
 {
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, info);
 
-    _depth_conv_func = get_depthwiseconvolution_function(src, weights, (biases != nullptr) ? biases : nullptr, dst, info);
-    switch(_depth_conv_func)
+    _depth_conv_func =
+        get_depthwiseconvolution_function(src, weights, (biases != nullptr) ? biases : nullptr, dst, info);
+    switch (_depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
             _func_optimized.configure(src, weights, biases, dst, info);
@@ -459,10 +499,14 @@ void CpuDepthwiseConv2d::configure(ITensorInfo *src, const ITensorInfo *weights,
     }
 }
 
-Status CpuDepthwiseConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
+Status CpuDepthwiseConv2d::validate(const ITensorInfo     *src,
+                                    const ITensorInfo     *weights,
+                                    const ITensorInfo     *biases,
+                                    const ITensorInfo     *dst,
+                                    const ConvolutionInfo &info)
 {
     DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(src, weights, biases, dst, info);
-    switch(depth_conv_func)
+    switch (depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
             return CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info);
@@ -475,10 +519,13 @@ Status CpuDepthwiseConv2d::validate(const ITensorInfo *src, const ITensorInfo *w
     }
 }
 
-DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_function(const ITensorInfo     *src,
+                                                                                   const ITensorInfo     *weights,
+                                                                                   const ITensorInfo     *biases,
+                                                                                   const ITensorInfo     *dst,
                                                                                    const ConvolutionInfo &info)
 {
-    if(bool(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info)))
+    if (bool(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info)))
     {
         return DepthwiseConvolutionFunction::OPTIMIZED;
     }
@@ -490,7 +537,7 @@ DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_functi
 
 void CpuDepthwiseConv2d::run(ITensorPack &tensors)
 {
-    switch(_depth_conv_func)
+    switch (_depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
             _func_optimized.run(tensors);
@@ -505,7 +552,7 @@ void CpuDepthwiseConv2d::run(ITensorPack &tensors)
 
 void CpuDepthwiseConv2d::prepare(ITensorPack &tensors)
 {
-    switch(_depth_conv_func)
+    switch (_depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
             _func_optimized.prepare(tensors);
diff --git a/src/cpu/operators/CpuDepthwiseConv2d.h b/src/cpu/operators/CpuDepthwiseConv2d.h
index 3d8719ee44..7eaa0df857 100644
--- a/src/cpu/operators/CpuDepthwiseConv2d.h
+++ b/src/cpu/operators/CpuDepthwiseConv2d.h
@@ -24,8 +24,9 @@
 #ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H
 #define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H
 
-#include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/ITensorInfo.h"
+
 #include "src/cpu/ICpuKernel.h"
 #include "src/cpu/ICpuOperator.h"
 #include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
@@ -56,14 +57,22 @@ public:
      *                         Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
      * @param[in]      info    Depthwise convolution meta-data.
      */
-    void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+    void configure(ITensorInfo           *src,
+                   const ITensorInfo     *weights,
+                   const ITensorInfo     *biases,
+                   ITensorInfo           *dst,
+                   const ConvolutionInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuDepthwiseConv2d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+    static Status validate(const ITensorInfo     *src,
+                           const ITensorInfo     *weights,
+                           const ITensorInfo     *biases,
+                           const ITensorInfo     *dst,
+                           const ConvolutionInfo &info);
     /** Static function to choose the best depthwise convolution function for @ref CpuDepthwiseConv2d
      *
      * @param[in] src     Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
@@ -76,7 +85,10 @@ public:
      *
      * @return a Depthwise Convolution Function
      */
-    static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+    static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo     *src,
+                                                                          const ITensorInfo     *weights,
+                                                                          const ITensorInfo     *biases,
+                                                                          const ITensorInfo     *dst,
                                                                           const ConvolutionInfo &info);
 
     // Inherited methods overriden:
@@ -118,32 +130,40 @@ private:
          * @param[out]     dst     Destination tensor info. Data type supported: same as @p src.
          * @param[in]      info    Depthwise convolution meta-data.
          */
-        void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+        void configure(ITensorInfo           *src,
+                       const ITensorInfo     *weights,
+                       const ITensorInfo     *biases,
+                       ITensorInfo           *dst,
+                       const ConvolutionInfo &info);
         /** Static function to check if given info will lead to a valid configuration
          *
          * Similar to CpuDepthwiseConv2dOptimizedInternal::configure()
          *
          * @return a status
          */
-        static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+        static Status validate(const ITensorInfo     *src,
+                               const ITensorInfo     *weights,
+                               const ITensorInfo     *biases,
+                               const ITensorInfo     *dst,
+                               const ConvolutionInfo &info);
 
         // Inherited methods overriden:
         void run(ITensorPack &tensors) override;
         void prepare(ITensorPack &tensors) override;
 
     private:
-        std::unique_ptr<CpuDepthwiseConv2dAssemblyDispatch> _dwc_optimized_func{ nullptr };
-        std::unique_ptr<CpuPermute>                         _permute_input{ nullptr };
-        std::unique_ptr<CpuPermute>                         _permute_weights{ nullptr };
-        std::unique_ptr<CpuPermute>                         _permute_output{ nullptr };
-        std::unique_ptr<CpuActivation>                      _activationlayer_function{ nullptr };
-        bool                                                _has_bias{ false };
-        bool                                                _is_quantized{ false };
-        bool                                                _is_nchw{ true };
-        bool                                                _permute{ false };
-        bool                                                _is_activationlayer_enabled{ false };
-        bool                                                _is_prepared{ false };
-        bool                                                _are_weights_const{ true };
+        std::unique_ptr<CpuDepthwiseConv2dAssemblyDispatch> _dwc_optimized_func{nullptr};
+        std::unique_ptr<CpuPermute>                         _permute_input{nullptr};
+        std::unique_ptr<CpuPermute>                         _permute_weights{nullptr};
+        std::unique_ptr<CpuPermute>                         _permute_output{nullptr};
+        std::unique_ptr<CpuActivation>                      _activationlayer_function{nullptr};
+        bool                                                _has_bias{false};
+        bool                                                _is_quantized{false};
+        bool                                                _is_nchw{true};
+        bool                                                _permute{false};
+        bool                                                _is_activationlayer_enabled{false};
+        bool                                                _is_prepared{false};
+        bool                                                _are_weights_const{true};
     };
 
     /** Basic function to execute a generic depthwise convolution. This function calls the following kernel:
@@ -176,7 +196,11 @@ private:
          *                         Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
          * @param[in]      info    Depthwise convolution meta-data.
          */
-        void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+        void configure(ITensorInfo           *src,
+                       const ITensorInfo     *weights,
+                       const ITensorInfo     *biases,
+                       ITensorInfo           *dst,
+                       const ConvolutionInfo &info);
 
         /** Static function to check if given info will lead to a valid configuration
          *
@@ -184,24 +208,28 @@ private:
          *
          * @return a status
          */
-        static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+        static Status validate(const ITensorInfo     *src,
+                               const ITensorInfo     *weights,
+                               const ITensorInfo     *biases,
+                               const ITensorInfo     *dst,
+                               const ConvolutionInfo &info);
 
         // Inherited methods overridden:
         void run(ITensorPack &tensors) override;
         void prepare(ITensorPack &tensors) override;
 
     private:
-        std::unique_ptr<kernels::CpuDepthwiseConv2dNativeKernel> _depthwise_conv_kernel{ nullptr };
-        std::unique_ptr<CpuPermute>                              _permute_input{ nullptr };
-        std::unique_ptr<CpuPermute>                              _permute_weights{ nullptr };
-        std::unique_ptr<CpuPermute>                              _permute_output{ nullptr };
-        std::unique_ptr<CpuActivation>                           _activationlayer_function{ nullptr };
-        bool                                                     _is_nchw{ true };
-        bool                                                     _is_prepared{ false };
-        bool                                                     _is_activationlayer_enabled{ false };
+        std::unique_ptr<kernels::CpuDepthwiseConv2dNativeKernel> _depthwise_conv_kernel{nullptr};
+        std::unique_ptr<CpuPermute>                              _permute_input{nullptr};
+        std::unique_ptr<CpuPermute>                              _permute_weights{nullptr};
+        std::unique_ptr<CpuPermute>                              _permute_output{nullptr};
+        std::unique_ptr<CpuActivation>                           _activationlayer_function{nullptr};
+        bool                                                     _is_nchw{true};
+        bool                                                     _is_prepared{false};
+        bool                                                     _is_activationlayer_enabled{false};
     };
 
-    DepthwiseConvolutionFunction        _depth_conv_func{ DepthwiseConvolutionFunction::GENERIC };
+    DepthwiseConvolutionFunction        _depth_conv_func{DepthwiseConvolutionFunction::GENERIC};
     CpuDepthwiseConv2dOptimizedInternal _func_optimized{};
     CpuDepthwiseConv2dGeneric           _func_generic{};
 };
diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
index d078155155..8d3741de96 100644
--- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
+++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -38,15 +39,14 @@ namespace cpu
 {
 struct CpuDepthwiseConv2dAssemblyDispatch::LocalImpl
 {
-    std::unique_ptr<kernels::CpuDepthwiseConv2dAssemblyWrapperKernel> asm_kernel{ nullptr };
-    bool                                                              is_prepared{ false };
-    bool                                                              are_weights_const{ true };
+    std::unique_ptr<kernels::CpuDepthwiseConv2dAssemblyWrapperKernel> asm_kernel{nullptr};
+    bool                                                              is_prepared{false};
+    bool                                                              are_weights_const{true};
     experimental::MemoryRequirements                                  mem_req{};
 };
 
 #ifndef DOXYGEN_SKIP_THIS
-CpuDepthwiseConv2dAssemblyDispatch::CpuDepthwiseConv2dAssemblyDispatch()
-    : _pImpl(std::make_unique<LocalImpl>())
+CpuDepthwiseConv2dAssemblyDispatch::CpuDepthwiseConv2dAssemblyDispatch() : _pImpl(std::make_unique<LocalImpl>())
 {
 }
 #endif /* DOXYGEN_SKIP_THIS */
@@ -66,7 +66,7 @@ void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo     *src,
     _pImpl->are_weights_const      = weights->are_values_constant();
 
     // If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
-    if(!CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, bias, dst, info))
+    if (!CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, bias, dst, info))
     {
         return;
     }
@@ -77,12 +77,16 @@ void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo     *src,
 
     // Compute memory requirements for assembly kernels
     constexpr size_t alignment = 4096;
-    _pImpl->mem_req.push_back({ TensorType::ACL_INT_0, dwc_wrapper->get_working_size(num_threads), alignment });
-    _pImpl->mem_req.push_back({ TensorType::ACL_INT_1, dwc_wrapper->get_storage_size(), alignment });
+    _pImpl->mem_req.push_back({TensorType::ACL_INT_0, dwc_wrapper->get_working_size(num_threads), alignment});
+    _pImpl->mem_req.push_back({TensorType::ACL_INT_1, dwc_wrapper->get_storage_size(), alignment});
     _pImpl->asm_kernel = std::move(dwc_wrapper);
 }
 
-Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info)
+Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo     *src,
+                                                    const ITensorInfo     *weights,
+                                                    const ITensorInfo     *bias,
+                                                    const ITensorInfo     *dst,
+                                                    const ConvolutionInfo &info)
 {
     return kernels::CpuDepthwiseConv2dAssemblyWrapperKernel::validate(src, weights, bias, dst, info);
 }
@@ -111,7 +115,7 @@ void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors)
 {
     const ITensor *weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
 
-    if((!_pImpl->are_weights_const && weights != nullptr) || !_pImpl->is_prepared)
+    if ((!_pImpl->are_weights_const && weights != nullptr) || !_pImpl->is_prepared)
     {
         // Pack weights and bias
         const ITensor *bias    = tensors.get_const_tensor(TensorType::ACL_SRC_2);
@@ -125,11 +129,12 @@ void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors)
         const auto weights_padding = weights->info()->padding();
 
         const size_t ld_weights_col = weights_shape[0] + weights_padding.left + weights_padding.right;
-        const size_t ld_weights_row = ld_weights_col * (weights_shape[1] + weights_padding.top + weights_padding.bottom);
+        const size_t ld_weights_row =
+            ld_weights_col * (weights_shape[1] + weights_padding.top + weights_padding.bottom);
         _pImpl->asm_kernel->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weights_row);
 
         weights->mark_as_unused();
-        if(bias != nullptr)
+        if (bias != nullptr)
         {
             bias->mark_as_unused();
         }
diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
index f222ab9cf9..f1816625d2 100644
--- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
+++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuOperator.h"
 
@@ -53,14 +54,22 @@ public:
      * @param[out] dst     Destination tensor info. Data type supported: same as @p src.
      * @param[in]  info    Depthwise convolution meta-data.
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info);
+    void configure(const ITensorInfo     *src,
+                   const ITensorInfo     *weights,
+                   const ITensorInfo     *bias,
+                   ITensorInfo           *dst,
+                   const ConvolutionInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuDepthwiseConv2dAssemblyDispatch::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info);
+    static Status validate(const ITensorInfo     *src,
+                           const ITensorInfo     *weights,
+                           const ITensorInfo     *bias,
+                           const ITensorInfo     *dst,
+                           const ConvolutionInfo &info);
     /** Checks if activation is supported by the assembly kernels
      *
      * @param[in] activation Activation to check
@@ -70,8 +79,8 @@ public:
     static bool is_activation_supported(const ActivationLayerInfo &activation);
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
diff --git a/src/cpu/operators/CpuDequantize.cpp b/src/cpu/operators/CpuDequantize.cpp
index 12dc136ba3..c05a23f3a7 100644
--- a/src/cpu/operators/CpuDequantize.cpp
+++ b/src/cpu/operators/CpuDequantize.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuDequantizeKernel.h"
 
diff --git a/src/cpu/operators/CpuDirectConv2d.cpp b/src/cpu/operators/CpuDirectConv2d.cpp
index 9cdbdb61c1..135a3bb2b9 100644
--- a/src/cpu/operators/CpuDirectConv2d.cpp
+++ b/src/cpu/operators/CpuDirectConv2d.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 
 namespace arm_compute
@@ -36,12 +37,25 @@ namespace cpu
 CpuDirectConv2d::~CpuDirectConv2d() = default;
 
 CpuDirectConv2d::CpuDirectConv2d(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false),
-      _is_activationlayer_enabled(false), _dim_split(Window::DimZ), _is_padding_required()
+    : _memory_group(std::move(memory_manager)),
+      _output_stage_kernel(),
+      _conv_kernel(),
+      _input_border_handler(),
+      _activationlayer_function(),
+      _accumulator(),
+      _has_bias(false),
+      _is_activationlayer_enabled(false),
+      _dim_split(Window::DimZ),
+      _is_padding_required()
 {
 }
 
-void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CpuDirectConv2d::configure(ITensorInfo               *src,
+                                ITensorInfo               *weights,
+                                const ITensorInfo         *bias,
+                                ITensorInfo               *dst,
+                                const PadStrideInfo       &conv_info,
+                                const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
     ARM_COMPUTE_LOG_PARAMS(src, weights, bias, dst, conv_info, act_info);
@@ -51,7 +65,7 @@ void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const IT
     _input_border_handler = std::make_unique<NEFillBorderKernel>();
 
     // Free accumulator
-    if(_accumulator.buffer() != nullptr)
+    if (_accumulator.buffer() != nullptr)
     {
         _accumulator.allocator()->free();
     }
@@ -62,28 +76,33 @@ void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const IT
     _has_bias = (bias != nullptr);
 
     _conv_kernel->configure(src, weights, dst, conv_info);
-    if(_has_bias)
+    if (_has_bias)
     {
         _output_stage_kernel->configure(dst, bias);
     }
     _is_padding_required = !_conv_kernel->border_size().empty();
 
-    if(_is_padding_required)
+    if (_is_padding_required)
     {
         // Add zero padding XY
-        _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
+        _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT,
+                                         PixelValue(static_cast<float>(0.f)));
     }
 
     //Configure Activation Layer
     _is_activationlayer_enabled = act_info.enabled();
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activationlayer_function = std::make_unique<CpuActivation>();
         _activationlayer_function->configure(dst, dst, act_info);
     }
 }
 
-Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+Status CpuDirectConv2d::validate(const ITensorInfo         *src,
+                                 const ITensorInfo         *weights,
+                                 const ITensorInfo         *bias,
+                                 const ITensorInfo         *dst,
+                                 const PadStrideInfo       &conv_info,
                                  const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
@@ -95,7 +114,7 @@ Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weig
     // Validate Convolution kernel
     ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dKernel::validate(src, weights, &accumulator, conv_info));
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3),
@@ -106,7 +125,7 @@ Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weig
     // Validate bias kernel
     ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dOutputStageKernel::validate(&accumulator, bias, dst));
 
-    if(act_info.enabled())
+    if (act_info.enabled())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, act_info));
     }
@@ -122,14 +141,15 @@ void CpuDirectConv2d::run(ITensorPack &tensors)
     auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2);
     auto dst  = tensors.get_tensor(TensorType::ACL_DST);
 
-    if(_is_padding_required)
+    if (_is_padding_required)
     {
         ITensorPack pack;
         pack.add_tensor(TensorType::ACL_SRC_DST, src);
-        NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(), pack);
+        NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(),
+                                       pack);
     }
     NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors);
-    if(_has_bias)
+    if (_has_bias)
     {
         ITensorPack pack;
         pack.add_tensor(TensorType::ACL_SRC_0, dst);
@@ -138,7 +158,7 @@ void CpuDirectConv2d::run(ITensorPack &tensors)
         NEScheduler::get().schedule_op(_output_stage_kernel.get(), Window::DimY, _output_stage_kernel->window(), pack);
     }
 
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         ITensorPack pack;
         pack.add_tensor(TensorType::ACL_SRC, dst);
diff --git a/src/cpu/operators/CpuDirectConv2d.h b/src/cpu/operators/CpuDirectConv2d.h
index fa8d61e083..73c85f2dcd 100644
--- a/src/cpu/operators/CpuDirectConv2d.h
+++ b/src/cpu/operators/CpuDirectConv2d.h
@@ -24,13 +24,14 @@
 #ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_H
 #define ARM_COMPUTE_CPU_DIRECTCONV2D_H
 
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/NEON/kernels/NEFillBorderKernel.h"
 #include "src/cpu/ICpuKernel.h"
 #include "src/cpu/ICpuOperator.h"
@@ -75,14 +76,23 @@ public:
      * @param[in]      conv_info Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]      act_info  (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensorInfo               *src,
+                   ITensorInfo               *weights,
+                   const ITensorInfo         *bias,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuDirectConv2d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *dst,
+                           const PadStrideInfo       &conv_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
@@ -95,10 +105,10 @@ private:
     std::unique_ptr<NEFillBorderKernel>                        _input_border_handler;
     std::unique_ptr<CpuActivation>                             _activationlayer_function;
     Tensor                                                     _accumulator;
-    bool                                                       _has_bias{ false };
-    bool                                                       _is_activationlayer_enabled{ false };
-    unsigned int                                               _dim_split{ 0 };
-    bool                                                       _is_padding_required{ false };
+    bool                                                       _has_bias{false};
+    bool                                                       _is_activationlayer_enabled{false};
+    unsigned int                                               _dim_split{0};
+    bool                                                       _is_padding_required{false};
 };
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuDirectConv3d.cpp b/src/cpu/operators/CpuDirectConv3d.cpp
index aa74e420a6..626f1c6775 100644
--- a/src/cpu/operators/CpuDirectConv3d.cpp
+++ b/src/cpu/operators/CpuDirectConv3d.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 
 namespace arm_compute
@@ -36,11 +37,17 @@ namespace cpu
 CpuDirectConv3d::~CpuDirectConv3d() = default;
 
 CpuDirectConv3d::CpuDirectConv3d(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _conv_kernel(), _activationlayer_function(), _accumulator(), _is_activationlayer_enabled(false), _dim_split(Window::DimZ)
+    : _memory_group(std::move(memory_manager)),
+      _conv_kernel(),
+      _activationlayer_function(),
+      _accumulator(),
+      _is_activationlayer_enabled(false),
+      _dim_split(Window::DimZ)
 {
 }
 
-void CpuDirectConv3d::configure(ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info)
+void CpuDirectConv3d::configure(
+    ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src0, src1, src2, dst, conv_info);
     ARM_COMPUTE_ERROR_ON(src0->data_layout() != DataLayout::NDHWC);
@@ -48,7 +55,7 @@ void CpuDirectConv3d::configure(ITensorInfo *src0, ITensorInfo *src1, const ITen
     _conv_kernel = std::make_unique<kernels::CpuDirectConv3dKernel>();
 
     // Free accumulator
-    if(_accumulator.buffer() != nullptr)
+    if (_accumulator.buffer() != nullptr)
     {
         _accumulator.allocator()->free();
     }
@@ -59,21 +66,25 @@ void CpuDirectConv3d::configure(ITensorInfo *src0, ITensorInfo *src1, const ITen
 
     //Configure Activation Layer
     _is_activationlayer_enabled = conv_info.act_info.enabled();
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activationlayer_function = std::make_unique<CpuActivation>();
         _activationlayer_function->configure(dst, dst, conv_info.act_info);
     }
 }
 
-Status CpuDirectConv3d::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo conv_info)
+Status CpuDirectConv3d::validate(const ITensorInfo *src0,
+                                 const ITensorInfo *src1,
+                                 const ITensorInfo *src2,
+                                 const ITensorInfo *dst,
+                                 const Conv3dInfo   conv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
 
     // Validate Convolution kernel
     ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv3dKernel::validate(src0, src1, src2, dst, conv_info));
 
-    if(conv_info.act_info.enabled())
+    if (conv_info.act_info.enabled())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, conv_info.act_info));
     }
@@ -89,7 +100,7 @@ void CpuDirectConv3d::run(ITensorPack &tensors)
 
     NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors);
 
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         ITensorPack pack;
         pack.add_tensor(TensorType::ACL_SRC, dst);
@@ -98,4 +109,4 @@ void CpuDirectConv3d::run(ITensorPack &tensors)
     }
 }
 } // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuDirectConv3d.h b/src/cpu/operators/CpuDirectConv3d.h
index cde01f07c2..3ad1e09a14 100644
--- a/src/cpu/operators/CpuDirectConv3d.h
+++ b/src/cpu/operators/CpuDirectConv3d.h
@@ -24,14 +24,15 @@
 #ifndef ARM_COMPUTE_CPU_DIRECTCONV3D_H
 #define ARM_COMPUTE_CPU_DIRECTCONV3D_H
 
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/NEON/kernels/NEFillBorderKernel.h"
 #include "src/cpu/ICpuKernel.h"
 #include "src/cpu/ICpuOperator.h"
@@ -76,14 +77,19 @@ public:
      *                           The 1st dimensions must be equal to the 1st dimension of the @p kernels tensor.
      * @param[in]      conv_info Contains padding, stride, acitvation information.
      */
-    void configure(ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info);
+    void configure(
+        ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuDirectConv3d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo conv_info);
+    static Status validate(const ITensorInfo *src0,
+                           const ITensorInfo *src1,
+                           const ITensorInfo *src2,
+                           const ITensorInfo *dst,
+                           const Conv3dInfo   conv_info);
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
@@ -93,8 +99,8 @@ private:
     std::unique_ptr<kernels::CpuDirectConv3dKernel> _conv_kernel;
     std::unique_ptr<CpuActivation>                  _activationlayer_function;
     Tensor                                          _accumulator;
-    bool                                            _is_activationlayer_enabled{ false };
-    unsigned int                                    _dim_split{ 0 };
+    bool                                            _is_activationlayer_enabled{false};
+    unsigned int                                    _dim_split{0};
 };
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuElementwise.cpp b/src/cpu/operators/CpuElementwise.cpp
index b88ae3e514..c2ae8773c6 100644
--- a/src/cpu/operators/CpuElementwise.cpp
+++ b/src/cpu/operators/CpuElementwise.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "src/cpu/operators/CpuElementwise.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/CpuElementwiseKernel.h"
@@ -33,7 +34,7 @@ namespace cpu
 void CpuElementwiseBase::run(ITensorPack &tensors)
 {
     // If the kernel has been configured, use the window from the kernel.
-    if(_kernel->is_window_configured())
+    if (_kernel->is_window_configured())
     {
         ICpuOperator::run(tensors);
         return;
@@ -101,12 +102,16 @@ void CpuElementwiseComparisonStatic<COP>::configure(const ITensorInfo *src0, con
 }
 
 template <ComparisonOperation COP>
-Status CpuElementwiseComparisonStatic<COP>::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+Status
+CpuElementwiseComparisonStatic<COP>::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
 {
     return kernels::CpuComparisonKernel::validate(COP, src0, src1, dst);
 }
 
-void CpuElementwiseComparison::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ComparisonOperation op)
+void CpuElementwiseComparison::configure(const ITensorInfo  *src0,
+                                         const ITensorInfo  *src1,
+                                         ITensorInfo        *dst,
+                                         ComparisonOperation op)
 {
     ARM_COMPUTE_LOG_PARAMS(src0, src1, dst);
     auto k = std::make_unique<kernels::CpuComparisonKernel>();
@@ -114,7 +119,10 @@ void CpuElementwiseComparison::configure(const ITensorInfo *src0, const ITensorI
     _kernel = std::move(k);
 }
 
-Status CpuElementwiseComparison::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op)
+Status CpuElementwiseComparison::validate(const ITensorInfo  *src0,
+                                          const ITensorInfo  *src1,
+                                          const ITensorInfo  *dst,
+                                          ComparisonOperation op)
 {
     return kernels::CpuComparisonKernel::validate(op, src0, src1, dst);
 }
@@ -127,4 +135,4 @@ template class CpuElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>
 template class CpuElementwiseComparisonStatic<ComparisonOperation::Less>;
 template class CpuElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
 } // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuElementwise.h b/src/cpu/operators/CpuElementwise.h
index b6c61cf245..5db53c8026 100644
--- a/src/cpu/operators/CpuElementwise.h
+++ b/src/cpu/operators/CpuElementwise.h
@@ -139,7 +139,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op);
+    static Status
+    validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op);
 };
 
 /** Basic function to run @ref cpu::kernels::CpuComparisonKernel
@@ -182,4 +183,4 @@ using NELessEqual = CpuElementwiseComparisonStatic<ComparisonOperation::LessEqua
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_H */
diff --git a/src/cpu/operators/CpuElementwiseUnary.cpp b/src/cpu/operators/CpuElementwiseUnary.cpp
index 7fd14dba7d..04ab7bf8f5 100644
--- a/src/cpu/operators/CpuElementwiseUnary.cpp
+++ b/src/cpu/operators/CpuElementwiseUnary.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "src/cpu/operators/CpuElementwiseUnary.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/CpuElementwiseUnaryKernel.h"
@@ -47,7 +48,7 @@ Status CpuElementwiseUnary::validate(ElementWiseUnary op, const ITensorInfo &src
 
 void CpuElementwiseUnary::run(ITensorPack &tensors)
 {
-    if(_kernel->is_window_configured())
+    if (_kernel->is_window_configured())
     {
         ICpuOperator::run(tensors);
         return;
@@ -57,4 +58,4 @@ void CpuElementwiseUnary::run(ITensorPack &tensors)
     ICpuOperator::run(tensors, compute_output_shape_and_window(src_info->tensor_shape()).second);
 }
 } // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuElementwiseUnary.h b/src/cpu/operators/CpuElementwiseUnary.h
index 5e8e98d047..1e51bfaa1c 100644
--- a/src/cpu/operators/CpuElementwiseUnary.h
+++ b/src/cpu/operators/CpuElementwiseUnary.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/cpu/ICpuOperator.h"
 
 namespace arm_compute
@@ -56,4 +57,4 @@ public:
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H */
diff --git a/src/cpu/operators/CpuFill.cpp b/src/cpu/operators/CpuFill.cpp
index 3d8f62fe07..1890d0b916 100644
--- a/src/cpu/operators/CpuFill.cpp
+++ b/src/cpu/operators/CpuFill.cpp
@@ -23,9 +23,8 @@
  */
 #include "src/cpu/operators/CpuFill.h"
 
-#include "src/cpu/kernels/CpuFillKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuFillKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/cpu/operators/CpuFill.h b/src/cpu/operators/CpuFill.h
index 41d9a9fa8a..cb83745d29 100644
--- a/src/cpu/operators/CpuFill.h
+++ b/src/cpu/operators/CpuFill.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_FILL_H
 
 #include "arm_compute/core/PixelValue.h"
+
 #include "src/cpu/ICpuOperator.h"
 
 namespace arm_compute
diff --git a/src/cpu/operators/CpuFlatten.cpp b/src/cpu/operators/CpuFlatten.cpp
index 7bab9e481c..2609d44590 100644
--- a/src/cpu/operators/CpuFlatten.cpp
+++ b/src/cpu/operators/CpuFlatten.cpp
@@ -23,16 +23,14 @@
  */
 #include "src/cpu/operators/CpuFlatten.h"
 
-#include "src/cpu/operators/CpuReshape.h"
-
 #include "src/common/utils/Log.h"
+#include "src/cpu/operators/CpuReshape.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-CpuFlatten::CpuFlatten()
-    : _reshape(nullptr)
+CpuFlatten::CpuFlatten() : _reshape(nullptr)
 {
 }
 
diff --git a/src/cpu/operators/CpuFloor.cpp b/src/cpu/operators/CpuFloor.cpp
index 868add7d29..a107393b01 100644
--- a/src/cpu/operators/CpuFloor.cpp
+++ b/src/cpu/operators/CpuFloor.cpp
@@ -23,9 +23,8 @@
  */
 #include "src/cpu/operators/CpuFloor.h"
 
-#include "src/cpu/kernels/CpuFloorKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuFloorKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/cpu/operators/CpuFullyConnected.cpp b/src/cpu/operators/CpuFullyConnected.cpp
index 395d8d2aa5..85a0b0311b 100644
--- a/src/cpu/operators/CpuFullyConnected.cpp
+++ b/src/cpu/operators/CpuFullyConnected.cpp
@@ -25,10 +25,11 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/MemoryHelpers.h"
@@ -49,8 +50,11 @@ using namespace arm_compute::misc::shape_calculator;
 
 namespace
 {
-Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act,
-                                      GEMMLowpOutputStageInfo &gemmlowp_output_stage_info)
+Status get_gemmlowp_output_stage_info(const ITensorInfo         *src,
+                                      const ITensorInfo         *weights,
+                                      const ITensorInfo         *dst,
+                                      const ActivationLayerInfo &act,
+                                      GEMMLowpOutputStageInfo   &gemmlowp_output_stage_info)
 {
     const auto                    data_type = src->data_type();
     const QuantizationInfo        oq_info   = dst->quantization_info();
@@ -62,10 +66,11 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo
     int32_t output_multiplier;
     int32_t output_shift;
 
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
 
-    int32_t type_min = 0;
-    int32_t type_max = 0;
+    int32_t type_min             = 0;
+    int32_t type_max             = 0;
     std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(oq_info, act, data_type);
 
     gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier;
@@ -78,14 +83,22 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo
     return Status{};
 }
 
-Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ActivationLayerInfo &act, bool enable_fast_math, WeightFormat weight_format)
+Status validate_mm(const ITensorInfo         *src,
+                   const ITensorInfo         *weights,
+                   const ITensorInfo         *biases,
+                   const ITensorInfo         *dst,
+                   const ActivationLayerInfo &act,
+                   bool                       enable_fast_math,
+                   WeightFormat               weight_format)
 {
-    if(is_data_type_quantized_asymmetric(src->data_type()))
+    if (is_data_type_quantized_asymmetric(src->data_type()))
     {
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate src and weights offset
-        const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, -src->quantization_info().uniform().offset);
-        const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, -weights->quantization_info().uniform().offset);
+        const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale,
+                                                     -src->quantization_info().uniform().offset);
+        const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale,
+                                                         -weights->quantization_info().uniform().offset);
 
         GEMMLowpOutputStageInfo gemmlowp_output_stage_info;
         ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(src, weights, dst, act, gemmlowp_output_stage_info));
@@ -97,11 +110,8 @@ Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITe
         // Validate gemmlowp function
         TensorInfo src_info     = src->clone()->set_quantization_info(src_quantization_info);
         TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info);
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmLowpMatrixMultiplyCore::validate(&src_info,
-                                                                            &weights_info,
-                                                                            biases,
-                                                                            dst,
-                                                                            gemm_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CpuGemmLowpMatrixMultiplyCore::validate(&src_info, &weights_info, biases, dst, gemm_info));
     }
     else
     {
@@ -142,21 +152,28 @@ CpuFullyConnected::CpuFullyConnected()
 
 CpuFullyConnected::~CpuFullyConnected() = default;
 
-void CpuFullyConnected::configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act)
+void CpuFullyConnected::configure_mm(const ITensorInfo         *src,
+                                     const ITensorInfo         *weights,
+                                     const ITensorInfo         *biases,
+                                     ITensorInfo               *dst,
+                                     const ActivationLayerInfo &act)
 {
-    if(_is_quantized_asymmetric)
+    if (_is_quantized_asymmetric)
     {
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate src and weights offset
-        const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, -src->quantization_info().uniform().offset);
-        const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, -weights->quantization_info().uniform().offset);
+        const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale,
+                                                     -src->quantization_info().uniform().offset);
+        const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale,
+                                                         -weights->quantization_info().uniform().offset);
 
         TensorInfo src_info     = src->clone()->set_quantization_info(src_quantization_info);
         TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info);
 
         // Configure gemmlowp function and output stage for asymmetric quantized types
         GEMMLowpOutputStageInfo gemmlowp_output_stage_info;
-        const Status            status = get_gemmlowp_output_stage_info(&src_info, &weights_info, dst, act, gemmlowp_output_stage_info);
+        const Status            status =
+            get_gemmlowp_output_stage_info(&src_info, &weights_info, dst, act, gemmlowp_output_stage_info);
         ARM_COMPUTE_ERROR_ON(status.error_code() != ErrorCode::OK);
 
         GEMMInfo gemm_info;
@@ -179,7 +196,11 @@ void CpuFullyConnected::configure_mm(const ITensorInfo *src, const ITensorInfo *
     }
 }
 
-void CpuFullyConnected::configure_conv_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act)
+void CpuFullyConnected::configure_conv_fc(const ITensorInfo         *src,
+                                          const ITensorInfo         *weights,
+                                          const ITensorInfo         *biases,
+                                          ITensorInfo               *dst,
+                                          const ActivationLayerInfo &act)
 {
     ARM_COMPUTE_ERROR_ON((weights->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
 
@@ -195,7 +216,11 @@ void CpuFullyConnected::configure_conv_fc(const ITensorInfo *src, const ITensorI
     configure_mm(&_flattened_src, weights, biases, dst, act);
 }
 
-void CpuFullyConnected::configure_fc_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act)
+void CpuFullyConnected::configure_fc_fc(const ITensorInfo         *src,
+                                        const ITensorInfo         *weights,
+                                        const ITensorInfo         *biases,
+                                        ITensorInfo               *dst,
+                                        const ActivationLayerInfo &act)
 {
     ARM_COMPUTE_ERROR_ON(src->dimension(0) != weights->dimension(1));
 
@@ -203,17 +228,17 @@ void CpuFullyConnected::configure_fc_fc(const ITensorInfo *src, const ITensorInf
     configure_mm(src, weights, biases, dst, act);
 }
 
-void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst,
-                                  FullyConnectedLayerInfo fc_info, const WeightsInfo &weights_info)
+void CpuFullyConnected::configure(const ITensorInfo      *src,
+                                  const ITensorInfo      *weights,
+                                  const ITensorInfo      *biases,
+                                  ITensorInfo            *dst,
+                                  FullyConnectedLayerInfo fc_info,
+                                  const WeightsInfo      &weights_info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuFullyConnected::validate(src,
-                                                           weights,
-                                                           biases != nullptr ? biases : nullptr,
-                                                           dst,
-                                                           fc_info,
-                                                           weights_info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        CpuFullyConnected::validate(src, weights, biases != nullptr ? biases : nullptr, dst, fc_info, weights_info));
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, fc_info);
 
     _needs_weights_conversion = false;
@@ -238,9 +263,11 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
 
     // Check if we have a fully connected layer with batches
     const bool is_batched_fc_layer = dst->dimension(1) > 1;
-    if(is_batched_fc_layer)
+    if (is_batched_fc_layer)
     {
-        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), dst->tensor_shape().cbegin() + 1));
+        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                            (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(),
+                                        dst->tensor_shape().cbegin() + 1));
     }
     else
     {
@@ -248,7 +275,7 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
     }
 
     // Reshape weights if needed
-    if(_needs_weights_reshape)
+    if (_needs_weights_reshape)
     {
         // Reshape the weights
         _transpose_weights = std::make_unique<kernels::CpuTransposeKernel>();
@@ -260,13 +287,11 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
     }
 
     // Convert weights if needed
-    if(_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
+    if (_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
     {
         // Convert weights
         _convert_weights = std::make_unique<CpuConvertFullyConnectedWeights>();
-        _convert_weights->configure(weights_to_use,
-                                    &_converted_weights,
-                                    src->tensor_shape(),
+        _convert_weights->configure(weights_to_use, &_converted_weights, src->tensor_shape(),
                                     fc_info.weights_trained_layout);
         _converted_weights.set_are_values_constant(weights_to_use->are_values_constant());
 
@@ -275,7 +300,7 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
         _trans_weights_idx        = AuxTensorIdx::ConvertedWeights;
     }
 
-    if(_is_fc_after_conv)
+    if (_is_fc_after_conv)
     {
         // Fully Connected layer after a Convolution Layer without batches
         configure_conv_fc(src, weights_to_use, biases, dst, fc_info.activation_info);
@@ -287,54 +312,57 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
     }
 
     // Retain the tensorinfo with the weights to use
-    if(_needs_weights_reshape || _needs_weights_conversion)
+    if (_needs_weights_reshape || _needs_weights_conversion)
     {
         _trans_weights = *weights_to_use;
     }
 
     // Set auxiliary memory requirements
     auto gemm_mem_req = (_is_quantized_asymmetric) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace();
-    for(unsigned int i = 0; i < gemm_mem_req.size(); ++i)
+    for (unsigned int i = 0; i < gemm_mem_req.size(); ++i)
     {
         _aux_mem[i] = gemm_mem_req[i];
     }
 
-    if(_aux_mem[Pretranspose].size > 0)
+    if (_aux_mem[Pretranspose].size > 0)
     {
         // Release permuted weights at the end of prepare as they are further transposed by the assembly dispatch
         // Do not release them if biases are dynamic and data type is quantized, since the weights tensor will be used for biases offset calculation
         // Keep all the auxiliary tensors in case of dynamic weights as they are recalculated every time.
         _aux_mem[TransposedWeights] = MemoryInfo(
             offset_int_vec(TransposedWeights),
-            _dynamic_weights                                                         ? MemoryLifetime::Temporary  :
-            (_is_quantized_asymmetric && biases && !(biases->are_values_constant())) ? MemoryLifetime::Persistent :
-                                                                                       MemoryLifetime::Prepare,
+            _dynamic_weights                                                           ? MemoryLifetime::Temporary
+            : (_is_quantized_asymmetric && biases && !(biases->are_values_constant())) ? MemoryLifetime::Persistent
+                                                                                       : MemoryLifetime::Prepare,
             _reshaped_weights.total_size());
 
-        _aux_mem[ConvertedWeights] = MemoryInfo(
-            offset_int_vec(ConvertedWeights),
-            _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
-            _converted_weights.total_size());
+        _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights),
+                                                _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
+                                                _converted_weights.total_size());
     }
     else
     {
-        _aux_mem[TransposedWeights] = MemoryInfo(
-            offset_int_vec(TransposedWeights),
-            _dynamic_weights          ? MemoryLifetime::Temporary :
-            _needs_weights_conversion ? MemoryLifetime::Prepare   :
-                                        MemoryLifetime::Persistent,
-            _reshaped_weights.total_size());
+        _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights),
+                                                 _dynamic_weights            ? MemoryLifetime::Temporary
+                                                 : _needs_weights_conversion ? MemoryLifetime::Prepare
+                                                                             : MemoryLifetime::Persistent,
+                                                 _reshaped_weights.total_size());
 
         _aux_mem[ConvertedWeights] = MemoryInfo(
-            offset_int_vec(ConvertedWeights),
-            _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Persistent,
+            offset_int_vec(ConvertedWeights), _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Persistent,
             _converted_weights.total_size());
     }
-    _aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
+    _aux_mem[FlattenedSrc] =
+        MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
 }
 
-Status CpuFullyConnected::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights,
-                                       const ITensorInfo *biases, const ITensorInfo *dst, FullyConnectedLayerInfo fc_info, WeightsInfo weights_info)
+Status CpuFullyConnected::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                                       const ITensorInfo         *src,
+                                       const ITensorInfo         *weights,
+                                       const ITensorInfo         *biases,
+                                       const ITensorInfo         *dst,
+                                       FullyConnectedLayerInfo    fc_info,
+                                       WeightsInfo                weights_info)
 {
     GEMMInfo gemm_info;
     gemm_info.set_activation_info(fc_info.activation_info);
@@ -345,12 +373,17 @@ Status CpuFullyConnected::has_opt_impl(arm_compute::WeightFormat &expected_weigh
     return CpuGemm::has_opt_impl(expected_weight_format, src, weights, biases, dst, gemm_info);
 }
 
-Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                   FullyConnectedLayerInfo fc_info, const WeightsInfo &weights_info)
+Status CpuFullyConnected::validate(const ITensorInfo      *src,
+                                   const ITensorInfo      *weights,
+                                   const ITensorInfo      *biases,
+                                   const ITensorInfo      *dst,
+                                   FullyConnectedLayerInfo fc_info,
+                                   const WeightsInfo      &weights_info)
 {
     ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
 
     if (is_fixed_format_fast_math(weights_info.weight_format()))
     {
@@ -364,15 +397,22 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we
     }
 
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU
-                                && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
 
     bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
     bool is_fc_after_conv = true;
 
-    const ITensorInfo &flatten_src       = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)));
-    const ITensorInfo &reshaped_weights  = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
-    const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone());
+    const ITensorInfo &flatten_src =
+        TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)));
+    const ITensorInfo &reshaped_weights = TensorInfo(
+        weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
+    const ITensorInfo &converted_weights = weights_reshaped
+                                               ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding())
+                                               : TensorInfo(*reshaped_weights.clone());
 
     // With the Fully Connected layer we can have 4 different cases:
     //  1) Convolution layer -> Fully Connected layer without batches
@@ -386,10 +426,10 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we
     // Check if we have a fully connected layer with batches
     const bool is_batched_fc_layer = dst->dimension(1) > 1;
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-        if(is_data_type_quantized(src->data_type()))
+        if (is_data_type_quantized(src->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -399,36 +439,37 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we
         }
     }
 
-    if(is_batched_fc_layer)
+    if (is_batched_fc_layer)
     {
-        is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), dst->tensor_shape().cbegin() + 1));
+        is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                           (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(),
+                                       dst->tensor_shape().cbegin() + 1));
     }
     else
     {
         is_fc_after_conv = src->num_dimensions() > 1;
     }
 
-    if(!weights_reshaped)
+    if (!weights_reshaped)
     {
         // Validate reshape weights kernel
         ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuTransposeKernel::validate(weights, &reshaped_weights));
         weights_to_use = &reshaped_weights;
     }
 
-    if(is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
+    if (is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
     {
         // Validate convert weights kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuConvertFullyConnectedWeights::validate(weights_to_use,
-                                                                              &converted_weights,
-                                                                              src->tensor_shape(),
-                                                                              fc_info.weights_trained_layout));
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuConvertFullyConnectedWeights::validate(
+            weights_to_use, &converted_weights, src->tensor_shape(), fc_info.weights_trained_layout));
         weights_to_use = &converted_weights;
     }
 
-    if(is_fc_after_conv)
+    if (is_fc_after_conv)
     {
         // Fully Connected layer after a Convolution Layer without batches
-        ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
+        ARM_COMPUTE_RETURN_ERROR_ON(
+            (weights_to_use->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
 
         // Validate flatten kernel
         ARM_COMPUTE_RETURN_ON_ERROR(CpuFlatten::validate(src, &flatten_src));
@@ -440,7 +481,8 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != weights_to_use->dimension(1));
     }
     // Validate matrix multiply kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(src_to_use, weights_to_use, biases, dst, fc_info.activation_info, fc_info.enable_fast_math, weights_info.weight_format()));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(src_to_use, weights_to_use, biases, dst, fc_info.activation_info,
+                                            fc_info.enable_fast_math, weights_info.weight_format()));
 
     return Status{};
 }
@@ -460,21 +502,21 @@ void CpuFullyConnected::run(ITensorPack &tensors)
     CpuAuxTensorHandler transformed_wei(offset_int_vec(_trans_weights_idx), _trans_weights, tensors, false);
 
     // Linearize src if it comes from a convolutional layer
-    if(_is_fc_after_conv)
+    if (_is_fc_after_conv)
     {
-        ITensorPack flatten_pack{ { ACL_SRC, src }, { ACL_DST, flattened_src.get() } };
+        ITensorPack flatten_pack{{ACL_SRC, src}, {ACL_DST, flattened_src.get()}};
         _flatten->run(flatten_pack);
     }
 
     ITensorPack gemm_pack = tensors;
     gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src);
-    if(_needs_weights_reshape || _needs_weights_conversion)
+    if (_needs_weights_reshape || _needs_weights_conversion)
     {
         gemm_pack.add_const_tensor(ACL_SRC_1, transformed_wei.get());
     }
 
     // Run matrix multiply
-    if(_is_quantized_asymmetric)
+    if (_is_quantized_asymmetric)
     {
         _mm_gemmlowp->run(gemm_pack);
     }
@@ -486,7 +528,7 @@ void CpuFullyConnected::run(ITensorPack &tensors)
 
 void CpuFullyConnected::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared || _dynamic_weights)
+    if (!_is_prepared || _dynamic_weights)
     {
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
         ++_asrt_prepare_count;
@@ -502,20 +544,21 @@ void CpuFullyConnected::prepare(ITensorPack &tensors)
         const ITensor *cur_weights = weights;
 
         // Reshape of the weights (happens only once)
-        if(_needs_weights_reshape)
+        if (_needs_weights_reshape)
         {
             // Run reshape weights kernel and mark weights as unused
-            ITensorPack transpose_pack{ { ACL_SRC, weights }, { ACL_DST, reshaped_weights.get() } };
-            NEScheduler::get().schedule_op(_transpose_weights.get(), Window::DimY, _transpose_weights->window(), transpose_pack);
+            ITensorPack transpose_pack{{ACL_SRC, weights}, {ACL_DST, reshaped_weights.get()}};
+            NEScheduler::get().schedule_op(_transpose_weights.get(), Window::DimY, _transpose_weights->window(),
+                                           transpose_pack);
 
             cur_weights->mark_as_unused();
             cur_weights = reshaped_weights.get();
         }
 
         // Convert weights if needed (happens only once)
-        if(_needs_weights_conversion)
+        if (_needs_weights_conversion)
         {
-            ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } };
+            ITensorPack convert_pack{{ACL_SRC, cur_weights}, {ACL_DST, converted_weights.get()}};
             _convert_weights->run(convert_pack);
 
             cur_weights->mark_as_unused();
@@ -526,7 +569,7 @@ void CpuFullyConnected::prepare(ITensorPack &tensors)
         gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights);
 
         // Prepare GEMM prepare and release unused weights
-        if(!_is_quantized_asymmetric)
+        if (!_is_quantized_asymmetric)
         {
             _mm_gemm->prepare(gemm_pack);
         }
diff --git a/src/cpu/operators/CpuFullyConnected.h b/src/cpu/operators/CpuFullyConnected.h
index 1e8c6478d0..7073fb9f7c 100644
--- a/src/cpu/operators/CpuFullyConnected.h
+++ b/src/cpu/operators/CpuFullyConnected.h
@@ -24,11 +24,11 @@
 #ifndef ARM_COMPUTE_CPU_FULLY_CONNECTED_H
 #define ARM_COMPUTE_CPU_FULLY_CONNECTED_H
 
-#include "src/cpu/ICpuOperator.h"
-
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/function_info/FullyConnectedLayerInfo.h"
 
+#include "src/cpu/ICpuOperator.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -86,16 +86,24 @@ public:
      * @param[in]  fc_info      (Optional) Fully connected layer additional info
      * @param[in]  weights_info (Optional) Stores neccessary compute information when weights are already reshaped
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst,
-                   FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), const WeightsInfo &weights_info = WeightsInfo());
+    void configure(const ITensorInfo      *src,
+                   const ITensorInfo      *weights,
+                   const ITensorInfo      *biases,
+                   ITensorInfo            *dst,
+                   FullyConnectedLayerInfo fc_info      = FullyConnectedLayerInfo(),
+                   const WeightsInfo      &weights_info = WeightsInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CpuFullyConnected
      *
      * Similar to @ref CpuFullyConnected::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                           FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), const WeightsInfo &weights_info = WeightsInfo());
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *weights,
+                           const ITensorInfo      *biases,
+                           const ITensorInfo      *dst,
+                           FullyConnectedLayerInfo fc_info      = FullyConnectedLayerInfo(),
+                           const WeightsInfo      &weights_info = WeightsInfo());
 
     /** Static function that queries whether there exists fixed-format kernel and if it exists it will return in the first argument in what format
      * weights are expected to be reshaped as defined by WeightFormat class. Apart from the first argument the rest of the arguments are the same
@@ -103,19 +111,35 @@ public:
      *
      * @return a status
      */
-    static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights,
-                               const ITensorInfo *biases, const ITensorInfo *dst,
-                               FullyConnectedLayerInfo fc_info, WeightsInfo weights_info);
+    static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                               const ITensorInfo         *src,
+                               const ITensorInfo         *weights,
+                               const ITensorInfo         *biases,
+                               const ITensorInfo         *dst,
+                               FullyConnectedLayerInfo    fc_info,
+                               WeightsInfo                weights_info);
 
     //Inherited methods override
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
-    void configure_fc_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act);
-    void configure_conv_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act);
-    void configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act);
+    void configure_fc_fc(const ITensorInfo         *src,
+                         const ITensorInfo         *weights,
+                         const ITensorInfo         *biases,
+                         ITensorInfo               *dst,
+                         const ActivationLayerInfo &act);
+    void configure_conv_fc(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           ITensorInfo               *dst,
+                           const ActivationLayerInfo &act);
+    void configure_mm(const ITensorInfo         *src,
+                      const ITensorInfo         *weights,
+                      const ITensorInfo         *biases,
+                      ITensorInfo               *dst,
+                      const ActivationLayerInfo &act);
 
     enum AuxTensorIdx
     {
diff --git a/src/cpu/operators/CpuGemm.cpp b/src/cpu/operators/CpuGemm.cpp
index 34b845928d..8da166dbef 100644
--- a/src/cpu/operators/CpuGemm.cpp
+++ b/src/cpu/operators/CpuGemm.cpp
@@ -24,9 +24,10 @@
 #include "src/cpu/operators/CpuGemm.h"
 
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -57,17 +58,25 @@ cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
 }
 } // namespace
 
-void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, float alpha, float beta, const GEMMInfo &gemm_info)
+void CpuGemm::configure(const ITensorInfo *a,
+                        const ITensorInfo *b,
+                        const ITensorInfo *c,
+                        ITensorInfo       *d,
+                        float              alpha,
+                        float              beta,
+                        const GEMMInfo    &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
     ARM_COMPUTE_ERROR_THROW_ON(CpuGemm::validate(a, b, c, d, alpha, beta, gemm_info));
     ARM_COMPUTE_LOG_PARAMS(a, b, c, d, alpha, beta, gemm_info);
 
-    const cpu::AsmGemmInfo asm_info      = init_assembly_metadata(gemm_info);
-    const bool             is_c_bias     = beta == 1 && c != nullptr;
-    bool                   run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, (is_c_bias) ? c : nullptr, d, asm_info)) &&
-                                           (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient.
-                                           !(!b->are_values_constant() && b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently.
+    const cpu::AsmGemmInfo asm_info  = init_assembly_metadata(gemm_info);
+    const bool             is_c_bias = beta == 1 && c != nullptr;
+    bool                   run_optimised =
+        bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, (is_c_bias) ? c : nullptr, d, asm_info)) &&
+        (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient.
+        !(!b->are_values_constant() &&
+          b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently.
 
     // Check if we need to reshape the matrix B only on the first run
     _is_prepared                      = false;
@@ -76,9 +85,12 @@ void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITenso
     _run_alpha_scale                  = alpha != 1.f;
     _run_bias_addition                = is_c_bias;
     _run_addition                     = beta != 0 && beta != 1 && c != nullptr;
-    _run_activation                   = gemm_info.activation_info().enabled() && (!run_optimised || (run_optimised && !cpu::CpuGemmAssemblyDispatch::is_activation_supported(gemm_info.activation_info())));
+    _run_activation =
+        gemm_info.activation_info().enabled() &&
+        (!run_optimised ||
+         (run_optimised && !cpu::CpuGemmAssemblyDispatch::is_activation_supported(gemm_info.activation_info())));
 
-    if(run_optimised)
+    if (run_optimised)
     {
         const ITensorInfo *c_to_use = is_c_bias ? c : nullptr;
         _asm_glue                   = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
@@ -90,10 +102,11 @@ void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITenso
         _aux_mem[Pretraspose]      = asm_mem_req[Pretraspose];
 
         // Scale product by alpha
-        if(_run_alpha_scale)
+        if (_run_alpha_scale)
         {
             _alpha_scale_func = std::make_unique<cpu::CpuActivation>();
-            _alpha_scale_func->configure(d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f));
+            _alpha_scale_func->configure(
+                d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f));
         }
     }
     else
@@ -104,7 +117,7 @@ void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITenso
         _mm_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixMultiplyKernel>();
 
         // Select between GEMV and GEMM
-        if(_run_vector_matrix_multiplication)
+        if (_run_vector_matrix_multiplication)
         {
             // Configure the matrix multiply kernel
             _mm_kernel->configure(a, b, gemm_output_to_use, alpha, false);
@@ -118,41 +131,50 @@ void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITenso
             // Configure interleave kernel
             _interleave_kernel = std::make_unique<cpu::kernels::CpuGemmInterleave4x4Kernel>();
             _interleave_kernel->configure(a, &_tmp_a);
-            _aux_mem[InterleavedLHS] = MemoryInfo(offset_int_vec(InterleavedLHS), MemoryLifetime::Temporary, _tmp_a.total_size());
+            _aux_mem[InterleavedLHS] =
+                MemoryInfo(offset_int_vec(InterleavedLHS), MemoryLifetime::Temporary, _tmp_a.total_size());
 
             // Configure transpose kernel
             _transpose_kernel = std::make_unique<cpu::kernels::CpuGemmTranspose1xWKernel>();
             _transpose_kernel->configure(b, &_tmp_b);
-            _aux_mem[TransposedRHS] = MemoryInfo(offset_int_vec(TransposedRHS), MemoryLifetime::Persistent, _tmp_b.total_size());
+            _aux_mem[TransposedRHS] =
+                MemoryInfo(offset_int_vec(TransposedRHS), MemoryLifetime::Persistent, _tmp_b.total_size());
 
             // Configure matrix multiplication kernel
             _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, true, GEMMReshapeInfo(m, n, k));
         }
 
-        if(_run_bias_addition)
+        if (_run_bias_addition)
         {
             _add_bias = std::make_unique<cpu::CpuAdd>();
             _add_bias->configure(gemm_output_to_use, c, d, ConvertPolicy::SATURATE);
-            _aux_mem[TempResult] = MemoryInfo(offset_int_vec(TempResult), MemoryLifetime::Temporary, _tmp_d.total_size());
+            _aux_mem[TempResult] =
+                MemoryInfo(offset_int_vec(TempResult), MemoryLifetime::Temporary, _tmp_d.total_size());
         }
     }
 
     // Configure matrix addition kernel
-    if(_run_addition)
+    if (_run_addition)
     {
         _ma_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixAdditionKernel>();
         _ma_kernel->configure(c, d, beta);
     }
 
     // Configure activation
-    if(_run_activation)
+    if (_run_activation)
     {
         _activation_func = std::make_unique<cpu::CpuActivation>();
         _activation_func->configure(d, nullptr, gemm_info.activation_info());
     }
 }
 
-Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, float alpha, float beta, const GEMMInfo &gemm_info)
+Status CpuGemm::validate(const ITensorInfo *a,
+                         const ITensorInfo *b,
+                         const ITensorInfo *c,
+                         const ITensorInfo *d,
+                         float              alpha,
+                         float              beta,
+                         const GEMMInfo    &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     const bool is_c_bias    = beta == 1 && c != nullptr;
@@ -162,7 +184,7 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::BFLOAT16, DataType::F16, DataType::F32);
 
-    if(is_fixed_format_fast_math(gemm_info.weight_format()))
+    if (is_fixed_format_fast_math(gemm_info.weight_format()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(a, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b, DataType::BFLOAT16);
@@ -174,46 +196,54 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens
 
     const int block_by = arm_compute::block_by(gemm_info.weight_format());
     // test if im2col has changed the dimensions that are needed for padding
-    if(a->dimension(0) != b->dimension(1) && block_by > 1)
+    if (a->dimension(0) != b->dimension(1) && block_by > 1)
     {
         // have to verify bias
         const size_t dim0_sz = a->dimension(0);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((dim0_sz % block_by) != 0, ("The matrix A number of columns must be a multiple of block_by=" + std::to_string(block_by)).c_str());
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            (dim0_sz % block_by) != 0,
+            ("The matrix A number of columns must be a multiple of block_by=" + std::to_string(block_by)).c_str());
         // a->dimension(0) = kernel_area * input_channel + kernel_area * input_pad_right
         // b->dimension(1) = kernel_area * input_channel
         // a->dimension(0) = b->dimension(1) + kernel_area * input_pad_right
         const size_t input_pad_right = (dim0_sz - b->dimension(1)) % block_by;
         const size_t kernel_area     = (dim0_sz - b->dimension(1)) / input_pad_right;
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((dim0_sz - kernel_area * input_pad_right) != b->dimension(1), "The product AB is defined only if A number of columns and B number of rows are related");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            (dim0_sz - kernel_area * input_pad_right) != b->dimension(1),
+            "The product AB is defined only if A number of columns and B number of rows are related");
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            a->dimension(0) != b->dimension(1),
+            "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
     }
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
-    if(a->data_type() != DataType::BFLOAT16)
+    if (a->data_type() != DataType::BFLOAT16)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, d);
     }
 
-    if(run_addition)
+    if (run_addition)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 0);
         ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.reinterpret_input_as_3d());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(c, d);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1), "The C matrix must have the same number of rows as the matrix A");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0), "The C matrix must have the same number of columns as the matrix B");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1),
+                                        "The C matrix must have the same number of rows as the matrix A");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0),
+                                        "The C matrix must have the same number of columns as the matrix B");
     }
 
-    if(d->total_size() != 0)
+    if (d->total_size() != 0)
     {
         // For fixed format we are expecting some kind of blocked format for B/RHS so the dimension won't necessarily match the result matrix any more.
         ARM_COMPUTE_RETURN_ERROR_ON(!gemm_info.fixed_format() && b->dimension(0) != d->dimension(0));
-        if(gemm_info.depth_output_gemm3d() != 0)
+        if (gemm_info.depth_output_gemm3d() != 0)
         {
-            if(gemm_info.reinterpret_input_as_3d())
+            if (gemm_info.reinterpret_input_as_3d())
             {
                 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1));
                 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != d->dimension(2));
@@ -230,15 +260,19 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens
     }
 
     // Check if we need to run the optimized assembly kernel
-    cpu::AsmGemmInfo asm_info      = init_assembly_metadata(gemm_info);
-    const bool       run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, d, asm_info)) &&
-                                     (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient.
-                                     !(!b->are_values_constant() && b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently.
-
-    if(!run_optimised)
+    cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
+    const bool       run_optimised =
+        bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, d, asm_info)) &&
+        (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient.
+        !(!b->are_values_constant() &&
+          b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently.
+
+    if (!run_optimised)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "CpuGemm cannot reinterpret the input tensor as 3D");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "CpuGemm cannot reinterpret the output tensor as 3D");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(),
+                                        "CpuGemm cannot reinterpret the input tensor as 3D");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0,
+                                        "CpuGemm cannot reinterpret the output tensor as 3D");
 
         // Check if the first input tensor is a vector.
         const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
@@ -254,7 +288,8 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens
         int       mult_transpose1xW_width   = 1;
         int       mult_interleave4x4_height = 1;
 
-        const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d());
+        const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(
+            m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d());
 
         const ITensorInfo *matrix_a_info = a;
         const ITensorInfo *matrix_b_info = b;
@@ -263,39 +298,44 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens
         TensorInfo tmp_b_info{};
         TensorInfo tmp_output_info = *d->clone();
 
-        if(run_interleave_transpose)
+        if (run_interleave_transpose)
         {
             matrix_a_info = &tmp_a_info;
             matrix_b_info = &tmp_b_info;
 
             // Validate interleave kernel
-            auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
+            auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(
+                                               *a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
             ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmInterleave4x4Kernel::validate(a, &tmp_a_info));
 
             // Validate transpose kernel
-            auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
+            auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(
+                                               *b, mult_transpose1xW_width)));
             ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info));
         }
 
         // Validate matrix multiply
-        auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info)));
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info));
+        auto_init_if_empty(tmp_output_info,
+                           matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(
+                               *matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info)));
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixMultiplyKernel::validate(
+            matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info));
 
-        if(is_c_bias)
+        if (is_c_bias)
         {
             ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuAdd::validate(&tmp_output_info, c, d, ConvertPolicy::SATURATE));
         }
     }
 
     // Validate matrix addition kernel
-    if(run_addition)
+    if (run_addition)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixAdditionKernel::validate(c, d, beta));
     }
 
     // Validate activation
     const ActivationLayerInfo &activation = gemm_info.activation_info();
-    if(activation.enabled())
+    if (activation.enabled())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuActivation::validate(d, nullptr, activation));
     }
@@ -312,15 +352,15 @@ void CpuGemm::run(ITensorPack &tensors)
     auto c = tensors.get_const_tensor(ACL_SRC_2);
     auto d = tensors.get_tensor(ACL_DST);
 
-    if(_asm_glue && _asm_glue->is_configured())
+    if (_asm_glue && _asm_glue->is_configured())
     {
         // Pass c to asm dispatch only if it's the bias tensor
         ITensorPack asm_pack = tensors;
         asm_pack.add_const_tensor(ACL_SRC_2, _run_bias_addition ? c : nullptr);
         _asm_glue->run(asm_pack);
-        if(_run_alpha_scale)
+        if (_run_alpha_scale)
         {
-            ITensorPack pack{ { ACL_SRC, d }, { ACL_DST, d } };
+            ITensorPack pack{{ACL_SRC, d}, {ACL_DST, d}};
             _alpha_scale_func->run(pack);
         }
     }
@@ -330,18 +370,20 @@ void CpuGemm::run(ITensorPack &tensors)
         CpuAuxTensorHandler transposed_b(offset_int_vec(TransposedRHS), _tmp_b, tensors, true);
         CpuAuxTensorHandler temp_d(offset_int_vec(TempResult), _tmp_d, tensors, true);
 
-        ITensorPack mm_pack{ { ACL_SRC_0, a }, { ACL_SRC_1, b }, { ACL_DST, (_run_bias_addition) ? temp_d.get() : d } };
-        if(!_run_vector_matrix_multiplication)
+        ITensorPack mm_pack{{ACL_SRC_0, a}, {ACL_SRC_1, b}, {ACL_DST, (_run_bias_addition) ? temp_d.get() : d}};
+        if (!_run_vector_matrix_multiplication)
         {
             // Run interleave kernel
-            ITensorPack interleave_pack{ { ACL_SRC, a }, { ACL_DST, interleaved_a.get() } };
-            NEScheduler::get().schedule_op(_interleave_kernel.get(), Window::DimY, _interleave_kernel->window(), interleave_pack);
+            ITensorPack interleave_pack{{ACL_SRC, a}, {ACL_DST, interleaved_a.get()}};
+            NEScheduler::get().schedule_op(_interleave_kernel.get(), Window::DimY, _interleave_kernel->window(),
+                                           interleave_pack);
 
-            if(!_reshape_b_only_on_first_run)
+            if (!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
-                ITensorPack transpose_pack{ { ACL_SRC, b }, { ACL_DST, transposed_b.get() } };
-                NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), transpose_pack);
+                ITensorPack transpose_pack{{ACL_SRC, b}, {ACL_DST, transposed_b.get()}};
+                NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(),
+                                               transpose_pack);
             }
 
             // Use reshaped matrices
@@ -349,48 +391,52 @@ void CpuGemm::run(ITensorPack &tensors)
             mm_pack.add_const_tensor(ACL_SRC_1, transposed_b.get());
         }
 
-        NEScheduler::get().schedule_op(_mm_kernel.get(), _run_vector_matrix_multiplication ? Window::DimX : Window::DimY, _mm_kernel->window(), mm_pack);
+        NEScheduler::get().schedule_op(_mm_kernel.get(),
+                                       _run_vector_matrix_multiplication ? Window::DimX : Window::DimY,
+                                       _mm_kernel->window(), mm_pack);
 
         // Run bias addition kernel
-        if(_run_bias_addition)
+        if (_run_bias_addition)
         {
-            ITensorPack pack{ { ACL_SRC_0, temp_d.get() }, { ACL_SRC_1, c }, { ACL_DST, d } };
+            ITensorPack pack{{ACL_SRC_0, temp_d.get()}, {ACL_SRC_1, c}, {ACL_DST, d}};
             _add_bias->run(pack);
         }
     }
 
     // Run matrix addition kernel
-    if(_run_addition)
+    if (_run_addition)
     {
-        ITensorPack c_add_pack{ { ACL_SRC, c }, { ACL_DST, d } };
+        ITensorPack c_add_pack{{ACL_SRC, c}, {ACL_DST, d}};
         NEScheduler::get().schedule_op(_ma_kernel.get(), Window::DimY, _ma_kernel->window(), c_add_pack);
     }
 
     // Run activation function
-    if(_run_activation)
+    if (_run_activation)
     {
-        ITensorPack pack{ { ACL_SRC, d }, { ACL_DST, d } };
+        ITensorPack pack{{ACL_SRC, d}, {ACL_DST, d}};
         _activation_func->run(pack);
     }
 }
 
 void CpuGemm::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
-        if(_asm_glue && _asm_glue->is_configured())
+        if (_asm_glue && _asm_glue->is_configured())
         {
             _asm_glue->prepare(tensors);
         }
-        else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication)
+        else if (_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication)
         {
-            const ITensor *b     = tensors.get_const_tensor(ACL_SRC_1);
-            ITensor       *b_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransposedRHS)));
+            const ITensor *b = tensors.get_const_tensor(ACL_SRC_1);
+            ITensor       *b_aux =
+                utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransposedRHS)));
             ARM_COMPUTE_ERROR_ON_NULLPTR(b, b_aux);
 
             CpuAuxTensorHandler transposed_b(_tmp_b, *b_aux);
-            ITensorPack         transpose_pack{ { ACL_SRC, b }, { ACL_DST, transposed_b.get() } };
-            NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), transpose_pack);
+            ITensorPack         transpose_pack{{ACL_SRC, b}, {ACL_DST, transposed_b.get()}};
+            NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(),
+                                           transpose_pack);
         }
         _is_prepared = true;
     }
@@ -401,8 +447,12 @@ experimental::MemoryRequirements CpuGemm::workspace() const
     return _aux_mem;
 }
 
-Status CpuGemm::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d,
-                             const GEMMInfo &gemm_info)
+Status CpuGemm::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                             const ITensorInfo         *a,
+                             const ITensorInfo         *b,
+                             const ITensorInfo         *c,
+                             const ITensorInfo         *d,
+                             const GEMMInfo            &gemm_info)
 {
     const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
 
diff --git a/src/cpu/operators/CpuGemm.h b/src/cpu/operators/CpuGemm.h
index 9b08e5d0f6..6b30d134fa 100644
--- a/src/cpu/operators/CpuGemm.h
+++ b/src/cpu/operators/CpuGemm.h
@@ -24,12 +24,12 @@
 #ifndef ARM_COMPUTE_CPU_GEMM_H
 #define ARM_COMPUTE_CPU_GEMM_H
 
-#include "src/cpu/ICpuOperator.h"
-
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/GEMMInfo.h"
+
+#include "src/cpu/ICpuOperator.h"
 #include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
 #include "src/cpu/kernels/CpuGemmMatrixAdditionKernel.h"
 #include "src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h"
@@ -93,16 +93,26 @@ public:
      * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
      *                       if the reshape of matrix B should happen only for the first run
      */
-    void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
-                   float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(const ITensorInfo *a,
+                   const ITensorInfo *b,
+                   const ITensorInfo *c,
+                   ITensorInfo       *d,
+                   float              alpha,
+                   float              beta,
+                   const GEMMInfo    &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CpuGemm.
      *
      * Similar to @ref CpuGemm::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d,
-                           float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *d,
+                           float              alpha,
+                           float              beta,
+                           const GEMMInfo    &gemm_info = GEMMInfo());
 
     /** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters.
      *
@@ -111,12 +121,16 @@ public:
      * the value of arm_compute::WeightFormat need to be passed via the
      * parameter gemm_info.
      */
-    static Status has_opt_impl(arm_compute::WeightFormat &weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d,
-                               const GEMMInfo &gemm_info = GEMMInfo());
+    static Status has_opt_impl(arm_compute::WeightFormat &weight_format,
+                               const ITensorInfo         *a,
+                               const ITensorInfo         *b,
+                               const ITensorInfo         *c,
+                               const ITensorInfo         *d,
+                               const GEMMInfo            &gemm_info = GEMMInfo());
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
     experimental::MemoryRequirements workspace() const override;
 
     /** Indicates if the convolution executes in variable weights mode.
@@ -138,28 +152,28 @@ private:
         Count
     };
 
-    std::unique_ptr<kernels::CpuGemmInterleave4x4Kernel>  _interleave_kernel{ nullptr };
-    std::unique_ptr<kernels::CpuGemmTranspose1xWKernel>   _transpose_kernel{ nullptr };
-    std::unique_ptr<kernels::CpuGemmMatrixMultiplyKernel> _mm_kernel{ nullptr };
-    std::unique_ptr<CpuGemmAssemblyDispatch>              _asm_glue{ nullptr };
-    std::unique_ptr<kernels::CpuGemmMatrixAdditionKernel> _ma_kernel{ nullptr };
-    std::unique_ptr<CpuActivation>                        _alpha_scale_func{ nullptr };
-    std::unique_ptr<CpuAdd>                               _add_bias{ nullptr };
-    std::unique_ptr<CpuActivation>                        _activation_func{ nullptr };
+    std::unique_ptr<kernels::CpuGemmInterleave4x4Kernel>  _interleave_kernel{nullptr};
+    std::unique_ptr<kernels::CpuGemmTranspose1xWKernel>   _transpose_kernel{nullptr};
+    std::unique_ptr<kernels::CpuGemmMatrixMultiplyKernel> _mm_kernel{nullptr};
+    std::unique_ptr<CpuGemmAssemblyDispatch>              _asm_glue{nullptr};
+    std::unique_ptr<kernels::CpuGemmMatrixAdditionKernel> _ma_kernel{nullptr};
+    std::unique_ptr<CpuActivation>                        _alpha_scale_func{nullptr};
+    std::unique_ptr<CpuAdd>                               _add_bias{nullptr};
+    std::unique_ptr<CpuActivation>                        _activation_func{nullptr};
 
     TensorInfo _tmp_a{};
     TensorInfo _tmp_b{};
     TensorInfo _tmp_d{};
 
-    bool _run_vector_matrix_multiplication{ false };
-    bool _run_alpha_scale{ false };
-    bool _run_addition{ false };
-    bool _run_bias_addition{ false };
-    bool _run_activation{ false };
-    bool _reshape_b_only_on_first_run{ false };
-    bool _is_prepared{ false };
+    bool _run_vector_matrix_multiplication{false};
+    bool _run_alpha_scale{false};
+    bool _run_addition{false};
+    bool _run_bias_addition{false};
+    bool _run_activation{false};
+    bool _reshape_b_only_on_first_run{false};
+    bool _is_prepared{false};
 
-    experimental::MemoryRequirements _aux_mem{ Count };
+    experimental::MemoryRequirements _aux_mem{Count};
 };
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuGemmConv2d.cpp b/src/cpu/operators/CpuGemmConv2d.cpp
index 39b410d609..7c59d88c61 100644
--- a/src/cpu/operators/CpuGemmConv2d.cpp
+++ b/src/cpu/operators/CpuGemmConv2d.cpp
@@ -26,9 +26,9 @@
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include "src/common/utils/Log.h"
@@ -52,8 +52,11 @@ namespace arm_compute
 {
 namespace cpu
 {
-CpuGemmConv2d::SkipInfo CpuGemmConv2d::skip_im_col_info(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info,
-                                                        const Size2D &dilation, const ActivationLayerInfo &act_info)
+CpuGemmConv2d::SkipInfo CpuGemmConv2d::skip_im_col_info(const ITensorInfo         *src,
+                                                        const ITensorInfo         *weights,
+                                                        const PadStrideInfo       &conv_info,
+                                                        const Size2D              &dilation,
+                                                        const ActivationLayerInfo &act_info)
 {
     const DataLayout   data_layout   = src->data_layout();
     const int          idx_width     = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -62,63 +65,86 @@ CpuGemmConv2d::SkipInfo CpuGemmConv2d::skip_im_col_info(const ITensorInfo *src,
     const unsigned int kernel_height = weights->dimension(idx_height);
     unsigned int       conv_w        = 0;
     unsigned int       conv_h        = 0;
-    std::tie(conv_w, conv_h)         = scaled_dimensions(src->dimension(idx_width),
-                                                         src->dimension(idx_height),
-                                                         kernel_width,
-                                                         kernel_height,
-                                                         conv_info,
-                                                         dilation);
-    const bool skip_im2col           = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
-
-    if(skip_im2col)
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+                                                 kernel_height, conv_info, dilation);
+    const bool skip_im2col   = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 &&
+                              conv_info.stride().first == 1 && conv_info.stride().second == 1);
+
+    if (skip_im2col)
     {
-        const bool skip_col2im = (data_layout == DataLayout::NHWC && (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ true))));
-        if(skip_col2im)
+        const bool skip_col2im =
+            (data_layout == DataLayout::NHWC &&
+             (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ true))));
+        if (skip_col2im)
         {
-            return { true, true };
+            return {true, true};
         }
     }
     else
     {
-        const bool skip_col2im = (data_layout == DataLayout::NHWC && (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ false))));
-        if(skip_col2im)
+        const bool skip_col2im =
+            (data_layout == DataLayout::NHWC &&
+             (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ false))));
+        if (skip_col2im)
         {
-            return { false, true };
+            return {false, true};
         }
     }
 
     // Default case when we cannot reinterpret the input and output as 3D.
-    return { false, false };
+    return {false, false};
 }
 
 CpuGemmConv2d::CpuGemmConv2d()
-    : _weights_reshape_kernel(nullptr), _im2col_kernel(), _mm_gemm(), _mm_gemmlowp(), _col2im_kernel(), _reshape(), _im2col_output(), _weights_reshaped(), _gemm_output(), _gemm_output_3d(),
-      _data_layout(DataLayout::NCHW), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_prepared(false), _aux_mem(AuxTensorIdx::Count)
+    : _weights_reshape_kernel(nullptr),
+      _im2col_kernel(),
+      _mm_gemm(),
+      _mm_gemmlowp(),
+      _col2im_kernel(),
+      _reshape(),
+      _im2col_output(),
+      _weights_reshaped(),
+      _gemm_output(),
+      _gemm_output_3d(),
+      _data_layout(DataLayout::NCHW),
+      _skip_im2col(false),
+      _skip_col2im(false),
+      _is_quantized(false),
+      _is_prepared(false),
+      _aux_mem(AuxTensorIdx::Count)
 {
 }
 CpuGemmConv2d::~CpuGemmConv2d() = default;
 
-void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act_info,
-                                 bool enable_fast_math, int gemm_3d_depth, bool fixed_format, arm_compute::WeightFormat weight_format)
+void CpuGemmConv2d::configure_mm(const ITensorInfo         *src,
+                                 const ITensorInfo         *weights,
+                                 const ITensorInfo         *biases,
+                                 ITensorInfo               *dst,
+                                 const ActivationLayerInfo &act_info,
+                                 bool                       enable_fast_math,
+                                 int                        gemm_3d_depth,
+                                 bool                       fixed_format,
+                                 arm_compute::WeightFormat  weight_format)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, act_info, enable_fast_math, gemm_3d_depth, _skip_im2col, fixed_format, weight_format));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, act_info, enable_fast_math, gemm_3d_depth,
+                                           _skip_im2col, fixed_format, weight_format));
 
     // Create GEMMInfo structure
-    const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
-                                         gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
-                                         false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format);
+    const GEMMInfo &gemm_info =
+        GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth,
+                 _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, GEMMLowpOutputStageInfo(),
+                 false, enable_fast_math, false, act_info, fixed_format, weight_format);
 
     // Supported activations in GEMM
-    const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                               ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                               ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                             };
+    const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+        ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+        ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
 
-    if(_is_quantized)
+    if (_is_quantized)
     {
-        TensorInfo tmp_src{ *src };
-        TensorInfo tmp_weights{ *weights };
+        TensorInfo tmp_src{*src};
+        TensorInfo tmp_weights{*weights};
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate input and weights offset
         const QuantizationInfo        iqinfo    = src->quantization_info();
@@ -129,7 +155,7 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig
         const DataType                data_type = src->data_type();
 
         tmp_src.set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset));
-        if(!is_data_type_quantized_per_channel(tmp_weights.data_type()))
+        if (!is_data_type_quantized_per_channel(tmp_weights.data_type()))
         {
             const UniformQuantizationInfo uwqinfo = wqinfo.uniform();
             tmp_weights.set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset));
@@ -142,7 +168,7 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig
         int32_t min_activation       = type_min.get<int32_t>();
         int32_t max_activation       = type_max.get<int32_t>();
 
-        if(supported_acts.count(act_info.activation()) != 0)
+        if (supported_acts.count(act_info.activation()) != 0)
         {
             std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
         }
@@ -156,11 +182,12 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig
         quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info);
 
         _mm_gemmlowp = std::make_unique<CpuGemmLowpMatrixMultiplyCore>();
-        _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, enable_fast_math, false, act_info, fixed_format,
-                                                                              weight_format));
+        _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst,
+                                GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false,
+                                         enable_fast_math, false, act_info, fixed_format, weight_format));
 
         auto mm_mem_req = _mm_gemmlowp->workspace();
-        for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+        for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
         {
             _aux_mem[cont] = mm_mem_req[cont];
         }
@@ -171,26 +198,35 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig
         _mm_gemm = std::make_unique<CpuGemm>();
         _mm_gemm->configure(src, weights, biases, dst, 1.0f, 1.0f, gemm_info);
         auto mm_mem_req = _mm_gemm->workspace();
-        for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+        for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
         {
             _aux_mem[cont] = mm_mem_req[cont];
         }
     }
 }
 
-Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                  const ActivationLayerInfo &act_info, bool enable_fast_math, int gemm_3d_depth, bool skip_im2col, bool fixed_format, arm_compute::WeightFormat weight_format)
+Status CpuGemmConv2d::validate_mm(const ITensorInfo         *src,
+                                  const ITensorInfo         *weights,
+                                  const ITensorInfo         *biases,
+                                  const ITensorInfo         *dst,
+                                  const ActivationLayerInfo &act_info,
+                                  bool                       enable_fast_math,
+                                  int                        gemm_3d_depth,
+                                  bool                       skip_im2col,
+                                  bool                       fixed_format,
+                                  arm_compute::WeightFormat  weight_format)
 {
     const DataType data_type             = src->data_type();
     const bool     is_quantized          = is_data_type_quantized_asymmetric(data_type);
     const bool     is_activation_enabled = act_info.enabled();
 
     // Create GEMMInfo structure
-    const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
-                                        gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
-                                        false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format);
+    const GEMMInfo gemm_info =
+        GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth,
+                 skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, GEMMLowpOutputStageInfo(),
+                 false, enable_fast_math, false, act_info, fixed_format, weight_format);
 
-    if(is_quantized)
+    if (is_quantized)
     {
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate input and weights offset
@@ -206,11 +242,10 @@ Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *wei
         int32_t min_activation       = type_min.get<int32_t>();
         int32_t max_activation       = type_max.get<int32_t>();
 
-        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                                 };
-        if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
+        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+            ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+            ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
+        if (is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
         {
             std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
         }
@@ -229,8 +264,9 @@ Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *wei
         input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset));
         weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset));
 
-        return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info, false, enable_fast_math,
-                                                                                                               false, act_info));
+        return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst,
+                                                       GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false,
+                                                                output_info, false, enable_fast_math, false, act_info));
     }
     else
     {
@@ -239,36 +275,44 @@ Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *wei
     }
 }
 
-Status CpuGemmConv2d::validate_gemm3d(const ITensorInfo *input_info, const ITensorInfo *weights_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
+Status CpuGemmConv2d::validate_gemm3d(const ITensorInfo         *input_info,
+                                      const ITensorInfo         *weights_info,
+                                      const ActivationLayerInfo &act_info,
+                                      int                        gemm_3d_depth,
+                                      bool                       skip_im2col)
 {
     const DataType     data_type = input_info->data_type();
     const unsigned int mult_y    = skip_im2col ? 1U : gemm_3d_depth;
     const unsigned int mult_z    = skip_im2col ? gemm_3d_depth : 1U;
 
     // Set dummy tensor shapes for the validation
-    const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, input_info->quantization_info());
+    const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type,
+                                      input_info->quantization_info());
     const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type, weights_info->quantization_info());
-    const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, input_info->quantization_info());
+    const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type,
+                                       input_info->quantization_info());
 
-    return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, false, gemm_3d_depth, skip_im2col);
+    return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, false,
+                       gemm_3d_depth, skip_im2col);
 }
 
-void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                              const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+void CpuGemmConv2d::configure(const ITensorInfo         *src,
+                              const ITensorInfo         *weights,
+                              const ITensorInfo         *biases,
+                              ITensorInfo               *dst,
+                              const PadStrideInfo       &conv_info,
+                              const WeightsInfo         &weights_info,
+                              const Size2D              &dilation,
+                              const ActivationLayerInfo &act_info,
+                              bool                       enable_fast_math,
+                              unsigned int               num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_UNUSED(num_groups, weights_info);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuGemmConv2d::validate(src,
-                                                       weights,
-                                                       biases,
-                                                       dst,
-                                                       conv_info,
-                                                       weights_info,
-                                                       dilation,
-                                                       act_info,
-                                                       enable_fast_math,
-                                                       num_groups));
-    ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+    ARM_COMPUTE_ERROR_THROW_ON(CpuGemmConv2d::validate(src, weights, biases, dst, conv_info, weights_info, dilation,
+                                                       act_info, enable_fast_math, num_groups));
+    ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, weights_info, dilation, act_info, enable_fast_math,
+                           num_groups);
 
     const DataType   data_type   = src->data_type();
     const DataLayout data_layout = src->data_layout();
@@ -283,7 +327,8 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights
     _is_prepared  = weights_info.retain_internal_weights();
     _is_quantized = is_data_type_quantized_asymmetric(src->data_type());
     _data_layout  = data_layout;
-    _skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+    _skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 &&
+                    conv_info.stride().first == 1 && conv_info.stride().second == 1);
 
     const ITensorInfo *gemm_input_to_use  = src;
     ITensorInfo       *gemm_output_to_use = dst;
@@ -291,20 +336,17 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights
     // Get convolved dimensions
     unsigned int conv_w      = 0;
     unsigned int conv_h      = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
-                                                 src->dimension(idx_height),
-                                                 kernel_width,
-                                                 kernel_height,
-                                                 conv_info,
-                                                 dilation);
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+                                                 kernel_height, conv_info, dilation);
 
     ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h),
                              "Output shape does not match the expected one");
 
     // Check if GEMM3D is supported
-    const CpuGemmConv2d::SkipInfo skip_info = CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info);
-    _skip_im2col                            = skip_info.skip_im2col;
-    _skip_col2im                            = skip_info.skip_col2im;
+    const CpuGemmConv2d::SkipInfo skip_info =
+        CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info);
+    _skip_im2col = skip_info.skip_im2col;
+    _skip_col2im = skip_info.skip_col2im;
 
     // Get parameters from conv_info
     unsigned int stride_x        = 0;
@@ -320,17 +362,19 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights
     _weights_reshaped.set_quantization_info(weights->quantization_info());
 
     // Create tensor to store im2col reshaped inputs
-    if(!_skip_im2col)
+    if (!_skip_im2col)
     {
         const int    block_by        = arm_compute::block_by(weights_info.weight_format());
         unsigned int input_pad_right = 0;
-        if(block_by > 1)
+        if (block_by > 1)
         {
-            input_pad_right = (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by);
+            input_pad_right =
+                (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by);
         }
         // Configure
         _im2col_kernel = std::make_unique<kernels::CpuIm2ColKernel>();
-        _im2col_kernel->configure(src, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation, num_groups, input_pad_right);
+        _im2col_kernel->configure(src, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation,
+                                  num_groups, input_pad_right);
 
         // Update GEMM input
         gemm_input_to_use = &_im2col_output;
@@ -338,7 +382,7 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights
 
     // Create temporary GEMM output tensor in case we cannot skip col2im
     const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
-    if(!_skip_col2im)
+    if (!_skip_col2im)
     {
         TensorShape shape_gemm;
 
@@ -368,9 +412,10 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights
     // In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix
     const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0;
     const bool         fixed_format  = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED;
-    configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, enable_fast_math, gemm_3d_depth, fixed_format, weights_info.weight_format());
+    configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, enable_fast_math,
+                 gemm_3d_depth, fixed_format, weights_info.weight_format());
 
-    if(!_skip_col2im && _data_layout == DataLayout::NCHW)
+    if (!_skip_col2im && _data_layout == DataLayout::NCHW)
     {
         // Configure col2im
         _col2im_kernel = std::make_unique<kernels::CpuCol2ImKernel>();
@@ -390,14 +435,24 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights
     gemm_trans_wei      = _mm_gemmlowp != nullptr ? _aux_mem[5].size > 0 : gemm_trans_wei; // Transpose RHS
 
     // Check lifetime
-    _aux_mem[Im2ColOutput]    = MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size());
-    _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), gemm_trans_wei ? MemoryLifetime::Prepare : MemoryLifetime::Persistent, _weights_reshaped.total_size());
-    _aux_mem[GemmOutput]      = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size());
+    _aux_mem[Im2ColOutput] =
+        MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size());
+    _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped),
+                                           gemm_trans_wei ? MemoryLifetime::Prepare : MemoryLifetime::Persistent,
+                                           _weights_reshaped.total_size());
+    _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size());
 }
 
-Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                   const PadStrideInfo &conv_info,
-                                   const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, const bool enable_fast_math)
+Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                                   const ITensorInfo         *src,
+                                   const ITensorInfo         *weights,
+                                   const ITensorInfo         *biases,
+                                   const ITensorInfo         *dst,
+                                   const PadStrideInfo       &conv_info,
+                                   const WeightsInfo         &weights_info,
+                                   const Size2D              &dilation,
+                                   const ActivationLayerInfo &act_info,
+                                   const bool                 enable_fast_math)
 {
     const DataLayout   data_layout   = src->data_layout();
     const int          idx_width     = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -406,36 +461,44 @@ Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_fo
     const unsigned int kernel_height = weights->dimension(idx_height);
     unsigned int       conv_w        = 0;
     unsigned int       conv_h        = 0;
-    std::tie(conv_w, conv_h)         = scaled_dimensions(src->dimension(idx_width),
-                                                         src->dimension(idx_height),
-                                                         kernel_width,
-                                                         kernel_height,
-                                                         conv_info,
-                                                         dilation);
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+                                                 kernel_height, conv_info, dilation);
 
-    const CpuGemmConv2d::SkipInfo skip_info = CpuGemmConv2d::skip_im_col_info(src, weights, conv_info,
-                                                                              dilation, act_info);
+    const CpuGemmConv2d::SkipInfo skip_info =
+        CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info);
 
     const bool         skip_im2col   = skip_info.skip_im2col;
     const bool         skip_col2im   = skip_info.skip_col2im;
     const unsigned int gemm_3d_depth = skip_col2im ? conv_h : 0;
     const bool         fixed_format  = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED;
-    const GEMMInfo     gemm_info     = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
-                                                gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
-                                                false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weights_info.weight_format());
+    const GEMMInfo     gemm_info =
+        GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth,
+                 skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, GEMMLowpOutputStageInfo(),
+                 false, enable_fast_math, false, act_info, fixed_format, weights_info.weight_format());
 
     return CpuGemm::has_opt_impl(expected_weight_format, src, weights, biases, dst, gemm_info);
 }
 
-Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                               const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+Status CpuGemmConv2d::validate(const ITensorInfo         *src,
+                               const ITensorInfo         *weights,
+                               const ITensorInfo         *biases,
+                               const ITensorInfo         *dst,
+                               const PadStrideInfo       &conv_info,
+                               const WeightsInfo         &weights_info,
+                               const Size2D              &dilation,
+                               const ActivationLayerInfo &act_info,
+                               bool                       enable_fast_math,
+                               unsigned int               num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::BFLOAT16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16,
+                                                         DataType::F16, DataType::F32);
 
-    if(!is_fixed_format(weights_info.weight_format()))
+    if (!is_fixed_format(weights_info.weight_format()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
     }
@@ -468,29 +531,25 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weight
     unsigned int conv_w = 0;
     unsigned int conv_h = 0;
 
-    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
-                                                 src->dimension(idx_height),
-                                                 kernel_width,
-                                                 kernel_height,
-                                                 conv_info,
-                                                 dilation);
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+                                                 kernel_height, conv_info, dilation);
 
     // Check if GEMM3D is supported
-    const CpuGemmConv2d::SkipInfo skip_info   = CpuGemmConv2d::skip_im_col_info(src, weights, conv_info,
-                                                                                dilation, act_info);
-    const bool                    skip_im2col = skip_info.skip_im2col, skip_col2im = skip_info.skip_col2im;
+    const CpuGemmConv2d::SkipInfo skip_info =
+        CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info);
+    const bool skip_im2col = skip_info.skip_im2col, skip_col2im = skip_info.skip_col2im;
 
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != src->dimension(idx_channel));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
 
     // Validate biases
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
-        if(is_quantized)
+        if (is_quantized)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
-        else if(is_bf16)
+        else if (is_bf16)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
         }
@@ -503,20 +562,23 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weight
     }
 
     unsigned int mat_weights_cols = weights->dimension(idx_kernels);
-    unsigned int mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel);
+    unsigned int mat_weights_rows =
+        weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel);
 
     weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, append_bias), 1, weights->data_type());
     weights_reshaped_info.set_quantization_info(weights->quantization_info());
     weights_to_use = &weights_reshaped_info;
 
-    if(!skip_im2col)
+    if (!skip_im2col)
     {
         const int block_by        = arm_compute::block_by(weights_info.weight_format());
         int       input_pad_right = 0;
-        if(block_by > 1)
+        if (block_by > 1)
         {
-            input_pad_right  = (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by);
-            mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * (weights->dimension(idx_channel) + input_pad_right);
+            input_pad_right =
+                (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by);
+            mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) *
+                               (weights->dimension(idx_channel) + input_pad_right);
         }
 
         // Create tensor info for im2col reshaped inputs
@@ -528,13 +590,15 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weight
 
         im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type);
         im2col_reshaped_info.set_quantization_info(src->quantization_info());
-        ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuIm2ColKernel::validate(src, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation, num_groups, input_pad_right));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            kernels::CpuIm2ColKernel::validate(src, &im2col_reshaped_info, Size2D(kernel_width, kernel_height),
+                                               conv_info, append_bias, dilation, num_groups, input_pad_right));
         gemm_input_to_use = &im2col_reshaped_info;
     }
 
     // Create temporary GEMM output tensor in case we cannot skip col2im
     const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
-    if(!skip_col2im)
+    if (!skip_col2im)
     {
         TensorShape shape_gemm = gemm_input_to_use->tensor_shape();
         shape_gemm.set(0, mat_weights_cols);
@@ -549,13 +613,15 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weight
     gemm_output_to_use      = &info_gemm;
     const bool fixed_format = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED;
 
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, enable_fast_math, skip_col2im ? conv_h : 0, skip_im2col, fixed_format,
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info,
+                                            enable_fast_math, skip_col2im ? conv_h : 0, skip_im2col, fixed_format,
                                             weights_info.weight_format()));
 
     // Validate Col2Im/ReshapeLayer
-    if(!skip_col2im && (data_layout == DataLayout::NCHW))
+    if (!skip_col2im && (data_layout == DataLayout::NCHW))
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            kernels::CpuCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h)));
     }
 
     return Status{};
@@ -574,15 +640,11 @@ void CpuGemmConv2d::run(ITensorPack &tensors)
     CpuAuxTensorHandler reshaped_wei(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, false);
 
     bool out_has_padding = _skip_col2im && (dst->info()->padding().bottom != 0 || dst->info()->padding().top != 0);
-    if(!_skip_im2col)
+    if (!_skip_im2col)
     {
         // Run input reshaping
         unsigned int y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-        ITensorPack  pack  =
-        {
-            { TensorType::ACL_SRC, src },
-            { TensorType::ACL_DST, im2col_output.get() }
-        };
+        ITensorPack  pack  = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, im2col_output.get()}};
         NEScheduler::get().schedule_op(_im2col_kernel.get(), y_dim, _im2col_kernel->window(), pack);
         gemm_input_to_use = im2col_output.get();
     }
@@ -595,11 +657,11 @@ void CpuGemmConv2d::run(ITensorPack &tensors)
     gemm3d.allocator()->import_memory(out_to_use->buffer());
     auto gemm_output_to_use = gemm_output.get();
 
-    if(_skip_im2col)
+    if (_skip_im2col)
     {
         gemm_output_to_use = &gemm3d;
     }
-    if(_skip_col2im && !out_has_padding)
+    if (_skip_col2im && !out_has_padding)
     {
         gemm_output_to_use = dst;
     }
@@ -607,12 +669,12 @@ void CpuGemmConv2d::run(ITensorPack &tensors)
     // Runs CpuGemm or CpuGemmLowpMatrixMultiplyCore functions
     ITensorPack pack_mm = tensors;
     pack_mm.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use);
-    if(!this->isVarWeightsKernel())
+    if (!this->isVarWeightsKernel())
     {
         pack_mm.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get());
     }
     pack_mm.add_tensor(TensorType::ACL_DST, gemm_output_to_use);
-    if(_is_quantized)
+    if (_is_quantized)
     {
         // Run gemmlowp
         _mm_gemmlowp->run(pack_mm);
@@ -624,45 +686,33 @@ void CpuGemmConv2d::run(ITensorPack &tensors)
     }
 
     // Reshape output matrix
-    if(!_skip_col2im)
+    if (!_skip_col2im)
     {
-        if(_data_layout == DataLayout::NCHW)
+        if (_data_layout == DataLayout::NCHW)
         {
-            ITensorPack pack =
-            {
-                { TensorType::ACL_SRC, gemm_output.get() },
-                { TensorType::ACL_DST, dst }
-            };
+            ITensorPack pack = {{TensorType::ACL_SRC, gemm_output.get()}, {TensorType::ACL_DST, dst}};
             NEScheduler::get().schedule_op(_col2im_kernel.get(), Window::DimY, _col2im_kernel->window(), pack);
         }
         else
         {
-            ITensorPack pack =
-            {
-                { TensorType::ACL_SRC, gemm_output_to_use },
-                { TensorType::ACL_DST, dst }
-            };
+            ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}};
             _reshape->run(pack);
         }
     }
-    else if(out_has_padding)
+    else if (out_has_padding)
     {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, gemm_output_to_use },
-            { TensorType::ACL_DST, dst }
-        };
+        ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}};
         _reshape->run(pack);
     }
 }
 
 void CpuGemmConv2d::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         // Variable weights executions that use fixed-format kernels
         // need no reshaping of the weights.
-        if(this->isVarWeightsKernel())
+        if (this->isVarWeightsKernel())
         {
             _is_quantized ? _mm_gemmlowp->prepare(tensors) : _mm_gemm->prepare(tensors);
             _is_prepared = true;
@@ -672,11 +722,7 @@ void CpuGemmConv2d::prepare(ITensorPack &tensors)
         // Run weights reshaping and mark original weights tensor as unused
         CpuAuxTensorHandler weights_reshaped(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors);
         auto                weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-        ITensorPack         pack    =
-        {
-            { TensorType::ACL_SRC, weights },
-            { TensorType::ACL_DST, weights_reshaped.get() }
-        };
+        ITensorPack         pack    = {{TensorType::ACL_SRC, weights}, {TensorType::ACL_DST, weights_reshaped.get()}};
         NEScheduler::get().schedule_op(_weights_reshape_kernel.get(), 3, _weights_reshape_kernel->window(), pack);
         weights->mark_as_unused();
         ITensorPack gemm_pack = tensors;
diff --git a/src/cpu/operators/CpuGemmConv2d.h b/src/cpu/operators/CpuGemmConv2d.h
index 61fe63a79f..118d366517 100644
--- a/src/cpu/operators/CpuGemmConv2d.h
+++ b/src/cpu/operators/CpuGemmConv2d.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/cpu/ICpuOperator.h"
 
 #include <memory>
@@ -106,17 +107,32 @@ public:
      *                              available which may introduce a drop of accuracy as well. Default is false
      * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
+    void configure(const ITensorInfo         *src,
+                   const ITensorInfo         *weights,
+                   const ITensorInfo         *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
+                   const WeightsInfo         &weights_info     = WeightsInfo(),
+                   const Size2D              &dilation         = Size2D(1U, 1U),
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false,
+                   unsigned int               num_groups       = 1);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuGemmConvolution::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(),
-                           bool enable_fast_math = false, unsigned int num_groups = 1);
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const WeightsInfo         &weights_info     = WeightsInfo(),
+                           const Size2D              &dilation         = Size2D(1U, 1U),
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false,
+                           unsigned int               num_groups       = 1);
 
     /** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters.
      *
@@ -124,10 +140,16 @@ public:
      *
      * @return a status.
      */
-    static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                               const PadStrideInfo &conv_info,
-                               const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(),
-                               const bool enable_fast_math = false);
+    static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                               const ITensorInfo         *src,
+                               const ITensorInfo         *weights,
+                               const ITensorInfo         *biases,
+                               const ITensorInfo         *output,
+                               const PadStrideInfo       &conv_info,
+                               const WeightsInfo         &weights_info     = WeightsInfo(),
+                               const Size2D              &dilation         = Size2D(1U, 1U),
+                               const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                               const bool                 enable_fast_math = false);
 
     // Inherited methods overridden:
     void                             run(ITensorPack &tensors) override;
@@ -150,8 +172,15 @@ private:
      * @param[in]  fixed_format     (Optional) Select GEMM execution with variable weights.
      * @param[in]  weight_format    (Optional) The layout to be used for the weights tensor when running GEMM with variable weights.
      */
-    void configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
-                      bool enable_fast_math = false, int gemm_3d_depth = 1, bool fixed_format = false, arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED);
+    void configure_mm(const ITensorInfo         *src,
+                      const ITensorInfo         *weights,
+                      const ITensorInfo         *biases,
+                      ITensorInfo               *output,
+                      const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                      bool                       enable_fast_math = false,
+                      int                        gemm_3d_depth    = 1,
+                      bool                       fixed_format     = false,
+                      arm_compute::WeightFormat  weight_format    = arm_compute::WeightFormat::UNSPECIFIED);
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer matrix multiply routines
      *
      * @param[in] src              Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
@@ -170,8 +199,16 @@ private:
      *
      * @return a status
      */
-    static Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
-                              bool enable_fast_math = false, int gemm_3d_depth = 1, bool skip_im2col = false, bool fixed_format = false, arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED);
+    static Status validate_mm(const ITensorInfo         *src,
+                              const ITensorInfo         *weights,
+                              const ITensorInfo         *biases,
+                              const ITensorInfo         *dst,
+                              const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                              bool                       enable_fast_math = false,
+                              int                        gemm_3d_depth    = 1,
+                              bool                       skip_im2col      = false,
+                              bool                       fixed_format     = false,
+                              arm_compute::WeightFormat  weight_format    = arm_compute::WeightFormat::UNSPECIFIED);
     /** Static function to check if GEMM3D is supported in @ref NEGEMM or in @ref CpuGemmMLowpMatrixMultiplyCore
      *
      * @param[in] src           Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
@@ -182,7 +219,11 @@ private:
      *
      * @return a status
      */
-    static Status validate_gemm3d(const ITensorInfo *src, const ITensorInfo *weights, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col);
+    static Status validate_gemm3d(const ITensorInfo         *src,
+                                  const ITensorInfo         *weights,
+                                  const ActivationLayerInfo &act_info,
+                                  int                        gemm_3d_depth,
+                                  bool                       skip_im2col);
 
     struct SkipInfo
     {
@@ -200,8 +241,11 @@ private:
      *
      * @return a SkipInfo instance.
      */
-    static SkipInfo skip_im_col_info(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info,
-                                     const Size2D &dilation, const ActivationLayerInfo &act_info);
+    static SkipInfo skip_im_col_info(const ITensorInfo         *src,
+                                     const ITensorInfo         *weights,
+                                     const PadStrideInfo       &conv_info,
+                                     const Size2D              &dilation,
+                                     const ActivationLayerInfo &act_info);
 
     /** Indicates if the convolution executes in variable weights mode.
      *
@@ -236,7 +280,7 @@ private:
     bool _is_quantized;
     bool _is_prepared;
 
-    experimental::MemoryRequirements _aux_mem{ Count };
+    experimental::MemoryRequirements _aux_mem{Count};
 };
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuGemmDirectConv2d.cpp b/src/cpu/operators/CpuGemmDirectConv2d.cpp
index 5ce285cb6f..8fa81b1907 100644
--- a/src/cpu/operators/CpuGemmDirectConv2d.cpp
+++ b/src/cpu/operators/CpuGemmDirectConv2d.cpp
@@ -26,10 +26,10 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/utils/CpuAuxTensorHandler.h"
-
 #include "support/Cast.h"
 
 #include <set>
@@ -43,7 +43,10 @@ using namespace arm_compute::utils::cast;
 
 namespace
 {
-GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act)
+GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo         *src,
+                                                        const ITensorInfo         *weights,
+                                                        const ITensorInfo         *dst,
+                                                        const ActivationLayerInfo &act)
 {
     // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
     // Extract and negate input and weights offset
@@ -53,16 +56,15 @@ GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src,
     const UniformQuantizationInfo uoqinfo   = oqinfo.uniform();
     const DataType                data_type = src->data_type();
     // Merge activation with output stage
-    const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                               ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                               ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                             };
-    PixelValue                                              type_min{};
-    PixelValue                                              type_max{};
+    const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+        ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+        ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
+    PixelValue type_min{};
+    PixelValue type_max{};
     std::tie(type_min, type_max) = get_min_max(data_type);
     int32_t min_activation       = type_min.get<int32_t>();
     int32_t max_activation       = type_max.get<int32_t>();
-    if(supported_acts.count(act.activation()) != 0)
+    if (supported_acts.count(act.activation()) != 0)
     {
         std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act, data_type, uoqinfo);
     }
@@ -107,31 +109,32 @@ CpuGemmDirectConv2d::CpuGemmDirectConv2d()
 
 CpuGemmDirectConv2d::~CpuGemmDirectConv2d() = default;
 
-void CpuGemmDirectConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info)
+void CpuGemmDirectConv2d::configure(const ITensorInfo *src,
+                                    const ITensorInfo *weights,
+                                    const ITensorInfo *biases,
+                                    ITensorInfo       *dst,
+                                    const Conv2dInfo  &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuGemmDirectConv2d::validate(src,
-                                                             weights,
-                                                             biases != nullptr ? biases : nullptr,
-                                                             dst,
-                                                             info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        CpuGemmDirectConv2d::validate(src, weights, biases != nullptr ? biases : nullptr, dst, info));
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, info);
 
     _run_activation = info.act_info.enabled() && !_gemm_asm_func->is_activation_supported(info.act_info);
     _is_prepared    = false;
 
-    _weights_permute_func->configure(weights, &_perm_weights, PermutationVector{ 3, 0, 1, 2 });
+    _weights_permute_func->configure(weights, &_perm_weights, PermutationVector{3, 0, 1, 2});
 
     // Configure assembly dispatch
     cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false);
-    if(is_data_type_quantized(src->data_type()))
+    if (is_data_type_quantized(src->data_type()))
     {
         asm_info.output_stage = calculate_output_stage_metadata(src, weights, dst, info.act_info);
     }
     _gemm_asm_func->configure(src, &_perm_weights, biases, dst, asm_info);
 
     // Configure activation
-    if(_run_activation)
+    if (_run_activation)
     {
         _activation_func->configure(dst, nullptr, info.act_info);
     }
@@ -141,24 +144,33 @@ void CpuGemmDirectConv2d::configure(const ITensorInfo *src, const ITensorInfo *w
     _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace];
     _aux_mem[Pretranspose]     = asm_mem_req[Pretranspose];
 
-    if(_aux_mem[Pretranspose].size > 0)
+    if (_aux_mem[Pretranspose].size > 0)
     {
         // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
-        _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, weights->total_size());
+        _aux_mem[PermutedWeights] =
+            MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, weights->total_size());
     }
     else
     {
         // We must permute weights if they are WeightFormat::UNSPECIFIED
-        if(info.weights_info.weight_format() == WeightFormat::UNSPECIFIED)
-            _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Persistent, weights->total_size());
+        if (info.weights_info.weight_format() == WeightFormat::UNSPECIFIED)
+            _aux_mem[PermutedWeights] =
+                MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Persistent, weights->total_size());
     }
 }
-Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info)
+Status CpuGemmDirectConv2d::validate(const ITensorInfo *src,
+                                     const ITensorInfo *weights,
+                                     const ITensorInfo *biases,
+                                     const ITensorInfo *dst,
+                                     const Conv2dInfo  &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32);
-    if(!is_fixed_format(info.weights_info.weight_format()))
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::BFLOAT16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16,
+                                                         DataType::F16, DataType::F32);
+    if (!is_fixed_format(info.weights_info.weight_format()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
     }
@@ -171,13 +183,13 @@ Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *
     ARM_COMPUTE_RETURN_ERROR_ON(info.dilation != Size2D(1U, 1U));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
     // Validate biases
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
-        if(is_data_type_quantized_asymmetric(data_type))
+        if (is_data_type_quantized_asymmetric(data_type))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
-        else if(data_type == DataType::BFLOAT16)
+        else if (data_type == DataType::BFLOAT16)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
         }
@@ -198,31 +210,32 @@ void CpuGemmDirectConv2d::run(ITensorPack &tensors)
     prepare(tensors);
 
     _gemm_asm_func->run(tensors);
-    if(_run_activation)
+    if (_run_activation)
     {
         ITensor    *io = tensors.get_tensor(ACL_DST);
-        ITensorPack pack{ { ACL_SRC, io }, { ACL_DST, io } };
+        ITensorPack pack{{ACL_SRC, io}, {ACL_DST, io}};
         _activation_func->run(pack);
     }
 }
 
 void CpuGemmDirectConv2d::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         // If we are using fixed-format kernel the weights are already reshaped
-        if(_gemm_asm_func && _gemm_asm_func->isVarWeightsKernel())
+        if (_gemm_asm_func && _gemm_asm_func->isVarWeightsKernel())
         {
             _gemm_asm_func->prepare(tensors);
             _is_prepared = true;
             return;
         }
-        const ITensor *weights     = tensors.get_const_tensor(ACL_SRC_1);
-        ITensor       *weights_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights)));
+        const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1);
+        ITensor       *weights_aux =
+            utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights)));
         ARM_COMPUTE_ERROR_ON_NULLPTR(weights, weights_aux);
 
         CpuAuxTensorHandler permuted_weights(_perm_weights, *weights_aux);
-        ITensorPack         permute_tensors{ { ACL_SRC, weights }, { ACL_DST, permuted_weights.get() } };
+        ITensorPack         permute_tensors{{ACL_SRC, weights}, {ACL_DST, permuted_weights.get()}};
         _weights_permute_func->run(permute_tensors);
 
         tensors.add_const_tensor(ACL_SRC_1, permuted_weights.get());
diff --git a/src/cpu/operators/CpuGemmDirectConv2d.h b/src/cpu/operators/CpuGemmDirectConv2d.h
index e55a461f36..1cc3caadae 100644
--- a/src/cpu/operators/CpuGemmDirectConv2d.h
+++ b/src/cpu/operators/CpuGemmDirectConv2d.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H
 
 #include "arm_compute/core/TensorInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuOperator.h"
 #include "src/cpu/operators/CpuActivation.h"
@@ -69,18 +70,26 @@ public:
      *                    Data types supported: Same as @p input.
      * @param[in] info    Contains padding and stride information described in @ref PadStrideInfo.
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info);
+    void configure(const ITensorInfo *src,
+                   const ITensorInfo *weights,
+                   const ITensorInfo *biases,
+                   ITensorInfo       *dst,
+                   const Conv2dInfo  &info);
     /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmDirectConv2d
      *
      * Similar to CpuGemmDirectConv2d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *weights,
+                           const ITensorInfo *biases,
+                           const ITensorInfo *dst,
+                           const Conv2dInfo  &info);
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
index 8ca128fb07..2ee879b67b 100644
--- a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
+++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
@@ -28,14 +28,14 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/MemoryHelpers.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h"
 #include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
 #include "src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h"
@@ -59,12 +59,12 @@ namespace
 cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
 {
     cpu::AsmGemmInfo asm_info;
-    asm_info.method                      = cpu::AsmConvMethod::Im2Col;
-    asm_info.reinterpret_input_as_3d     = info.reinterpret_input_as_3d();
-    asm_info.depth_output_gemm3d         = info.depth_output_gemm3d();
-    asm_info.activation_info             = info.activation_info();
-    asm_info.output_stage                = info.gemmlowp_output_stage();
-    asm_info.fast_mode                   = info.fast_math();
+    asm_info.method                  = cpu::AsmConvMethod::Im2Col;
+    asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
+    asm_info.depth_output_gemm3d     = info.depth_output_gemm3d();
+    asm_info.activation_info         = info.activation_info();
+    asm_info.output_stage            = info.gemmlowp_output_stage();
+    asm_info.fast_mode               = info.fast_math();
 
     return asm_info;
 }
@@ -105,7 +105,8 @@ CpuGemmLowpMatrixMultiplyCore::CpuGemmLowpMatrixMultiplyCore()
 }
 CpuGemmLowpMatrixMultiplyCore::~CpuGemmLowpMatrixMultiplyCore() = default;
 
-void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info)
+void CpuGemmLowpMatrixMultiplyCore::configure(
+    const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, dst);
     ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpMatrixMultiplyCore::validate(a, b, c, dst, gemm_info));
@@ -122,28 +123,31 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
     _reshape_b_only_on_first_run      = b->are_values_constant();
     _is_prepared                      = false;
     _fused_assembly_path              = false;
-    _flip_signedness                  = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run;
-    _gemm_info                        = gemm_info;
+    _flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) &&
+                       _reshape_b_only_on_first_run;
+    _gemm_info = gemm_info;
 
     _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
 
     const ITensorInfo *a_to_use = a;
 
     // Convert to QASYMM8 -> QASYMM8_SIGNED and back
-    if(_flip_signedness)
+    if (_flip_signedness)
     {
         const int32_t                 offset_correction = 128;
         const DataType                dt                = DataType::QASYMM8_SIGNED;
         const UniformQuantizationInfo iqinfo            = a_to_use->quantization_info().uniform();
 
-        _signed_a                = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
+        _signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(
+            QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
         _convert_to_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();
         _convert_to_signed_asymm->configure(a_to_use, &_signed_a);
         a_to_use  = &_signed_a;
         _a_offset = _signed_a.quantization_info().uniform().offset;
 
         const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
-        _signed_output                       = dst->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
+        _signed_output                       = dst->clone()->set_data_type(dt).set_quantization_info(
+                                  QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
 
         // Output stage correction
         GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
@@ -157,7 +161,7 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
     }
 
     // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
-    if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+    if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
     {
         _fuse_output_stage = true;
         _mm_result_s32     = TensorInfo(dst->tensor_shape(), 1, DataType::S32);
@@ -166,16 +170,18 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
     // Initialize assembly kernel meta-data
     const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
 #ifdef __aarch64__
-    if(!(!b->are_values_constant() && b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.
+    if (!(!b->are_values_constant() &&
+          b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.
     {
-        switch(a->data_type())
+        switch (a->data_type())
         {
             case DataType::QASYMM8:
             case DataType::QASYMM8_SIGNED:
             case DataType::U8:
             case DataType::S8:
             {
-                if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+                if (is_data_type_quantized_asymmetric(a_to_use->data_type()) &&
+                    info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
                 {
                     auto c_info_to_use = c == nullptr ? nullptr : c;
                     _asm_glue->configure(a_to_use, b, c_info_to_use, dst, asm_info);
@@ -197,13 +203,14 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
         }
     }
 #endif /* __aarch64__ */
-    if(!(_assembly_path || _run_vector_matrix_multiplication))
+    if (!(_assembly_path || _run_vector_matrix_multiplication))
     {
         matrix_a = &_tmp_a;
         matrix_b = &_tmp_b;
 
         // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
-        _tmp_a = TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info());
+        _tmp_a =
+            TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info());
         // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
         _tmp_b = TensorInfo(compute_transpose1xW_shape(*b), 1, b->data_type(), b->quantization_info());
 
@@ -216,13 +223,13 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
         _mtx_b_reshape_kernel->configure(b, &_tmp_b);
     }
 
-    if(!_fused_assembly_path)
+    if (!_fused_assembly_path)
     {
         // Build reduction info
         const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
 
         // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
-        if(_a_offset != 0)
+        if (_a_offset != 0)
         {
             _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
 
@@ -232,7 +239,7 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
         }
 
         // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
-        if(_b_offset != 0)
+        if (_b_offset != 0)
         {
             _vector_sum_row = TensorInfo(compute_reductionB_shape(*a_to_use), 1, DataType::S32);
 
@@ -241,24 +248,23 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
             _mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info);
         }
 
-        if(_fuse_output_stage)
+        if (_fuse_output_stage)
         {
             // Configure matrix multiply kernel
-            if(!_assembly_path)
+            if (!_assembly_path)
             {
                 _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();
                 _mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32);
             }
 
-            _offset_contribution_output_stage_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionOutputStageKernel>();
-            _offset_contribution_output_stage_kernel->configure(&_mm_result_s32,
-                                                                _a_offset == 0 ? nullptr : &_vector_sum_col,
-                                                                _b_offset == 0 ? nullptr : &_vector_sum_row, c,
-                                                                _flip_signedness ? &_signed_output : dst,
-                                                                a->dimension(0),
-                                                                _a_offset, _b_offset, info.gemmlowp_output_stage());
+            _offset_contribution_output_stage_kernel =
+                std::make_unique<kernels::CpuGemmLowpOffsetContributionOutputStageKernel>();
+            _offset_contribution_output_stage_kernel->configure(
+                &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                _b_offset == 0 ? nullptr : &_vector_sum_row, c, _flip_signedness ? &_signed_output : dst,
+                a->dimension(0), _a_offset, _b_offset, info.gemmlowp_output_stage());
 
-            if(_flip_signedness)
+            if (_flip_signedness)
             {
                 _convert_from_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();
                 _convert_from_signed_asymm->configure(&_signed_output, dst);
@@ -267,27 +273,29 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
         else
         {
             // Configure matrix multiply kernel
-            if(!_assembly_path)
+            if (!_assembly_path)
             {
                 _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();
                 _mm_kernel->configure(matrix_a, matrix_b, dst);
             }
             // Configure offset contribution kernel
             _offset_contribution_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionKernel>();
-            _offset_contribution_kernel->configure(dst, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->dimension(0),
+            _offset_contribution_kernel->configure(dst, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                                                   _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->dimension(0),
                                                    _a_offset, _b_offset);
         }
     }
     // Configure activation
     const ActivationLayerInfo &activation = gemm_info.activation_info();
-    _run_activation                       = activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation));
-    if(_run_activation)
+    _run_activation =
+        activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation));
+    if (_run_activation)
     {
         _activation_func = std::make_unique<CpuActivation>();
         _activation_func->configure(dst, nullptr, activation);
     }
 
-    if(_assembly_path)
+    if (_assembly_path)
     {
         auto asm_mem_req           = _asm_glue->workspace();
         _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace];
@@ -295,27 +303,41 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
     }
 
     // Request memory for LHS and RHS reshape matrix
-    _aux_mem[VectorSumCol] = MemoryInfo(offset_int_vec(VectorSumCol), !_fused_assembly_path && _a_offset != 0
-                                        && _reshape_b_only_on_first_run ?
-                                        MemoryLifetime::Persistent :
-                                        MemoryLifetime::Temporary,
-                                        _vector_sum_col.total_size());
-    _aux_mem[VectorSumRow] = MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
-    _aux_mem[TmpA]         = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size());
-    _aux_mem[TmpB]         = MemoryInfo(offset_int_vec(TmpB), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
-    _aux_mem[MMResultS32]  = MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
-    _aux_mem[SignedA]      = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size());
-    _aux_mem[SignedOutput] = MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size());
+    _aux_mem[VectorSumCol] =
+        MemoryInfo(offset_int_vec(VectorSumCol),
+                   !_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run ? MemoryLifetime::Persistent
+                                                                                           : MemoryLifetime::Temporary,
+                   _vector_sum_col.total_size());
+    _aux_mem[VectorSumRow] =
+        MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
+    _aux_mem[TmpA] = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size());
+    _aux_mem[TmpB] = MemoryInfo(offset_int_vec(TmpB),
+                                _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary,
+                                _tmp_b.total_size());
+    _aux_mem[MMResultS32] =
+        MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
+    _aux_mem[SignedA] = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size());
+    _aux_mem[SignedOutput] =
+        MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size());
 }
 
-Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
+Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a,
+                                               const ITensorInfo *b,
+                                               const ITensorInfo *c,
+                                               const ITensorInfo *output,
+                                               const GEMMInfo    &gemm_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
-                                    "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr &&
+                                        gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE,
+                                    "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (a)->dimension(0) != (b)->dimension(1),
+        "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
 
@@ -333,28 +355,32 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens
     int32_t b_offset = b->quantization_info().uniform().offset;
 
     bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
-    if(fuse_output_stage)
+    if (fuse_output_stage)
     {
-        auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
+        auto_init_if_empty(mm_result_s32_info,
+                           a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
     }
 
     // Convert QASYMM8->QASYMM8_SIGNED
     TensorInfo signed_a{};
     TensorInfo signed_output{};
-    bool       flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
-    if(flip_signedness)
+    bool       flip_signedness = is_data_type_quantized_per_channel(b->data_type()) &&
+                           (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
+    if (flip_signedness)
     {
         const int32_t                 offset_correction = 128;
         const DataType                dt                = DataType::QASYMM8_SIGNED;
         const UniformQuantizationInfo iqinfo            = a_to_use->quantization_info().uniform();
 
-        signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
+        signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(
+            QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
         ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a));
         a_to_use = &signed_a;
         a_offset = signed_a.quantization_info().uniform().offset;
 
         const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
-        signed_output                        = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
+        signed_output                        = output->clone()->set_data_type(dt).set_quantization_info(
+                                   QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
 
         // Output stage correction
         GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
@@ -374,25 +400,28 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens
     bool run_optimised             = false;
     bool run_optimised_requantized = false;
 
-    if(!(!b->are_values_constant() && b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.
+    if (!(!b->are_values_constant() &&
+          b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.
     {
-        if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        if (is_data_type_quantized_asymmetric(a_to_use->data_type()) &&
+            info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
         {
             run_optimised             = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, c, output, asm_info));
             run_optimised_requantized = run_optimised;
         }
         else
         {
-            run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));
+            run_optimised = bool(CpuGemmAssemblyDispatch::validate(
+                a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));
         }
     }
 
-    if(run_optimised)
+    if (run_optimised)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
-        if(info.depth_output_gemm3d() != 0)
+        if (info.depth_output_gemm3d() != 0)
         {
-            if(info.reinterpret_input_as_3d())
+            if (info.reinterpret_input_as_3d())
             {
                 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
                 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
@@ -409,11 +438,13 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(),
+                                        "NEGEMM cannot reinterpret the input tensor as 3D");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0,
+                                        "NEGEMM cannot reinterpret the output tensor as 3D");
 
         const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
-        if(!run_vector_matrix_multiplication)
+        if (!run_vector_matrix_multiplication)
         {
             matrix_a_info = &tmp_a_info;
             matrix_b_info = &tmp_b_info;
@@ -437,7 +468,7 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens
         }
     }
 
-    if(!run_optimised_requantized)
+    if (!run_optimised_requantized)
     {
         TensorInfo info_vector_sum_col{};
         TensorInfo info_vector_sum_row{};
@@ -445,62 +476,70 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens
         const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
 
         // Validate matrix B reduction kernel only if _a_offset is not equal to 0
-        if(a_offset != 0)
+        if (a_offset != 0)
         {
             info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
 
             // Configure Matrix B reduction kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info));
         }
 
         // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
-        if(b_offset != 0)
+        if (b_offset != 0)
         {
             info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
 
             // Configure matrix A reduction kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info));
         }
 
-        if(fuse_output_stage)
+        if (fuse_output_stage)
         {
-            if(!run_optimised)
+            if (!run_optimised)
             {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
-
-                ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info));
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    info.reinterpret_input_as_3d(),
+                    "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    info.depth_output_gemm3d() != 0,
+                    "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
+
+                ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(
+                    matrix_a_info, matrix_b_info, &mm_result_s32_info));
             }
 
             // Validate offset contribution kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
-                                                                                                          a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                                          b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                                          c,
-                                                                                                          flip_signedness ? &signed_output : output,
-                                                                                                          a_offset, b_offset,
-                                                                                                          info.gemmlowp_output_stage()));
+            ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate(
+                &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col,
+                b_offset == 0 ? nullptr : &info_vector_sum_row, c, flip_signedness ? &signed_output : output, a_offset,
+                b_offset, info.gemmlowp_output_stage()));
         }
         else
         {
-            if(!run_optimised)
+            if (!run_optimised)
             {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
-
-                ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    info.reinterpret_input_as_3d(),
+                    "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    info.depth_output_gemm3d() != 0,
+                    "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
+
+                ARM_COMPUTE_RETURN_ON_ERROR(
+                    kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
             }
             // Validate offset contribution kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate(output,
-                                                                                               a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                               b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                               a_offset, b_offset));
+            ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate(
+                output, a_offset == 0 ? nullptr : &info_vector_sum_col, b_offset == 0 ? nullptr : &info_vector_sum_row,
+                a_offset, b_offset));
         }
     }
 
     // Validate activation
     const ActivationLayerInfo &activation = gemm_info.activation_info();
-    if(activation.enabled())
+    if (activation.enabled())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, activation));
     }
@@ -529,24 +568,22 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
     CpuAuxTensorHandler signed_output(offset_int_vec(SignedOutput), _signed_output, tensors, false);
 
     // Convert QASYMM8->QASYMM8_SIGNED
-    if(_flip_signedness)
+    if (_flip_signedness)
     {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, a },
-            { TensorType::ACL_DST, signed_a.get() }
-        };
-        NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(), pack);
+        ITensorPack pack = {{TensorType::ACL_SRC, a}, {TensorType::ACL_DST, signed_a.get()}};
+        NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(),
+                                       pack);
         a_to_use = signed_a.get();
         matrix_a = signed_a.get();
     }
 
     // Run GEMM
-    if(_asm_glue->is_configured())
+    if (_asm_glue->is_configured())
     {
         ITensorPack asm_glue_tensors = tensors;
         auto        output_to_use    = (_fuse_output_stage ? mm_result_s32.get() : dst);
-        if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && _gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        if (is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) &&
+            _gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
         {
             asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);
             asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);
@@ -563,35 +600,25 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
     }
     else
     {
-        if(!_run_vector_matrix_multiplication)
+        if (!_run_vector_matrix_multiplication)
         {
             matrix_a = tmp_a.get();
             matrix_b = tmp_b.get();
             // Run interleave kernel
-            ITensorPack pack_a =
-            {
-                { TensorType::ACL_SRC, a_to_use },
-                { TensorType::ACL_DST, tmp_a.get() }
-            };
-            NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(), pack_a);
+            ITensorPack pack_a = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, tmp_a.get()}};
+            NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(),
+                                           pack_a);
 
-            if(!_reshape_b_only_on_first_run)
+            if (!_reshape_b_only_on_first_run)
             {
-                ITensorPack pack_b =
-                {
-                    { TensorType::ACL_SRC, b },
-                    { TensorType::ACL_DST, tmp_b.get() }
-                };
+                ITensorPack pack_b = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, tmp_b.get()}};
                 // Run transpose kernel
-                NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack_b);
+                NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY,
+                                               _mtx_b_reshape_kernel->window(), pack_b);
             }
         }
-        ITensorPack pack_mm =
-        {
-            { TensorType::ACL_SRC_0, matrix_a },
-            { TensorType::ACL_SRC_1, matrix_b }
-        };
-        if(_fuse_output_stage)
+        ITensorPack pack_mm = {{TensorType::ACL_SRC_0, matrix_a}, {TensorType::ACL_SRC_1, matrix_b}};
+        if (_fuse_output_stage)
         {
             pack_mm.add_tensor(TensorType::ACL_DST, mm_result_s32.get());
         }
@@ -602,31 +629,25 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
         NEScheduler::get().schedule_op(_mm_kernel.get(), Window::DimY, _mm_kernel->window(), pack_mm);
     }
 
-    if(!_fused_assembly_path)
+    if (!_fused_assembly_path)
     {
         // Run matrix A reduction kernel only if _b_offset is not equal to 0
-        if(_b_offset != 0)
+        if (_b_offset != 0)
         {
-            ITensorPack pack =
-            {
-                { TensorType::ACL_SRC, a_to_use },
-                { TensorType::ACL_DST, vector_sum_row.get() }
-            };
-            NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX, _mtx_a_reduction_kernel->window(), pack);
+            ITensorPack pack = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, vector_sum_row.get()}};
+            NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX,
+                                           _mtx_a_reduction_kernel->window(), pack);
         }
 
         // Run matrix B reduction kernel only if _a_offset is not equal to 0
-        if(_a_offset != 0 && !_reshape_b_only_on_first_run)
+        if (_a_offset != 0 && !_reshape_b_only_on_first_run)
         {
-            ITensorPack pack =
-            {
-                { TensorType::ACL_SRC, b },
-                { TensorType::ACL_DST, vector_sum_col.get() }
-            };
-            NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack);
+            ITensorPack pack = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, vector_sum_col.get()}};
+            NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX,
+                                           _mtx_b_reduction_kernel->window(), pack);
         }
 
-        if(_fuse_output_stage)
+        if (_fuse_output_stage)
         {
             ITensorPack pack;
             pack.add_tensor(TensorType::ACL_SRC_0, mm_result_s32.get());
@@ -636,7 +657,8 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
             pack.add_tensor(TensorType::ACL_DST, _flip_signedness ? signed_output.get() : dst);
 
             // Run offset contribution kernel
-            NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY, _offset_contribution_output_stage_kernel->window(), pack);
+            NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY,
+                                           _offset_contribution_output_stage_kernel->window(), pack);
         }
         else
         {
@@ -646,68 +668,57 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
             pack.add_tensor(TensorType::ACL_DST, dst);
 
             // Run offset contribution kernel
-            NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY, _offset_contribution_kernel->window(), pack);
+            NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY,
+                                           _offset_contribution_kernel->window(), pack);
         }
     }
 
     // Convert QASYMM8_SIGNED->QASYMM8
-    if(!_fused_assembly_path && _fuse_output_stage && _flip_signedness)
+    if (!_fused_assembly_path && _fuse_output_stage && _flip_signedness)
     {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, signed_output.get() },
-            { TensorType::ACL_DST, dst }
-        };
-        NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY, _convert_from_signed_asymm->window(), pack);
+        ITensorPack pack = {{TensorType::ACL_SRC, signed_output.get()}, {TensorType::ACL_DST, dst}};
+        NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY,
+                                       _convert_from_signed_asymm->window(), pack);
     }
 
     // Run fused activation unless already run in the fused assembly
-    if(_run_activation)
+    if (_run_activation)
     {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, dst },
-            { TensorType::ACL_DST, dst }
-        };
+        ITensorPack pack = {{TensorType::ACL_SRC, dst}, {TensorType::ACL_DST, dst}};
         _activation_func->run(pack);
     }
 }
 
 void CpuGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         auto original_b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
         // Run assembly reshape
-        if(_asm_glue->is_configured())
+        if (_asm_glue->is_configured())
         {
             _asm_glue->prepare(tensors);
         }
         // Run non-assembly reshape
-        else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())
+        else if (_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())
         {
             // Run reshape kernel and mark original weights tensor as unused
-            ITensor            *tmp_b_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(TmpB)));
+            ITensor *tmp_b_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(TmpB)));
             CpuAuxTensorHandler tmp_b(_tmp_b, *tmp_b_p);
-            ITensorPack         pack =
-            {
-                { TensorType::ACL_SRC, original_b },
-                { TensorType::ACL_DST, tmp_b.get() }
-            };
-            NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack);
+            ITensorPack         pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, tmp_b.get()}};
+            NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(),
+                                           pack);
         }
 
         // Run matrix B reduction kernel only if _a_offset is not equal to 0
-        if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
+        if (!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
         {
-            ITensor            *vector_sum_col_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(VectorSumCol)));
+            ITensor *vector_sum_col_p =
+                utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(VectorSumCol)));
             CpuAuxTensorHandler vector_sum_col(_vector_sum_col, *vector_sum_col_p);
-            ITensorPack         pack =
-            {
-                { TensorType::ACL_SRC, original_b },
-                { TensorType::ACL_DST, vector_sum_col.get() }
-            };
-            NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack);
+            ITensorPack         pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, vector_sum_col.get()}};
+            NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX,
+                                           _mtx_b_reduction_kernel->window(), pack);
         }
         _is_prepared = true;
     }
diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h
index a1b34291d0..a7798938e7 100644
--- a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h
+++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/function_info/GEMMInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuOperator.h"
 
@@ -108,18 +109,26 @@ public:
      * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
      *                       if the reshape of matrix B should be executed only for the first run
      */
-    void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(const ITensorInfo *a,
+                   const ITensorInfo *b,
+                   const ITensorInfo *c,
+                   ITensorInfo       *dst,
+                   const GEMMInfo    &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuGemmLowpMatrixMultiplyCore::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *dst, const GEMMInfo &gemm_info = GEMMInfo());
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *dst,
+                           const GEMMInfo    &gemm_info = GEMMInfo());
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
diff --git a/src/cpu/operators/CpuGemmLowpOutputStage.cpp b/src/cpu/operators/CpuGemmLowpOutputStage.cpp
index 58f98acff0..4215eed199 100644
--- a/src/cpu/operators/CpuGemmLowpOutputStage.cpp
+++ b/src/cpu/operators/CpuGemmLowpOutputStage.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h"
 #include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
@@ -36,36 +37,42 @@ namespace arm_compute
 {
 namespace cpu
 {
-void CpuGemmLowpOutputStage::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info)
+void CpuGemmLowpOutputStage::configure(ITensorInfo                   *src,
+                                       ITensorInfo                   *bias,
+                                       ITensorInfo                   *dst,
+                                       const GEMMLowpOutputStageInfo &info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpOutputStage::validate(src, bias, dst, info));
     ARM_COMPUTE_LOG_PARAMS(src, bias, dst, info);
 
-    switch(info.type)
+    switch (info.type)
     {
         case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
         {
-            switch(info.output_data_type)
+            switch (info.output_data_type)
             {
                 case DataType::QASYMM8:
                 {
                     auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>();
-                    k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+                    k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset,
+                                 info.gemmlowp_min_bound, info.gemmlowp_max_bound);
                     _kernel = std::move(k);
                     break;
                 }
                 case DataType::QASYMM8_SIGNED:
                 {
                     auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>();
-                    k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+                    k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset,
+                                 info.gemmlowp_min_bound, info.gemmlowp_max_bound);
                     _kernel = std::move(k);
                     break;
                 }
                 case DataType::QSYMM16:
                 {
                     auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>();
-                    k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+                    k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound,
+                                 info.gemmlowp_max_bound);
                     _kernel = std::move(k);
                     break;
                 }
@@ -79,7 +86,7 @@ void CpuGemmLowpOutputStage::configure(ITensorInfo *src, ITensorInfo *bias, ITen
         }
         case GEMMLowpOutputStageType::QUANTIZE_DOWN:
         {
-            switch(info.output_data_type)
+            switch (info.output_data_type)
             {
                 case DataType::QASYMM8:
                 case DataType::QASYMM8_SIGNED:
@@ -102,32 +109,41 @@ void CpuGemmLowpOutputStage::configure(ITensorInfo *src, ITensorInfo *bias, ITen
     }
 }
 
-Status CpuGemmLowpOutputStage::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info)
+Status CpuGemmLowpOutputStage::validate(const ITensorInfo             *src,
+                                        const ITensorInfo             *bias,
+                                        const ITensorInfo             *dst,
+                                        const GEMMLowpOutputStageInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::UNKNOWN, "CpuGemmLowpOutputStage cannot be used with UNKNOWN output data type.");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16);
-    ARM_COMPUTE_RETURN_ERROR_ON((info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN) && (info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::UNKNOWN,
+                                    "CpuGemmLowpOutputStage cannot be used with UNKNOWN output data type.");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM16);
+    ARM_COMPUTE_RETURN_ERROR_ON((info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN) &&
+                                (info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT));
 
-    switch(info.type)
+    switch (info.type)
     {
         case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
         {
-            switch(dst->data_type())
+            switch (dst->data_type())
             {
                 case DataType::QASYMM8:
-                    return kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+                    return kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(
+                        src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
                 case DataType::QASYMM8_SIGNED:
-                    return kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+                    return kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(
+                        src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
                 case DataType::QSYMM16:
-                    return kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+                    return kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(
+                        src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
                 default:
                     return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type.");
             }
         }
         case GEMMLowpOutputStageType::QUANTIZE_DOWN:
         {
-            switch(dst->data_type())
+            switch (dst->data_type())
             {
                 case DataType::QASYMM8:
                 case DataType::QASYMM8_SIGNED:
@@ -146,4 +162,4 @@ void CpuGemmLowpOutputStage::run(ITensorPack &tensors)
     NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
 }
 } // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuGemmLowpOutputStage.h b/src/cpu/operators/CpuGemmLowpOutputStage.h
index 39394f6b5f..e5e2f41fa9 100644
--- a/src/cpu/operators/CpuGemmLowpOutputStage.h
+++ b/src/cpu/operators/CpuGemmLowpOutputStage.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_GEMMLOWP_OUTPUT_STAGE_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/cpu/ICpuOperator.h"
 
 /** This file contains all available output stages for GEMMLowp.
@@ -76,7 +77,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info);
+    static Status validate(const ITensorInfo             *src,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           const GEMMLowpOutputStageInfo &info);
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
diff --git a/src/cpu/operators/CpuMatMul.cpp b/src/cpu/operators/CpuMatMul.cpp
index 8811a7ea6b..89087129c3 100644
--- a/src/cpu/operators/CpuMatMul.cpp
+++ b/src/cpu/operators/CpuMatMul.cpp
@@ -23,14 +23,16 @@
  */
 
 #include "src/cpu/operators/CpuMatMul.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
+
 #include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/function_info/MatMulInfo.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/NEON/functions/NEMatMul.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -46,8 +48,11 @@ namespace cpu
 {
 namespace
 {
-Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act,
-                                      GEMMLowpOutputStageInfo &gemmlowp_output_stage_info)
+Status get_gemmlowp_output_stage_info(const ITensorInfo         *src,
+                                      const ITensorInfo         *weights,
+                                      const ITensorInfo         *dst,
+                                      const ActivationLayerInfo &act,
+                                      GEMMLowpOutputStageInfo   &gemmlowp_output_stage_info)
 {
     const auto                    data_type = src->data_type();
     const QuantizationInfo        oq_info   = dst->quantization_info();
@@ -59,10 +64,11 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo
     int32_t output_multiplier;
     int32_t output_shift;
 
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
 
-    int32_t type_min = 0;
-    int32_t type_max = 0;
+    int32_t type_min             = 0;
+    int32_t type_max             = 0;
     std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(oq_info, act, data_type);
 
     gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier;
@@ -77,14 +83,27 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo
 } // namespace
 
 CpuMatMul::CpuMatMul()
-    : _transpose_kernel_lhs(), _transpose_kernel_rhs(), _asm_glue(), _lhs_transposed(), _rhs_transposed(), _original_lhs_shape(), _original_rhs_shape(), _original_dst_shape()
+    : _transpose_kernel_lhs(),
+      _transpose_kernel_rhs(),
+      _asm_glue(),
+      _lhs_transposed(),
+      _rhs_transposed(),
+      _original_lhs_shape(),
+      _original_rhs_shape(),
+      _original_dst_shape()
 {
 }
 
-Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
+Status CpuMatMul::validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *dst,
+                           const MatMulInfo          &info,
+                           const CpuMatMulSettings   &settings,
+                           const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->are_values_constant(), "LHS Tensor must be dynamic.");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs->are_values_constant(), "RHS Tensor must be dynamic.");
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(lhs);
@@ -103,34 +122,39 @@ Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const
     gemm_info.fast_mode       = settings.fast_math();
 
     // Validate and then permute a/b
-    if(adj_lhs)
+    if (adj_lhs)
     {
-        auto_init_if_empty(lhs_transposed, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*lhs)));
+        auto_init_if_empty(lhs_transposed,
+                           lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*lhs)));
         ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuTransposeKernel::validate(lhs_to_use, &lhs_transposed));
         // Assign lhs_to_use pointer to use transposed TensorInfo
         lhs_to_use = &lhs_transposed;
     }
-    if(adj_rhs)
+    if (adj_rhs)
     {
-        auto_init_if_empty(rhs_transposed, rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*rhs)));
+        auto_init_if_empty(rhs_transposed,
+                           rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*rhs)));
         ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuTransposeKernel::validate(rhs_to_use, &rhs_transposed));
         // Assign rhs_to_use pointer to use transposed TensorInfo
         rhs_to_use = &rhs_transposed;
     }
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(0) != rhs_to_use->dimension(1),
-                                    "The product AB is defined only if the number of columns in A is equal to the number of rows in B (after transpose)");
+                                    "The product AB is defined only if the number of columns in A is equal to the "
+                                    "number of rows in B (after transpose)");
 
     // Iterate over dimensions to be collapsed in operator - check dimensions are equivalent between tensors
-    for(unsigned int i = 2; i < Coordinates::num_max_dimensions; i++)
+    for (unsigned int i = 2; i < Coordinates::num_max_dimensions; i++)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(i) != rhs_to_use->dimension(i), "Broadcasting in Batch dimension is unsupported by this operator.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(i) != rhs_to_use->dimension(i),
+                                        "Broadcasting in Batch dimension is unsupported by this operator.");
     }
 
     // Quantized-specific configuration
-    if(is_data_type_quantized(lhs->data_type()))
+    if (is_data_type_quantized(lhs->data_type()))
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(lhs_to_use, rhs_to_use, dst, gemm_info.activation_info, gemm_info.output_stage));
+        ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(lhs_to_use, rhs_to_use, dst,
+                                                                   gemm_info.activation_info, gemm_info.output_stage));
     }
 
     cpu::CpuGemmAssemblyDispatch::validate(lhs_to_use, rhs_to_use, nullptr, dst, gemm_info);
@@ -138,7 +162,12 @@ Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const
     return Status{};
 }
 
-void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
+void CpuMatMul::configure(ITensorInfo               *lhs,
+                          ITensorInfo               *rhs,
+                          ITensorInfo               *dst,
+                          const MatMulInfo          &info,
+                          const CpuMatMulSettings   &settings,
+                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, info, settings);
@@ -163,21 +192,23 @@ void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst,
     _original_rhs_shape = rhs_to_use.tensor_shape();
 
     // Reshape lhs for use with assembly kernels.
-    lhs_to_use.set_tensor_shape(TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, _original_lhs_shape.collapsed_from(2).z()));
-    dst_to_use.set_tensor_shape(TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, _original_dst_shape.collapsed_from(2).z()));
+    lhs_to_use.set_tensor_shape(
+        TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, _original_lhs_shape.collapsed_from(2).z()));
+    dst_to_use.set_tensor_shape(
+        TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, _original_dst_shape.collapsed_from(2).z()));
     rhs_to_use.set_tensor_shape(_original_rhs_shape.collapsed_from(2));
 
     // 2.  Configuration for transpose of lhs/rhs
     // ------------------------------------------------------
     // Initialise transposed TensorInfo class for aux tensors (intermediary tensors)
-    if(_adj_lhs)
+    if (_adj_lhs)
     {
         // Setup transpose LHS
         _transpose_kernel_lhs = std::make_unique<cpu::kernels::CpuTransposeKernel>();
         _transpose_kernel_lhs->configure(&lhs_to_use, &_lhs_transposed);
     }
 
-    if(_adj_rhs)
+    if (_adj_rhs)
     {
         // Setup transpose RHS
         _transpose_kernel_rhs = std::make_unique<cpu::kernels::CpuTransposeKernel>();
@@ -196,20 +227,22 @@ void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst,
     rhs_to_use = (_adj_rhs) ? _rhs_transposed : rhs_to_use;
 
     // Quantized-specific configuration
-    if(is_data_type_quantized(lhs->data_type()))
+    if (is_data_type_quantized(lhs->data_type()))
     {
-        get_gemmlowp_output_stage_info(&lhs_to_use, &rhs_to_use, &dst_to_use, _gemm_info.activation_info, _gemm_info.output_stage);
+        get_gemmlowp_output_stage_info(&lhs_to_use, &rhs_to_use, &dst_to_use, _gemm_info.activation_info,
+                                       _gemm_info.output_stage);
     }
 
     // Configure Asm Kernel
     _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
-    _asm_glue->configure(&lhs_to_use, &rhs_to_use, nullptr, &dst_to_use, _gemm_info); // c is nullptr as bias not supported in MatMul
+    _asm_glue->configure(&lhs_to_use, &rhs_to_use, nullptr, &dst_to_use,
+                         _gemm_info); // c is nullptr as bias not supported in MatMul
 
     // Specify memory requirements for intermediate tensors
     auto asm_mem_req = _asm_glue->workspace();
     // Specify memory required by gemm kernel
     int idx = 0;
-    for(const auto &aux : asm_mem_req)
+    for (const auto &aux : asm_mem_req)
     {
         _aux_mem[idx] = aux;
         idx++;
@@ -228,8 +261,12 @@ void CpuMatMul::run(ITensorPack &tensors)
 
     // Reshape LHS and DST to ensure compatibility with GEMM asm kernel (Batch dimensions is 4th for lhs and dst within asm)
     // Collapse RHS (necessary to support dimensions larger than 3 in gemm assembly)
-    lhs->info()->set_tensor_shape(TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, _original_lhs_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z
-    dst->info()->set_tensor_shape(TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, _original_dst_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z
+    lhs->info()->set_tensor_shape(
+        TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1,
+                    _original_lhs_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z
+    dst->info()->set_tensor_shape(
+        TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1,
+                    _original_dst_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z
     rhs->info()->set_tensor_shape(_original_rhs_shape.collapsed_from(2));
 
     // Initialise object to handle stored transposed tensors in auxillary memory
@@ -240,17 +277,19 @@ void CpuMatMul::run(ITensorPack &tensors)
     ITensorPack asm_tensors(tensors);
 
     // Run transpose lhs if necessary
-    if(_adj_lhs)
+    if (_adj_lhs)
     {
-        ITensorPack lhs_transpose_pack = { { TensorType::ACL_SRC, lhs }, { TensorType::ACL_DST, lhs_transposed.get() } };
-        NEScheduler::get().schedule_op(_transpose_kernel_lhs.get(), Window::DimY, _transpose_kernel_lhs->window(), lhs_transpose_pack);
+        ITensorPack lhs_transpose_pack = {{TensorType::ACL_SRC, lhs}, {TensorType::ACL_DST, lhs_transposed.get()}};
+        NEScheduler::get().schedule_op(_transpose_kernel_lhs.get(), Window::DimY, _transpose_kernel_lhs->window(),
+                                       lhs_transpose_pack);
         asm_tensors.add_const_tensor(TensorType::ACL_SRC_0, lhs_transposed.get());
     }
     // Run transpose rhs if necessary
-    if(_adj_rhs)
+    if (_adj_rhs)
     {
-        ITensorPack rhs_transpose_pack = { { TensorType::ACL_SRC, rhs }, { TensorType::ACL_DST, rhs_transposed.get() } };
-        NEScheduler::get().schedule_op(_transpose_kernel_rhs.get(), Window::DimY, _transpose_kernel_rhs->window(), rhs_transpose_pack);
+        ITensorPack rhs_transpose_pack = {{TensorType::ACL_SRC, rhs}, {TensorType::ACL_DST, rhs_transposed.get()}};
+        NEScheduler::get().schedule_op(_transpose_kernel_rhs.get(), Window::DimY, _transpose_kernel_rhs->window(),
+                                       rhs_transpose_pack);
         asm_tensors.add_const_tensor(TensorType::ACL_SRC_1, rhs_transposed.get());
     }
     // Run asm kernel
diff --git a/src/cpu/operators/CpuMatMul.h b/src/cpu/operators/CpuMatMul.h
index 475c019fd0..24db3da346 100644
--- a/src/cpu/operators/CpuMatMul.h
+++ b/src/cpu/operators/CpuMatMul.h
@@ -25,6 +25,7 @@
 #define ACL_SRC_CPU_OPERATORS_CPUMATMUL
 
 #include "arm_compute/core/TensorInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuOperator.h"
 #include "src/cpu/kernels/CpuTransposeKernel.h"
@@ -66,18 +67,27 @@ public:
      * @param[in]  settings The settings for matmul operation (i.e fast math)
      * @param[in]  act_info Class containing information about fused activation function.
      */
-    void configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensorInfo               *lhs,
+                   ITensorInfo               *rhs,
+                   ITensorInfo               *dst,
+                   const MatMulInfo          &info,
+                   const CpuMatMulSettings   &settings,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuMatMul::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings,
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *dst,
+                           const MatMulInfo          &info,
+                           const CpuMatMulSettings   &settings,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
@@ -91,9 +101,9 @@ private:
     };
 
     // Define unique pointers to kernels/operators used by matmul
-    std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_lhs{ nullptr };
-    std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_rhs{ nullptr };
-    std::unique_ptr<CpuGemmAssemblyDispatch>     _asm_glue{ nullptr };
+    std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_lhs{nullptr};
+    std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_rhs{nullptr};
+    std::unique_ptr<CpuGemmAssemblyDispatch>     _asm_glue{nullptr};
 
     // TensorInfo for tensors stored in auxillary memory
     TensorInfo _lhs_transposed{};
@@ -105,13 +115,13 @@ private:
     TensorShape _original_dst_shape{};
 
     // Note : adj_lhs means the same as transposing lhs
-    bool                             _adj_lhs{ false };
-    bool                             _adj_rhs{ false };
-    bool                             _fast_math{ false };
+    bool                             _adj_lhs{false};
+    bool                             _adj_rhs{false};
+    bool                             _fast_math{false};
     AsmGemmInfo                      _gemm_info{};
-    experimental::MemoryRequirements _aux_mem{ Count };
+    experimental::MemoryRequirements _aux_mem{Count};
 };
-}
-}
+} // namespace cpu
+} // namespace arm_compute
 
 #endif /* ACL_SRC_CPU_OPERATORS_CPUMATMUL */
diff --git a/src/cpu/operators/CpuMaxUnpooling.cpp b/src/cpu/operators/CpuMaxUnpooling.cpp
index 24e9fd6d46..697fc40ab3 100644
--- a/src/cpu/operators/CpuMaxUnpooling.cpp
+++ b/src/cpu/operators/CpuMaxUnpooling.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "src/cpu/operators/CpuMaxUnpooling.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h"
 
@@ -29,7 +30,10 @@ namespace arm_compute
 {
 namespace cpu
 {
-void CpuMaxUnpooling::configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info)
+void CpuMaxUnpooling::configure(const ITensorInfo      *src,
+                                const ITensorInfo      *indices,
+                                ITensorInfo            *dst,
+                                const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src, indices, dst, pool_info);
     auto k = std::make_unique<kernels::CpuMaxUnpoolingLayerKernel>();
@@ -37,9 +41,12 @@ void CpuMaxUnpooling::configure(const ITensorInfo *src, const ITensorInfo *indic
     _kernel = std::move(k);
 }
 
-Status CpuMaxUnpooling::validate(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info)
+Status CpuMaxUnpooling::validate(const ITensorInfo      *src,
+                                 const ITensorInfo      *indices,
+                                 const ITensorInfo      *dst,
+                                 const PoolingLayerInfo &pool_info)
 {
     return kernels::CpuMaxUnpoolingLayerKernel::validate(src, indices, dst, pool_info);
 }
-} // namesapce cpu
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuMaxUnpooling.h b/src/cpu/operators/CpuMaxUnpooling.h
index aa1f1072a5..5dc00bce9e 100644
--- a/src/cpu/operators/CpuMaxUnpooling.h
+++ b/src/cpu/operators/CpuMaxUnpooling.h
@@ -44,14 +44,18 @@ public:
      * @param[out] dst       Destination tensor. Data types supported: Same as @p src
      * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info);
+    void
+    configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref CpuMaxUnpooling::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *indices,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &pool_info);
 };
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuMul.cpp b/src/cpu/operators/CpuMul.cpp
index 4c15015206..ac9847111d 100644
--- a/src/cpu/operators/CpuMul.cpp
+++ b/src/cpu/operators/CpuMul.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuMulKernel.h"
 
@@ -33,14 +34,24 @@ namespace arm_compute
 {
 namespace cpu
 {
-Status CpuMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+Status CpuMul::validate(const ITensorInfo         *src1,
+                        const ITensorInfo         *src2,
+                        const ITensorInfo         *dst,
+                        float                      scale,
+                        ConvertPolicy              overflow_policy,
+                        RoundingPolicy             rounding_policy,
                         const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return kernels::CpuMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy);
 }
 
-void CpuMul::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+void CpuMul::configure(ITensorInfo               *src1,
+                       ITensorInfo               *src2,
+                       ITensorInfo               *dst,
+                       float                      scale,
+                       ConvertPolicy              overflow_policy,
+                       RoundingPolicy             rounding_policy,
                        const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
@@ -58,13 +69,19 @@ void CpuMul::run(ITensorPack &tensors)
     NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors);
 }
 
-Status CpuComplexMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status CpuComplexMul::validate(const ITensorInfo         *src1,
+                               const ITensorInfo         *src2,
+                               const ITensorInfo         *dst,
+                               const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return kernels::CpuComplexMulKernel::validate(src1, src2, dst);
 }
 
-void CpuComplexMul::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void CpuComplexMul::configure(ITensorInfo               *src1,
+                              ITensorInfo               *src2,
+                              ITensorInfo               *dst,
+                              const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
@@ -80,4 +97,4 @@ void CpuComplexMul::run(ITensorPack &tensors)
     NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
 }
 } // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuMul.h b/src/cpu/operators/CpuMul.h
index 3e0edbf050..82b309830b 100644
--- a/src/cpu/operators/CpuMul.h
+++ b/src/cpu/operators/CpuMul.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/cpu/ICpuOperator.h"
 
 namespace arm_compute
@@ -61,7 +62,12 @@ public:
      * @param[in]      rounding_policy Rounding policy.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+    void configure(ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   float                      scale,
+                   ConvertPolicy              overflow_policy,
+                   RoundingPolicy             rounding_policy,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -69,7 +75,12 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           float                      scale,
+                           ConvertPolicy              overflow_policy,
+                           RoundingPolicy             rounding_policy,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
@@ -89,14 +100,20 @@ public:
      * @param[out]     dst      The dst tensor. Data types supported: same as @p src1. Number of channels: same as @p src1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref CpuComplexMul::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
diff --git a/src/cpu/operators/CpuPermute.cpp b/src/cpu/operators/CpuPermute.cpp
index babaf21b6f..25acc92d00 100644
--- a/src/cpu/operators/CpuPermute.cpp
+++ b/src/cpu/operators/CpuPermute.cpp
@@ -23,9 +23,8 @@
  */
 #include "src/cpu/operators/CpuPermute.h"
 
-#include "src/cpu/kernels/CpuPermuteKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuPermuteKernel.h"
 
 namespace arm_compute
 {
@@ -43,5 +42,5 @@ Status CpuPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, cons
 {
     return kernels::CpuPermuteKernel::validate(src, dst, perm);
 }
-} // namesapce cpu
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuPool2d.cpp b/src/cpu/operators/CpuPool2d.cpp
index 722cd36ee5..b72bde6978 100644
--- a/src/cpu/operators/CpuPool2d.cpp
+++ b/src/cpu/operators/CpuPool2d.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuPool2dKernel.h"
 #include "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h"
@@ -53,7 +54,8 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer
     ARM_COMPUTE_LOG_PARAMS(src, dst, pool_info, indices);
 
     // Check if we can run assembly kernels. Currently, indices are not supported by those kernels
-    const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
+    const bool run_optimised =
+        bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
 
     // Get data layout
     _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
@@ -61,10 +63,11 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer
     // Check if we have Global Pooling Layer
     const unsigned int idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
     const unsigned int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    _is_global_pooling_layer      = (src->dimension(idx_width) == pool_info.pool_size.width) && (src->dimension(idx_height) == pool_info.pool_size.height);
-    _use_kernel_indices           = pool_info.use_kernel_indices;
+    _is_global_pooling_layer      = (src->dimension(idx_width) == pool_info.pool_size.width) &&
+                               (src->dimension(idx_height) == pool_info.pool_size.height);
+    _use_kernel_indices = pool_info.use_kernel_indices;
 
-    if(run_optimised)
+    if (run_optimised)
     {
         const CPUInfo     &ci          = NEScheduler::get().cpu_info();
         const unsigned int num_threads = NEScheduler::get().num_threads();
@@ -76,7 +79,7 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer
         // Get kernel's memory requirements
         constexpr size_t alignment      = 4096;
         const size_t     workspace_size = pooling_wrapper->get_working_size(num_threads);
-        _aux_mem[0]                     = MemoryInfo(TensorType::ACL_INT_0, MemoryLifetime::Temporary, workspace_size, alignment);
+        _aux_mem[0] = MemoryInfo(TensorType::ACL_INT_0, MemoryLifetime::Temporary, workspace_size, alignment);
 
         _asm_glue = std::move(pooling_wrapper);
     }
@@ -89,11 +92,15 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer
     }
 }
 
-Status CpuPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status CpuPool2d::validate(const ITensorInfo      *src,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &pool_info,
+                           const ITensorInfo      *indices)
 {
-    const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
+    const bool run_optimised =
+        bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
 
-    if(run_optimised)
+    if (run_optimised)
     {
         return Status{};
     }
@@ -105,20 +112,24 @@ void CpuPool2d::run(ITensorPack &tensors)
 {
     ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No tensors provided");
 
-    if(_asm_glue)
+    if (_asm_glue)
     {
         const auto hints = (_is_global_pooling_layer) ? Window::DimX : Window::DimY;
         NEScheduler::get().schedule_op(_asm_glue.get(), hints, _asm_glue->window(), tensors);
     }
     else
     {
-        switch(_data_layout)
+        switch (_data_layout)
         {
             case DataLayout::NCHW:
-                NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), _is_global_pooling_layer ? Window::DimZ : Window::DimY, _pooling_layer_kernel->window(), tensors);
+                NEScheduler::get().schedule_op(_pooling_layer_kernel.get(),
+                                               _is_global_pooling_layer ? Window::DimZ : Window::DimY,
+                                               _pooling_layer_kernel->window(), tensors);
                 break;
             case DataLayout::NHWC:
-                NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), (_use_kernel_indices ? Window::DimY : Window::DimX), _pooling_layer_kernel->window(), tensors);
+                NEScheduler::get().schedule_op(_pooling_layer_kernel.get(),
+                                               (_use_kernel_indices ? Window::DimY : Window::DimX),
+                                               _pooling_layer_kernel->window(), tensors);
                 break;
             default:
                 ARM_COMPUTE_ERROR("Data layout not supported");
diff --git a/src/cpu/operators/CpuPool2d.h b/src/cpu/operators/CpuPool2d.h
index 5c571db88a..ea73e3f335 100644
--- a/src/cpu/operators/CpuPool2d.h
+++ b/src/cpu/operators/CpuPool2d.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_POOL2D_H
 
 #include "arm_compute/core/experimental/Types.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuOperator.h"
 
@@ -58,17 +59,21 @@ public:
      * @param[in]      pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      * @param[out]     indices   (optional) The indices of the maximal values. Data type supported: U32.
      */
-    void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
+    void
+    configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuPool2d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &pool_info,
+                           const ITensorInfo      *indices = nullptr);
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
diff --git a/src/cpu/operators/CpuPool3d.cpp b/src/cpu/operators/CpuPool3d.cpp
index 14e4ac6c97..7fa78c1f80 100644
--- a/src/cpu/operators/CpuPool3d.cpp
+++ b/src/cpu/operators/CpuPool3d.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/Scheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuPool3dKernel.h"
 
@@ -35,8 +36,7 @@ namespace arm_compute
 {
 namespace cpu
 {
-CpuPool3d::CpuPool3d()
-    : _aux_mem(1)
+CpuPool3d::CpuPool3d() : _aux_mem(1)
 {
 }
 
@@ -70,4 +70,4 @@ experimental::MemoryRequirements CpuPool3d::workspace() const
 }
 
 } // namespace cpu
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuPool3d.h b/src/cpu/operators/CpuPool3d.h
index 8a73f8a0af..235d798095 100644
--- a/src/cpu/operators/CpuPool3d.h
+++ b/src/cpu/operators/CpuPool3d.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_POOL3D_H
 
 #include "arm_compute/core/experimental/Types.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuOperator.h"
 
@@ -61,7 +62,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info);
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
diff --git a/src/cpu/operators/CpuQuantize.cpp b/src/cpu/operators/CpuQuantize.cpp
index f9e14d1f88..4315499c39 100644
--- a/src/cpu/operators/CpuQuantize.cpp
+++ b/src/cpu/operators/CpuQuantize.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuQuantizeKernel.h"
 
diff --git a/src/cpu/operators/CpuReshape.cpp b/src/cpu/operators/CpuReshape.cpp
index e6892a2e7e..a423abb49a 100644
--- a/src/cpu/operators/CpuReshape.cpp
+++ b/src/cpu/operators/CpuReshape.cpp
@@ -23,11 +23,10 @@
  */
 #include "src/cpu/operators/CpuReshape.h"
 
-#include "src/cpu/kernels/CpuReshapeKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include "src/common/utils/Log.h"
-
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/cpu/kernels/CpuReshapeKernel.h"
 
 namespace arm_compute
 {
@@ -49,7 +48,7 @@ Status CpuReshape::validate(const ITensorInfo *src, const ITensorInfo *dst)
 void CpuReshape::run(ITensorPack &tensors)
 {
     ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         static_cast<kernels::CpuReshapeKernel *>(_kernel.get())->prepare(tensors);
         _is_prepared = true;
diff --git a/src/cpu/operators/CpuReshape.h b/src/cpu/operators/CpuReshape.h
index 9bc43e7db4..33da792319 100644
--- a/src/cpu/operators/CpuReshape.h
+++ b/src/cpu/operators/CpuReshape.h
@@ -24,9 +24,10 @@
 #ifndef ARM_COMPUTE_CPU_RESHAPE_H
 #define ARM_COMPUTE_CPU_RESHAPE_H
 
-#include "src/cpu/ICpuOperator.h"
 #include "arm_compute/core/Window.h"
 
+#include "src/cpu/ICpuOperator.h"
+
 namespace arm_compute
 {
 namespace cpu
@@ -53,7 +54,7 @@ public:
     void run(ITensorPack &tensors) override;
 
 private:
-    bool    _is_prepared{ false } ;
+    bool _is_prepared{false};
 };
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuScale.cpp b/src/cpu/operators/CpuScale.cpp
index 8a712bf088..7df9296931 100644
--- a/src/cpu/operators/CpuScale.cpp
+++ b/src/cpu/operators/CpuScale.cpp
@@ -24,8 +24,9 @@
 #include "src/cpu/operators/CpuScale.h"
 
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "src/cpu/kernels/CpuScaleKernel.h"
@@ -37,11 +38,12 @@ namespace cpu
 {
 namespace
 {
-void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, SamplingPolicy sampling_policy, bool align_corners)
+void precompute_dx_dy_offsets(
+    ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, SamplingPolicy sampling_policy, bool align_corners)
 {
     ARM_COMPUTE_ERROR_ON(offsets == nullptr);
     float sampling_offset = 0.0f;
-    if(sampling_policy == SamplingPolicy::CENTER)
+    if (sampling_policy == SamplingPolicy::CENTER)
     {
         sampling_offset = 0.5f;
     }
@@ -50,38 +52,44 @@ void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float
     win.set(Window::DimX, Window::Dimension(0, offsets->info()->dimension(0), 1));
     win.set(Window::DimY, Window::Dimension(0, offsets->info()->dimension(1), 1));
 
-    if(dx != nullptr && dy != nullptr)
+    if (dx != nullptr && dy != nullptr)
     {
         // Pre-compute the offset and pixel's distance for BILINEAR interpolation
         Iterator offsets_it(offsets, win);
         Iterator dx_it(dx, win);
         Iterator dy_it(dy, win);
 
-        execute_window_loop(win, [&](const Coordinates & id)
-        {
-            const float in_x  = (id.x() + sampling_offset) * wr - sampling_offset;
-            const float in_y  = (id.y() + sampling_offset) * hr - sampling_offset;
-            const int   in_xi = std::floor(in_x);
-            const int   in_yi = std::floor(in_y);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &id)
+            {
+                const float in_x  = (id.x() + sampling_offset) * wr - sampling_offset;
+                const float in_y  = (id.y() + sampling_offset) * hr - sampling_offset;
+                const int   in_xi = std::floor(in_x);
+                const int   in_yi = std::floor(in_y);
 
-            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
-            *reinterpret_cast<float *>(dx_it.ptr())        = in_x - in_xi;
-            *reinterpret_cast<float *>(dy_it.ptr())        = in_y - in_yi;
-        },
-        offsets_it, dx_it, dy_it);
+                *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
+                *reinterpret_cast<float *>(dx_it.ptr())        = in_x - in_xi;
+                *reinterpret_cast<float *>(dy_it.ptr())        = in_y - in_yi;
+            },
+            offsets_it, dx_it, dy_it);
     }
     else
     {
         // Pre-compute the offset for NEAREST interpolation
         Iterator offsets_it(offsets, win);
 
-        execute_window_loop(win, [&](const Coordinates & id)
-        {
-            const float float_in_xi                        = (id.x() + sampling_offset) * wr;
-            const auto  in_xi                              = static_cast<size_t>(align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi) : std::floor(float_in_xi));
-            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
-        },
-        offsets_it);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &id)
+            {
+                const float float_in_xi = (id.x() + sampling_offset) * wr;
+                const auto  in_xi       = static_cast<size_t>(
+                    align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi)
+                                         : std::floor(float_in_xi));
+                *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
+            },
+            offsets_it);
     }
 }
 } // namespace
@@ -96,20 +104,24 @@ void CpuScale::configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelIn
     _is_prepared = false;
 
     // Get data layout and width/height indices
-    _data_layout         = _scale_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : _scale_info.data_layout;
-    const int idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    _data_layout        = _scale_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : _scale_info.data_layout;
+    const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
     const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
 
     // Compute the ratio between source width/height and destination width/height
-    const bool is_align_corners_used = _scale_info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy);
-    const auto wr                    = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), is_align_corners_used);
-    const auto hr                    = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), is_align_corners_used);
+    const bool is_align_corners_used =
+        _scale_info.align_corners &&
+        arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy);
+    const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width),
+                                                                     dst->dimension(idx_width), is_align_corners_used);
+    const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height),
+                                                                     dst->dimension(idx_height), is_align_corners_used);
 
     // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-    InterpolationPolicy policy_to_use = (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f
-                                         && hr <= 1.f) ?
-                                        InterpolationPolicy::NEAREST_NEIGHBOR :
-                                        _scale_info.interpolation_policy;
+    InterpolationPolicy policy_to_use =
+        (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+            ? InterpolationPolicy::NEAREST_NEIGHBOR
+            : _scale_info.interpolation_policy;
 
     // Get the tensor shape
     TensorShape shape(dst->dimension(idx_width));
@@ -122,7 +134,7 @@ void CpuScale::configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelIn
     auto dy           = std::make_unique<TensorInfo>(tensor_info_dxdy);
     auto offsets      = std::make_unique<TensorInfo>(tensor_info_offsets);
     auto scale_kernel = std::make_unique<kernels::CpuScaleKernel>();
-    switch(policy_to_use)
+    switch (policy_to_use)
     {
         case InterpolationPolicy::NEAREST_NEIGHBOR:
         {
@@ -148,7 +160,8 @@ void CpuScale::configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelIn
 Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT);
+    ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER &&
+                                info.sampling_policy != SamplingPolicy::TOP_LEFT);
 
     ITensorInfo *offsets = nullptr;
     ITensorInfo *dx      = nullptr;
@@ -160,19 +173,25 @@ Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const
     const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
     // Compute the ratio between source width/height and destination width/height
-    const bool is_align_corners_used = info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy);
-    const auto wr                    = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), is_align_corners_used);
-    const auto hr                    = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), is_align_corners_used);
+    const bool is_align_corners_used =
+        info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy);
+    const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width),
+                                                                     dst->dimension(idx_width), is_align_corners_used);
+    const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height),
+                                                                     dst->dimension(idx_height), is_align_corners_used);
 
     // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-    InterpolationPolicy policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : info.interpolation_policy;
+    InterpolationPolicy policy_to_use =
+        (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+            ? InterpolationPolicy::NEAREST_NEIGHBOR
+            : info.interpolation_policy;
 
     // Get the tensor shape of auxilary buffers
     const TensorShape shape(dst->dimension(idx_width), dst->dimension(idx_height));
     TensorInfo        tensor_info_offsets(shape, Format::S32);
     TensorInfo        tensor_info_dx(shape, Format::F32);
     TensorInfo        tensor_info_dy(shape, Format::F32);
-    switch(policy_to_use)
+    switch (policy_to_use)
     {
         case InterpolationPolicy::NEAREST_NEIGHBOR:
             offsets = &tensor_info_offsets;
@@ -186,13 +205,14 @@ Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const
             break;
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuScaleKernel::validate(src->clone().get(), dx, dy, offsets, dst->clone().get(), info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        kernels::CpuScaleKernel::validate(src->clone().get(), dx, dy, offsets, dst->clone().get(), info));
     return Status{};
 }
 
 void CpuScale::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         _is_prepared       = true;
         const auto src     = tensors.get_const_tensor(TensorType::ACL_SRC);
@@ -206,22 +226,27 @@ void CpuScale::prepare(ITensorPack &tensors)
         const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
 
         // Compute the ratio between source width/height and destination width/height
-        const bool is_align_corners_used = _scale_info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy);
-        const auto wr                    = arm_compute::scale_utils::calculate_resize_ratio(src->info()->dimension(idx_width), dst->info()->dimension(idx_width), is_align_corners_used);
-        const auto hr                    = arm_compute::scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), is_align_corners_used);
+        const bool is_align_corners_used =
+            _scale_info.align_corners &&
+            arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy);
+        const auto wr = arm_compute::scale_utils::calculate_resize_ratio(
+            src->info()->dimension(idx_width), dst->info()->dimension(idx_width), is_align_corners_used);
+        const auto hr = arm_compute::scale_utils::calculate_resize_ratio(
+            src->info()->dimension(idx_height), dst->info()->dimension(idx_height), is_align_corners_used);
 
         // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-        InterpolationPolicy policy_to_use = (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f
-                                             && hr <= 1.f) ?
-                                            InterpolationPolicy::NEAREST_NEIGHBOR :
-                                            _scale_info.interpolation_policy;
+        InterpolationPolicy policy_to_use =
+            (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+                ? InterpolationPolicy::NEAREST_NEIGHBOR
+                : _scale_info.interpolation_policy;
         const SamplingPolicy sampling_policy = _scale_info.sampling_policy;
 
-        bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required(_data_layout, src->info()->data_type(), policy_to_use, _scale_info.border_mode);
+        bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required(
+            _data_layout, src->info()->data_type(), policy_to_use, _scale_info.border_mode);
 
-        if(precompute_indices_weights)
+        if (precompute_indices_weights)
         {
-            switch(policy_to_use)
+            switch (policy_to_use)
             {
                 case InterpolationPolicy::NEAREST_NEIGHBOR:
                 {
@@ -245,7 +270,8 @@ void CpuScale::prepare(ITensorPack &tensors)
         }
         else
         {
-            if(policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && policy_to_use != InterpolationPolicy::BILINEAR && policy_to_use != InterpolationPolicy::AREA)
+            if (policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR &&
+                policy_to_use != InterpolationPolicy::BILINEAR && policy_to_use != InterpolationPolicy::AREA)
             {
                 ARM_COMPUTE_ERROR("Unsupported interpolation mode");
             }
diff --git a/src/cpu/operators/CpuScale.h b/src/cpu/operators/CpuScale.h
index ee7c523bad..c12a8e733a 100644
--- a/src/cpu/operators/CpuScale.h
+++ b/src/cpu/operators/CpuScale.h
@@ -24,9 +24,10 @@
 #ifndef ARM_COMPUTE_CPU_SCALE_H
 #define ARM_COMPUTE_CPU_SCALE_H
 
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/experimental/Types.h"
+
 #include "src/cpu/ICpuKernel.h"
 #include "src/cpu/ICpuOperator.h"
 
@@ -62,9 +63,9 @@ public:
     void run(ITensorPack &tensors) override;
 
 private:
-    ScaleKernelInfo _scale_info{ InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED };
-    DataLayout      _data_layout{ DataLayout::UNKNOWN };
-    bool            _is_prepared{ false };
+    ScaleKernelInfo _scale_info{InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED};
+    DataLayout      _data_layout{DataLayout::UNKNOWN};
+    bool            _is_prepared{false};
 };
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuSoftmax.cpp b/src/cpu/operators/CpuSoftmax.cpp
index bf4c2fa3a2..e55d7f903e 100644
--- a/src/cpu/operators/CpuSoftmax.cpp
+++ b/src/cpu/operators/CpuSoftmax.cpp
@@ -25,9 +25,10 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/core/helpers/SoftmaxHelpers.h"
@@ -63,13 +64,15 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d
     ARM_COMPUTE_ERROR_THROW_ON(CpuSoftmaxGeneric::validate(src, dst, beta, axis));
     ARM_COMPUTE_LOG_PARAMS(src, dst, beta, axis);
 
-    const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
+    const unsigned int actual_axis =
+        static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
 
     _needs_permute = actual_axis > 0;
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
-        _permute_input.configure(src, &_input_permuted, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
+        _permute_input.configure(src, &_input_permuted,
+                                 softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
     }
 
     // We want to deal with a 2D input. Either it is the permuted version of the original input (4D case)
@@ -79,10 +82,11 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d
     // Create intermediate tensors shapes
     TensorShape max_sum_shape = tmp_input->tensor_shape();
     max_sum_shape.set(0, 1);
-    const TensorInfo input_info    = tmp_input->clone()->reset_padding().set_is_resizable(true);
-    DataType         tmp_data_type = is_data_type_quantized_asymmetric(tmp_input->data_type()) ? DataType::F32 : tmp_input->data_type();
-    TensorInfo       tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
-    TensorInfo       max_info(tmp_input->clone()->set_tensor_shape(max_sum_shape));
+    const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true);
+    DataType         tmp_data_type =
+        is_data_type_quantized_asymmetric(tmp_input->data_type()) ? DataType::F32 : tmp_input->data_type();
+    TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
+    TensorInfo max_info(tmp_input->clone()->set_tensor_shape(max_sum_shape));
 
     // Init intermediate tensors
     _max = TensorInfo(max_info);
@@ -94,13 +98,14 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d
     _max_kernel = std::move(mk);
 
     auto sm = std::make_unique<kernels::CpuLogits1DSoftmaxKernel<IS_LOG>>();
-    if(_needs_permute)
+    if (_needs_permute)
     {
         // The normalization kernel stores the result in a permuted output tensor
         sm->configure(tmp_input, &_max, &_output_permuted, beta, &_tmp);
 
         // Re-permute the permuted output into the requested (4D) output
-        _permute_output.configure(&_output_permuted, dst, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
+        _permute_output.configure(&_output_permuted, dst,
+                                  softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
     }
     else
     {
@@ -109,11 +114,15 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d
     }
     _softmax_kernel = std::move(sm);
 
-    _aux_mem[InternalTensorIdx::MAX] = MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max.total_size());
-    _aux_mem[InternalTensorIdx::TMP] = MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size());
+    _aux_mem[InternalTensorIdx::MAX] =
+        MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max.total_size());
+    _aux_mem[InternalTensorIdx::TMP] =
+        MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size());
 
-    _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), MemoryLifetime::Temporary, _input_permuted.total_size());
-    _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), MemoryLifetime::Temporary, _output_permuted.total_size());
+    _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC),
+                                                           MemoryLifetime::Temporary, _input_permuted.total_size());
+    _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST),
+                                                           MemoryLifetime::Temporary, _output_permuted.total_size());
 }
 
 template <bool IS_LOG>
@@ -123,7 +132,8 @@ Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensor
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 4, "Only up to 4 dimensions are supported");
     ARM_COMPUTE_UNUSED(beta);
-    ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-src->num_dimensions()) || static_cast<int32_t>(src->num_dimensions()) <= axis);
+    ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-src->num_dimensions()) ||
+                                static_cast<int32_t>(src->num_dimensions()) <= axis);
 
     // Create intermediate tensor info
     DataType         tmp_data_type = src->data_type();
@@ -131,25 +141,33 @@ Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensor
 
     TensorShape max_sum_shape = src->tensor_shape();
     max_sum_shape.set(0, 1);
-    const TensorInfo tensor_info_max_sum(src->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(src->quantization_info()).set_is_resizable(true));
+    const TensorInfo tensor_info_max_sum(src->clone()
+                                             ->set_tensor_shape(max_sum_shape)
+                                             .set_data_type(tmp_data_type)
+                                             .set_quantization_info(src->quantization_info())
+                                             .set_is_resizable(true));
     const TensorInfo dont_care;
 
-    const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
+    const unsigned int actual_axis =
+        static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
 
     const bool needs_permute = actual_axis > 0;
 
-    if(needs_permute)
+    if (needs_permute)
     {
-        const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
-        const TensorShape       permuted_shape     = misc::shape_calculator::compute_permutation_output_shape(*src, permutation_vector);
-        TensorInfo              input_permuted(src->clone()->set_tensor_shape(permuted_shape));
+        const PermutationVector permutation_vector =
+            softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
+        const TensorShape permuted_shape =
+            misc::shape_calculator::compute_permutation_output_shape(*src, permutation_vector);
+        TensorInfo input_permuted(src->clone()->set_tensor_shape(permuted_shape));
         ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &input_permuted, permutation_vector));
         TensorInfo output_permuted(dst->clone()->set_tensor_shape(permuted_shape));
         ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&output_permuted, dst, permutation_vector));
     }
 
     ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DMaxKernel::validate(src, &tensor_info_max_sum));
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DSoftmaxKernel<IS_LOG>::validate(&tensor_info_tmp, &tensor_info_max_sum, dst, beta, &dont_care));
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DSoftmaxKernel<IS_LOG>::validate(
+        &tensor_info_tmp, &tensor_info_max_sum, dst, beta, &dont_care));
 
     return Status{};
 }
@@ -166,43 +184,38 @@ void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors)
     CpuAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max, tensors, true);
 
     CpuAuxTensorHandler input_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _input_permuted, tensors, true);
-    CpuAuxTensorHandler output_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _output_permuted, tensors, true);
+    CpuAuxTensorHandler output_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _output_permuted, tensors,
+                                        true);
 
     ITensorPack max_pack;
     ITensorPack softmax_pack;
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
-        ITensorPack permute_in_pack = { { TensorType::ACL_SRC, src }, { TensorType::ACL_DST, input_permuted.get() } };
+        ITensorPack permute_in_pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, input_permuted.get()}};
         _permute_input.run(permute_in_pack);
 
-        max_pack = { { TensorType::ACL_SRC, input_permuted.get() }, { TensorType::ACL_DST, max.get() } };
+        max_pack = {{TensorType::ACL_SRC, input_permuted.get()}, {TensorType::ACL_DST, max.get()}};
 
-        softmax_pack =
-        {
-            { TensorType::ACL_SRC_0, input_permuted.get() },
-            { TensorType::ACL_SRC_1, max.get() },
-            { TensorType::ACL_DST_0, output_permuted.get() },
-            { TensorType::ACL_DST_1, tmp.get() }
-        };
+        softmax_pack = {{TensorType::ACL_SRC_0, input_permuted.get()},
+                        {TensorType::ACL_SRC_1, max.get()},
+                        {TensorType::ACL_DST_0, output_permuted.get()},
+                        {TensorType::ACL_DST_1, tmp.get()}};
     }
     else
     {
-        max_pack = { { TensorType::ACL_SRC, src }, { TensorType::ACL_DST, max.get() } };
-
-        softmax_pack =
-        {
-            { TensorType::ACL_SRC_0, src },
-            { TensorType::ACL_SRC_1, max.get() },
-            { TensorType::ACL_DST_0, dst },
-            { TensorType::ACL_DST_1, tmp.get() }
-        };
+        max_pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, max.get()}};
+
+        softmax_pack = {{TensorType::ACL_SRC_0, src},
+                        {TensorType::ACL_SRC_1, max.get()},
+                        {TensorType::ACL_DST_0, dst},
+                        {TensorType::ACL_DST_1, tmp.get()}};
     }
 
     NEScheduler::get().schedule_op(_max_kernel.get(), Window::DimY, _max_kernel->window(), max_pack);
     NEScheduler::get().schedule_op(_softmax_kernel.get(), Window::DimY, _softmax_kernel->window(), softmax_pack);
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
         ITensorPack permute_out_pack;
         permute_out_pack.add_tensor(TensorType::ACL_SRC, output_permuted.get());
@@ -211,7 +224,7 @@ void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors)
     }
 }
 
-template <bool                   IS_LOG>
+template <bool IS_LOG>
 experimental::MemoryRequirements CpuSoftmaxGeneric<IS_LOG>::workspace() const
 {
     return _aux_mem;
diff --git a/src/cpu/operators/CpuSoftmax.h b/src/cpu/operators/CpuSoftmax.h
index 64df8704f9..8cab70e14f 100644
--- a/src/cpu/operators/CpuSoftmax.h
+++ b/src/cpu/operators/CpuSoftmax.h
@@ -24,11 +24,13 @@
 #ifndef ARM_COMPUTE_CPU_SOFTMAX_H
 #define ARM_COMPUTE_CPU_SOFTMAX_H
 
-#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/TensorInfo.h"
+
 #include "src/cpu/ICpuKernel.h"
 #include "src/cpu/ICpuOperator.h"
 #include "src/cpu/operators/CpuPermute.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -77,7 +79,7 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0);
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
diff --git a/src/cpu/operators/CpuSub.cpp b/src/cpu/operators/CpuSub.cpp
index 91a5b6e63c..7d27efbc96 100644
--- a/src/cpu/operators/CpuSub.cpp
+++ b/src/cpu/operators/CpuSub.cpp
@@ -23,17 +23,20 @@
  */
 #include "src/cpu/operators/CpuSub.h"
 
-#include "src/cpu/kernels/CpuSubKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include "src/common/utils/Log.h"
-
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/cpu/kernels/CpuSubKernel.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void CpuSub::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CpuSub::configure(const ITensorInfo         *src0,
+                       const ITensorInfo         *src1,
+                       ITensorInfo               *dst,
+                       ConvertPolicy              policy,
+                       const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     ARM_COMPUTE_LOG_PARAMS(src0, src1, dst, policy);
@@ -42,7 +45,11 @@ void CpuSub::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensor
     _kernel = std::move(k);
 }
 
-Status CpuSub::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CpuSub::validate(const ITensorInfo         *src0,
+                        const ITensorInfo         *src1,
+                        const ITensorInfo         *dst,
+                        ConvertPolicy              policy,
+                        const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return kernels::CpuSubKernel::validate(src0, src1, dst, policy);
diff --git a/src/cpu/operators/CpuSub.h b/src/cpu/operators/CpuSub.h
index 88908637aa..d1782a1d3c 100644
--- a/src/cpu/operators/CpuSub.h
+++ b/src/cpu/operators/CpuSub.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_SUB_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/cpu/ICpuOperator.h"
 
 namespace arm_compute
@@ -53,14 +54,22 @@ public:
      * @param[in]  policy   Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
      * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ITensorInfo         *src0,
+                   const ITensorInfo         *src1,
+                   ITensorInfo               *dst,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref CpuSub::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src0,
+                           const ITensorInfo         *src1,
+                           const ITensorInfo         *dst,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
diff --git a/src/cpu/operators/CpuTranspose.cpp b/src/cpu/operators/CpuTranspose.cpp
index 4e7854fd6e..ea548e0511 100644
--- a/src/cpu/operators/CpuTranspose.cpp
+++ b/src/cpu/operators/CpuTranspose.cpp
@@ -23,9 +23,8 @@
  */
 #include "src/cpu/operators/CpuTranspose.h"
 
-#include "src/cpu/kernels/CpuTransposeKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuTransposeKernel.h"
 
 namespace arm_compute
 {
@@ -43,5 +42,5 @@ Status CpuTranspose::validate(const ITensorInfo *src, const ITensorInfo *dst)
 {
     return kernels::CpuTransposeKernel::validate(src, dst);
 }
-} // namesapce cpu
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuWinogradConv2d.cpp b/src/cpu/operators/CpuWinogradConv2d.cpp
index c4edd89964..9d07736c13 100644
--- a/src/cpu/operators/CpuWinogradConv2d.cpp
+++ b/src/cpu/operators/CpuWinogradConv2d.cpp
@@ -22,23 +22,25 @@
  * SOFTWARE.
  */
 #include "src/cpu/operators/CpuWinogradConv2d.h"
+
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/CPP/Validate.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "src/core/NEON/kernels/assembly/winograd.hpp"
 #include "src/core/NEON/kernels/convolution/common/tensor.hpp"
 #include "src/core/NEON/kernels/convolution/common/utils.hpp"
-#include "src/core/helpers/MemoryHelpers.h"
-#include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/AssemblyUtils.h"
-#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
 #include "src/cpu/kernels/assembly/arm_gemm.hpp"
+#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
 #include "src/cpu/operators/CpuActivation.h"
 #include "src/cpu/operators/CpuPermute.h"
 #include "src/cpu/utils/CpuAuxTensorHandler.h"
@@ -56,21 +58,26 @@ namespace
 inline Tensor4DShape internal_get_shape(const ITensorInfo *in)
 {
     const DataLayout data_layout = in->data_layout();
-    const int        in_width    = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH));
-    const int        in_height   = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT));
-    const int        in_channels = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
-    const int        in_batches  = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES));
+    const int        in_width = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH));
+    const int in_height   = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT));
+    const int in_channels = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
+    const int in_batches  = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES));
 
-    return Tensor4DShape{ in_batches, in_height, in_width, in_channels };
+    return Tensor4DShape{in_batches, in_height, in_width, in_channels};
 }
 
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info)
+Status validate_arguments(const ITensorInfo   *src,
+                          const ITensorInfo   *weights,
+                          const ITensorInfo   *biases,
+                          const ITensorInfo   *dst,
+                          const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_UNUSED(dst, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd layer only supports unit strides.");
-    if(biases != nullptr)
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1,
+                                    "Winograd layer only supports unit strides.");
+    if (biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
@@ -80,43 +87,46 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
     return Status{};
 }
 
-bool get_winograd_kernel_implementation(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst,
-                                        const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math,
-                                        arm_conv::winograd::WinogradImpl *winograd_impl, std::unique_ptr<arm_conv::ConvolutionArgs> &conv_args)
+bool get_winograd_kernel_implementation(const ITensorInfo                          *src,
+                                        const ITensorInfo                          *weights,
+                                        const ITensorInfo                          *dst,
+                                        const PadStrideInfo                        &conv_info,
+                                        const ActivationLayerInfo                  &act_info,
+                                        bool                                        enable_fast_math,
+                                        arm_conv::winograd::WinogradImpl           *winograd_impl,
+                                        std::unique_ptr<arm_conv::ConvolutionArgs> &conv_args)
 {
     arm_conv::winograd::WinogradConfig winograd_cfg;
     arm_gemm::GemmConfig               cfg;
 
     const DataType data_type = src->data_type();
-    Tensor4DShape  in_shape{ internal_get_shape(src) };
-    Tensor4DShape  out_shape{ internal_get_shape(dst) };
-    Tensor4DShape  kernel_shape{ internal_get_shape(weights) };
+    Tensor4DShape  in_shape{internal_get_shape(src)};
+    Tensor4DShape  out_shape{internal_get_shape(dst)};
+    Tensor4DShape  kernel_shape{internal_get_shape(weights)};
     uint32_t       nthreads = NEScheduler::get().num_threads();
     // Get configuration arguments for Winograd
     winograd_cfg.output_rows = 0;
     winograd_cfg.output_cols = 0;
     conv_args                = std::make_unique<arm_conv::ConvolutionArgs>(
-                                   in_shape.n_batches,
-                                   arm_conv::Shape2D{ static_cast<uint32_t>(in_shape.n_rows), static_cast<uint32_t>(in_shape.n_cols) },
-                                   in_shape.n_channels,
-                                   conv_info.pad_top(),
-                                   conv_info.pad_left(),
-                                   arm_conv::Shape2D{ static_cast<uint32_t>(out_shape.n_rows), static_cast<uint32_t>(out_shape.n_cols) },
-                                   out_shape.n_channels,
-                                   arm_conv::Shape2D{ static_cast<uint32_t>(kernel_shape.n_rows), static_cast<uint32_t>(kernel_shape.n_cols) },
-                                   assembly_utils::map_to_arm_gemm_activation(act_info));
+        in_shape.n_batches,
+        arm_conv::Shape2D{static_cast<uint32_t>(in_shape.n_rows), static_cast<uint32_t>(in_shape.n_cols)},
+        in_shape.n_channels, conv_info.pad_top(), conv_info.pad_left(),
+        arm_conv::Shape2D{static_cast<uint32_t>(out_shape.n_rows), static_cast<uint32_t>(out_shape.n_cols)},
+        out_shape.n_channels,
+        arm_conv::Shape2D{static_cast<uint32_t>(kernel_shape.n_rows), static_cast<uint32_t>(kernel_shape.n_cols)},
+        assembly_utils::map_to_arm_gemm_activation(act_info));
 
     bool success = false;
-    if(data_type == DataType::F32)
+    if (data_type == DataType::F32)
     {
-        success = arm_conv::winograd::get_implementation<float>(
-                      *winograd_impl, &CPUInfo::get(), *conv_args, nthreads, enable_fast_math, &winograd_cfg, nullptr);
+        success = arm_conv::winograd::get_implementation<float>(*winograd_impl, &CPUInfo::get(), *conv_args, nthreads,
+                                                                enable_fast_math, &winograd_cfg, nullptr);
     }
 #if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    else if(data_type == DataType::F16)
+    else if (data_type == DataType::F16)
     {
-        success = arm_conv::winograd::get_implementation<__fp16>(
-                      *winograd_impl, &CPUInfo::get(), *conv_args, nthreads, enable_fast_math, &winograd_cfg, nullptr);
+        success = arm_conv::winograd::get_implementation<__fp16>(*winograd_impl, &CPUInfo::get(), *conv_args, nthreads,
+                                                                 enable_fast_math, &winograd_cfg, nullptr);
     }
 #endif // defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
     else
@@ -127,7 +137,8 @@ bool get_winograd_kernel_implementation(const ITensorInfo *src, const ITensorInf
 }
 inline bool fuse_function_supported(const ActivationLayerInfo &act_info)
 {
-    return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU || act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU;
+    return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ||
+           act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU;
 }
 } // namespace
 
@@ -141,7 +152,7 @@ CpuWinogradConv2d::CpuWinogradConv2d()
       _permute_output(std::make_unique<CpuPermute>()),
       _permute_weights(std::make_unique<CpuPermute>()),
       _aux_mem(AuxTensorIdx::Count),
-      _conv_args{ nullptr },
+      _conv_args{nullptr},
       _winograd_impl{},
       _data_layout(),
       _winograd_transformed_input{},
@@ -152,15 +163,20 @@ CpuWinogradConv2d::CpuWinogradConv2d()
       _weights_hwio(),
       _input_nhwc(),
       _output_nhwc(),
-      _is_prepared{ false },
-      _run_activation{ false }
+      _is_prepared{false},
+      _run_activation{false}
 {
 }
 
 CpuWinogradConv2d::~CpuWinogradConv2d() = default;
 
-void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst,
-                                  const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math)
+void CpuWinogradConv2d::configure(const ITensorInfo         *src,
+                                  const ITensorInfo         *weights,
+                                  const ITensorInfo         *biases,
+                                  ITensorInfo               *dst,
+                                  const PadStrideInfo       &conv_info,
+                                  const ActivationLayerInfo &act_info,
+                                  bool                       enable_fast_math)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate(src, weights, biases, dst, conv_info, act_info, enable_fast_math));
@@ -169,21 +185,29 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei
     const DataType data_type = src->data_type();
     uint32_t       nthreads  = NEScheduler::get().num_threads();
     _data_layout             = src->data_layout();
-    const Tensor4DShape kernel_shape{ internal_get_shape(weights) };
-
-    bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math, &_winograd_impl, _conv_args);
-
-    ARM_COMPUTE_EXIT_ON_MSG_VAR(!success, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows, kernel_shape.n_cols);
-    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n", _winograd_impl.input_transform->get_name().c_str());
-    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n", _winograd_impl.input_transform->get_name().c_str());
-    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n", _winograd_impl.input_transform->get_name().c_str());
-
-    const bool has_impl = ((_winograd_impl.input_transform != nullptr) && (_winograd_impl.output_transform != nullptr) && (_winograd_impl.gemm_args != nullptr));
-    if(has_impl)
+    const Tensor4DShape kernel_shape{internal_get_shape(weights)};
+
+    bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math,
+                                                      &_winograd_impl, _conv_args);
+
+    ARM_COMPUTE_EXIT_ON_MSG_VAR(!success, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows,
+                                kernel_shape.n_cols);
+    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n",
+                                        _winograd_impl.input_transform->get_name().c_str());
+    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n",
+                                        _winograd_impl.input_transform->get_name().c_str());
+    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n",
+                                        _winograd_impl.input_transform->get_name().c_str());
+
+    const bool has_impl = ((_winograd_impl.input_transform != nullptr) &&
+                           (_winograd_impl.output_transform != nullptr) && (_winograd_impl.gemm_args != nullptr));
+    if (has_impl)
     {
         // Determine how much working space is required, allocate it.
-        const size_t input_workspace_size  = _winograd_impl.input_transform->get_working_space_size(*_conv_args, nthreads);
-        const size_t output_workspace_size = _winograd_impl.output_transform->get_working_space_size(*_conv_args, nthreads);
+        const size_t input_workspace_size =
+            _winograd_impl.input_transform->get_working_space_size(*_conv_args, nthreads);
+        const size_t output_workspace_size =
+            _winograd_impl.output_transform->get_working_space_size(*_conv_args, nthreads);
 
         TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, DataType::U8);
         TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, DataType::U8);
@@ -232,7 +256,7 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei
         PermutationVector weights_permutation_vector(3U, 0U, 1U, 2U);
 
         // Configure the kernel to transform the input tensor from NCHW -> NHWC
-        if(_data_layout == DataLayout::NCHW)
+        if (_data_layout == DataLayout::NCHW)
         {
             _permute_input->configure(src, &_input_nhwc, PermutationVector(2U, 0U, 1U));
             weights_permutation_vector = PermutationVector(3U, 2U, 0U, 1U);
@@ -242,28 +266,30 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei
         _permute_weights->configure(weights, &_weights_hwio, weights_permutation_vector);
 
         // Reorder the convoluted output to ACL's ordering NCHW
-        if(_data_layout == DataLayout::NCHW)
+        if (_data_layout == DataLayout::NCHW)
         {
             // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
-            TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0),
-                                        dst->dimension(1), dst->dimension(3)),
-                            1, dst->data_type());
+            TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0), dst->dimension(1), dst->dimension(3)), 1,
+                            dst->data_type());
             _output_nhwc = info;
             _permute_output->configure(&_output_nhwc, dst, PermutationVector(1U, 2U, 0U));
         }
 
         // Configure input transform kernel
-        _transform_input_kernel = std::make_unique<CpuWinogradConv2dTransformInputKernel>(_winograd_impl, *_conv_args, nthreads);
+        _transform_input_kernel =
+            std::make_unique<CpuWinogradConv2dTransformInputKernel>(_winograd_impl, *_conv_args, nthreads);
 
         // Configure GEMM function
-        _gemm_function->configure(&_winograd_transformed_input, &_winograd_transformed_weights, nullptr, &_winograd_transformed_output, 1.0f, 0.f);
+        _gemm_function->configure(&_winograd_transformed_input, &_winograd_transformed_weights, nullptr,
+                                  &_winograd_transformed_output, 1.0f, 0.f);
 
         // Configure output transform kernel
-        _transform_output_kernel = std::make_unique<CpuWinogradConv2dTransformOutputKernel>(_winograd_impl, *_conv_args, nthreads);
+        _transform_output_kernel =
+            std::make_unique<CpuWinogradConv2dTransformOutputKernel>(_winograd_impl, *_conv_args, nthreads);
 
         //Configure Activation Layer
         _run_activation = act_info.enabled() && !fuse_function_supported(act_info);
-        if(_run_activation)
+        if (_run_activation)
         {
             _activation_func->configure(dst, nullptr, act_info);
         }
@@ -276,40 +302,55 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei
         _aux_mem[TempResult]     = asm_mem_req[TempResult];
 
         // Request temporary memory. Overlap memory needed for Input/Output transformations as they run on different non-overlapping time-steps.
-        _aux_mem[TransformedInput]   = MemoryInfo(offset_int_vec(TransformedInput), MemoryLifetime::Temporary, wds.input_matrix_size_bytes, storage_alignment);
-        _aux_mem[TransformedOutput]  = MemoryInfo(offset_int_vec(TransformedOutput), MemoryLifetime::Temporary, wds.output_matrix_size_bytes, storage_alignment);
-        _aux_mem[WorkspaceIO]        = MemoryInfo(offset_int_vec(WorkspaceIO), MemoryLifetime::Temporary, std::max(input_workspace_size, output_workspace_size));
-        _aux_mem[PermutedWeights]    = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size());
-        _aux_mem[TransformedWeights] = MemoryInfo(offset_int_vec(TransformedWeights), MemoryLifetime::Persistent, wds.weight_matrix_size_bytes, storage_alignment);
-        if(_data_layout == DataLayout::NCHW)
+        _aux_mem[TransformedInput]  = MemoryInfo(offset_int_vec(TransformedInput), MemoryLifetime::Temporary,
+                                                 wds.input_matrix_size_bytes, storage_alignment);
+        _aux_mem[TransformedOutput] = MemoryInfo(offset_int_vec(TransformedOutput), MemoryLifetime::Temporary,
+                                                 wds.output_matrix_size_bytes, storage_alignment);
+        _aux_mem[WorkspaceIO]       = MemoryInfo(offset_int_vec(WorkspaceIO), MemoryLifetime::Temporary,
+                                                 std::max(input_workspace_size, output_workspace_size));
+        _aux_mem[PermutedWeights] =
+            MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size());
+        _aux_mem[TransformedWeights] = MemoryInfo(offset_int_vec(TransformedWeights), MemoryLifetime::Persistent,
+                                                  wds.weight_matrix_size_bytes, storage_alignment);
+        if (_data_layout == DataLayout::NCHW)
         {
             _aux_mem[PermutedInput].merge(offset_int_vec(PermutedInput), src->total_size());
             _aux_mem[PermutedOutput].merge(offset_int_vec(PermutedOutput), dst->total_size());
         }
     }
 }
-Status CpuWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                   const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status CpuWinogradConv2d::validate(const ITensorInfo         *src,
+                                   const ITensorInfo         *weights,
+                                   const ITensorInfo         *biases,
+                                   const ITensorInfo         *dst,
+                                   const PadStrideInfo       &conv_info,
+                                   const ActivationLayerInfo &act_info,
+                                   bool                       enable_fast_math)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info));
 
     // Disable winograd for fp16 if fast math is false.
-    if(!enable_fast_math)
+    if (!enable_fast_math)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32);
     }
 
-    const Tensor4DShape              kernel_shape{ internal_get_shape(weights) };
+    const Tensor4DShape              kernel_shape{internal_get_shape(weights)};
     arm_conv::winograd::WinogradImpl winograd_impl{};
 
     std::unique_ptr<arm_conv::ConvolutionArgs> conv_args;
-    const bool                                 success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math, &winograd_impl, conv_args);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(success == false, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows, kernel_shape.n_cols);
-    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n", winograd_impl.input_transform->get_name().c_str());
-    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n", winograd_impl.input_transform->get_name().c_str());
-    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n", winograd_impl.input_transform->get_name().c_str());
+    const bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math,
+                                                            &winograd_impl, conv_args);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(success == false, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows,
+                                        kernel_shape.n_cols);
+    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n",
+                                        winograd_impl.input_transform->get_name().c_str());
+    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n",
+                                        winograd_impl.input_transform->get_name().c_str());
+    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n",
+                                        winograd_impl.input_transform->get_name().c_str());
     return Status{};
 }
 
@@ -328,24 +369,29 @@ void CpuWinogradConv2d::run(ITensorPack &tensors)
 
     // Wrap the winograd-domain tensorInfos created in configuration in tensors and allocate the required memory.
     CpuAuxTensorHandler input_nhwc(offset_int_vec(PermutedInput), _input_nhwc, tensors, true);
-    CpuAuxTensorHandler winograd_input_transformed(offset_int_vec(TransformedInput), _winograd_transformed_input, tensors, true);
+    CpuAuxTensorHandler winograd_input_transformed(offset_int_vec(TransformedInput), _winograd_transformed_input,
+                                                   tensors, true);
     CpuAuxTensorHandler input_workspace(offset_int_vec(WorkspaceIO), _input_workspace, tensors, true);
     const bool          is_nchw = _data_layout == DataLayout::NCHW;
-    if(is_nchw)
+    if (is_nchw)
     {
         //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
-        ITensorPack pack{ { ACL_SRC, src }, { ACL_DST, input_nhwc.get() } };
+        ITensorPack pack{{ACL_SRC, src}, {ACL_DST, input_nhwc.get()}};
         _permute_input->run(pack);
     }
 
-    CpuAuxTensorHandler winograd_output_transformed(offset_int_vec(TransformedOutput), _winograd_transformed_output, tensors, true);
+    CpuAuxTensorHandler winograd_output_transformed(offset_int_vec(TransformedOutput), _winograd_transformed_output,
+                                                    tensors, true);
     CpuAuxTensorHandler output_workspace(offset_int_vec(WorkspaceIO), _output_workspace, tensors, true);
     CpuAuxTensorHandler output_nhwc(offset_int_vec(PermutedOutput), _output_nhwc, tensors, true);
 
-    ITensorPack transform_input_pack{ { ACL_SRC, is_nchw ? input_nhwc.get() : src }, { ACL_DST, winograd_input_transformed.get() }, { ACL_INT, input_workspace.get() } };
+    ITensorPack transform_input_pack{{ACL_SRC, is_nchw ? input_nhwc.get() : src},
+                                     {ACL_DST, winograd_input_transformed.get()},
+                                     {ACL_INT, input_workspace.get()}};
     NEScheduler::get().schedule_op(_transform_input_kernel.get(), Window::DimX, win, transform_input_pack);
 
-    CpuAuxTensorHandler winograd_weights_transformed(offset_int_vec(TransformedWeights), _winograd_transformed_weights, tensors, true);
+    CpuAuxTensorHandler winograd_weights_transformed(offset_int_vec(TransformedWeights), _winograd_transformed_weights,
+                                                     tensors, true);
 
     // Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
     ITensorPack gemm_pack = tensors;
@@ -356,30 +402,34 @@ void CpuWinogradConv2d::run(ITensorPack &tensors)
     _gemm_function->run(gemm_pack);
 
     // Output transform
-    ITensorPack transform_output_pack{ { ACL_SRC_0, winograd_output_transformed.get() }, { ACL_DST, is_nchw ? output_nhwc.get() : output }, { ACL_SRC_1, biases }, { ACL_INT, output_workspace.get() } };
+    ITensorPack transform_output_pack{{ACL_SRC_0, winograd_output_transformed.get()},
+                                      {ACL_DST, is_nchw ? output_nhwc.get() : output},
+                                      {ACL_SRC_1, biases},
+                                      {ACL_INT, output_workspace.get()}};
     NEScheduler::get().schedule_op(_transform_output_kernel.get(), Window::DimX, win, transform_output_pack);
-    if(is_nchw)
+    if (is_nchw)
     {
         // Reorder the convoluted output to ACL's ordering NCHW
-        ITensorPack pack{ { ACL_SRC, output_nhwc.get() }, { ACL_DST, output } };
+        ITensorPack pack{{ACL_SRC, output_nhwc.get()}, {ACL_DST, output}};
         _permute_output->run(pack);
     }
-    if(_run_activation)
+    if (_run_activation)
     {
-        ITensorPack pack{ { ACL_SRC, output }, { ACL_DST, output } };
+        ITensorPack pack{{ACL_SRC, output}, {ACL_DST, output}};
         _activation_func->run(pack);
     }
 }
 
 void CpuWinogradConv2d::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
-        const ITensor *weights     = tensors.get_const_tensor(ACL_SRC_1);
-        ITensor       *weights_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights)));
+        const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1);
+        ITensor       *weights_aux =
+            utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights)));
 
         CpuAuxTensorHandler permuted_weights(_weights_hwio, *weights_aux);
-        ITensorPack         permute_tensors{ { ACL_SRC, weights }, { ACL_DST, permuted_weights.get() } };
+        ITensorPack         permute_tensors{{ACL_SRC, weights}, {ACL_DST, permuted_weights.get()}};
         _permute_weights->run(permute_tensors);
         const int element_size_in_bytes = permuted_weights.get()->info()->element_size();
         // Weights were in OHWI format, before being permuted "permuted_weights" to be in HWIO format.
@@ -387,31 +437,32 @@ void CpuWinogradConv2d::prepare(ITensorPack &tensors)
         const unsigned int width_idx   = 2; // W in HWIO
         const unsigned int channel_idx = 1; // I in HWIO
 
-        const int permuted_weight_row_stride     = permuted_weights.get()->info()->strides_in_bytes()[height_idx] / element_size_in_bytes;
-        const int permuted_weight_col_stride     = permuted_weights.get()->info()->strides_in_bytes()[width_idx] / element_size_in_bytes;
-        const int permuted_weight_channel_stride = permuted_weights.get()->info()->strides_in_bytes()[channel_idx] / element_size_in_bytes;
+        const int permuted_weight_row_stride =
+            permuted_weights.get()->info()->strides_in_bytes()[height_idx] / element_size_in_bytes;
+        const int permuted_weight_col_stride =
+            permuted_weights.get()->info()->strides_in_bytes()[width_idx] / element_size_in_bytes;
+        const int permuted_weight_channel_stride =
+            permuted_weights.get()->info()->strides_in_bytes()[channel_idx] / element_size_in_bytes;
 
         // Wrap the winograd-domain transformed weight TensorInfo in Auxiliary tensor and allocate the required memory.
-        ITensor *weights_transf = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransformedWeights)));
+        ITensor *weights_transf =
+            utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransformedWeights)));
         ARM_COMPUTE_ERROR_ON_NULLPTR(weights_transf);
         CpuAuxTensorHandler winograd_transformed_weights(_winograd_transformed_weights, *weights_transf);
 
         const void *permuted_weights_ptr;
         void       *win_wght_transf_ptr;
 
-        permuted_weights_ptr = reinterpret_cast<const void *>(permuted_weights.get()->buffer() + permuted_weights.get()->info()->offset_first_element_in_bytes());
-        win_wght_transf_ptr  = reinterpret_cast<void *>(winograd_transformed_weights.get()->buffer() + winograd_transformed_weights.get()->info()->offset_first_element_in_bytes());
+        permuted_weights_ptr = reinterpret_cast<const void *>(
+            permuted_weights.get()->buffer() + permuted_weights.get()->info()->offset_first_element_in_bytes());
+        win_wght_transf_ptr =
+            reinterpret_cast<void *>(winograd_transformed_weights.get()->buffer() +
+                                     winograd_transformed_weights.get()->info()->offset_first_element_in_bytes());
 
         // Prepare Weights
         _winograd_impl.weight_transform->execute(
-            *_conv_args,
-            permuted_weights_ptr,
-            permuted_weight_row_stride,
-            permuted_weight_col_stride,
-            permuted_weight_channel_stride,
-            win_wght_transf_ptr,
-            _winograd_impl.winograd_spec,
-            0, 1 // Thread 1 of 1
+            *_conv_args, permuted_weights_ptr, permuted_weight_row_stride, permuted_weight_col_stride,
+            permuted_weight_channel_stride, win_wght_transf_ptr, _winograd_impl.winograd_spec, 0, 1 // Thread 1 of 1
         );
         ITensorPack gemm_pack = tensors;
         gemm_pack.add_const_tensor(ACL_SRC_1, winograd_transformed_weights.get());
diff --git a/src/cpu/operators/CpuWinogradConv2d.h b/src/cpu/operators/CpuWinogradConv2d.h
index e0df34e2db..7e1d952462 100644
--- a/src/cpu/operators/CpuWinogradConv2d.h
+++ b/src/cpu/operators/CpuWinogradConv2d.h
@@ -26,10 +26,11 @@
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuOperator.h"
-#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
 #include "src/cpu/kernels/assembly/gemm_common.hpp"
+#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
 #include "src/cpu/operators/CpuActivation.h"
 #include "src/cpu/operators/CpuGemm.h"
 #include "src/cpu/operators/CpuPermute.h"
@@ -73,7 +74,11 @@ public:
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                              available which may introduce a drop of accuracy as well. Default is false
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
+    void configure(const ITensorInfo         *src,
+                   const ITensorInfo         *weights,
+                   const ITensorInfo         *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
                    const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
                    bool                       enable_fast_math = false);
     /** Static function to check if given info will lead to a valid configuration of @ref CpuWinogradConv2d
@@ -82,13 +87,17 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *dst,
+                           const PadStrideInfo       &conv_info,
                            const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
                            bool                       enable_fast_math = false);
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
@@ -108,27 +117,28 @@ private:
         PermutedOutput     = TransformedInput,
         Count              = 10
     };
-    std::unique_ptr<CpuGemm>                   _gemm_function;
-    std::unique_ptr<CpuActivation>             _activation_func;
-    std::unique_ptr<ICPPKernel>                _transform_input_kernel;
-    std::unique_ptr<ICPPKernel>                _transform_output_kernel;
-    std::unique_ptr<CpuPermute>                _permute_input;
-    std::unique_ptr<CpuPermute>                _permute_output;
-    std::unique_ptr<CpuPermute>                _permute_weights;
-    experimental::MemoryRequirements           _aux_mem{ Count };
-    std::unique_ptr<arm_conv::ConvolutionArgs> _conv_args; // Make it unique ptr because this type does not have a default constructor
-    arm_conv::winograd::WinogradImpl           _winograd_impl;
-    DataLayout                                 _data_layout;
-    TensorInfo                                 _winograd_transformed_input;
-    TensorInfo                                 _winograd_transformed_output;
-    TensorInfo                                 _winograd_transformed_weights;
-    TensorInfo                                 _input_workspace;
-    TensorInfo                                 _output_workspace;
-    TensorInfo                                 _weights_hwio;
-    TensorInfo                                 _input_nhwc;
-    TensorInfo                                 _output_nhwc;
-    bool                                       _is_prepared;
-    bool                                       _run_activation;
+    std::unique_ptr<CpuGemm>         _gemm_function;
+    std::unique_ptr<CpuActivation>   _activation_func;
+    std::unique_ptr<ICPPKernel>      _transform_input_kernel;
+    std::unique_ptr<ICPPKernel>      _transform_output_kernel;
+    std::unique_ptr<CpuPermute>      _permute_input;
+    std::unique_ptr<CpuPermute>      _permute_output;
+    std::unique_ptr<CpuPermute>      _permute_weights;
+    experimental::MemoryRequirements _aux_mem{Count};
+    std::unique_ptr<arm_conv::ConvolutionArgs>
+        _conv_args; // Make it unique ptr because this type does not have a default constructor
+    arm_conv::winograd::WinogradImpl _winograd_impl;
+    DataLayout                       _data_layout;
+    TensorInfo                       _winograd_transformed_input;
+    TensorInfo                       _winograd_transformed_output;
+    TensorInfo                       _winograd_transformed_weights;
+    TensorInfo                       _input_workspace;
+    TensorInfo                       _output_workspace;
+    TensorInfo                       _weights_hwio;
+    TensorInfo                       _input_nhwc;
+    TensorInfo                       _output_nhwc;
+    bool                             _is_prepared;
+    bool                             _run_activation;
 };
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
index 3069d6b541..343ef21c0b 100644
--- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
@@ -24,12 +24,13 @@
 #include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
 #include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
 #include "src/core/utils/AssemblyUtils.h"
-#include "src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h"
 #include "src/cpu/kernels/assembly/arm_gemm.hpp"
+#include "src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h"
 #include "src/cpu/utils/CpuAuxTensorHandler.h"
 
 #include <arm_neon.h>
@@ -53,7 +54,12 @@ namespace
  * @param[in] num_threads      Number of threads to run this method. Must be >= 1
  */
 template <typename TypeInput, typename TypeOutput>
-void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon<TypeInput, TypeOutput> *gemm_asm, ITensor *dst, const TypeInput *src, int src_ld, int src_multi_stride, unsigned int num_threads)
+void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon<TypeInput, TypeOutput> *gemm_asm,
+                                       ITensor                                     *dst,
+                                       const TypeInput                             *src,
+                                       int                                          src_ld,
+                                       int                                          src_multi_stride,
+                                       unsigned int                                 num_threads)
 {
     ARM_COMPUTE_ERROR_ON(gemm_asm == nullptr);
     ARM_COMPUTE_ERROR_ON(num_threads == 0);
@@ -61,14 +67,14 @@ void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon<TypeInput, TypeOutpu
     const unsigned int wsize = gemm_asm->get_B_pretranspose_window_size();
 
     std::vector<IScheduler::Workload> workloads(num_threads);
-    for(unsigned int t = 0; t < num_threads; ++t)
+    for (unsigned int t = 0; t < num_threads; ++t)
     {
-        workloads[t] = [ = ](const ThreadInfo & info)
+        workloads[t] = [=](const ThreadInfo &info)
         {
             const unsigned int start = (info.thread_id * wsize) / num_threads;
             const unsigned int end   = ((info.thread_id + 1) * wsize) / num_threads;
 
-            if(start < end)
+            if (start < end)
             {
                 gemm_asm->pretranspose_B_array_part(dst->buffer(), src, src_ld, src_multi_stride, start, end);
             }
@@ -113,7 +119,7 @@ Params extract_parameters(const ITensorInfo *a, const ITensorInfo *b, const ITen
     p.sections = 1;
     p.indirect = false;
 
-    if(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect)
+    if (info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect)
     {
         p.indirect = true;
         p.sections = b->tensor_shape()[2] * b->tensor_shape()[3];
@@ -125,7 +131,7 @@ Params extract_parameters(const ITensorInfo *a, const ITensorInfo *b, const ITen
     }
 
     // Update M in case of GEMM3D for output
-    if(info.depth_output_gemm3d != 0)
+    if (info.depth_output_gemm3d != 0)
     {
         p.M       = d->tensor_shape().y() * d->tensor_shape().z();
         p.batches = d->tensor_shape().total_size_upper(3) / p.multis;
@@ -139,19 +145,24 @@ IScheduler::Hints scheduling_hint_heuristic(arm_gemm::GemmMethod method, DataTyp
     // Schedule assembly kernel
     const int         granule_threshold = 200;
     IScheduler::Hints scheduling_hint   = IScheduler::Hints(Window::DimX);
-    if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && data_type == DataType::F32)
+    if (method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && data_type == DataType::F32)
     {
         scheduling_hint = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold);
     }
-    else if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && (data_type == DataType::F32 || data_type == DataType::F16 || data_type == DataType::U8 || data_type == DataType::S8))
+    else if (method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D &&
+             (data_type == DataType::F32 || data_type == DataType::F16 || data_type == DataType::U8 ||
+              data_type == DataType::S8))
     {
         //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions
-        scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
+        scheduling_hint =
+            IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
     }
-    else if(method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D && (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED))
+    else if (method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D &&
+             (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED))
     {
         //special case for QASYMM8 to support 2D parallelism, scheduler here may be tweaked differently compared to FP32 case
-        scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
+        scheduling_hint =
+            IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
     }
 
     return scheduling_hint;
@@ -175,8 +186,12 @@ public:
      * @param[in]  gemm_info GEMM meta-data
      * @param[in]  os        Output stage meta-data.
      */
-    void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
-                   arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info,
+    void configure(const ITensorInfo *a,
+                   const ITensorInfo *b,
+                   const ITensorInfo *c,
+                   ITensorInfo       *d,
+                   arm_gemm::GemmArgs args,
+                   const AsmGemmInfo &gemm_info,
                    const OutputStage &os = {});
 
     /** Set requantization shifts to be used
@@ -193,19 +208,20 @@ public:
       *
       * @return A tuple with the pointers to the shift and multiplier data respectively
       */
-    std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> set_requantize_data(const std::vector<int32_t> &shifts,
-                                                                                            const std::vector<int32_t> &multipliers);
+    std::tuple<bool, const int32_t *, const int32_t *, const int32_t *>
+    set_requantize_data(const std::vector<int32_t> &shifts, const std::vector<int32_t> &multipliers);
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
     bool                             is_configured() const override;
     experimental::MemoryRequirements workspace() const override;
     bool                             isVarWeightsKernel() const override
     {
-        if(!_gemm_kernel_asm)
+        if (!_gemm_kernel_asm)
             return false;
-        const arm_compute::WeightFormat wf = assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format);
+        const arm_compute::WeightFormat wf =
+            assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format);
         return wf != arm_compute::WeightFormat::UNSPECIFIED && wf != arm_compute::WeightFormat::ANY;
     }
 
@@ -229,15 +245,15 @@ private:
     void prepare_indirect_buffer(ITensorPack &tensors);
 
     /** Assembly Gemm kernel */
-    std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr };
+    std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{nullptr};
     /** Optimised Arm® Neon™ kernel */
-    std::unique_ptr<INEKernel> _optimised_kernel{ nullptr };
+    std::unique_ptr<INEKernel> _optimised_kernel{nullptr};
     /** Assembly GEMM workspace tensor info */
     TensorInfo _workspace_info{};
     /** Pre-transpose tensor info */
     TensorInfo _pretranspose_info{};
     /** Prepared flag */
-    bool _is_prepared{ false };
+    bool _is_prepared{false};
     /** GEMM meta-data */
     AsmGemmInfo _gemm_info{};
     /** GEMM kernel description */
@@ -251,26 +267,27 @@ private:
     /** Indirect buffer */
     std::unique_ptr<const TypeInput *const *, free_delete> _indirect_arg{};
     std::unique_ptr<const TypeInput *, free_delete>        _indirect_buf{};
-    std::vector<TypeInput>           _indirect_pad{};
-    arm_gemm::ConvolutionParameters  _cp{};
-    experimental::MemoryRequirements _aux_mem{ Count };
-    bool                             _B_pretranspose_required{ false };
-    bool                             _is_b_constant{ true };
-    bool                             _is_c_constant{ true };
+    std::vector<TypeInput>                                 _indirect_pad{};
+    arm_gemm::ConvolutionParameters                        _cp{};
+    experimental::MemoryRequirements                       _aux_mem{Count};
+    bool                                                   _B_pretranspose_required{false};
+    bool                                                   _is_b_constant{true};
+    bool                                                   _is_c_constant{true};
 };
 
 template <typename TypeInput, typename TypeOutput, class OutputStage>
 std::tuple<bool, const int32_t *, const int32_t *, const int32_t *>
-Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts, const std::vector<int32_t> &multipliers)
+Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts,
+                                                                  const std::vector<int32_t> &multipliers)
 {
     _multipliers   = multipliers;
     _shifts        = shifts;
     bool need_left = false;
-    for(const auto s : _shifts)
+    for (const auto s : _shifts)
     {
         left_shifts.push_back(std::max(-s, int32_t(0)));
         right_shifts.push_back(std::min(-s, int32_t(0)));
-        if(s < 0 && !need_left)
+        if (s < 0 && !need_left)
         {
             need_left = true;
         }
@@ -295,32 +312,35 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer(ITens
     const int    multi_size   = batch_size * batches;
     const size_t multi_stride = multi_size / sizeof(TypeInput);
 
-    for(int64_t m = 0; m < multis; m++)
+    for (int64_t m = 0; m < multis; m++)
     {
-        for(int64_t b = 0; b < batches; b++)
+        for (int64_t b = 0; b < batches; b++)
         {
-            for(int64_t output_y = 0; output_y < _cp.output_height; output_y++)
+            for (int64_t output_y = 0; output_y < _cp.output_height; output_y++)
             {
-                for(int64_t output_x = 0; output_x < _cp.output_width; output_x++)
+                for (int64_t output_x = 0; output_x < _cp.output_width; output_x++)
                 {
                     int64_t output_xy = (output_y * _cp.output_width) + output_x;
 
-                    for(int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++)
+                    for (int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++)
                     {
-                        for(int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++)
+                        for (int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++)
                         {
                             int64_t input_x   = (output_x * _cp.output_stride_w) + kernel_x - _cp.padding_left;
                             int64_t input_y   = (output_y * _cp.output_stride_h) + kernel_y - _cp.padding_top;
                             int64_t kernel_xy = (kernel_y * _cp.kernel_width) + kernel_x;
                             int64_t input_xy  = (input_y * _cp.input_width) + input_x;
 
-                            if(input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height)
+                            if (input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height)
                             {
-                                _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = _indirect_pad.data();
+                                _indirect_buf
+                                    .get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] =
+                                    _indirect_pad.data();
                             }
                             else
                             {
-                                _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] =
+                                _indirect_buf
+                                    .get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] =
                                     A_ptr + (m * multi_stride_A + b * batch_stride_A + input_xy * stride_A);
                             }
                         }
@@ -332,12 +352,15 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer(ITens
 }
 
 template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info)
+void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITensorInfo *a,
+                                                                      const ITensorInfo *b,
+                                                                      const ITensorInfo *d,
+                                                                      const AsmGemmInfo &info)
 {
     ARM_COMPUTE_ERROR_ON(!(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect));
 
     float zeropad = 0.f;
-    if(is_data_type_quantized(a->data_type()))
+    if (is_data_type_quantized(a->data_type()))
     {
         zeropad = a->quantization_info().uniform().offset;
     }
@@ -350,16 +373,25 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITen
     const int64_t output_width   = static_cast<int64_t>(d->tensor_shape()[1]);
     const int64_t output_height  = static_cast<int64_t>(d->tensor_shape()[2]);
 
-    _cp = { input_width, input_height, input_channels, kernel_width, kernel_height, output_width, output_height,
-            info.ps_info.stride().first, info.ps_info.stride().second, info.padding_top, info.padding_left, zeropad
-          };
-
-    if(info.method == AsmConvMethod::Conv)
+    _cp = {input_width,
+           input_height,
+           input_channels,
+           kernel_width,
+           kernel_height,
+           output_width,
+           output_height,
+           info.ps_info.stride().first,
+           info.ps_info.stride().second,
+           info.padding_top,
+           info.padding_left,
+           zeropad};
+
+    if (info.method == AsmConvMethod::Conv)
     {
         _gemm_kernel_asm->set_convolution_parameters(_cp);
     }
 
-    if(info.method == AsmConvMethod::Indirect)
+    if (info.method == AsmConvMethod::Indirect)
     {
         const unsigned int multis    = 1;
         const unsigned int batches   = a->tensor_shape().total_size_upper(3);
@@ -372,19 +404,22 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITen
         const int    multi_size   = batch_size * batches;
         const size_t multi_stride = multi_size / sizeof(TypeInputPtr);
 
-        _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>(reinterpret_cast<const TypeInput **>(malloc(multi_size * multis)));
-        _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>(reinterpret_cast<const TypeInput *const **>(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches)));
+        _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>(
+            reinterpret_cast<const TypeInput **>(malloc(multi_size * multis)));
+        _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>(
+            reinterpret_cast<const TypeInput *const **>(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches)));
         _indirect_pad = std::vector<TypeInput>(_cp.input_channels, TypeInput(zeropad));
 
         // Set indirect argument
         int64_t pos = 0;
-        for(int64_t m = 0; m < multis; m++)
+        for (int64_t m = 0; m < multis; m++)
         {
-            for(int64_t b = 0; b < batches; b++)
+            for (int64_t b = 0; b < batches; b++)
             {
-                for(int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++)
+                for (int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++)
                 {
-                    (_indirect_arg.get())[pos++] = _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw;
+                    (_indirect_arg.get())[pos++] =
+                        _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw;
                 }
             }
         }
@@ -394,8 +429,12 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITen
 }
 
 template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
-                                                             arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info,
+void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *a,
+                                                             const ITensorInfo *b,
+                                                             const ITensorInfo *c,
+                                                             ITensorInfo       *d,
+                                                             arm_gemm::GemmArgs args,
+                                                             const AsmGemmInfo &gemm_info,
                                                              const OutputStage &os)
 {
     ARM_COMPUTE_UNUSED(c);
@@ -404,7 +443,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *
     _is_c_constant = c ? c->are_values_constant() : true;
 
     _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput, OutputStage>(args, os);
-    if(_gemm_kernel_asm == nullptr)
+    if (_gemm_kernel_asm == nullptr)
     {
         //configuration not supported: Leave function unconfigured:
         return;
@@ -419,13 +458,14 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *
     const size_t       workspace_size = _gemm_kernel_asm->get_working_size();
     const unsigned int alignment      = 4096;
     _workspace_info                   = TensorInfo(TensorShape(workspace_size), 1, DataType::U8);
-    _aux_mem[AsmGemmWorkspace]        = MemoryInfo(offset_int_vec(AsmGemmWorkspace), MemoryLifetime::Temporary, workspace_size, alignment);
+    _aux_mem[AsmGemmWorkspace] =
+        MemoryInfo(offset_int_vec(AsmGemmWorkspace), MemoryLifetime::Temporary, workspace_size, alignment);
 
     //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and
     //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001
     {
         const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
-        if(window_size < static_cast<unsigned int>(args._maxthreads))
+        if (window_size < static_cast<unsigned int>(args._maxthreads))
         {
             _gemm_kernel_asm->set_nthreads(window_size);
         }
@@ -434,18 +474,19 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *
     _optimised_kernel = std::move(acl_gemm_wrapper);
     _gemm_info        = gemm_info;
     // Check for pre-transposed support
-    if(_gemm_kernel_asm->B_pretranspose_required())
+    if (_gemm_kernel_asm->B_pretranspose_required())
     {
         // Forcing 128-byte alignment (required by 32-bit kernels)
         const unsigned int alignment           = 128;
         const size_t       B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size();
         _pretranspose_info                     = TensorInfo(TensorShape(B_pretranspose_size), 1, DataType::U8);
-        _aux_mem[Pretranspose]                 = MemoryInfo(offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment);
-        _B_pretranspose_required               = true;
+        _aux_mem[Pretranspose] =
+            MemoryInfo(offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment);
+        _B_pretranspose_required = true;
     }
 
     // Handle indirect GEMM convolution
-    if(gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect)
+    if (gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect)
     {
         configure_indirect(a, b, d, gemm_info);
     }
@@ -454,34 +495,39 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *
 template <typename TypeInput, typename TypeOutput, class OutputStage>
 void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
         auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2);
 
         // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
-        if(c && c->info()->data_type() == DataType::S32)
+        if (c && c->info()->data_type() == DataType::S32)
         {
-            _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
+            _gemm_kernel_asm->set_quantized_bias(
+                reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
         }
 
         // Pretranspose B if required
-        if(_gemm_kernel_asm->B_pretranspose_required())
+        if (_gemm_kernel_asm->B_pretranspose_required())
         {
             // Fixed format kernels need no pretranspose.
-            ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format(assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format)));
-            const int  ldb            = b->info()->strides_in_bytes().y() / b->info()->element_size();
-            const auto in1_ptr        = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
-            const int  multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
+            ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format(
+                assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format)));
+            const int  ldb = b->info()->strides_in_bytes().y() / b->info()->element_size();
+            const auto in1_ptr =
+                reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
+            const int multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
 
             CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, false);
             ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr);
-            run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(), in1_ptr, ldb, multi_stride_b, NEScheduler::get().num_threads());
+            run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(),
+                                                                     in1_ptr, ldb, multi_stride_b,
+                                                                     NEScheduler::get().num_threads());
 
             b->mark_as_unused();
         }
 
-        if(_gemm_info.method == AsmConvMethod::Indirect)
+        if (_gemm_info.method == AsmConvMethod::Indirect)
         {
             prepare_indirect_buffer(tensors);
         }
@@ -526,12 +572,12 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
     int       multi_stride_b = 0;
     const int multi_stride_d = d->info()->strides_in_bytes()[d_multi_idx] / d->info()->element_size();
 
-    auto             in0_ptr = reinterpret_cast<const TypeInput *>(a->buffer() + a->info()->offset_first_element_in_bytes());
+    auto in0_ptr = reinterpret_cast<const TypeInput *>(a->buffer() + a->info()->offset_first_element_in_bytes());
     const TypeInput *in1_ptr = nullptr;
     auto             out_ptr = reinterpret_cast<TypeOutput *>(d->buffer() + d->info()->offset_first_element_in_bytes());
 
     // Check if B is pre-tranposed and de-reference if not
-    if(!_gemm_kernel_asm->B_is_pretransposed())
+    if (!_gemm_kernel_asm->B_is_pretransposed())
     {
         ldb            = b->info()->strides_in_bytes().y() / b->info()->element_size();
         multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
@@ -539,30 +585,34 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
     }
 
     // If necessary, run pretranspose every time if either weights or biases are non-constant
-    if((b && !_is_b_constant) || (c && !_is_c_constant && c->info()->data_type() == DataType::S32))
+    if ((b && !_is_b_constant) || (c && !_is_c_constant && c->info()->data_type() == DataType::S32))
     {
-        if(c && c->info()->data_type() == DataType::S32)
+        if (c && c->info()->data_type() == DataType::S32)
         {
-            _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
+            _gemm_kernel_asm->set_quantized_bias(
+                reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
         }
 
         // Pretranspose B if required
-        if(_B_pretranspose_required)
+        if (_B_pretranspose_required)
         {
-            const int  ldb            = b->info()->strides_in_bytes().y() / b->info()->element_size();
-            const auto b_ptr          = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
-            const int  multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
+            const int  ldb = b->info()->strides_in_bytes().y() / b->info()->element_size();
+            const auto b_ptr =
+                reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
+            const int multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
 
             CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, true);
             ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr);
 
-            if(_is_b_constant)
+            if (_is_b_constant)
             {
                 _gemm_kernel_asm->requantize_bias(pretranspose.get()->buffer(), b_ptr, ldb, multi_stride_b);
             }
             else
             {
-                run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(), b_ptr, ldb, multi_stride_b, NEScheduler::get().num_threads());
+                run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(),
+                                                                         b_ptr, ldb, multi_stride_b,
+                                                                         NEScheduler::get().num_threads());
             }
         }
     }
@@ -571,17 +621,17 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
 
     // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads
     CpuAuxTensorHandler workspace(offset_int_vec(AsmGemmWorkspace), _workspace_info, tensors, false);
-    if(workspace.get()->buffer() != nullptr)
+    if (workspace.get()->buffer() != nullptr)
     {
         _gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(workspace.get()->buffer()));
         const unsigned int split_dim   = scheduling_hint.split_dimension();
         const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
         unsigned int       num_threads = NEScheduler::get().num_threads();
-        if(window_size < num_threads)
+        if (window_size < num_threads)
         {
             num_threads = window_size;
         }
-        if(split_dim != IScheduler::split_dimensions_all)
+        if (split_dim != IScheduler::split_dimensions_all)
         {
             // Make sure the kernel does not expect more threads than we can actually spawn
             const unsigned int num_iterations = _optimised_kernel.get()->window().num_iterations(split_dim);
@@ -595,12 +645,12 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
 
     // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
     TypeOutput *bias = nullptr;
-    if(c && c->info()->data_type() != DataType::S32)
+    if (c && c->info()->data_type() != DataType::S32)
     {
         bias = reinterpret_cast<TypeOutput *>(c->buffer() + c->info()->offset_first_element_in_bytes());
     }
 
-    if(_gemm_info.method == AsmConvMethod::Indirect)
+    if (_gemm_info.method == AsmConvMethod::Indirect)
     {
         in0_ptr        = nullptr;
         lda            = 0;
@@ -609,18 +659,20 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
     }
 
     // Set gemm parameters
-    _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a,
-                                 in1_ptr, ldb, multi_stride_b,
-                                 out_ptr, ldd, batch_stride_d, multi_stride_d,
-                                 bias, 0);
+    _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, in1_ptr, ldb, multi_stride_b, out_ptr,
+                                 ldd, batch_stride_d, multi_stride_d, bias, 0);
     // Schedule
     NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint);
 }
 
 template <typename TypeInput, typename TypeOutput>
 void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm,
-                     const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
-                     arm_gemm::Activation activation, const AsmGemmInfo &info)
+                     const ITensorInfo                                   *a,
+                     const ITensorInfo                                   *b,
+                     const ITensorInfo                                   *c,
+                     ITensorInfo                                         *d,
+                     arm_gemm::Activation                                 activation,
+                     const AsmGemmInfo                                   &info)
 {
     Params         p           = extract_parameters(a, b, d, info);
     const CPUInfo &ci          = NEScheduler::get().cpu_info();
@@ -628,7 +680,8 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge
 
     arm_gemm::GemmConfig cfg;
     cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
-    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fixed_format, info.fast_mode, &cfg);
+    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads,
+                            info.fixed_format, info.fast_mode, &cfg);
 
     // Create arm_gemm fallback
     auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput>>();
@@ -638,8 +691,12 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge
 
 template <typename TypeInput, typename TypeOutput>
 void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm,
-                           const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
-                           arm_gemm::Activation activation, const AsmGemmInfo &info)
+                           const ITensorInfo                                   *a,
+                           const ITensorInfo                                   *b,
+                           const ITensorInfo                                   *c,
+                           ITensorInfo                                         *d,
+                           arm_gemm::Activation                                 activation,
+                           const AsmGemmInfo                                   &info)
 {
     ARM_COMPUTE_UNUSED(activation);
     Params             p           = extract_parameters(a, b, d, info);
@@ -648,7 +705,8 @@ void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &
 
     arm_gemm::GemmConfig cfg;
     cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
-    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fixed_format, info.fast_mode, &cfg);
+    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads,
+                            info.fixed_format, info.fast_mode, &cfg);
 
     // Create arm_gemm fallback
     auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>();
@@ -660,22 +718,20 @@ void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &
     const GEMMLowpOutputStageInfo os_info  = info.output_stage;
 
     arm_gemm::Requantize32 gemm_requant_info{};
-    if(os_info.gemmlowp_shifts.size() > 1)
+    if (os_info.gemmlowp_shifts.size() > 1)
     {
-        const auto requantize_data = fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers);
-        gemm_requant_info          = arm_gemm::Requantize32(nullptr, 0,
-                                                            a_offset, b_offset, os_info.gemmlowp_offset,
-                                                            (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) : nullptr,
-                                                            std::get<2>(requantize_data),
-                                                            std::get<3>(requantize_data),
-                                                            os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
+        const auto requantize_data =
+            fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers);
+        gemm_requant_info = arm_gemm::Requantize32(
+            nullptr, 0, a_offset, b_offset, os_info.gemmlowp_offset,
+            (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) : nullptr, std::get<2>(requantize_data),
+            std::get<3>(requantize_data), os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
     }
     else
     {
-        gemm_requant_info = arm_gemm::Requantize32(nullptr, 0,
-                                                   a_offset, b_offset, os_info.gemmlowp_offset,
-                                                   -os_info.gemmlowp_shift, os_info.gemmlowp_multiplier,
-                                                   os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
+        gemm_requant_info =
+            arm_gemm::Requantize32(nullptr, 0, a_offset, b_offset, os_info.gemmlowp_offset, -os_info.gemmlowp_shift,
+                                   os_info.gemmlowp_multiplier, os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
     }
 
     // Configure fallback
@@ -684,13 +740,16 @@ void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &
 }
 } //namespace
 
-CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch()
-    : _arm_gemm(nullptr)
+CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch() : _arm_gemm(nullptr)
 {
 }
 
-Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d,
-                                             const AsmGemmInfo &info)
+Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                                             const ITensorInfo         *a,
+                                             const ITensorInfo         *b,
+                                             const ITensorInfo         *c,
+                                             const ITensorInfo         *d,
+                                             const AsmGemmInfo         &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
     ARM_COMPUTE_UNUSED(c);
@@ -701,53 +760,61 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected
     arm_gemm::GemmConfig cfg;
     cfg.weight_format                           = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
     arm_gemm::WeightFormat arm_gemm_expected_wf = assembly_utils::map_to_arm_gemm_weight_format(expected_weight_format);
-    arm_gemm::GemmArgs     args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, act, num_threads, info.fixed_format, info.fast_mode, &cfg);
-    switch(a->data_type())
+    arm_gemm::GemmArgs     args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, act, num_threads,
+                                info.fixed_format, info.fast_mode, &cfg);
+    switch (a->data_type())
     {
         case DataType::F32:
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<float, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
-                                            "We could not find an optimized kernel for F32 input");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                !(arm_gemm::has_opt_gemm<float, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+                "We could not find an optimized kernel for F32 input");
             break;
 #ifdef __aarch64__
         case DataType::U8:
         case DataType::QASYMM8:
-            if(d->data_type() == DataType::S32)
+            if (d->data_type() == DataType::S32)
             {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<uint8_t, uint32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
-                                                "We could not find an optimized kernel for U8/QASYMM8 input and U32 output");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    !(arm_gemm::has_opt_gemm<uint8_t, uint32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+                    "We could not find an optimized kernel for U8/QASYMM8 input and U32 output");
             }
             else
             {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<uint8_t, uint8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})),
-                                                "We could not find an optimized kernel for U8 input and U8 output");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    !(arm_gemm::has_opt_gemm<uint8_t, uint8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})),
+                    "We could not find an optimized kernel for U8 input and U8 output");
             }
             break;
         case DataType::S8:
         case DataType::QASYMM8_SIGNED:
-            if(d->data_type() == DataType::S32)
+            if (d->data_type() == DataType::S32)
             {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<int8_t, int32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
-                                                "We could not find an optimized kernel for S8/QASYMM8_SIGNED input and S32 output");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    !(arm_gemm::has_opt_gemm<int8_t, int32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+                    "We could not find an optimized kernel for S8/QASYMM8_SIGNED input and S32 output");
             }
             else
             {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<int8_t, int8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})),
-                                                "We could not find an optimized kernel for S8 input and S8 output");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    !(arm_gemm::has_opt_gemm<int8_t, int8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})),
+                    "We could not find an optimized kernel for S8 input and S8 output");
             }
             break;
 #endif /* __aarch64__ */
 #if defined(ARM_COMPUTE_ENABLE_BF16)
         case DataType::BFLOAT16:
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<bfloat16, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
-                                            "We could not find an optimized kernel for BFLOAT16 input and F32 output");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                !(arm_gemm::has_opt_gemm<bfloat16, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+                "We could not find an optimized kernel for BFLOAT16 input and F32 output");
             break;
         }
 #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<float16_t, float16_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
-                                            "We could not find an optimized kernel for F16 input and F16 output");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                !(arm_gemm::has_opt_gemm<float16_t, float16_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+                "We could not find an optimized kernel for F16 input and F16 output");
             break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         default:
@@ -759,26 +826,30 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected
     return Status{};
 }
 
-Status CpuGemmAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info)
+Status CpuGemmAssemblyDispatch::validate(
+    const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info)
 {
     ARM_COMPUTE_UNUSED(c, info);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a, b, d);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(info.reshape_b_only_on_first_run), "Assembly kernel will not be executed when reshape_b_only_on_first_run is false");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(info.reshape_b_only_on_first_run),
+                                    "Assembly kernel will not be executed when reshape_b_only_on_first_run is false");
 
 #ifndef __aarch64__
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->element_size() == 1, "8bit integer types only supported for aarch64");
 #endif /* __aarch64__ */
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8,
-                                                         DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8,
-                                                         DataType::BFLOAT16, DataType::F16, DataType::F32);
-    if(is_data_type_quantized_per_channel(b->data_type()))
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S8, DataType::BFLOAT16,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+        b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8,
+        DataType::BFLOAT16, DataType::F16, DataType::F32);
+    if (is_data_type_quantized_per_channel(b->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8_SIGNED, DataType::S8);
     }
-    else if(is_fixed_format_fast_math(info.weight_format))
+    else if (is_fixed_format_fast_math(info.weight_format))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(a, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b, DataType::BFLOAT16);
@@ -787,22 +858,29 @@ Status CpuGemmAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, "Only F32 output supported for F32 input");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, "Only F16 output supported for F16 input");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::BFLOAT16 && d->data_type() != DataType::F32, "Only F32 output supported for BFLOAT16 input");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32, "Only U32 output supported for U8 input");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, "Only S32 output supported for S8 input");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 && (d->data_type() != DataType::QASYMM8 && d->data_type() != DataType::S32),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32,
+                                    "Only F32 output supported for F32 input");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16,
+                                    "Only F16 output supported for F16 input");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::BFLOAT16 && d->data_type() != DataType::F32,
+                                    "Only F32 output supported for BFLOAT16 input");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32,
+                                    "Only U32 output supported for U8 input");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32,
+                                    "Only S32 output supported for S8 input");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 &&
+                                        (d->data_type() != DataType::QASYMM8 && d->data_type() != DataType::S32),
                                     "Only QASYMM8/S32 output supported for QASYMM8 input");
     arm_compute::WeightFormat expected_weight_format = arm_compute::WeightFormat::UNSPECIFIED;
     const Status              ret = CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, a, b, c, d, info);
-    if((bool)ret && expected_weight_format != arm_compute::WeightFormat::ANY)
+    if ((bool)ret && expected_weight_format != arm_compute::WeightFormat::ANY)
     {
         // Correctness check: if the format expected by the kernel is
         // not "any", make sure that the one found matches the format
         // intended by the caller.
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((expected_weight_format != info.weight_format),
-                                        "The format expected by the kernel does not correspond with the one requested by the user.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            (expected_weight_format != info.weight_format),
+            "The format expected by the kernel does not correspond with the one requested by the user.");
     }
     return ret;
 }
@@ -813,18 +891,19 @@ bool CpuGemmAssemblyDispatch::is_activation_supported(const ActivationLayerInfo
     return act.type != arm_gemm::Activation::Type::None;
 }
 
-void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info)
+void CpuGemmAssemblyDispatch::configure(
+    const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
     arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(info.activation_info);
 
     //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
-    if(!CpuGemmAssemblyDispatch::validate(a, b, c, d, info))
+    if (!CpuGemmAssemblyDispatch::validate(a, b, c, d, info))
     {
         return;
     }
 
-    switch(a->data_type())
+    switch (a->data_type())
     {
         case DataType::F32:
             create_arm_gemm<float, float>(_arm_gemm, a, b, c, d, act, info);
@@ -832,7 +911,7 @@ void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo
 #ifdef __aarch64__
         case DataType::U8:
         case DataType::QASYMM8:
-            if(d->data_type() == DataType::S32)
+            if (d->data_type() == DataType::S32)
             {
                 create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, a, b, c, d, act, info);
             }
@@ -843,7 +922,7 @@ void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo
             break;
         case DataType::S8:
         case DataType::QASYMM8_SIGNED:
-            if(d->data_type() == DataType::S32)
+            if (d->data_type() == DataType::S32)
             {
                 create_arm_gemm<int8_t, int32_t>(_arm_gemm, a, b, c, d, act, info);
             }
diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h
index ceb7a3f775..5be39a54c0 100644
--- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuOperator.h"
 
@@ -42,20 +43,20 @@ enum class AsmConvMethod
 
 struct AsmGemmInfo
 {
-    AsmConvMethod             method{ AsmConvMethod::Im2Col };
+    AsmConvMethod             method{AsmConvMethod::Im2Col};
     PadStrideInfo             ps_info{};
     ActivationLayerInfo       activation_info{};
     GEMMLowpOutputStageInfo   output_stage{};
-    bool                      negated_offsets{ true };
-    bool                      reinterpret_input_as_3d{ false };
-    bool                      depth_output_gemm3d{ false };
-    int64_t                   padding_top{ 0 };
-    int64_t                   padding_left{ 0 };
-    float                     padding_value{ 0.f };
-    bool                      fast_mode{ false };
-    bool                      fixed_format{ false };
-    arm_compute::WeightFormat weight_format{ arm_compute::WeightFormat::UNSPECIFIED };
-    bool                      reshape_b_only_on_first_run{ true };
+    bool                      negated_offsets{true};
+    bool                      reinterpret_input_as_3d{false};
+    bool                      depth_output_gemm3d{false};
+    int64_t                   padding_top{0};
+    int64_t                   padding_left{0};
+    float                     padding_value{0.f};
+    bool                      fast_mode{false};
+    bool                      fixed_format{false};
+    arm_compute::WeightFormat weight_format{arm_compute::WeightFormat::UNSPECIFIED};
+    bool                      reshape_b_only_on_first_run{true};
 };
 
 /** Assembly kernel glue */
@@ -72,12 +73,12 @@ public:
     class IFallback
     {
     public:
-        virtual void run(ITensorPack &tensors)                              = 0;
-        virtual void prepare(ITensorPack &tensors)                          = 0;
-        virtual experimental::MemoryRequirements workspace() const          = 0;
-        virtual bool                             is_configured() const      = 0;
-        virtual bool                             isVarWeightsKernel() const = 0;
-        virtual ~IFallback()                                                = default;
+        virtual void                             run(ITensorPack &tensors)     = 0;
+        virtual void                             prepare(ITensorPack &tensors) = 0;
+        virtual experimental::MemoryRequirements workspace() const             = 0;
+        virtual bool                             is_configured() const         = 0;
+        virtual bool                             isVarWeightsKernel() const    = 0;
+        virtual ~IFallback()                                                   = default;
     };
 
 public:
@@ -121,7 +122,8 @@ public:
      * @param[out] d    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
      * @param[in]  info GEMM meta-data
      */
-    void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info);
+    void configure(
+        const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info);
 
     /** Indicates whether or not this function can be used to process the given parameters.
      *
@@ -133,7 +135,11 @@ public:
      *
      * @return a status.
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info);
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *d,
+                           const AsmGemmInfo &info);
 
     /** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters.
      *
@@ -144,7 +150,12 @@ public:
      *
      * @return a status.
      */
-    static Status has_opt_impl(arm_compute::WeightFormat &weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info);
+    static Status has_opt_impl(arm_compute::WeightFormat &weight_format,
+                               const ITensorInfo         *a,
+                               const ITensorInfo         *b,
+                               const ITensorInfo         *c,
+                               const ITensorInfo         *d,
+                               const AsmGemmInfo         &info);
     /** Checks if activation is supported by the gemm assembly dispatcher
      *
      * @param[in] activation Activation to check
@@ -167,8 +178,8 @@ public:
     }
 
     // Inherited methods overridden:
-    void prepare(ITensorPack &tensors) override;
-    void run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
diff --git a/src/cpu/utils/CpuAuxTensorHandler.h b/src/cpu/utils/CpuAuxTensorHandler.h
index ae1cffb659..e23b88a777 100644
--- a/src/cpu/utils/CpuAuxTensorHandler.h
+++ b/src/cpu/utils/CpuAuxTensorHandler.h
@@ -39,25 +39,26 @@ namespace cpu
 class CpuAuxTensorHandler
 {
 public:
-    CpuAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false)
+    CpuAuxTensorHandler(
+        int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false)
         : _tensor()
     {
-        if(info.total_size() == 0)
+        if (info.total_size() == 0)
         {
             return;
         }
         _tensor.allocator()->soft_init(info);
 
         ITensor *packed_tensor = utils::cast::polymorphic_downcast<ITensor *>(pack.get_tensor(slot_id));
-        if((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size()))
+        if ((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size()))
         {
-            if(!bypass_alloc)
+            if (!bypass_alloc)
             {
                 _tensor.allocator()->allocate();
                 ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Allocating auxiliary tensor");
             }
 
-            if(pack_inject)
+            if (pack_inject)
             {
                 pack.add_tensor(slot_id, &_tensor);
                 _injected_tensor_pack = &pack;
@@ -70,22 +71,21 @@ public:
         }
     }
 
-    CpuAuxTensorHandler(TensorInfo &info, ITensor &tensor)
-        : _tensor()
+    CpuAuxTensorHandler(TensorInfo &info, ITensor &tensor) : _tensor()
     {
         _tensor.allocator()->soft_init(info);
-        if(info.total_size() <= tensor.info()->total_size())
+        if (info.total_size() <= tensor.info()->total_size())
         {
             _tensor.allocator()->import_memory(tensor.buffer());
         }
     }
 
-    CpuAuxTensorHandler(const CpuAuxTensorHandler &) = delete;
+    CpuAuxTensorHandler(const CpuAuxTensorHandler &)          = delete;
     CpuAuxTensorHandler &operator=(const CpuAuxTensorHandler) = delete;
 
     ~CpuAuxTensorHandler()
     {
-        if(_injected_tensor_pack)
+        if (_injected_tensor_pack)
         {
             _injected_tensor_pack->remove_tensor(_injected_slot_id);
         }
@@ -103,9 +103,9 @@ public:
 
 private:
     Tensor       _tensor{};
-    ITensorPack *_injected_tensor_pack{ nullptr };
-    int          _injected_slot_id{ TensorType::ACL_UNKNOWN };
+    ITensorPack *_injected_tensor_pack{nullptr};
+    int          _injected_slot_id{TensorType::ACL_UNKNOWN};
 };
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H */
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
index 15a5632d0b..9ca20fa152 100644
--- a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
@@ -22,14 +22,15 @@
  * SOFTWARE.
  */
 #include "ClKernelRuntime.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/core/CL/CLUtils.h"
 #ifdef ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h"
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
 #include "src/gpu/cl/ClKernelLibrary.h"
-
 #include "support/Cast.h"
 namespace arm_compute
 {
@@ -43,13 +44,12 @@ void ClKernelRuntime::configure(const ClCompileContext &compile_ctx, const GpuKe
 {
     // Create kernel from kernel source string
     opencl::ClKernelLibrary &klib = opencl::ClKernelLibrary::get();
-    _kernel                       = static_cast<cl::Kernel>(compile_ctx.create_kernel(code.name(),
-                                                                                      code.name(), // program name has to be provided to differentiate between different unfusable components' kernels.
-                                                                                      // Each program contains exactly one kernel
-                                                                                      code.code(),
-                                                                                      klib.kernel_path() /* Kernel path: Used in cases of embedded kernels */,
-                                                                                      code.build_options().options(),
-                                                                                      false /* Is source binary */));
+    _kernel                       = static_cast<cl::Kernel>(compile_ctx.create_kernel(
+                              code.name(),
+                              code.name(), // program name has to be provided to differentiate between different unfusable components' kernels.
+                              // Each program contains exactly one kernel
+                              code.code(), klib.kernel_path() /* Kernel path: Used in cases of embedded kernels */,
+                              code.build_options().options(), false /* Is source binary */));
 
     // Configure execution window
     IClKernel::configure_internal(code.window());
@@ -63,11 +63,15 @@ void ClKernelRuntime::configure(const ClCompileContext &compile_ctx, const GpuKe
 
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 
-inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKernelArgumentInfo &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector<cl::Image2D> &cl_images)
+inline void ClKernelRuntime::add_tensor_argument(unsigned int                &idx,
+                                                 const GpuKernelArgumentInfo &arg,
+                                                 const ICLTensor             *tensor,
+                                                 const Window                &arg_slice,
+                                                 std::vector<cl::Image2D>    &cl_images)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
 
-    switch(arg.type)
+    switch (arg.type)
     {
         case GpuKernelArgumentInfo::Type::Scalar:
         {
@@ -95,9 +99,13 @@ inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKer
         }
         case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
         {
-            const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3));
+            const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) *
+                                                                            tensor->info()->dimension(2) *
+                                                                            tensor->info()->dimension(3));
             const size_t      image_row_pitch = tensor->info()->strides_in_bytes()[1];
-            cl::Image2D       tensor_image2d  = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+            cl::Image2D       tensor_image2d =
+                create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d,
+                                           tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
             cl_images.push_back(tensor_image2d);
             _kernel.setArg(idx++, tensor_image2d);
             break;
@@ -111,9 +119,13 @@ inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKer
         }
         case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
         {
-            const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3));
+            const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) *
+                                                                            tensor->info()->dimension(2) *
+                                                                            tensor->info()->dimension(3));
             const size_t      image_row_pitch = tensor->info()->strides_in_bytes()[1];
-            cl::Image2D       tensor_image2d  = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+            cl::Image2D       tensor_image2d =
+                create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d,
+                                           tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
             cl_images.push_back(tensor_image2d);
             _kernel.setArg(idx++, tensor_image2d);
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(tensor->info()->strides_in_bytes()[2]));
@@ -142,8 +154,9 @@ inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKer
             const size_t image_h        = tensor->info()->tensor_shape().total_size_upper(1);
             const size_t image_stride_y = tensor->info()->strides_in_bytes()[1];
 
-            cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(),
-                                                                    TensorShape(image_w, image_h), tensor->info()->data_type(), image_stride_y, CLImage2DType::ReadOnly);
+            cl::Image2D tensor_image2d = create_image2d_from_buffer(
+                CLKernelLibrary::get().context(), tensor->cl_buffer(), TensorShape(image_w, image_h),
+                tensor->info()->data_type(), image_stride_y, CLImage2DType::ReadOnly);
             cl_images.push_back(tensor_image2d);
 
             _kernel.setArg(idx++, tensor_image2d);
@@ -170,13 +183,16 @@ inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKer
 }
 
 #else // ACL_INTERNAL_TEST_CKW_IN_DF
-inline void ClKernelRuntime::add_kernel_argument(unsigned int &idx, const GpuKernelArgumentBinding &arg, const ICLTensor *tensor, std::vector<cl::Image2D> &cl_images)
+inline void ClKernelRuntime::add_kernel_argument(unsigned int                   &idx,
+                                                 const GpuKernelArgumentBinding &arg,
+                                                 const ICLTensor                *tensor,
+                                                 std::vector<cl::Image2D>       &cl_images)
 {
-    switch(arg.type())
+    switch (arg.type())
     {
         case GpuKernelArgumentBinding::Type::TensorStorage:
         {
-            switch(arg.tensor_storage_type())
+            switch (arg.tensor_storage_type())
             {
                 case TensorStorageType::ClBufferUint8Ptr:
                 {
@@ -238,7 +254,7 @@ void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::Com
         // CLImages created from tensor arguments. Need to be retained until enqueue
         std::vector<cl::Image2D> cl_images;
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-        for(auto id_arg : _arguments)
+        for (auto id_arg : _arguments)
         {
             const auto arg    = id_arg.second;
             auto       tensor = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(id_arg.first));
@@ -248,7 +264,7 @@ void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::Com
         }
 
 #else  // ACL_INTERNAL_TEST_CKW_IN_DF
-        for(const auto &arg : _arguments)
+        for (const auto &arg : _arguments)
         {
             auto tensor = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(arg.id()));
             ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
@@ -259,8 +275,7 @@ void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::Com
 
         // Dispatch kernel
         enqueue(queue, *this, slice, lws_hint(), use_dummy_work_items);
-    }
-    while(skip_sliding_window && window.slide_window_slice_3D(slice));
+    } while (skip_sliding_window && window.slide_window_slice_3D(slice));
 }
 
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h
index 92e73503ce..e78567eb9d 100644
--- a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h
@@ -68,7 +68,11 @@ private:
      * @param[in]     arg_slice Window the kernel will be run on
      * @param[out]    cl_images Extra cl images created from the tensor (will need to be retained until the kernel is enqueued)
      */
-    inline void add_tensor_argument(unsigned int &idx, const GpuKernelArgumentInfo &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector<cl::Image2D> &cl_images);
+    inline void add_tensor_argument(unsigned int                &idx,
+                                    const GpuKernelArgumentInfo &arg,
+                                    const ICLTensor             *tensor,
+                                    const Window                &arg_slice,
+                                    std::vector<cl::Image2D>    &cl_images);
 #else  // ACL_INTERNAL_TEST_CKW_IN_DF
     /** Set a kernel argument as part of a tensor
      *
@@ -77,7 +81,10 @@ private:
      * @param[in]     tensor    Tensor of which the kernel argument @p arg is a part of
      * @param[out]    cl_images Extra cl images created from the tensor (will need to be retained until the kernel is enqueued)
      */
-    inline void add_kernel_argument(unsigned int &idx, const GpuKernelArgumentBinding &arg, const ICLTensor *tensor, std::vector<cl::Image2D> &cl_images);
+    inline void add_kernel_argument(unsigned int                   &idx,
+                                    const GpuKernelArgumentBinding &arg,
+                                    const ICLTensor                *tensor,
+                                    std::vector<cl::Image2D>       &cl_images);
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 private:
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp
index cd21b10180..ba39ff4c9d 100644
--- a/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+
 #include "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
@@ -55,14 +56,14 @@ public:
     {
         DataView() = default;
         DataView(CLTensor *tensor, const TensorInfo &tensor_info, const AuxMemoryInfo &memory_info)
-            : tensor{ tensor }, tensor_info{ tensor_info }, memory_info{ memory_info }
+            : tensor{tensor}, tensor_info{tensor_info}, memory_info{memory_info}
         {
         }
-        ~DataView()                     = default;
-        DataView(const DataView &other) = default;
+        ~DataView()                                = default;
+        DataView(const DataView &other)            = default;
         DataView &operator=(const DataView &other) = default;
         DataView(DataView &&other)                 = default;
-        DataView &operator=(DataView &&other) = default;
+        DataView     &operator=(DataView &&other)  = default;
         CLTensor     *tensor{};      /**< Pointer to the auxiliary tensor */
         TensorInfo    tensor_info{}; /**< Associated tensor info */
         AuxMemoryInfo memory_info{}; /**< Memory requirement */
@@ -92,7 +93,7 @@ private:
     {
         const auto t_id             = tensor_info.id();
         auto       find_tensor_pair = _owned_tensors.find(t_id);
-        if(find_tensor_pair != _owned_tensors.end())
+        if (find_tensor_pair != _owned_tensors.end())
         {
             return find_tensor_pair->second.get();
         }
@@ -107,7 +108,7 @@ private:
     }
 
     std::map<ITensorInfo::Id, std::unique_ptr<CLTensor>> _owned_tensors{};
-    std::vector<DataView> _tensors{};
+    std::vector<DataView>                                _tensors{};
 };
 /** Construct auxiliary tensors required by @ref GpuWorkloadSourceCode
  *
@@ -120,12 +121,12 @@ private:
  */
 Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code)
 {
-    for(auto t_id : code.tensors())
+    for (auto t_id : code.tensors())
     {
         // Get tensor object
         const auto workload_arg  = code.query_tensor(t_id);
         ICLTensor *tensor_object = nullptr;
-        if(workload_arg->memory_descriptor()->memory_type == MemoryType::Auxiliary)
+        if (workload_arg->memory_descriptor()->memory_type == MemoryType::Auxiliary)
         {
             // Create aux tensor CLTensor object
             const TensorInfo tensor_info = *workload_arg->tensor_info();
@@ -133,7 +134,7 @@ Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode
             const auto aux_memory_info = workload_arg->memory_descriptor()->aux_memory_info;
             tensor_object              = aux_tensors->add_aux_tensor(tensor_info, aux_memory_info);
 
-            if(tensor_object == nullptr)
+            if (tensor_object == nullptr)
             {
                 return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Failed to construct an auxiliary tensor");
             }
@@ -156,7 +157,7 @@ public:
     ITensorPack *find_tensor_pack(UnitWorkloadId uwk_id)
     {
         auto tensor_pack = _tensor_packs.find(uwk_id);
-        if(tensor_pack != _tensor_packs.end())
+        if (tensor_pack != _tensor_packs.end())
         {
             return &(tensor_pack->second);
         }
@@ -173,7 +174,10 @@ public:
         return _tensor_packs.at(uwk_id);
     }
 
-    friend Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &code, const std::vector<CLTensor *> &user_tensors, const ClAuxTensors &aux_tensors);
+    friend Status create_tensor_lut(ClTensorLUT                   *tensor_lut,
+                                    const GpuWorkloadSourceCode   &code,
+                                    const std::vector<CLTensor *> &user_tensors,
+                                    const ClAuxTensors            &aux_tensors);
 
 private:
     /** Add a tensor pack and associate it with @ref UnitWorkloadId @p uwk_id
@@ -197,19 +201,22 @@ private:
  *
  * @return Status
  */
-Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &code, const std::vector<CLTensor *> &user_tensors, const ClAuxTensors &aux_tensors)
+Status create_tensor_lut(ClTensorLUT                   *tensor_lut,
+                         const GpuWorkloadSourceCode   &code,
+                         const std::vector<CLTensor *> &user_tensors,
+                         const ClAuxTensors            &aux_tensors)
 {
     // Combine user tensors and aux tensors
     std::map<ITensorInfo::Id, CLTensor *> tensor_map;
-    for(auto tensor : user_tensors)
+    for (auto tensor : user_tensors)
     {
         const auto t_id = tensor->info()->id();
 
-        if(tensor_map.find(t_id) != tensor_map.end())
+        if (tensor_map.find(t_id) != tensor_map.end())
         {
             // In case of elementwise in-place: give another Id to the In/Out tensor when passed again
             std::vector<ITensorInfo::Id> ids;
-            for(auto &t : tensor_map)
+            for (auto &t : tensor_map)
             {
                 ids.push_back(t.first);
             }
@@ -221,11 +228,11 @@ Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &c
             tensor_map[t_id] = tensor;
         }
     }
-    for(const auto &data : aux_tensors.get_tensors())
+    for (const auto &data : aux_tensors.get_tensors())
     {
         const auto t_id   = data.tensor_info.id();
         const auto tensor = data.tensor;
-        if(tensor_map.find(t_id) != tensor_map.end())
+        if (tensor_map.find(t_id) != tensor_map.end())
         {
             return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Clashing tensor ids");
         }
@@ -233,25 +240,25 @@ Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &c
     }
 
     // Add tensor objects into corresponding tensor packs
-    for(auto id_tensor : tensor_map)
+    for (auto id_tensor : tensor_map)
     {
         const auto t_id          = id_tensor.first;
         const auto tensor_object = id_tensor.second;
-        if(tensor_object == nullptr)
+        if (tensor_object == nullptr)
         {
             return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Trying to add a nullptr into the tensor packs");
         }
-        if(tensor_object->allocator()->info().total_size() == 0U)
+        if (tensor_object->allocator()->info().total_size() == 0U)
         {
             return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "No allocated memory found in tensor");
         }
 
-        for(auto uwk_id : code.get_unit_workloads_from_tensor(t_id))
+        for (auto uwk_id : code.get_unit_workloads_from_tensor(t_id))
         {
             ITensorPack *tensor_pack = tensor_lut->find_tensor_pack(uwk_id);
-            if(tensor_pack == nullptr)
+            if (tensor_pack == nullptr)
             {
-                tensor_lut->add_tensor_pack(uwk_id, ITensorPack{ { t_id, tensor_object } });
+                tensor_lut->add_tensor_pack(uwk_id, ITensorPack{{t_id, tensor_object}});
             }
             else
             {
@@ -269,15 +276,14 @@ struct ClWorkloadRuntime::Implementation
 {
     std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels{};
     std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels_prep{};
-    bool                  _is_configured{ false };
-    bool                  _is_prepared{ false };
-    ClTensorLUT           _tensor_lut{};
-    ClAuxTensors          _aux_tensors{};
-    GpuWorkloadSourceCode _source_code{};
+    bool                                                       _is_configured{false};
+    bool                                                       _is_prepared{false};
+    ClTensorLUT                                                _tensor_lut{};
+    ClAuxTensors                                               _aux_tensors{};
+    GpuWorkloadSourceCode                                      _source_code{};
 };
 
-ClWorkloadRuntime::ClWorkloadRuntime()
-    : _impl{ std::make_unique<Implementation>() }
+ClWorkloadRuntime::ClWorkloadRuntime() : _impl{std::make_unique<Implementation>()}
 {
 }
 
@@ -286,18 +292,19 @@ ClWorkloadRuntime::~ClWorkloadRuntime() = default;
 Status ClWorkloadRuntime::configure(const GpuWorkloadSketch &sketch)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(_impl->_is_configured, "ClWorkloadRuntime cannot be re-configured");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(sketch.gpu_context()->gpu_language() != GpuLanguage::OpenCL, "ClWorkloadRuntime cannot be configured with non-OpenCL workload sketch");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(sketch.gpu_context()->gpu_language() != GpuLanguage::OpenCL,
+                                    "ClWorkloadRuntime cannot be configured with non-OpenCL workload sketch");
     // Generate source code
     _impl->_source_code = sketch.implementation().generate_source_code();
     // Configure unit workload from source code
-    for(auto uwk_id : _impl->_source_code.unit_workloads())
+    for (auto uwk_id : _impl->_source_code.unit_workloads())
     {
         const auto work  = _impl->_source_code.query_unit_workload(uwk_id);
         const auto stage = work.stage().stage;
         auto       k     = std::make_unique<ClKernelRuntime>();
         k->configure(*sketch.gpu_context()->cl_compile_context(), work.code());
 
-        switch(stage)
+        switch (stage)
         {
             case UnitWorkloadStage::Stage::Run:
             {
@@ -323,9 +330,9 @@ Status ClWorkloadRuntime::configure(const GpuWorkloadSketch &sketch)
 
 void ClWorkloadRuntime::prepare()
 {
-    if(!_impl->_is_prepared)
+    if (!_impl->_is_prepared)
     {
-        for(auto &id_kernel_pair : _impl->_kernels_prep)
+        for (auto &id_kernel_pair : _impl->_kernels_prep)
         {
             const bool flush_queue = false;
             const auto uwk_id      = id_kernel_pair.first;
@@ -344,7 +351,7 @@ Status ClWorkloadRuntime::run(const std::vector<CLTensor *> &tensors)
     const auto st = create_tensor_lut(&_impl->_tensor_lut, _impl->_source_code, tensors, _impl->_aux_tensors);
     ARM_COMPUTE_RETURN_ON_ERROR(st);
     prepare();
-    for(auto &id_kernel_pair : _impl->_kernels)
+    for (auto &id_kernel_pair : _impl->_kernels)
     {
         // Flush the command queue on the last kernel
         const bool flush_queue = false;
@@ -358,7 +365,7 @@ Status ClWorkloadRuntime::run(const std::vector<CLTensor *> &tensors)
 std::vector<std::tuple<CLTensor *, TensorInfo, AuxMemoryInfo>> ClWorkloadRuntime::get_auxiliary_tensors()
 {
     std::vector<std::tuple<CLTensor *, TensorInfo, AuxMemoryInfo>> aux_tensors;
-    for(const auto &data : _impl->_aux_tensors.get_tensors())
+    for (const auto &data : _impl->_aux_tensors.get_tensors())
     {
         aux_tensors.emplace_back(data.tensor, data.tensor_info, data.memory_info);
     }
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp
index 84fb279237..7044b0ea66 100644
--- a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp
+++ b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp
@@ -30,14 +30,17 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-void cl_add_tensor_component_argument(cl::Kernel &kernel, unsigned int &idx, const ICLTensor *tensor, TensorComponentType component)
+void cl_add_tensor_component_argument(cl::Kernel         &kernel,
+                                      unsigned int       &idx,
+                                      const ICLTensor    *tensor,
+                                      TensorComponentType component)
 {
     ARM_COMPUTE_ERROR_ON(tensor == nullptr);
 
     const auto *info    = tensor->info();
     const auto &strides = info->strides_in_bytes();
 
-    switch(component)
+    switch (component)
     {
         case TensorComponentType::OffsetFirstElement:
             kernel.setArg<cl_uint>(idx++, info->offset_first_element_in_bytes());
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h
index 4cbb157a48..306d547acb 100644
--- a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h
+++ b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h
@@ -42,7 +42,10 @@ namespace dynamic_fusion
  * @param[in]     tensor    Tensor from which to access the tensor component.
  * @param[in]     component Tensor component to select such as tensor dimensions, strides, etc.
  */
-void cl_add_tensor_component_argument(cl::Kernel &kernel, unsigned int &idx, const ICLTensor *tensor, TensorComponentType component);
+void cl_add_tensor_component_argument(cl::Kernel         &kernel,
+                                      unsigned int       &idx,
+                                      const ICLTensor    *tensor,
+                                      TensorComponentType component);
 
 /** Add an OpenCL buffer object to the kernel's arguments at the specified index @p idx.
  *
diff --git a/src/dynamic_fusion/sketch/ArgumentPack.h b/src/dynamic_fusion/sketch/ArgumentPack.h
index f118d7d851..3bf380b1ec 100644
--- a/src/dynamic_fusion/sketch/ArgumentPack.h
+++ b/src/dynamic_fusion/sketch/ArgumentPack.h
@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK
 
 #include "arm_compute/core/experimental/Types.h"
+
 #include <unordered_map>
 #include <vector>
 
@@ -52,26 +53,21 @@ public:
      */
     struct PackElement
     {
-        PackElement()                        = default;
-        PackElement(const PackElement &elem) = default;
+        PackElement()                                   = default;
+        PackElement(const PackElement &elem)            = default;
         PackElement &operator=(const PackElement &elem) = default;
         PackElement(PackElement &&elem)                 = default;
-        PackElement &operator=(PackElement &&elem) = default;
-        PackElement(Id id, T *tensor)
-            : id(id), tensor(tensor), ctensor(nullptr)
+        PackElement &operator=(PackElement &&elem)      = default;
+        PackElement(Id id, T *tensor) : id(id), tensor(tensor), ctensor(nullptr)
         {
         }
-        PackElement(Id id, const T *ctensor)
-            : id(id), tensor(nullptr), ctensor(ctensor)
+        PackElement(Id id, const T *ctensor) : id(id), tensor(nullptr), ctensor(ctensor)
         {
         }
 
-        Id       id{ ACL_UNKNOWN }; /**< Argument id within the pack */
-        T       *tensor{ nullptr }; /**< Non-const pointer to tensor-related object */
-        const T *ctensor
-        {
-            nullptr
-        }; /**< Const pointer to tensor-related object */
+        Id       id{ACL_UNKNOWN};  /**< Argument id within the pack */
+        T       *tensor{nullptr};  /**< Non-const pointer to tensor-related object */
+        const T *ctensor{nullptr}; /**< Const pointer to tensor-related object */
     };
 
 public:
@@ -88,10 +84,9 @@ public:
     /** Allow instances of this class to be moved */
     ArgumentPack<T> &operator=(ArgumentPack<T> &&other) = default;
     /** Initializer list Constructor */
-    ArgumentPack(const std::initializer_list<PackElement> &l)
-        : _pack{}
+    ArgumentPack(const std::initializer_list<PackElement> &l) : _pack{}
     {
-        for(const auto &e : l)
+        for (const auto &e : l)
         {
             _pack[e.id] = e;
         }
@@ -134,7 +129,7 @@ public:
     const T *get_const_tensor(Id id) const
     {
         auto it = _pack.find(id);
-        if(it != _pack.end())
+        if (it != _pack.end())
         {
             return it->second.ctensor != nullptr ? it->second.ctensor : it->second.tensor;
         }
@@ -171,10 +166,10 @@ public:
     std::vector<T *> get_src_tensors()
     {
         std::vector<T *> src_tensors{};
-        for(int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id)
+        for (int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id)
         {
             auto tensor = get_tensor(static_cast<TensorType>(id));
-            if(tensor != nullptr)
+            if (tensor != nullptr)
             {
                 src_tensors.push_back(tensor);
             }
@@ -188,10 +183,10 @@ public:
     std::vector<const T *> get_const_src_tensors() const
     {
         std::vector<const T *> src_tensors{};
-        for(int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id)
+        for (int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id)
         {
             auto tensor = get_const_tensor(static_cast<TensorType>(id));
-            if(tensor != nullptr)
+            if (tensor != nullptr)
             {
                 src_tensors.push_back(tensor);
             }
@@ -205,10 +200,10 @@ public:
     std::vector<T *> get_dst_tensors()
     {
         std::vector<T *> dst_tensors{};
-        for(int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id)
+        for (int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id)
         {
             auto tensor = get_tensor(static_cast<TensorType>(id));
-            if(tensor != nullptr)
+            if (tensor != nullptr)
             {
                 dst_tensors.push_back(tensor);
             }
@@ -222,10 +217,10 @@ public:
     std::vector<const T *> get_const_dst_tensors() const
     {
         std::vector<const T *> dst_tensors{};
-        for(int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id)
+        for (int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id)
         {
             auto tensor = get_const_tensor(static_cast<TensorType>(id));
-            if(tensor != nullptr)
+            if (tensor != nullptr)
             {
                 dst_tensors.push_back(tensor);
             }
diff --git a/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp b/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp
index 3a5657e07b..6f3816568c 100644
--- a/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp
+++ b/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp
@@ -69,7 +69,8 @@ uint32_t DepthwiseConv2dAttributes::depth_multiplier() const
     return _depth_multiplier;
 }
 
-DepthwiseConv2dAttributes &DepthwiseConv2dAttributes::dimension_rounding_type(const DimensionRoundingType &dimension_rounding_type)
+DepthwiseConv2dAttributes &
+DepthwiseConv2dAttributes::dimension_rounding_type(const DimensionRoundingType &dimension_rounding_type)
 {
     _dimension_rounding_type = dimension_rounding_type;
     return *this;
diff --git a/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp b/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp
index c28791f5fe..80f65f926a 100644
--- a/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp
+++ b/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
+
 #include "arm_compute/core/Size2D.h"
 
 namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
index 226e1a2df3..03817173f4 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
@@ -61,11 +61,10 @@ struct GpuKernelArgumentInfo
     /** Default constructor */
     GpuKernelArgumentInfo() = default;
     /** Constructor */
-    GpuKernelArgumentInfo(Type type)
-        : type{ type }
+    GpuKernelArgumentInfo(Type type) : type{type}
     {
     }
-    Type type{ Type::Tensor_4D_t_Buffer };
+    Type type{Type::Tensor_4D_t_Buffer};
 };
 bool operator==(const GpuKernelArgumentInfo &info0, const GpuKernelArgumentInfo &info1);
 /** Kernel argument information linked with its corresponding @ref ITensorInfo
@@ -79,10 +78,8 @@ public:
      * @param[in] tensor_info     Associated @ref ITensorInfo
      * @param[in] kernel_arg_info Associated @ref GpuKernelArgumentInfo
      */
-    GpuKernelArgument(const ITensorInfo           &tensor_info,
-                      const GpuKernelArgumentInfo &kernel_arg_info)
-        : _tensor_info{ tensor_info },
-          _kernel_arg_info{ kernel_arg_info }
+    GpuKernelArgument(const ITensorInfo &tensor_info, const GpuKernelArgumentInfo &kernel_arg_info)
+        : _tensor_info{tensor_info}, _kernel_arg_info{kernel_arg_info}
     {
     }
     /** Get workload tensor id */
@@ -200,12 +197,12 @@ public:
         TensorComponent /** @ref TensorComponentType */
     };
     GpuKernelArgumentBinding(ITensorInfo::Id id, TensorStorageType storage)
-        : _type{ Type::TensorStorage }, _id{ id }, _value{}
+        : _type{Type::TensorStorage}, _id{id}, _value{}
     {
         _value.tensor_storage_type = storage;
     }
     GpuKernelArgumentBinding(ITensorInfo::Id id, TensorComponentType component)
-        : _type{ Type::TensorComponent }, _id{ id }, _value{}
+        : _type{Type::TensorComponent}, _id{id}, _value{}
     {
         _value.tensor_component_type = component;
     }
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
index 5a65ede38b..1a458c9862 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
@@ -31,35 +31,31 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-std::vector<DependencyGraph::TensorId> GpuKernelComponentGraph::get_tensor_ids(const std::vector<const ITensorInfo *> tensors)
+std::vector<DependencyGraph::TensorId>
+GpuKernelComponentGraph::get_tensor_ids(const std::vector<const ITensorInfo *> tensors)
 {
     std::vector<DependencyGraph::TensorId> tensor_ids{};
-    std::transform(
-        std::begin(tensors), std::end(tensors),
-        std::back_inserter(tensor_ids),
-        [](const auto & t)
-    {
-        return t->id();
-    });
+    std::transform(std::begin(tensors), std::end(tensors), std::back_inserter(tensor_ids),
+                   [](const auto &t) { return t->id(); });
     return tensor_ids;
 }
 
 GpuKernelComponentGraph::GpuKernelComponentGraph(GpuWorkloadContext *context, GpuComponentServices *services)
-    : _context{ context }, _services{ services }, _components{}, _tensors{}, _dependency_graph{}
+    : _context{context}, _services{services}, _components{}, _tensors{}, _dependency_graph{}
 {
 }
 
 GpuKernelComponentStream GpuKernelComponentGraph::fuse(const MemoryDescriptorMap &mem_map) const
 {
-    GpuKernelComponentStream stream{ _context, _services, mem_map };
+    GpuKernelComponentStream stream{_context, _services, mem_map};
     const auto               op_seq = _dependency_graph.build_operators_sequence();
 
     stream.new_component_group();
-    for(auto op : op_seq)
+    for (auto op : op_seq)
     {
         const auto component = _components.at(op.op).get();
         const auto success   = stream.add_component(component);
-        if(!success) // Assume first failure was because the root component is unfusable
+        if (!success) // Assume first failure was because the root component is unfusable
         {
             stream.new_component_group();
             const auto success = stream.add_component(component);
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h
index 85c9b45840..6f871a3c90 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h
@@ -70,21 +70,21 @@ public:
      * @param[in] args Component arguments except for component id, which is auto-allocated
      */
     template <typename T, typename... Args>
-    void add_new_component(Args &&... args)
+    void add_new_component(Args &&...args)
     {
-        auto                      comp           = _services->component_factory().create<T>(std::forward<Args>(args)...);
-        ArgumentPack<ITensorInfo> tensors        = comp->tensors();
+        auto                      comp    = _services->component_factory().create<T>(std::forward<Args>(args)...);
+        ArgumentPack<ITensorInfo> tensors = comp->tensors();
         const auto                src_tensor_ids = get_tensor_ids(tensors.get_const_src_tensors());
         const auto                dst_tensor_ids = get_tensor_ids(tensors.get_const_dst_tensors());
-        bool                      success        = _dependency_graph.add_operator(comp->id(), src_tensor_ids, dst_tensor_ids);
+        bool                      success = _dependency_graph.add_operator(comp->id(), src_tensor_ids, dst_tensor_ids);
         ARM_COMPUTE_UNUSED(success);
         ARM_COMPUTE_ERROR_ON(!success);
         _components[comp->id()] = std::move(comp);
-        for(auto t : tensors.get_const_src_tensors())
+        for (auto t : tensors.get_const_src_tensors())
         {
             _tensors[t->id()] = t;
         }
-        for(auto t : tensors.get_const_dst_tensors())
+        for (auto t : tensors.get_const_dst_tensors())
         {
             _tensors[t->id()] = t;
         }
@@ -99,11 +99,11 @@ public:
 
 private:
     static std::vector<DependencyGraph::TensorId> get_tensor_ids(const std::vector<const ITensorInfo *> tensors);
-    GpuWorkloadContext   *_context;
-    GpuComponentServices *_services;
+    GpuWorkloadContext                           *_context;
+    GpuComponentServices                         *_services;
     std::map<ComponentId, std::unique_ptr<IGpuKernelComponent>> _components;
     std::map<ITensorInfo::Id, const ITensorInfo *>              _tensors;
-    DependencyGraph _dependency_graph{};
+    DependencyGraph                                             _dependency_graph{};
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
index 81c3f0c800..5a6d125d96 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
 #include <algorithm>
@@ -37,86 +38,87 @@ namespace dynamic_fusion
 {
 bool GpuKernelComponentGroup::add_component(ComponentPtr component)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(
-        _finalized, "The component group has been finalized and cannot be altered.");
+    ARM_COMPUTE_ERROR_ON_MSG(_finalized, "The component group has been finalized and cannot be altered.");
 
     // note: Constraint 1 is guaranteed as a precondition
     // Constraint 2
-    if(component->type() != GpuComponentType::Output && _components.size() >= max_fused_components)
+    if (component->type() != GpuComponentType::Output && _components.size() >= max_fused_components)
     {
         return false;
     }
     // Constraint 3.1: Pattern: (Unfusable + Output)
-    if(!_components.empty() && get_root_component()->type() == GpuComponentType::Unfusable && component->type() != GpuComponentType::Output)
+    if (!_components.empty() && get_root_component()->type() == GpuComponentType::Unfusable &&
+        component->type() != GpuComponentType::Output)
     {
         return false;
     }
     // Constraint 3.2
-    if(!_components.empty() && (component->type() != GpuComponentType::Simple && component->type() != GpuComponentType::Output))
+    if (!_components.empty() &&
+        (component->type() != GpuComponentType::Simple && component->type() != GpuComponentType::Output))
     {
         return false;
     }
     // Constraint 4
-    if(component->type() != GpuComponentType::Unfusable && component->tensors().get_const_dst_tensors().size() != 1U)
+    if (component->type() != GpuComponentType::Unfusable && component->tensors().get_const_dst_tensors().size() != 1U)
     {
         return false;
     }
     // Constraint 5
-    if(!_components.empty() && !(get_root_component()->properties() == component->properties()))
+    if (!_components.empty() && !(get_root_component()->properties() == component->properties()))
     {
         return false;
     }
     // Constraint 7
-    if(!_components.empty())
+    if (!_components.empty())
     {
         const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors();
         ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
         const auto first_dst_tensor = root_dst_tensors[0];
         const auto dst_tensors      = component->tensors().get_const_dst_tensors();
-        for(const auto &t : root_dst_tensors)
+        for (const auto &t : root_dst_tensors)
         {
-            if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+            if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
             {
                 return false;
             }
         }
-        for(const auto &t : dst_tensors)
+        for (const auto &t : dst_tensors)
         {
-            if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+            if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
             {
                 return false;
             }
         }
     }
     // Constraint 8
-    if(!_components.empty())
+    if (!_components.empty())
     {
         const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors();
         ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
         const auto first_dst_tensor_layout = root_dst_tensors[0]->data_layout();
         const auto dst_tensors             = component->tensors().get_const_dst_tensors();
-        for(const auto &t : root_dst_tensors)
+        for (const auto &t : root_dst_tensors)
         {
-            if(t->data_layout() != first_dst_tensor_layout)
+            if (t->data_layout() != first_dst_tensor_layout)
             {
                 return false;
             }
         }
-        for(const auto &t : dst_tensors)
+        for (const auto &t : dst_tensors)
         {
-            if(t->data_layout() != first_dst_tensor_layout)
+            if (t->data_layout() != first_dst_tensor_layout)
             {
                 return false;
             }
         }
     }
     // Constraint 9
-    if(component->tensors().get_const_dst_tensors().size() >= max_dst_tensors)
+    if (component->tensors().get_const_dst_tensors().size() >= max_dst_tensors)
     {
         return false;
     }
     // Constraint 9 corollary
-    if(component->type() == GpuComponentType::Output && _components.size() >= max_fused_components + max_dst_tensors)
+    if (component->type() == GpuComponentType::Output && _components.size() >= max_fused_components + max_dst_tensors)
     {
         return false;
     }
@@ -126,36 +128,36 @@ bool GpuKernelComponentGroup::add_component(ComponentPtr component)
 
 void GpuKernelComponentGroup::finalize()
 {
-    if(_finalized)
+    if (_finalized)
     {
         return;
     }
 
     _finalized = true;
 
-    std::set<const ITensorInfo *> output_tensors;
+    std::set<const ITensorInfo *>                                   output_tensors;
     std::map<const ITensorInfo *, std::vector<const ITensorInfo *>> possible_tile_map;
-    std::map<const ITensorInfo *, int32_t> tile_usages;
+    std::map<const ITensorInfo *, int32_t>                          tile_usages;
 
-    for(auto component : _components)
+    for (auto component : _components)
     {
-        const auto tensors = component->tensors();
+        const auto tensors     = component->tensors();
         const auto src_tensors = tensors.get_const_src_tensors();
         const auto dst_tensors = tensors.get_const_dst_tensors();
 
         // Detect input, output and intermediate tensors.
-        for(auto tensor : src_tensors)
+        for (auto tensor : src_tensors)
         {
             const auto output_tensors_it = output_tensors.find(tensor);
 
-            if(output_tensors_it != output_tensors.end())
+            if (output_tensors_it != output_tensors.end())
             {
                 // This tensor is the output of another operator.
                 // It must be marked as intermediate tensor.
                 output_tensors.erase(output_tensors_it);
                 _interm_tensors.insert(tensor);
             }
-            else if(_interm_tensors.find(tensor) == _interm_tensors.end())
+            else if (_interm_tensors.find(tensor) == _interm_tensors.end())
             {
                 _input_tensors.insert(tensor);
 
@@ -164,7 +166,7 @@ void GpuKernelComponentGroup::finalize()
             }
         }
 
-        for(auto tensor : dst_tensors)
+        for (auto tensor : dst_tensors)
         {
             ARM_COMPUTE_ERROR_ON(_input_tensors.find(tensor) != _input_tensors.end());
             ARM_COMPUTE_ERROR_ON(output_tensors.find(tensor) != output_tensors.end());
@@ -177,27 +179,27 @@ void GpuKernelComponentGroup::finalize()
 
         // Check if the output can overwrite the input tile.
         const auto component_type = component->type();
-        if(component_type == GpuComponentType::Simple || component_type == GpuComponentType::Output)
+        if (component_type == GpuComponentType::Simple || component_type == GpuComponentType::Output)
         {
             ARM_COMPUTE_ERROR_ON(dst_tensors.size() != 1);
 
-            const auto dst_tensor = dst_tensors[0];
-            const auto &dst_shape = dst_tensor->tensor_shape();
-            const auto &dst_type = dst_tensor->data_type();
+            const auto  dst_tensor = dst_tensors[0];
+            const auto &dst_shape  = dst_tensor->tensor_shape();
+            const auto &dst_type   = dst_tensor->data_type();
 
             tile_usages[dst_tensor] = 0;
 
-            for(auto src_tensor : src_tensors)
+            for (auto src_tensor : src_tensors)
             {
                 const auto &src_shape = src_tensor->tensor_shape();
-                const auto &src_type = src_tensor->data_type();
+                const auto &src_type  = src_tensor->data_type();
 
-                if(src_shape == dst_shape && src_type == dst_type)
+                if (src_shape == dst_shape && src_type == dst_type)
                 {
                     const auto tile_usages_it = tile_usages.find(src_tensor);
                     ARM_COMPUTE_ERROR_ON(tile_usages_it == tile_usages.end());
 
-                    if(component_type == GpuComponentType::Simple || tile_usages_it->second > 0)
+                    if (component_type == GpuComponentType::Simple || tile_usages_it->second > 0)
                     {
                         // Increase the number of tile usages unless this component is an output
                         // and the tile has not been shared with any component.
@@ -212,7 +214,7 @@ void GpuKernelComponentGroup::finalize()
         else
         {
             // Outputs of complex and unfusable components need dedicated tile.
-            for(auto tensor : dst_tensors)
+            for (auto tensor : dst_tensors)
             {
                 tile_usages[tensor] = 0;
             }
@@ -220,25 +222,25 @@ void GpuKernelComponentGroup::finalize()
     }
 
     // Find the smallest list of tiles that the intermediate tensors need to write to.
-    for(auto tensor : _input_tensors)
+    for (auto tensor : _input_tensors)
     {
         _tile_map[tensor] = tensor;
     }
 
-    for(auto component : _components)
+    for (auto component : _components)
     {
         const auto dst_tensors = component->tensors().get_const_dst_tensors();
 
-        for(auto tensor : dst_tensors)
+        for (auto tensor : dst_tensors)
         {
             const auto target_tiles = possible_tile_map.at(tensor);
-            _tile_map[tensor] = tensor;
+            _tile_map[tensor]       = tensor;
 
-            for(auto target : target_tiles)
+            for (auto target : target_tiles)
             {
                 const auto num_usage = tile_usages[target];
 
-                if(num_usage <= 1)
+                if (num_usage <= 1)
                 {
                     // The target tile is consumed by only this operator, so we can reuse it
                     // for the destination tensor data.
@@ -249,26 +251,23 @@ void GpuKernelComponentGroup::finalize()
         }
     }
 
-    for(auto tensor : output_tensors)
+    for (auto tensor : output_tensors)
     {
         _tile_map[tensor] = tensor;
     }
 
     // All intermediate tensors that cannot be shared with any previous tensor
     // will need to be declared as tile variable.
-    for(auto tensor_tile : _tile_map)
+    for (auto tensor_tile : _tile_map)
     {
-        if(tensor_tile.first == tensor_tile.second &&
-           _interm_tensors.find(tensor_tile.first) != _interm_tensors.end())
+        if (tensor_tile.first == tensor_tile.second && _interm_tensors.find(tensor_tile.first) != _interm_tensors.end())
         {
             _tiles.push_back(tensor_tile.first);
         }
     }
 
-    std::set_union(
-        _input_tensors.begin(), _input_tensors.end(),
-        output_tensors.begin(), output_tensors.end(),
-        std::back_inserter(_argument_tensors));
+    std::set_union(_input_tensors.begin(), _input_tensors.end(), output_tensors.begin(), output_tensors.end(),
+                   std::back_inserter(_argument_tensors));
     _any_output_tensor = *output_tensors.begin();
 }
 
@@ -282,7 +281,7 @@ const ITensorInfo *GpuKernelComponentGroup::get_tile_for_tensor(const ITensorInf
 {
     ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
 
-    if(_tile_map.find(tensor) != _tile_map.end())
+    if (_tile_map.find(tensor) != _tile_map.end())
     {
         return _tile_map.at(tensor);
     }
@@ -304,7 +303,7 @@ std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_argument_tensors()
 
 GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_root_component() const
 {
-    if(empty())
+    if (empty())
     {
         return nullptr;
     }
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h
index c939aec369..6ad71abb39 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h
@@ -25,12 +25,11 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGROUP
 
 #include "components/Types.h"
-
 #include <cstdint>
 #include <cstdlib>
-#include <vector>
-#include <set>
 #include <map>
+#include <set>
+#include <vector>
 
 namespace arm_compute
 {
@@ -129,9 +128,9 @@ public:
     /** Get the number of components within the group */
     size_t size() const;
     /** Check if the component group is empty */
-    bool empty() const;
-    ComponentPtr &operator[](size_t index);
-    const ComponentPtr &operator[](size_t index) const;
+    bool                                               empty() const;
+    ComponentPtr                                      &operator[](size_t index);
+    const ComponentPtr                                &operator[](size_t index) const;
     typename std::vector<ComponentPtr>::iterator       begin();
     typename std::vector<ComponentPtr>::iterator       end();
     typename std::vector<ComponentPtr>::const_iterator begin() const;
@@ -142,13 +141,13 @@ public:
 private:
     std::vector<ComponentPtr> _components{};
 
-    bool _finalized{ false };
+    bool _finalized{false};
 
-    std::vector<const ITensorInfo *> _argument_tensors{};
-    std::set<const ITensorInfo *> _input_tensors{};
-    std::set<const ITensorInfo *> _interm_tensors{};
-    const ITensorInfo *_any_output_tensor{ nullptr };
-    std::vector<const ITensorInfo *> _tiles{};
+    std::vector<const ITensorInfo *>                   _argument_tensors{};
+    std::set<const ITensorInfo *>                      _input_tensors{};
+    std::set<const ITensorInfo *>                      _interm_tensors{};
+    const ITensorInfo                                 *_any_output_tensor{nullptr};
+    std::vector<const ITensorInfo *>                   _tiles{};
     std::map<const ITensorInfo *, const ITensorInfo *> _tile_map{};
 };
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
index a2b6623370..8042e3dd08 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
@@ -23,9 +23,9 @@
  */
 #include "GpuKernelComponentStream.h"
 
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
-#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
 namespace arm_compute
 {
@@ -33,8 +33,10 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-GpuKernelComponentStream::GpuKernelComponentStream(GpuWorkloadContext *context, GpuComponentServices *services, const MemoryDescriptorMap &mem_map)
-    : _context{ context }, _services{ services }, _component_groups{}, _mem_map{ mem_map }
+GpuKernelComponentStream::GpuKernelComponentStream(GpuWorkloadContext        *context,
+                                                   GpuComponentServices      *services,
+                                                   const MemoryDescriptorMap &mem_map)
+    : _context{context}, _services{services}, _component_groups{}, _mem_map{mem_map}
 {
 }
 
@@ -42,7 +44,7 @@ GpuWorkloadSourceCode GpuKernelComponentStream::write_workload_code()
 {
     GpuWorkloadSourceCode source_code;
     // Traverse through component groups and assemble workload together
-    for(auto && group : _component_groups)
+    for (auto &&group : _component_groups)
     {
         group.finalize();
 
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h
index ba2503a938..ef8a8a15b0 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h
@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTSTREAM
 
 #include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
 
@@ -53,7 +54,9 @@ public:
      * @param[in] services @ref GpuComponentServices to be used throughout the stream
      * @param[in] mem_map  @ref MemoryDescriptor map used to assemble the @ref GpuWorkloadSourceCode
      */
-    GpuKernelComponentStream(GpuWorkloadContext *context, GpuComponentServices *services, const MemoryDescriptorMap &mem_map);
+    GpuKernelComponentStream(GpuWorkloadContext        *context,
+                             GpuComponentServices      *services,
+                             const MemoryDescriptorMap &mem_map);
     /** Allow instances of this class to be copy constructed */
     GpuKernelComponentStream(const GpuKernelComponentStream &stream) = default;
     /** Allow instances of this class to be copied */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
index 64e1cdc3bc..24812cd8a7 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
diff --git a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
index c99984fc0e..502ceab807 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
@@ -26,9 +26,9 @@
 #include "arm_compute/core/experimental/Types.h"
 
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
-#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h"
 #else // ACL_INTERNAL_TEST_CKW_IN_DF
@@ -42,7 +42,7 @@ namespace experimental
 namespace dynamic_fusion
 {
 GpuLogicalKernel::GpuLogicalKernel(GpuComponentServices *services, const GpuKernelComponentGroup &components)
-    : _comp_group{ components }, _store_components{}
+    : _comp_group{components}, _store_components{}
 {
     ARM_COMPUTE_UNUSED(services);
 }
@@ -51,9 +51,9 @@ GpuKernelSourceCode GpuLogicalKernel::write_kernel_code()
 {
     GpuKernelSourceCode code;
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    ClTemplateWriter writer { _comp_group };
+    ClTemplateWriter writer{_comp_group};
 #else  // ACL_INTERNAL_TEST_CKW_IN_DF
-    GpuCkwDriver writer { _comp_group };
+    GpuCkwDriver writer{_comp_group};
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
     code.name(writer.get_name());
diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp
index 7bb14c8698..aec8b9db4f 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp
@@ -36,20 +36,15 @@ namespace
 std::vector<DependencyGraph::TensorId> get_tensor_ids(const std::vector<const ITensorInfo *> tensors)
 {
     std::vector<DependencyGraph::TensorId> tensor_ids{};
-    std::transform(
-        std::begin(tensors), std::end(tensors),
-        std::back_inserter(tensor_ids),
-        [](const auto & t)
-    {
-        return t->id();
-    });
+    std::transform(std::begin(tensors), std::end(tensors), std::back_inserter(tensor_ids),
+                   [](const auto &t) { return t->id(); });
     return tensor_ids;
 }
 
 } // namespace
 
 Operator::Operator(OperatorId id, GpuOperatorType operator_type, const ArgumentPack<ITensorInfo> &tensors)
-    : _id{ id }, _operator_type{ operator_type }, _tensors{ tensors }
+    : _id{id}, _operator_type{operator_type}, _tensors{tensors}
 {
 }
 
@@ -73,69 +68,69 @@ bool GpuOperatorGroup::try_add_operator(const Operator &op, bool is_output) cons
     const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors());
     const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors());
     // Constraint 1
-    if(!_graph.try_add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output))
+    if (!_graph.try_add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output))
     {
         return false;
     }
     // Constraint 2
-    if(_operators.size() >= max_fused_operators)
+    if (_operators.size() >= max_fused_operators)
     {
         return false;
     }
     // Constraint 3.1: Pattern: (Unfusable)
-    if(_operators.size() > 0 && get_root_operator()->operator_type() == GpuOperatorType::Unfusable)
+    if (_operators.size() > 0 && get_root_operator()->operator_type() == GpuOperatorType::Unfusable)
     {
         return false;
     }
     // Constraint 3.2
-    if(_operators.size() > 0 && (op.operator_type() != GpuOperatorType::Simple))
+    if (_operators.size() > 0 && (op.operator_type() != GpuOperatorType::Simple))
     {
         return false;
     }
     // Constraint 4
-    if(op.operator_type() != GpuOperatorType::Unfusable && op.tensors().get_const_dst_tensors().size() != 1U)
+    if (op.operator_type() != GpuOperatorType::Unfusable && op.tensors().get_const_dst_tensors().size() != 1U)
     {
         return false;
     }
     // Constraint 5
-    if(_operators.size() > 0)
+    if (_operators.size() > 0)
     {
         const auto root_dst_tensors = get_root_operator()->tensors().get_const_dst_tensors();
         ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
         const auto first_dst_tensor = root_dst_tensors[0];
         const auto dst_tensors      = op.tensors().get_const_dst_tensors();
-        for(const auto &t : root_dst_tensors)
+        for (const auto &t : root_dst_tensors)
         {
-            if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+            if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
             {
                 return false;
             }
         }
-        for(const auto &t : dst_tensors)
+        for (const auto &t : dst_tensors)
         {
-            if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+            if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
             {
                 return false;
             }
         }
     }
     // Constraint 6
-    if(_operators.size() > 0)
+    if (_operators.size() > 0)
     {
         const auto root_dst_tensors = get_root_operator()->tensors().get_const_dst_tensors();
         ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
         const auto first_dst_tensor_layout = root_dst_tensors[0]->data_layout();
         const auto dst_tensors             = op.tensors().get_const_dst_tensors();
-        for(const auto &t : root_dst_tensors)
+        for (const auto &t : root_dst_tensors)
         {
-            if(t->data_layout() != first_dst_tensor_layout)
+            if (t->data_layout() != first_dst_tensor_layout)
             {
                 return false;
             }
         }
-        for(const auto &t : dst_tensors)
+        for (const auto &t : dst_tensors)
         {
-            if(t->data_layout() != first_dst_tensor_layout)
+            if (t->data_layout() != first_dst_tensor_layout)
             {
                 return false;
             }
@@ -151,16 +146,17 @@ void GpuOperatorGroup::add_operator(const Operator &op, bool is_output)
     _graph.add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output);
     _operators[op.id()] = op;
 }
-Operator GpuOperatorGroup::new_operator(const GpuOperatorType &operator_type, const ArgumentPack<ITensorInfo> &tensors) const
+Operator GpuOperatorGroup::new_operator(const GpuOperatorType           &operator_type,
+                                        const ArgumentPack<ITensorInfo> &tensors) const
 {
     auto new_id = static_cast<OperatorId>(_operators.size());
-    return Operator{ new_id, operator_type, tensors };
+    return Operator{new_id, operator_type, tensors};
 }
 const Operator *GpuOperatorGroup::get_root_operator() const
 {
     const auto roots = _graph.get_root_ops();
     ARM_COMPUTE_ERROR_ON(roots.size() > 1);
-    if(roots.empty())
+    if (roots.empty())
     {
         return nullptr;
     }
diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h
index 308a9d796a..0a2369d357 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h
@@ -25,9 +25,11 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORGROUP
 
 #include "arm_compute/core/ITensorInfo.h"
+
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h"
 #include "src/dynamic_fusion/sketch/utils/DependencyGraph.h"
+
 #include <map>
 
 namespace arm_compute
@@ -104,7 +106,7 @@ public:
     const Operator *get_root_operator() const;
 
 private:
-    DependencyGraph _graph{};
+    DependencyGraph                _graph{};
     std::map<OperatorId, Operator> _operators{};
 };
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp
index c2bd012703..36cad790c7 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp
@@ -23,7 +23,9 @@
  */
 
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h"
+
 #include "arm_compute/core/CL/CLCompileContext.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h"
 
 namespace arm_compute
@@ -33,7 +35,7 @@ namespace experimental
 namespace dynamic_fusion
 {
 GpuWorkloadContext::GpuWorkloadContext(CLCompileContext *cl_compile_ctx)
-    : _impl{ std::make_unique<Impl>(GpuLanguage::OpenCL, cl_compile_ctx) }
+    : _impl{std::make_unique<Impl>(GpuLanguage::OpenCL, cl_compile_ctx)}
 {
 }
 
@@ -74,7 +76,11 @@ const GpuWorkloadContext::Impl &GpuWorkloadContext::implementation() const
 }
 
 GpuWorkloadContext::Impl::Impl(GpuLanguage gpu_language, CLCompileContext *cl_compile_ctx)
-    : _gpu_language(gpu_language), _cl_compile_ctx(cl_compile_ctx), _next_tensor_id(1), _mem_map(), _managed_tensor_info()
+    : _gpu_language(gpu_language),
+      _cl_compile_ctx(cl_compile_ctx),
+      _next_tensor_id(1),
+      _mem_map(),
+      _managed_tensor_info()
 {
 }
 
@@ -100,7 +106,7 @@ void GpuWorkloadContext::Impl::register_user_tensor(ITensorInfo &tensor_info)
     const auto tensor_id = next_tensor_id();
 
     tensor_info.set_id(tensor_id);
-    _mem_map[tensor_id] = MemoryDescriptor{ MemoryType::User };
+    _mem_map[tensor_id] = MemoryDescriptor{MemoryType::User};
     // Save a *copy* of the user tensor info in workload context for future reference
     // Note that this means if the user modifies the @p tensor_info, the change will not be reflected in the context
     _managed_tensor_info.emplace(tensor_info.id(), std::make_unique<TensorInfo>(tensor_info));
@@ -111,7 +117,7 @@ ITensorInfo *GpuWorkloadContext::Impl::create_virtual_tensor()
     auto       tensor_info = std::make_unique<TensorInfo>();
     const auto tensor_id   = -next_tensor_id();
     tensor_info->set_id(tensor_id);
-    _mem_map[tensor_id] = MemoryDescriptor{ MemoryType::Virtual };
+    _mem_map[tensor_id] = MemoryDescriptor{MemoryType::Virtual};
     auto inserted       = _managed_tensor_info.emplace(tensor_info->id(), std::move(tensor_info));
     return inserted.first->second.get();
 }
@@ -121,7 +127,7 @@ ITensorInfo *GpuWorkloadContext::Impl::create_auxiliary_tensor(const ITensorInfo
     auto       tensor_info = std::make_unique<TensorInfo>(itensor_info);
     const auto tensor_id   = next_tensor_id();
     tensor_info->set_id(tensor_id);
-    _mem_map[tensor_id] = MemoryDescriptor{ MemoryType::Auxiliary, AuxMemoryInfo{ tensor_info->total_size() } };
+    _mem_map[tensor_id] = MemoryDescriptor{MemoryType::Auxiliary, AuxMemoryInfo{tensor_info->total_size()}};
     auto inserted       = _managed_tensor_info.emplace(tensor_info->id(), std::move(tensor_info));
     return inserted.first->second.get();
 }
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h
index c169476a70..7d9699031f 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h
@@ -27,8 +27,8 @@
 
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h"
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
 
 namespace arm_compute
 {
@@ -93,8 +93,8 @@ private:
     GpuLanguage       _gpu_language;
     CLCompileContext *_cl_compile_ctx;
 
-    ITensorInfo::Id     _next_tensor_id;
-    MemoryDescriptorMap _mem_map;
+    ITensorInfo::Id                                        _next_tensor_id;
+    MemoryDescriptorMap                                    _mem_map;
     std::map<ITensorInfo::Id, std::unique_ptr<TensorInfo>> _managed_tensor_info;
 };
 
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp
index d3a20c0dfe..973f7c747f 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 
 namespace arm_compute
@@ -30,8 +31,7 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-GpuWorkloadSketch::GpuWorkloadSketch(Context *context)
-    : _impl{ std::make_unique<Implementation>(context) }
+GpuWorkloadSketch::GpuWorkloadSketch(Context *context) : _impl{std::make_unique<Implementation>(context)}
 {
 }
 GpuWorkloadSketch::~GpuWorkloadSketch()
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h
index d3033898e9..fea4fe9577 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h
@@ -24,8 +24,9 @@
 #ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL
 
-#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h"
@@ -45,12 +46,8 @@ public:
      *
      * @param[in] context global workload creation context
      */
-    explicit Implementation(
-        Context *context)
-        : _context{ context },
-          _comp_services{},
-          _component_graph{ _context, &_comp_services },
-          _operator_group{}
+    explicit Implementation(Context *context)
+        : _context{context}, _comp_services{}, _component_graph{_context, &_comp_services}, _operator_group{}
     {
     }
     /** Prevent instances of this class from being copy constructed */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
index 578366daaf..43bcc47fa0 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h"
 
@@ -45,7 +46,7 @@ namespace
  */
 GpuKernelArgumentList extract_kernel_args_for_one_tensor(GpuKernelArgumentList &flat_kernel_args)
 {
-    if(flat_kernel_args.empty())
+    if (flat_kernel_args.empty())
     {
         return {};
     }
@@ -56,10 +57,10 @@ GpuKernelArgumentList extract_kernel_args_for_one_tensor(GpuKernelArgumentList &
     flat_kernel_args.pop_front();
     const auto tensor_id = karg_head.id();
 
-    while(!flat_kernel_args.empty())
+    while (!flat_kernel_args.empty())
     {
         const GpuKernelArgumentBinding &karg = flat_kernel_args.front();
-        if(karg.id() != tensor_id) // Encounter the next tensor, return the current tensor's kernel arguments
+        if (karg.id() != tensor_id) // Encounter the next tensor, return the current tensor's kernel arguments
         {
             return tensor_kargs;
         }
@@ -68,7 +69,7 @@ GpuKernelArgumentList extract_kernel_args_for_one_tensor(GpuKernelArgumentList &
     }
     return tensor_kargs;
 }
-}
+} // namespace
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 /** Uniquely identifies a @ref GpuUnitWorkload within a @ref GpuWorkloadSourceCode */
 using UnitWorkloadId = int32_t;
@@ -92,9 +93,7 @@ public:
     GpuWorkloadArgument(const ITensorInfo           &tensor_info,
                         const MemoryDescriptor      &mem_desc,
                         const GpuKernelArgumentInfo &kernel_arg_info)
-        : _tensor_info{ tensor_info },
-          _mem_desc{ mem_desc },
-          _kernel_arg_info{ kernel_arg_info }
+        : _tensor_info{tensor_info}, _mem_desc{mem_desc}, _kernel_arg_info{kernel_arg_info}
     {
     }
 #else  // ACL_INTERNAL_TEST_CKW_IN_DF
@@ -107,9 +106,7 @@ public:
     GpuWorkloadArgument(const ITensorInfo           &tensor_info,
                         const MemoryDescriptor      &mem_desc,
                         const GpuKernelArgumentList &kernel_args)
-        : _tensor_info{ tensor_info },
-          _mem_desc{ mem_desc },
-          _kernel_args{ kernel_args }
+        : _tensor_info{tensor_info}, _mem_desc{mem_desc}, _kernel_args{kernel_args}
     {
     }
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
@@ -175,9 +172,9 @@ private:
     TensorInfo       _tensor_info{};
     MemoryDescriptor _mem_desc{};
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    GpuKernelArgumentInfo _kernel_arg_info {};
+    GpuKernelArgumentInfo _kernel_arg_info{};
 #else  // ACL_INTERNAL_TEST_CKW_IN_DF
-    GpuKernelArgumentList     _kernel_args {};
+    GpuKernelArgumentList _kernel_args{};
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 };
 
@@ -190,7 +187,7 @@ struct UnitWorkloadStage
         Prepare, /**< Only run once at the beginning. */
         Run,     /**< Run every time after the first time. */
     };
-    Stage stage{ Stage::Run };
+    Stage stage{Stage::Run};
 };
 
 inline bool operator==(const UnitWorkloadStage &stage0, const UnitWorkloadStage &stage1)
@@ -212,7 +209,7 @@ public:
      * @param[in] stage       Stage of the unit workload
      */
     GpuUnitWorkload(UnitWorkloadId id, const GpuKernelSourceCode &kernel_code, const UnitWorkloadStage &stage)
-        : _id{ id }, _kernel_code{ kernel_code }, _stage{ stage }
+        : _id{id}, _kernel_code{kernel_code}, _stage{stage}
     {
     }
     /** Get the id of the unit workload */
@@ -253,7 +250,10 @@ public:
      *
      * @return UnitWorkloadId  Allocated unit workload id
      */
-    UnitWorkloadId add_unit_workload(const GpuKernelSourceCode &kernel_code, const UnitWorkloadStage &stage, const MemoryDescriptorMap &mem_map, const GpuWorkloadContext *context)
+    UnitWorkloadId add_unit_workload(const GpuKernelSourceCode &kernel_code,
+                                     const UnitWorkloadStage   &stage,
+                                     const MemoryDescriptorMap &mem_map,
+                                     const GpuWorkloadContext  *context)
     {
         // Use the size of the kernel codes as Id
         const auto uwk_id    = static_cast<UnitWorkloadId>(_unit_workloads.size());
@@ -262,12 +262,13 @@ public:
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
         ARM_COMPUTE_UNUSED(context);
         // Assemble kernel argument with memory descriptor to form workload argument
-        for(const auto &id_arg : kernel_code.arguments())
+        for (const auto &id_arg : kernel_code.arguments())
         {
-            const auto arg_id           = id_arg.first;
-            const auto arg              = id_arg.second;
-            _workload_arguments[arg_id] = GpuWorkloadArgument{ *arg.tensor_info(), mem_map.at(arg_id), *arg.kernel_argument_info() };
-            if(_tensor_uwork_map.find(arg_id) == _tensor_uwork_map.end())
+            const auto arg_id = id_arg.first;
+            const auto arg    = id_arg.second;
+            _workload_arguments[arg_id] =
+                GpuWorkloadArgument{*arg.tensor_info(), mem_map.at(arg_id), *arg.kernel_argument_info()};
+            if (_tensor_uwork_map.find(arg_id) == _tensor_uwork_map.end())
             {
                 _tensor_uwork_map[arg_id] = std::set<UnitWorkloadId>();
             }
@@ -276,18 +277,19 @@ public:
 #else  // ACL_INTERNAL_TEST_CKW_IN_DF
         GpuKernelArgumentList flat_kernel_args = kernel_code.arguments();
         GpuKernelArgumentList tensor_kargs{};
-        while(true)
+        while (true)
         {
             tensor_kargs = extract_kernel_args_for_one_tensor(flat_kernel_args);
-            if(tensor_kargs.empty())
+            if (tensor_kargs.empty())
             {
                 break;
             }
             else
             {
                 const auto tensor_id           = tensor_kargs.at(0).id();
-                _workload_arguments[tensor_id] = GpuWorkloadArgument{ *context->implementation().get_tensor_info(tensor_id), mem_map.at(tensor_id), tensor_kargs };
-                if(_tensor_uwork_map.find(tensor_id) == _tensor_uwork_map.end())
+                _workload_arguments[tensor_id] = GpuWorkloadArgument{
+                    *context->implementation().get_tensor_info(tensor_id), mem_map.at(tensor_id), tensor_kargs};
+                if (_tensor_uwork_map.find(tensor_id) == _tensor_uwork_map.end())
                 {
                     _tensor_uwork_map[tensor_id] = std::set<UnitWorkloadId>();
                 }
@@ -308,7 +310,7 @@ public:
     {
         std::vector<UnitWorkloadId> ids{};
 
-        for(const auto &uwk : _unit_workloads)
+        for (const auto &uwk : _unit_workloads)
         {
             ids.push_back(uwk.id());
         }
@@ -323,7 +325,7 @@ public:
     std::vector<ITensorInfo::Id> tensors() const
     {
         std::vector<ITensorInfo::Id> ids{};
-        for(const auto &id_tensor : _workload_arguments)
+        for (const auto &id_tensor : _workload_arguments)
         {
             ids.push_back(id_tensor.first);
         }
@@ -337,7 +339,7 @@ public:
     }
 
 private:
-    std::vector<GpuUnitWorkload> _unit_workloads{};
+    std::vector<GpuUnitWorkload>                        _unit_workloads{};
     std::map<ITensorInfo::Id, GpuWorkloadArgument>      _workload_arguments{};
     std::map<ITensorInfo::Id, std::set<UnitWorkloadId>> _tensor_uwork_map{};
 };
diff --git a/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
index 1d8b231efd..ad474674f9 100644
--- a/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
+++ b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
 
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp
index 4b4c22fa1d..c4ab110c92 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
+
 #include "ckw/Error.h"
 
 namespace arm_compute
@@ -36,12 +37,12 @@ GpuCkwComponentArgument::GpuCkwComponentArgument()
 {
 }
 
-GpuCkwComponentArgument::GpuCkwComponentArgument(ckw::TensorOperand &tensor)
-    : _tensor(&tensor)
+GpuCkwComponentArgument::GpuCkwComponentArgument(ckw::TensorOperand &tensor) : _tensor(&tensor)
 {
 }
 
-GpuCkwComponentArgument &GpuCkwComponentArgument::init_virtual_tensor(ckw::TileOperand &tile, const ckw::TensorTileSampler &tile_sampler)
+GpuCkwComponentArgument &GpuCkwComponentArgument::init_virtual_tensor(ckw::TileOperand             &tile,
+                                                                      const ckw::TensorTileSampler &tile_sampler)
 {
     CKW_ASSERT(_tile == nullptr);
 
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h
index 80f91389a0..863989a7bd 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h
@@ -110,9 +110,9 @@ public:
     const ckw::TensorTileSampler &tile_sampler() const;
 
 private:
-    ckw::TensorOperand *_tensor{ nullptr };
-    ckw::TileOperand   *_tile{ nullptr };
-    ckw::TensorTileSampler  _tile_sampler{};
+    ckw::TensorOperand    *_tensor{nullptr};
+    ckw::TileOperand      *_tile{nullptr};
+    ckw::TensorTileSampler _tile_sampler{};
 };
 
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp
index a24a172d77..c927f32bde 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp
@@ -23,17 +23,16 @@
  */
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h"
 
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
-#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
-
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/common/utils/Log.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
-
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
 using namespace ckw;
 namespace arm_compute
@@ -43,11 +42,11 @@ namespace experimental
 namespace dynamic_fusion
 {
 GpuCkwDriver::GpuCkwDriver(const GpuKernelComponentGroup &components)
-    : _components{ components }, _kernel{ GpuTargetLanguage::OpenCL }, _code{}
+    : _components{components}, _kernel{GpuTargetLanguage::OpenCL}, _code{}
 {
     // Generate kernel name
     std::string name = "";
-    for(auto &comp : _components)
+    for (auto &comp : _components)
     {
         auto ckw_driver = comp->ckw_component_driver();
         ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr);
@@ -60,7 +59,7 @@ GpuCkwDriver::GpuCkwDriver(const GpuKernelComponentGroup &components)
     GpuCkwScopedKernelWriter writer(&root_writer);
     GpuCkwVariableTable      vtable{};
 
-    for(auto &comp : _components)
+    for (auto &comp : _components)
     {
         auto ckw_driver = comp->ckw_component_driver();
         ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr);
@@ -82,7 +81,7 @@ std::string GpuCkwDriver::get_code()
 std::string GpuCkwDriver::get_config_id()
 {
     std::string id = "";
-    for(auto &comp : _components)
+    for (auto &comp : _components)
     {
         auto ckw_driver = comp->ckw_component_driver();
         ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr);
@@ -101,9 +100,9 @@ Window GpuCkwDriver::get_window() const
 GpuKernelArgumentList GpuCkwDriver::get_kernel_arguments()
 {
     GpuKernelArgumentList args{};
-    for(const auto &arg : _kernel.arguments())
+    for (const auto &arg : _kernel.arguments())
     {
-        switch(arg.type())
+        switch (arg.type())
         {
             case KernelArgument::Type::TensorStorage:
             {
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
index 19db575fea..2ca5fb435c 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
@@ -24,12 +24,12 @@
 #ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER
 #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER
 
+#include "ckw/Kernel.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h"
 
-#include "ckw/Kernel.h"
-
 #include <map>
 #include <string>
 
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp
index ca4f121566..5f8ce919e3 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp
@@ -23,10 +23,12 @@
  */
 
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
+
 #include "ckw/Error.h"
 #include "ckw/TileInfo.h"
 
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
+
 namespace arm_compute
 {
 namespace experimental
@@ -34,21 +36,21 @@ namespace experimental
 namespace dynamic_fusion
 {
 
-GpuCkwKernelWriter::GpuCkwKernelWriter(ckw::Kernel &kernel)
-    : KernelWriter(kernel)
+GpuCkwKernelWriter::GpuCkwKernelWriter(ckw::Kernel &kernel) : KernelWriter(kernel)
 {
 }
 
 void GpuCkwKernelWriter::op_load_once(GpuCkwComponentArgument *tensor_or_tile, const ckw::TensorTileSampler &sampler)
 {
-    if(!tensor_or_tile->has_tile())
+    if (!tensor_or_tile->has_tile())
     {
         CKW_ASSERT(tensor_or_tile->has_tensor());
 
         auto &tensor = tensor_or_tile->tensor();
 
         const auto tile_name = tensor.name() + "_tile";
-        auto      &tile      = declare_tile(tile_name.c_str(), ckw::TileInfo(tensor.data_type(), sampler.height(), sampler.width()));
+        auto      &tile =
+            declare_tile(tile_name.c_str(), ckw::TileInfo(tensor.data_type(), sampler.height(), sampler.width()));
 
         op_load(tile, tensor, sampler);
 
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp
index 043fda9e6f..cbadbd9639 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 
 namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h
index 4d11b5e3e4..81049bfe37 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h
@@ -63,7 +63,7 @@ public:
 
 private:
     GpuCkwKernelWriter *_writer;
-    int32_t          _parent_id_space;
+    int32_t             _parent_id_space;
 };
 
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp
index 37c27cd116..88a0cf7f43 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp
@@ -23,11 +23,12 @@
  */
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
 #include <sstream>
 
 namespace arm_compute
@@ -36,19 +37,22 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group, GpuCkwScopedKernelWriter &writer, const ITensorInfo *tensor, TensorStorageType storage,
-                                                               const std::string &alias)
+GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group,
+                                                               GpuCkwScopedKernelWriter      &writer,
+                                                               const ITensorInfo             *tensor,
+                                                               TensorStorageType              storage,
+                                                               const std::string             &alias)
 {
     ARM_COMPUTE_ERROR_ON_MSG(!tensor->has_valid_id(), "Tensor info with valid id expected");
 
     // Do not re-declare if the variable associated with the tensor has already been declared
     auto it = _vars.find(tensor->id());
 
-    if(it != _vars.end())
+    if (it != _vars.end())
     {
         return &it->second;
     }
-    if(comp_group.is_intermediate_tensor(tensor))
+    if (comp_group.is_intermediate_tensor(tensor))
     {
         // Create a virtual tensor variable
         GpuCkwComponentArgument var;
@@ -61,7 +65,7 @@ GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelCo
         std::stringstream ss;
         ss << alias << "_t" << abs(tensor->id());
         const auto              uniq_name = ss.str();
-        GpuCkwComponentArgument var{ writer->declare_tensor_argument(uniq_name, to_ckw(*tensor), to_ckw(storage)) };
+        GpuCkwComponentArgument var{writer->declare_tensor_argument(uniq_name, to_ckw(*tensor), to_ckw(storage))};
         auto                  &&inserted = _vars.emplace(tensor->id(), var);
         return &(inserted.first->second);
     }
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h
index 0649dcba9d..2b118911b8 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h
@@ -25,6 +25,7 @@
 #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE
 
 #include "arm_compute/core/ITensorInfo.h"
+
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
 
 #include <map>
@@ -58,8 +59,11 @@ public:
      *
      * @return GpuCkwComponentArgument*
      */
-    GpuCkwComponentArgument *declare_variable(const GpuKernelComponentGroup &comp_group, GpuCkwScopedKernelWriter &writer, const ITensorInfo *tensor, TensorStorageType storage,
-                                              const std::string &alias = "unnamed");
+    GpuCkwComponentArgument *declare_variable(const GpuKernelComponentGroup &comp_group,
+                                              GpuCkwScopedKernelWriter      &writer,
+                                              const ITensorInfo             *tensor,
+                                              TensorStorageType              storage,
+                                              const std::string             &alias = "unnamed");
 
 private:
     std::map<ITensorInfo::Id, GpuCkwComponentArgument> _vars{};
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h
index 14086f785e..52e56e2e35 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h
@@ -25,6 +25,7 @@
 #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_IGPUCKWCOMPONENTDRIVER
 
 #include "arm_compute/core/Window.h"
+
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
 #include "src/dynamic_fusion/sketch/gpu/components/Types.h"
 
@@ -73,8 +74,7 @@ public:
      * @param[in] id      Component id
      * @param[in] tensors Tensor arguments to the components
      */
-    IGpuCkwComponentDriver(ComponentId id, const ArgumentPack<ITensorInfo> &tensors)
-        : _id{ id }, _tensors{ tensors }
+    IGpuCkwComponentDriver(ComponentId id, const ArgumentPack<ITensorInfo> &tensors) : _id{id}, _tensors{tensors}
     {
     }
     /** Destructor */
@@ -89,7 +89,9 @@ public:
      *
      *                            @note @p writer can only be passed via value since the new scope is created in the copy constructor
      */
-    virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const = 0;
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const = 0;
     /** Get tensor arguments */
     ArgumentPack<ITensorInfo> tensors() const
     {
@@ -128,7 +130,7 @@ public:
     }
 
 private:
-    ComponentId               _id{ -1 };
+    ComponentId               _id{-1};
     ArgumentPack<ITensorInfo> _tensors{};
 };
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp
index c07fac0e0d..c3b1b3c8bc 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp
@@ -24,16 +24,18 @@
 #include "GpuCkwActivation.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
 #include "ckw/TensorTileSampler.h"
+
 #include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
 #include <string>
 
 using namespace ckw;
@@ -87,24 +89,25 @@ inline TensorTileSampler create_sampler(GpuCkwScopedKernelWriter &writer, int32_
 GpuCkwActivation::GpuCkwActivation(ComponentId                      id,
                                    const ArgumentPack<ITensorInfo> &tensors,
                                    const Attributes                &attributes)
-    : IGpuCkwComponentDriver{ id, tensors },
-      _src{},
-      _dst{},
-      _attributes{ attributes }
+    : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 }
 
-void GpuCkwActivation::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
+void GpuCkwActivation::write_component_code(const ComponentGroup    &comp_group,
+                                            GpuCkwVariableTable     &vtable,
+                                            GpuCkwScopedKernelWriter writer) const
 {
     const auto         root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
     const unsigned int n0          = root_window.x().step();
     const unsigned int m0          = root_window.y().step();
 
-    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
-    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+    GpuCkwComponentArgument *src =
+        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
+    GpuCkwComponentArgument *dst =
+        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
 
     load_src_dst_tiles_and_prepare_sampler(writer, src, dst, m0, n0, create_sampler);
 
@@ -119,7 +122,7 @@ void GpuCkwActivation::write_component_code(const ComponentGroup &comp_group, Gp
     const auto &constant_B       = writer->declare_tile("B_VAL", _attributes.b());
 
     // Perform the operation.
-    switch(_attributes.activation())
+    switch (_attributes.activation())
     {
         case ActivationLayerInfo::ActivationFunction::LOGISTIC:
         {
@@ -179,9 +182,10 @@ Window GpuCkwActivation::get_window() const
     // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
     // This is in line with the collapsing convention used by operators like Conv2d
     output_shape.collapse(2U, 1U);
-    constexpr unsigned int vector_size_byte_opencl           = 16;
-    const unsigned int     num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
-    Window                 win                               = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
+    constexpr unsigned int vector_size_byte_opencl = 16;
+    const unsigned int     num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
+    Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
 
     return win;
 }
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h
index e157e36cbf..386e933a72 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h
@@ -46,15 +46,15 @@ public:
      * @param[in] tensors    Tensor arguments to the component
      * @param[in] attributes Component attributes
      */
-    GpuCkwActivation(ComponentId                      id,
-                            const ArgumentPack<ITensorInfo> &tensors,
-                            const Attributes                &attributes);
+    GpuCkwActivation(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwActivation);
     /** Destructor */
     ~GpuCkwActivation() override = default;
     // Inherited methods overriden:
-    virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override;
-    Window get_window() const override;
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const override;
+    Window       get_window() const override;
 
 private:
     const ITensorInfo *_src;
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp
index 6ecf2bac44..e8e5087633 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp
@@ -24,16 +24,18 @@
 #include "GpuCkwCast.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
 #include "ckw/TensorTileSampler.h"
+
 #include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
 #include <string>
 
 using namespace ckw;
@@ -84,30 +86,29 @@ inline TensorTileSampler create_sampler(GpuCkwScopedKernelWriter &writer, int32_
 }
 } // namespace
 
-GpuCkwCast::GpuCkwCast(ComponentId                      id,
-                       const ArgumentPack<ITensorInfo> &tensors,
-                       const Attributes                &attributes)
-    : IGpuCkwComponentDriver{ id, tensors },
-      _src{},
-      _dst{},
-      _attributes{ attributes }
+GpuCkwCast::GpuCkwCast(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes)
+    : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 }
 
-void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
+void GpuCkwCast::write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const
 {
     const auto         root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
     const unsigned int n0          = root_window.x().step();
     const unsigned int m0          = root_window.y().step();
 
-    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
-    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+    GpuCkwComponentArgument *src =
+        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
+    GpuCkwComponentArgument *dst =
+        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
 
     // Load the source tile and prepare the sampler.
-    if(!src->has_tile())
+    if (!src->has_tile())
     {
         const auto sampler = create_sampler(writer, m0, n0);
         writer->op_load_once(src, sampler);
@@ -122,7 +123,7 @@ void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, GpuCkwVa
     const auto &sampler  = src->tile_sampler();
 
     // Prepare the output tile.
-    if(!dst->has_tile())
+    if (!dst->has_tile())
     {
         // Get Target datatype and convert it to ckw::DataType.
         ckw::DataType target_dt = dynamic_fusion::to_ckw(_attributes.data_type());
@@ -143,7 +144,7 @@ void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, GpuCkwVa
     const size_t dst_size  = data_size_from_type(_dst->data_type());
     const bool   cast_down = (src_size >= dst_size);
 
-    if(cast_down && is_data_type_quantized(_src->data_type()))
+    if (cast_down && is_data_type_quantized(_src->data_type()))
     {
         const auto &constant_x80 = writer->declare_tile("0x80", 0x80);
         writer->op_binary_expression(src_tile, src_tile, BinaryOp::BitwiseXOR, constant_x80);
@@ -151,7 +152,7 @@ void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, GpuCkwVa
 
     ckw::ConvertPolicy convert_policy = ckw::ConvertPolicy::None;
 
-    if(cast_down && (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE))
+    if (cast_down && (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE))
     {
         convert_policy = ckw::ConvertPolicy::Saturate;
     }
@@ -167,9 +168,10 @@ Window GpuCkwCast::get_window() const
     // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
     // This is in line with the collapsing convention used by operators like Conv2d
     output_shape.collapse(2U, 1U);
-    constexpr unsigned int vector_size_byte_opencl           = 16;
-    const unsigned int     num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
-    Window                 win                               = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
+    constexpr unsigned int vector_size_byte_opencl = 16;
+    const unsigned int     num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
+    Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
 
     return win;
 }
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h
index 821cec1e19..2389301196 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h
@@ -46,15 +46,15 @@ public:
      * @param[in] tensors    Tensor arguments to the component
      * @param[in] attributes Component attributes
      */
-    GpuCkwCast(ComponentId                      id,
-                            const ArgumentPack<ITensorInfo> &tensors,
-                            const Attributes                &attributes);
+    GpuCkwCast(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwCast);
     /** Destructor */
     ~GpuCkwCast() override = default;
     // Inherited methods overriden:
-    virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override;
-    Window get_window() const override;
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const override;
+    Window       get_window() const override;
 
 private:
     const ITensorInfo *_src;
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp
index 3c906646a6..7833da2334 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp
@@ -25,21 +25,20 @@
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
-
+#include "arm_compute/core/Validate.h"
 #include "ckw/TensorTileSampler.h"
 #include "ckw/TileInfo.h"
 
 #include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 
 namespace arm_compute
 {
@@ -54,13 +53,7 @@ GpuCkwDirectConv2d::GpuCkwDirectConv2d(ComponentId                      id,
                                        const ArgumentPack<ITensorInfo> &tensors,
                                        const Attributes                &attributes,
                                        const Settings                  &settings)
-    : IGpuCkwComponentDriver{ id, tensors },
-      _src{},
-      _wei{},
-      _bia{},
-      _dst{},
-      _attributes{ attributes },
-      _settings{ settings }
+    : IGpuCkwComponentDriver{id, tensors}, _src{}, _wei{}, _bia{}, _dst{}, _attributes{attributes}, _settings{settings}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _wei = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
@@ -69,7 +62,9 @@ GpuCkwDirectConv2d::GpuCkwDirectConv2d(ComponentId                      id,
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _wei, _dst); // Bias can be null
 }
 
-void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
+void GpuCkwDirectConv2d::write_component_code(const ComponentGroup    &comp_group,
+                                              GpuCkwVariableTable     &vtable,
+                                              GpuCkwScopedKernelWriter writer) const
 {
     const auto desc = _settings.direct_conv_descriptor();
     ARM_COMPUTE_ERROR_ON_MSG(desc.export_input_to_cl_image || desc.export_output_to_cl_image,
@@ -99,15 +94,18 @@ void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group,
     // extra loop to compute the left-over elements.
     const bool use_cl_image_for_weights = desc.export_weights_to_cl_image && (k0 == 4) && (K % 4 == 0);
 
-    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
+    GpuCkwComponentArgument *src =
+        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
     GpuCkwComponentArgument *wei = vtable.declare_variable(
-        comp_group, writer, _wei, use_cl_image_for_weights ? TensorStorageType::ClImage2dReadOnly : TensorStorageType::ClBufferUint8Ptr, "wei");
-    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+        comp_group, writer, _wei,
+        use_cl_image_for_weights ? TensorStorageType::ClImage2dReadOnly : TensorStorageType::ClBufferUint8Ptr, "wei");
+    GpuCkwComponentArgument *dst =
+        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
     GpuCkwComponentArgument *bia = nullptr;
 
     const bool using_bias = _bia != nullptr;
 
-    if(using_bias)
+    if (using_bias)
     {
         bia = vtable.declare_variable(comp_group, writer, _bia, TensorStorageType::ClBufferUint8Ptr, "bia");
     }
@@ -154,7 +152,8 @@ void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group,
     src_sampler.address_mode_x(TensorSamplerAddressModeX::None);
     // We cannot have out-of-bounds reads when the kernel height is equal to 1. Otherwise, we need to ensure the
     // indirection buffer mi does not contain negative values representing out-of-bounds reads.
-    src_sampler.address_mode_y(kernel_height == 1 ? TensorSamplerAddressModeY::None : TensorSamplerAddressModeY::SkipMinEdgeOnly);
+    src_sampler.address_mode_y(kernel_height == 1 ? TensorSamplerAddressModeY::None
+                                                  : TensorSamplerAddressModeY::SkipMinEdgeOnly);
     src_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
 
     TensorTileSampler wei_sampler;
@@ -178,7 +177,7 @@ void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group,
     dst_sampler.z(tile_0);
     dst_sampler.b(tile_bout);
 
-    if(!dst->has_tile())
+    if (!dst->has_tile())
     {
         auto &tile = writer->declare_tile("dst", TileInfo(to_ckw(_dst->data_type()), m0, n0));
         dst->init_virtual_tensor(tile, dst_sampler);
@@ -189,10 +188,10 @@ void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group,
 
     // We create a 2d container of size (M0, 1) to store the indices for iteration
     TileContainer it;
-    for(int m = 0; m < m0; ++m)
+    for (int m = 0; m < m0; ++m)
     {
-        std::vector<std::string> idx { std::to_string(m) };
-        it.push_back({ idx });
+        std::vector<std::string> idx{std::to_string(m)};
+        it.push_back({idx});
     }
     const auto &tile_it = writer->declare_tile("it", it, ckw::DataType::Int32);
 
@@ -289,9 +288,9 @@ void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group,
     // Bias addition
     // NOTE: This operation will be removed from this kernel as the interface is standardized. The intended way of
     // performing bias addition is to fuse this convolution kernel with a following elementwise addition kernel.
-    if(using_bias)
+    if (using_bias)
     {
-        if(!bia->has_tile())
+        if (!bia->has_tile())
         {
             // Reuse the destination sampler for the bias
             writer->op_load_once(bia, dst_sampler);
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp
index c8bf999261..2935ba45ea 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp
@@ -24,22 +24,24 @@
 #include "GpuCkwElementwiseBinary.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
 #include "ckw/TensorTileSampler.h"
 #include "ckw/types/TensorSamplerTypes.h"
+
 #include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h"
 #include "src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "support/StringSupport.h"
+
 #include <algorithm>
 #include <string>
 
@@ -53,11 +55,7 @@ namespace dynamic_fusion
 GpuCkwElementwiseBinary::GpuCkwElementwiseBinary(ComponentId                      id,
                                                  const ArgumentPack<ITensorInfo> &tensors,
                                                  const Attributes                &attributes)
-    : IGpuCkwComponentDriver{ id, tensors },
-      _lhs{},
-      _rhs{},
-      _dst{},
-      _attributes{ attributes }
+    : IGpuCkwComponentDriver{id, tensors}, _lhs{}, _rhs{}, _dst{}, _attributes{attributes}
 {
     _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
@@ -65,15 +63,20 @@ GpuCkwElementwiseBinary::GpuCkwElementwiseBinary(ComponentId
     ARM_COMPUTE_ERROR_ON_NULLPTR(_lhs, _rhs, _dst);
 }
 
-void GpuCkwElementwiseBinary::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
+void GpuCkwElementwiseBinary::write_component_code(const ComponentGroup    &comp_group,
+                                                   GpuCkwVariableTable     &vtable,
+                                                   GpuCkwScopedKernelWriter writer) const
 {
     const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
     const auto n0          = static_cast<int32_t>(root_window.x().step());
     const auto m0          = static_cast<int32_t>(root_window.y().step());
 
-    GpuCkwComponentArgument *lhs = vtable.declare_variable(comp_group, writer, _lhs, TensorStorageType::ClBufferUint8Ptr, "lhs");
-    GpuCkwComponentArgument *rhs = vtable.declare_variable(comp_group, writer, _rhs, TensorStorageType::ClBufferUint8Ptr, "rhs");
-    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+    GpuCkwComponentArgument *lhs =
+        vtable.declare_variable(comp_group, writer, _lhs, TensorStorageType::ClBufferUint8Ptr, "lhs");
+    GpuCkwComponentArgument *rhs =
+        vtable.declare_variable(comp_group, writer, _rhs, TensorStorageType::ClBufferUint8Ptr, "rhs");
+    GpuCkwComponentArgument *dst =
+        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
 
     auto &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
     auto &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
@@ -86,32 +89,36 @@ void GpuCkwElementwiseBinary::write_component_code(const ComponentGroup &comp_gr
     auto &const_0 = writer->declare_tile("0", 0);
 
     // Load the LHS and RHS tiles
-    if(!lhs->has_tile())
+    if (!lhs->has_tile())
     {
-        auto sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _lhs->dimension(0), _lhs->dimension(1), n0, m0, "lhs_", const_0);
+        auto sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _lhs->dimension(0), _lhs->dimension(1),
+                                                        n0, m0, "lhs_", const_0);
         sampler.format(TensorSamplerFormat::C_WH_1); // 3rd dimension collapsed with 2nd dimension
         sampler.z(const_0);
         sampler.b(gid_2);
         writer->op_load_once(lhs, sampler);
     }
-    if(!rhs->has_tile())
+    if (!rhs->has_tile())
     {
-        auto sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _rhs->dimension(0), _rhs->dimension(1), n0, m0, "rhs_", const_0);
+        auto sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _rhs->dimension(0), _rhs->dimension(1),
+                                                        n0, m0, "rhs_", const_0);
         sampler.format(TensorSamplerFormat::C_WH_1); // 3rd dimension collapsed with 2nd dimension
         sampler.z(const_0);
         sampler.b(gid_2);
         writer->op_load_once(rhs, sampler);
     }
 
-    auto dst_sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _dst->dimension(0), _dst->dimension(1), n0, m0, "dst_", const_0);
+    auto dst_sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _dst->dimension(0), _dst->dimension(1),
+                                                        n0, m0, "dst_", const_0);
     dst_sampler.format(TensorSamplerFormat::C_WH_1); // 3rd dimension collapsed with 2nd dimension
     dst_sampler.z(const_0);
     dst_sampler.b(gid_2);
 
     // Prepare the output tile.
-    if(!dst->has_tile())
+    if (!dst->has_tile())
     {
-        auto &tile = writer->declare_tile("dst_tile", ckw::TileInfo(to_ckw(_dst->data_type()), dst_sampler.height(), dst_sampler.width()));
+        auto &tile = writer->declare_tile(
+            "dst_tile", ckw::TileInfo(to_ckw(_dst->data_type()), dst_sampler.height(), dst_sampler.width()));
         dst->init_virtual_tensor(tile, dst_sampler);
     }
 
@@ -131,9 +138,10 @@ Window GpuCkwElementwiseBinary::get_window() const
     // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
     // This is in line with the collapsing convention used by operators like Conv2d
     output_shape.collapse(2U, 1U);
-    constexpr unsigned int vector_size_byte_opencl           = 16;
-    const unsigned int     num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
-    Window                 win                               = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
+    constexpr unsigned int vector_size_byte_opencl = 16;
+    const unsigned int     num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
+    Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
 
     return win;
 }
@@ -141,11 +149,12 @@ Window GpuCkwElementwiseBinary::get_window() const
 std::string GpuCkwElementwiseBinary::get_name(const ComponentGroup &comp_group) const
 {
     ARM_COMPUTE_UNUSED(comp_group);
-    const std::vector<std::string> build_params =
-    {
+    const std::vector<std::string> build_params = {
         "elementwise_binary",
-        "op", to_string(_attributes.operation()),
-        "dt", lower_string(string_from_data_type(_dst->data_type())),
+        "op",
+        to_string(_attributes.operation()),
+        "dt",
+        lower_string(string_from_data_type(_dst->data_type())),
     };
     return join(build_params, "_");
 }
@@ -154,13 +163,16 @@ std::string GpuCkwElementwiseBinary::get_tuner_id(const ComponentGroup &comp_gro
 {
     ARM_COMPUTE_UNUSED(comp_group);
     /// NOTE: Hardcoded for now, the parameters should ideally be exported by ckw (a selection of constant tiles)
-    std::vector<std::string> build_params =
-    {
+    std::vector<std::string> build_params = {
         "elementwise_binary",
-        "op", to_string(_attributes.operation()),
-        "dt", lower_string(string_from_data_type(_dst->data_type())),
-        "dst_dim0", support::cpp11::to_string(_dst->dimension(0)),
-        "dst_dim1", support::cpp11::to_string(_dst->dimension(1)),
+        "op",
+        to_string(_attributes.operation()),
+        "dt",
+        lower_string(string_from_data_type(_dst->data_type())),
+        "dst_dim0",
+        support::cpp11::to_string(_dst->dimension(0)),
+        "dst_dim1",
+        support::cpp11::to_string(_dst->dimension(1)),
     };
     return join(build_params, "_");
 }
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h
index e9c41530f8..1a20d4c533 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h
@@ -46,17 +46,17 @@ public:
      * @param[in] tensors    Tensor arguments to the component
      * @param[in] attributes Component attributes
      */
-    GpuCkwElementwiseBinary(ComponentId                      id,
-                            const ArgumentPack<ITensorInfo> &tensors,
-                            const Attributes                &attributes);
+    GpuCkwElementwiseBinary(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwElementwiseBinary);
     /** Destructor */
     ~GpuCkwElementwiseBinary() override = default;
     // Inherited methods overriden:
-    virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override;
-    Window      get_window() const override;
-    std::string get_name(const ComponentGroup &comp_group) const override;
-    std::string get_tuner_id(const ComponentGroup &comp_group) const override;
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const override;
+    Window       get_window() const override;
+    std::string  get_name(const ComponentGroup &comp_group) const override;
+    std::string  get_tuner_id(const ComponentGroup &comp_group) const override;
 
 private:
     const ITensorInfo *_lhs;
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp
index 9c9a298132..8ab3ec3a55 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp
@@ -24,17 +24,18 @@
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
 #include "ckw/TensorTileSampler.h"
+
 #include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 
 using namespace ckw;
 
@@ -48,11 +49,7 @@ GpuCkwPool2d::GpuCkwPool2d(ComponentId                      id,
                            const ArgumentPack<ITensorInfo> &tensors,
                            const Attributes                &attributes,
                            const Settings                  &settings)
-    : IGpuCkwComponentDriver{ id, tensors },
-      _src{},
-      _dst{},
-      _attributes{ attributes },
-      _settings{ settings }
+    : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}, _settings{settings}
 
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
@@ -60,14 +57,18 @@ GpuCkwPool2d::GpuCkwPool2d(ComponentId                      id,
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 }
 
-void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
+void GpuCkwPool2d::write_component_code(const ComponentGroup    &comp_group,
+                                        GpuCkwVariableTable     &vtable,
+                                        GpuCkwScopedKernelWriter writer) const
 {
     const auto         root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
     const unsigned int n0          = root_window.x().step();
     const unsigned int m0          = root_window.y().step();
 
-    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
-    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+    GpuCkwComponentArgument *src =
+        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
+    GpuCkwComponentArgument *dst =
+        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
 
     TileOperand &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
     TileOperand &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
@@ -90,23 +91,26 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw
     const auto    src_data_type = _src->data_type();
 
     // Check if this is global pooling path
-    const bool is_global_pooling = (pool_size_x == src_width) && (pool_size_y == src_height) && (pad_x == 0) && (pad_y == 0);
+    const bool is_global_pooling =
+        (pool_size_x == src_width) && (pool_size_y == src_height) && (pad_x == 0) && (pad_y == 0);
     // Check if this a case of FP_MIXED_PRECISION
-    const bool use_fp_mixed_precision = (src_data_type == DataType::F16) && _settings.mixed_precision() && _attributes.pool_type() != PoolingType::MAX;
-    const auto acc_data_type          = (use_fp_mixed_precision) ? (DataType::F32) : (src_data_type);
+    const bool use_fp_mixed_precision =
+        (src_data_type == DataType::F16) && _settings.mixed_precision() && _attributes.pool_type() != PoolingType::MAX;
+    const auto acc_data_type = (use_fp_mixed_precision) ? (DataType::F32) : (src_data_type);
 
     TileOperand       &const_0            = writer->declare_tile("0", 0);
     const TileOperand &const_1            = writer->declare_tile("1", 1);
     const TileOperand &const_lowest_value = writer->declare_tile("LOWEST_VALUE", std::numeric_limits<float>::lowest());
     const TileOperand &pool_size_x_tile   = writer->declare_tile("POOL_SIZE_X", pool_size_x);
     const TileOperand &pool_size_y_tile   = writer->declare_tile("POOL_SIZE_Y", pool_size_y);
-    const TileOperand &stride_x_tile      = writer->declare_tile("STRIDE_X", static_cast<int32_t>(_attributes.stride().x()));
-    const TileOperand &stride_y_tile      = writer->declare_tile("STRIDE_Y", static_cast<int32_t>(_attributes.stride().y()));
-    const TileOperand &pad_x_tile         = writer->declare_tile("PAD_X", pad_x);
-    const TileOperand &pad_y_tile         = writer->declare_tile("PAD_Y", pad_y);
-    const TileOperand &dst_height_tile    = writer->declare_tile("DST_HEIGHT", static_cast<int32_t>(_dst->dimension(height_idx)));
-    const TileOperand &src_height_tile    = writer->declare_tile("SRC_HEIGHT", src_height);
-    const TileOperand &src_width_tile     = writer->declare_tile("SRC_WIDTH", src_width);
+    const TileOperand &stride_x_tile = writer->declare_tile("STRIDE_X", static_cast<int32_t>(_attributes.stride().x()));
+    const TileOperand &stride_y_tile = writer->declare_tile("STRIDE_Y", static_cast<int32_t>(_attributes.stride().y()));
+    const TileOperand &pad_x_tile    = writer->declare_tile("PAD_X", pad_x);
+    const TileOperand &pad_y_tile    = writer->declare_tile("PAD_Y", pad_y);
+    const TileOperand &dst_height_tile =
+        writer->declare_tile("DST_HEIGHT", static_cast<int32_t>(_dst->dimension(height_idx)));
+    const TileOperand &src_height_tile = writer->declare_tile("SRC_HEIGHT", src_height);
+    const TileOperand &src_width_tile  = writer->declare_tile("SRC_WIDTH", src_width);
 
     TileOperand &idx_out_n = writer->declare_tile("idx_out_n", ckw::DataType::Int32);
     TileOperand &idx_out_h = writer->declare_tile("idx_out_h", ckw::DataType::Int32);
@@ -145,7 +149,7 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw
 
     // Prepare dst tensor and tile
     TileInfo dst_tile_info = TileInfo(to_ckw(src_data_type), m0, n0);
-    if(!dst->has_tile())
+    if (!dst->has_tile())
     {
         TileOperand &dst_tile = writer->declare_tile("dst_tile", dst_tile_info);
         dst->init_virtual_tensor(dst_tile, dst_sampler);
@@ -156,14 +160,15 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw
     const TileOperand &res_tile = writer->declare_tile("res_tile", TileInfo(to_ckw(acc_data_type), m0, n0));
 
     // Initialise result tile with appropriate value
-    if(_attributes.pool_type() == PoolingType::MAX)
+    if (_attributes.pool_type() == PoolingType::MAX)
     {
-        if(_settings.use_inf_as_limit())
+        if (_settings.use_inf_as_limit())
         {
             TileContainer            minus_inf_tile_container;
             std::vector<std::string> value = std::vector<std::string>(n0, "(-INFINITY)");
-            minus_inf_tile_container.push_back({ value });
-            const TileOperand &minus_inf = writer->declare_tile("minus_inf_const", minus_inf_tile_container, to_ckw(acc_data_type));
+            minus_inf_tile_container.push_back({value});
+            const TileOperand &minus_inf =
+                writer->declare_tile("minus_inf_const", minus_inf_tile_container, to_ckw(acc_data_type));
             writer->op_assign(res_tile, minus_inf);
         }
         else
@@ -209,7 +214,7 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw
     writer->op_binary_elementwise_function(pool_y_e, BinaryFunction::Min, pool_size_y_tile, pool_y_e);
 
     const TileOperand &filter_size = writer->declare_tile("filter_size", ckw::DataType::Int32);
-    if(_attributes.exclude_padding())
+    if (_attributes.exclude_padding())
     {
         const TileOperand &y_diff = writer->declare_tile("y_diff", ckw::DataType::Int32);
         const TileOperand &x_diff = writer->declare_tile("x_diff", ckw::DataType::Int32);
@@ -227,7 +232,7 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw
     const TileOperand &x = writer->declare_tile("x", ckw::DataType::Int32);
     const TileOperand &y = writer->declare_tile("y", ckw::DataType::Int32);
 
-    if(is_global_pooling)
+    if (is_global_pooling)
     {
         writer->op_assign(x, const_0);
         writer->op_assign(y, const_0);
@@ -242,76 +247,80 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw
     }
 
     // Y dim for-loop
-    writer->op_for_loop(y, BinaryOp::Less, pool_y_e, y, AssignmentOp::Increment, const_1, [&]()
-    {
-        // Reset the iterator for the inner loop
-        if(is_global_pooling)
-        {
-            writer->op_assign(x, const_0);
-        }
-        else
+    writer->op_for_loop(
+        y, BinaryOp::Less, pool_y_e, y, AssignmentOp::Increment, const_1,
+        [&]()
         {
-            writer->op_assign(x, pool_x_s);
-        }
-
-        TileOperand &a_y = writer->declare_tile("a_y", ckw::DataType::Int32);
-        writer->op_binary_expression(a_y, idx_in_h, BinaryOp::Add, y);
-
-        // X dim for-loop
-        writer->op_for_loop(x, BinaryOp::Less, pool_x_e, x, AssignmentOp::Increment, const_1, [&]()
-        {
-            TileOperand &a_x = writer->declare_tile("a_x", ckw::DataType::Int32);
-            writer->op_binary_expression(a_x, idx_in_w, BinaryOp::Add, x);
-
-            TileOperand &src_tile = writer->declare_tile("src_tile", TileInfo(to_ckw(acc_data_type), m0, n0));
-
-            src_sampler.y(a_x);
-            src_sampler.z(a_y);
-
-            // Load src tile
-            if(use_fp_mixed_precision)
+            // Reset the iterator for the inner loop
+            if (is_global_pooling)
             {
-                TileOperand &src_uncasted_tile = writer->declare_tile("uncasted_src_tile", dst_tile_info);
-                writer->op_load(src_uncasted_tile, src->tensor(), src_sampler);
-                writer->op_cast_expression(src_tile, src_uncasted_tile, ckw::ConvertPolicy::None);
+                writer->op_assign(x, const_0);
             }
             else
             {
-                writer->op_load(src_tile, src->tensor(), src_sampler);
+                writer->op_assign(x, pool_x_s);
             }
 
-            // Take the square of the input, for L2 Pooling
-            if(_attributes.pool_type() == PoolingType::L2)
-            {
-                writer->op_binary_expression(src_tile, src_tile, BinaryOp::Mul, src_tile);
-            }
-
-            // Perfom Pooling op
-            if(_attributes.pool_type() == PoolingType::MAX)
-            {
-                writer->op_binary_elementwise_function(res_tile, BinaryFunction::Max, res_tile, src_tile);
-            }
-            else
-            {
-                writer->op_binary_expression(res_tile, res_tile, BinaryOp::Add, src_tile);
-            }
+            TileOperand &a_y = writer->declare_tile("a_y", ckw::DataType::Int32);
+            writer->op_binary_expression(a_y, idx_in_h, BinaryOp::Add, y);
+
+            // X dim for-loop
+            writer->op_for_loop(
+                x, BinaryOp::Less, pool_x_e, x, AssignmentOp::Increment, const_1,
+                [&]()
+                {
+                    TileOperand &a_x = writer->declare_tile("a_x", ckw::DataType::Int32);
+                    writer->op_binary_expression(a_x, idx_in_w, BinaryOp::Add, x);
+
+                    TileOperand &src_tile = writer->declare_tile("src_tile", TileInfo(to_ckw(acc_data_type), m0, n0));
+
+                    src_sampler.y(a_x);
+                    src_sampler.z(a_y);
+
+                    // Load src tile
+                    if (use_fp_mixed_precision)
+                    {
+                        TileOperand &src_uncasted_tile = writer->declare_tile("uncasted_src_tile", dst_tile_info);
+                        writer->op_load(src_uncasted_tile, src->tensor(), src_sampler);
+                        writer->op_cast_expression(src_tile, src_uncasted_tile, ckw::ConvertPolicy::None);
+                    }
+                    else
+                    {
+                        writer->op_load(src_tile, src->tensor(), src_sampler);
+                    }
+
+                    // Take the square of the input, for L2 Pooling
+                    if (_attributes.pool_type() == PoolingType::L2)
+                    {
+                        writer->op_binary_expression(src_tile, src_tile, BinaryOp::Mul, src_tile);
+                    }
+
+                    // Perfom Pooling op
+                    if (_attributes.pool_type() == PoolingType::MAX)
+                    {
+                        writer->op_binary_elementwise_function(res_tile, BinaryFunction::Max, res_tile, src_tile);
+                    }
+                    else
+                    {
+                        writer->op_binary_expression(res_tile, res_tile, BinaryOp::Add, src_tile);
+                    }
+                });
         });
-    });
 
-    if((_attributes.pool_type() == PoolingType::AVG) || (_attributes.pool_type() == PoolingType::L2))
+    if ((_attributes.pool_type() == PoolingType::AVG) || (_attributes.pool_type() == PoolingType::L2))
     {
         // filter_size is automatically broadcasted in the operation
         writer->op_binary_expression(res_tile, res_tile, BinaryOp::Div, filter_size);
     }
 
     // Take square root of the result in L2 pooling
-    if(_attributes.pool_type() == PoolingType::L2)
+    if (_attributes.pool_type() == PoolingType::L2)
     {
         writer->op_unary_elementwise_function(res_tile, UnaryFunction::Sqrt, res_tile);
     }
 
     // Store the results and do casting if FP_MIXED_PRECISION
-    if(use_fp_mixed_precision)
+    if (use_fp_mixed_precision)
     {
         writer->op_cast_expression(dst_tile, res_tile, ckw::ConvertPolicy::None);
     }
@@ -326,7 +335,7 @@ Window GpuCkwPool2d::get_window() const
     ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
 
     TensorShape        output_shape = _dst->tensor_shape();
-    const unsigned int vec_size     = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0));
+    const unsigned int vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0));
     // Create and configure kernel window
     auto win = calculate_max_window(output_shape, Steps(vec_size));
     win      = win.collapse_if_possible(win, Window::DimZ); // collapse window on batch size.
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h
index 2ccf255236..822282a108 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h
@@ -59,9 +59,11 @@ public:
     /** Destructor */
     ~GpuCkwPool2d() override = default;
     // Inherited methods overriden:
-    virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override;
-    Window      get_window() const override;
-    std::string get_name(const ComponentGroup &comp_group) const override;
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const override;
+    Window       get_window() const override;
+    std::string  get_name(const ComponentGroup &comp_group) const override;
 
 private:
     const ITensorInfo *_src;
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp
index d997c82dae..f2a7d41afd 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp
@@ -28,14 +28,13 @@
 
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
-
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -49,20 +48,17 @@ namespace
 constexpr unsigned int opencl_vector_size_in_bytes = 16;
 } // namespace
 
-GpuCkwResize::GpuCkwResize(ComponentId                      id,
-                           const ArgumentPack<ITensorInfo> &tensors,
-                           const Attributes                &attributes)
-    : IGpuCkwComponentDriver{ id, tensors },
-      _src{},
-      _dst{},
-      _attributes{ attributes }
+GpuCkwResize::GpuCkwResize(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes)
+    : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST);
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 }
 
-void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
+void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup    &comp_group,
+                                              GpuCkwVariableTable     &vtable,
+                                              GpuCkwScopedKernelWriter writer) const
 {
     const size_t width_idx  = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::WIDTH);
     const size_t height_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::HEIGHT);
@@ -72,12 +68,16 @@ void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group,
     const int32_t m0          = root_window.y().step();
     const int32_t partial_n0  = _dst->dimension(0) % n0;
 
-    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
-    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+    GpuCkwComponentArgument *src =
+        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
+    GpuCkwComponentArgument *dst =
+        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
 
     // Constants
-    const float scale_x      = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx), _attributes.align_corners());
-    const float scale_y      = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx), _attributes.align_corners());
+    const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx),
+                                                              _attributes.align_corners());
+    const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx),
+                                                              _attributes.align_corners());
     const auto &tile_scale_x = writer->declare_tile("scale_x", scale_x);
     const auto &tile_scale_y = writer->declare_tile("scale_y", scale_y);
     const auto &tile_0       = writer->declare_tile("0", 0);
@@ -112,7 +112,7 @@ void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group,
     const auto &tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32);
     const auto &tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32);
 
-    switch(_attributes.sampling_policy())
+    switch (_attributes.sampling_policy())
     {
         case SamplingPolicy::TOP_LEFT:
             // xi_f = (xo * scale_x)
@@ -138,7 +138,7 @@ void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group,
             ARM_COMPUTE_ERROR("Unsupported sampling policy");
     }
 
-    if(_attributes.align_corners())
+    if (_attributes.align_corners())
     {
         writer->op_unary_elementwise_function(tile_xi_f, UnaryFunction::Round, tile_xi_f);
         writer->op_unary_elementwise_function(tile_yi_f, UnaryFunction::Round, tile_yi_f);
@@ -161,8 +161,10 @@ void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group,
     auto &tile_xi0 = writer->declare_tile("xi0", ckw::DataType::Int32);
     auto &tile_yi0 = writer->declare_tile("yi0", ckw::DataType::Int32);
 
-    writer->op_ternary_elementwise_function(tile_xi0, TernaryFunction::Clamp, tile_xi_f_int, tile_0, tile_src_w_minus_1);
-    writer->op_ternary_elementwise_function(tile_yi0, TernaryFunction::Clamp, tile_yi_f_int, tile_0, tile_src_h_minus_1);
+    writer->op_ternary_elementwise_function(tile_xi0, TernaryFunction::Clamp, tile_xi_f_int, tile_0,
+                                            tile_src_w_minus_1);
+    writer->op_ternary_elementwise_function(tile_yi0, TernaryFunction::Clamp, tile_yi_f_int, tile_0,
+                                            tile_src_h_minus_1);
 
     TensorTileSampler src_sampler;
     src_sampler.x(tile_co);
@@ -199,7 +201,9 @@ void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group,
     writer->op_assign(tile_dst, tile_src);
 }
 
-void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
+void GpuCkwResize::do_bilinear_resize(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const
 {
     const size_t width_idx  = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::WIDTH);
     const size_t height_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::HEIGHT);
@@ -209,12 +213,16 @@ void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, GpuCkwVa
     const int32_t m0          = root_window.y().step();
     const int32_t partial_n0  = _dst->dimension(0) % n0;
 
-    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
-    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+    GpuCkwComponentArgument *src =
+        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
+    GpuCkwComponentArgument *dst =
+        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
 
     // Constants
-    const float scale_x      = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx), _attributes.align_corners());
-    const float scale_y      = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx), _attributes.align_corners());
+    const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx),
+                                                              _attributes.align_corners());
+    const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx),
+                                                              _attributes.align_corners());
     const auto &tile_scale_x = writer->declare_tile("scale_x", scale_x);
     const auto &tile_scale_y = writer->declare_tile("scale_y", scale_y);
     const auto &tile_0       = writer->declare_tile("0", 0);
@@ -251,7 +259,7 @@ void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, GpuCkwVa
     const auto &tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32);
     const auto &tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32);
 
-    switch(_attributes.sampling_policy())
+    switch (_attributes.sampling_policy())
     {
         case SamplingPolicy::TOP_LEFT:
             // xi_f = (xo * scale_x)
@@ -312,8 +320,10 @@ void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, GpuCkwVa
 
     writer->op_ternary_elementwise_function(tile_xi0, TernaryFunction::Clamp, tile_xi, tile_0, tile_src_w_minus_1);
     writer->op_ternary_elementwise_function(tile_yi0, TernaryFunction::Clamp, tile_yi, tile_0, tile_src_h_minus_1);
-    writer->op_ternary_elementwise_function(tile_xi1, TernaryFunction::Clamp, tile_xi_plus_1, tile_0, tile_src_w_minus_1);
-    writer->op_ternary_elementwise_function(tile_yi1, TernaryFunction::Clamp, tile_yi_plus_1, tile_0, tile_src_h_minus_1);
+    writer->op_ternary_elementwise_function(tile_xi1, TernaryFunction::Clamp, tile_xi_plus_1, tile_0,
+                                            tile_src_w_minus_1);
+    writer->op_ternary_elementwise_function(tile_yi1, TernaryFunction::Clamp, tile_yi_plus_1, tile_0,
+                                            tile_src_h_minus_1);
 
     TensorTileSampler in_sampler;
     in_sampler.x(tile_co);
@@ -388,7 +398,7 @@ void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, GpuCkwVa
     writer->op_binary_expression(tile_a1, tile_yi_f, BinaryOp::Sub, tile_yi_float);
     writer->op_binary_expression(tile_b1, tile_1, BinaryOp::Sub, tile_a1);
 
-    if(is_data_type_float(_src->data_type()))
+    if (is_data_type_float(_src->data_type()))
     {
         // Cast weights to source type
         const auto &tile_a_src_type  = writer->declare_tile("a_src_t", to_ckw(_src->data_type()));
@@ -461,9 +471,11 @@ void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, GpuCkwVa
     }
 }
 
-void GpuCkwResize::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
+void GpuCkwResize::write_component_code(const ComponentGroup    &comp_group,
+                                        GpuCkwVariableTable     &vtable,
+                                        GpuCkwScopedKernelWriter writer) const
 {
-    switch(_attributes.interpolation_policy())
+    switch (_attributes.interpolation_policy())
     {
         case InterpolationPolicy::NEAREST_NEIGHBOR:
             do_nearest_neighbor_resize(comp_group, vtable, writer);
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp
index 8917391537..889706b0c0 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp
@@ -24,10 +24,12 @@
 #include "GpuCkwStore.h"
 
 #include "arm_compute/core/Error.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+
 #include <string>
 
 namespace arm_compute
@@ -37,12 +39,14 @@ namespace experimental
 namespace dynamic_fusion
 {
 GpuCkwStore::GpuCkwStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors)
-    : IGpuCkwComponentDriver{ id, tensors }, _src{}, _dst{}
+    : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
 }
-void GpuCkwStore::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
+void GpuCkwStore::write_component_code(const ComponentGroup    &comp_group,
+                                       GpuCkwVariableTable     &vtable,
+                                       GpuCkwScopedKernelWriter writer) const
 {
     auto src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
     auto dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h
index 8e35651caf..f1f0e6747b 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h
@@ -48,8 +48,10 @@ public:
     /** Destructor */
     ~GpuCkwStore() override = default;
     // Inherited methods overriden:
-    virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override;
-    std::string get_name(const ComponentGroup &comp_group) const override;
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const override;
+    std::string  get_name(const ComponentGroup &comp_group) const override;
 
 private:
     const ITensorInfo *_src;
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h
index e2b8584b99..6ba2b2f651 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/utils/misc/Utility.h"
 #include "ckw/TensorTileSampler.h"
+
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
@@ -44,9 +45,14 @@ using SamplerCreator = std::function<TensorTileSampler(GpuCkwScopedKernelWriter
 
 /** Load src and dst tiles of dimension [m0, n0] only when not loaded and prepare the sampler
  */
-inline void load_src_dst_tiles_and_prepare_sampler(GpuCkwScopedKernelWriter &writer, GpuCkwComponentArgument *src, GpuCkwComponentArgument *dst, int32_t m0, int32_t n0, SamplerCreator create_sampler)
+inline void load_src_dst_tiles_and_prepare_sampler(GpuCkwScopedKernelWriter &writer,
+                                                   GpuCkwComponentArgument  *src,
+                                                   GpuCkwComponentArgument  *dst,
+                                                   int32_t                   m0,
+                                                   int32_t                   n0,
+                                                   SamplerCreator            create_sampler)
 {
-    if(!src->has_tile())
+    if (!src->has_tile())
     {
         const auto sampler = create_sampler(writer, m0, n0);
         writer->op_load_once(src, sampler);
@@ -61,7 +67,7 @@ inline void load_src_dst_tiles_and_prepare_sampler(GpuCkwScopedKernelWriter &wri
     const auto &sampler  = src->tile_sampler();
 
     // Prepare the output tile.
-    if(!dst->has_tile())
+    if (!dst->has_tile())
     {
         auto &tile = writer->declare_tile("dst_tile", src_tile.tile_info());
         dst->init_virtual_tensor(tile, sampler);
@@ -78,7 +84,13 @@ inline void load_src_dst_tiles_and_prepare_sampler(GpuCkwScopedKernelWriter &wri
  * @param[in]     prefix          Prefix to all the tiles declared within this function
  * @param[in]     const_0         Constant tile of value 0
  */
-inline void get_coord(GpuCkwScopedKernelWriter writer, TileOperand &coord, const TileOperand &gid, int32_t step_v, int32_t leftover_step_v, const std::string &prefix, const TileOperand &const_0)
+inline void get_coord(GpuCkwScopedKernelWriter writer,
+                      TileOperand             &coord,
+                      const TileOperand       &gid,
+                      int32_t                  step_v,
+                      int32_t                  leftover_step_v,
+                      const std::string       &prefix,
+                      const TileOperand       &const_0)
 {
     auto &step          = writer->declare_tile(prefix + "step", step_v);
     auto &leftover_step = writer->declare_tile(prefix + "leftover_step", leftover_step_v);
@@ -122,8 +134,15 @@ inline void get_coord(GpuCkwScopedKernelWriter writer, TileOperand &coord, const
  *
  * @return TensorTileSampler
  */
-inline TensorTileSampler create_boundary_aware_2d_sampler(GpuCkwScopedKernelWriter writer, TileOperand &gid_0, TileOperand &gid_1, int32_t dim0_v, int32_t dim1_v, int32_t n0_v, int32_t m0_v,
-                                                          const std::string prefix, TileOperand &const_0)
+inline TensorTileSampler create_boundary_aware_2d_sampler(GpuCkwScopedKernelWriter writer,
+                                                          TileOperand             &gid_0,
+                                                          TileOperand             &gid_1,
+                                                          int32_t                  dim0_v,
+                                                          int32_t                  dim1_v,
+                                                          int32_t                  n0_v,
+                                                          int32_t                  m0_v,
+                                                          const std::string        prefix,
+                                                          TileOperand             &const_0)
 {
     // Clamp tile size [n0, m0] against dimension [dim0, dim1]
     // This is needed to:
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h
index 34b1283add..5da317bf38 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h
@@ -28,6 +28,7 @@
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
 #include "ckw/TensorInfo.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 
 namespace arm_compute
@@ -38,7 +39,7 @@ namespace dynamic_fusion
 {
 inline ckw::DataType to_ckw(DataType dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::F32:
             return ckw::DataType::Fp32;
@@ -65,21 +66,16 @@ inline ckw::DataType to_ckw(DataType dt)
 
 inline ckw::TensorShape to_ckw(const TensorShape &shape)
 {
-    ARM_COMPUTE_ERROR_ON(shape.num_max_dimensions < std::tuple_size<ckw::TensorShape> {});
-    ARM_COMPUTE_ERROR_ON(std::tuple_size<ckw::TensorShape> {} != 5);
+    ARM_COMPUTE_ERROR_ON(shape.num_max_dimensions < std::tuple_size<ckw::TensorShape>{});
+    ARM_COMPUTE_ERROR_ON(std::tuple_size<ckw::TensorShape>{} != 5);
     /// NOTE: Overflow danger. Use size_t?
-    return ckw::TensorShape
-    {
-        static_cast<int32_t>(shape[0]),
-        static_cast<int32_t>(shape[1]),
-        static_cast<int32_t>(shape[2]),
-        static_cast<int32_t>(shape[3]),
-        static_cast<int32_t>(shape[4])
-    };
+    return ckw::TensorShape{static_cast<int32_t>(shape[0]), static_cast<int32_t>(shape[1]),
+                            static_cast<int32_t>(shape[2]), static_cast<int32_t>(shape[3]),
+                            static_cast<int32_t>(shape[4])};
 }
 inline ckw::TensorDataLayout to_ckw(DataLayout dl)
 {
-    switch(dl)
+    switch (dl)
     {
         case DataLayout::NHWC:
             return ckw::TensorDataLayout::Nhwc;
@@ -91,18 +87,13 @@ inline ckw::TensorDataLayout to_ckw(DataLayout dl)
 }
 inline ckw::TensorInfo to_ckw(const ITensorInfo &tensor_info)
 {
-    return ckw::TensorInfo
-    {
-        to_ckw(tensor_info.data_type()),
-        to_ckw(tensor_info.tensor_shape()),
-        to_ckw(tensor_info.data_layout()),
-        tensor_info.id()
-    };
+    return ckw::TensorInfo{to_ckw(tensor_info.data_type()), to_ckw(tensor_info.tensor_shape()),
+                           to_ckw(tensor_info.data_layout()), tensor_info.id()};
 }
 
 inline TensorComponentType from_ckw(const ckw::TensorComponentType &component)
 {
-    switch(component)
+    switch (component)
     {
         case ckw::TensorComponentType::OffsetFirstElement:
             return TensorComponentType::OffsetFirstElement;
@@ -142,7 +133,7 @@ inline TensorComponentType from_ckw(const ckw::TensorComponentType &component)
 
 inline ckw::TensorStorageType to_ckw(const TensorStorageType &storage)
 {
-    switch(storage)
+    switch (storage)
     {
         case TensorStorageType::ClBufferUint8Ptr:
             return ckw::TensorStorageType::BufferUint8Ptr;
@@ -159,7 +150,7 @@ inline ckw::TensorStorageType to_ckw(const TensorStorageType &storage)
 }
 inline TensorStorageType from_ckw(const ckw::TensorStorageType &storage)
 {
-    switch(storage)
+    switch (storage)
     {
         case ckw::TensorStorageType::BufferUint8Ptr:
             return TensorStorageType::ClBufferUint8Ptr;
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h
index 9cb022fc10..0cba258940 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h
@@ -25,6 +25,7 @@
 #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY
 
 #include "ckw/types/Operators.h"
+
 #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
 
 namespace arm_compute
@@ -35,7 +36,7 @@ namespace dynamic_fusion
 {
 inline ckw::BinaryOp to_ckw(const ElementwiseBinaryCommonAttributes &attributes)
 {
-    switch(attributes.operation())
+    switch (attributes.operation())
     {
         case ElementwiseBinaryCommonAttributes::ElementwiseOp::Add:
             return ckw::BinaryOp::Add;
diff --git a/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h b/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h
index f7f0029618..ee109a7e2b 100644
--- a/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h
+++ b/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h
@@ -24,8 +24,9 @@
 #ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY
 
-#include "Types.h"
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+#include "Types.h"
 #include <memory>
 
 namespace arm_compute
@@ -49,13 +50,13 @@ public:
      * @return std::unique_ptr<IGpuKernelComponent>
      */
     template <typename T, typename... Args>
-    std::unique_ptr<IGpuKernelComponent> create(Args &&... args)
+    std::unique_ptr<IGpuKernelComponent> create(Args &&...args)
     {
         return std::make_unique<T>(_count++, std::forward<Args>(args)...);
     }
 
 private:
-    ComponentId _count{ 0 };
+    ComponentId _count{0};
 };
 
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
index af766a7ece..4b8eea2f57 100644
--- a/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
+++ b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
@@ -24,11 +24,11 @@
 #ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT
 #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT
 
-#include "Types.h"
-
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
 
+#include "Types.h"
+
 namespace arm_compute
 {
 namespace experimental
@@ -76,13 +76,8 @@ public:
      * @param[in] properties Kernel component properties
      * @param[in] tensors    Tensor arguments to the components
      */
-    IGpuKernelComponent(
-        ComponentId                      id,
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors)
-        : _id{ id },
-          _properties{ properties },
-          _tensors{ tensors }
+    IGpuKernelComponent(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors)
+        : _id{id}, _properties{properties}, _tensors{tensors}
     {
     }
     /** Destructor */
@@ -117,7 +112,7 @@ public:
     virtual GpuComponentType type() const = 0;
 
 private:
-    ComponentId               _id{ -1 };
+    ComponentId               _id{-1};
     Properties                _properties{};
     ArgumentPack<ITensorInfo> _tensors{};
 };
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp
index c41257d18c..fdf528a65d 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp
@@ -68,17 +68,11 @@ ClComponentActivation::ClComponentActivation(ComponentId
                                              const IGpuKernelComponent::Properties &properties,
                                              const ArgumentPack<ITensorInfo>       &tensors,
                                              const Attributes                      &attributes)
-    : IGpuKernelComponent{ id, properties, tensors },
+    : IGpuKernelComponent{id, properties, tensors},
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer
-{
-    std::make_unique<ClTemplateActivation>(id, tensors, attributes)
-}
+      _component_writer{std::make_unique<ClTemplateActivation>(id, tensors, attributes)}
 #else  //ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer
-{
-    std::make_unique<GpuCkwActivation>(id, tensors, attributes)
-}
+      _component_writer{std::make_unique<GpuCkwActivation>(id, tensors, attributes)}
 #endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h
index 9b090af988..02c854356a 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h
@@ -25,9 +25,8 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTACTIVATION
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
-#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
-#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
 namespace arm_compute
 {
@@ -79,20 +78,17 @@ public:
      * |F16            |F16            |
      * |F32            |F32            |
      */
-    static Status validate(
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes);
+    static Status
+    validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
 
     /** Constructor
      *
      * Similar to @ref ClComponentActivation::validate()
      */
-    ClComponentActivation(
-        ComponentId                      id,
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes);
+    ClComponentActivation(ComponentId                      id,
+                          const Properties                &properties,
+                          const ArgumentPack<ITensorInfo> &tensors,
+                          const Attributes                &attributes);
 
     /** Destructor */
     ~ClComponentActivation() override;
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp
index 635869f817..b1636795a3 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp
@@ -24,6 +24,7 @@
 #include "ClComponentCast.h"
 
 #include "arm_compute/core/Error.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
@@ -38,11 +39,10 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-Status ClComponentCast::validate(
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes,
-    const Settings                  &settings)
+Status ClComponentCast::validate(const Properties                &properties,
+                                 const ArgumentPack<ITensorInfo> &tensors,
+                                 const Attributes                &attributes,
+                                 const Settings                  &settings)
 {
     ARM_COMPUTE_UNUSED(properties, attributes, settings);
 
@@ -53,13 +53,15 @@ Status ClComponentCast::validate(
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(dst);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == attributes.data_type(), "input and target data types should be different");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == attributes.data_type(),
+                                    "input and target data types should be different");
 
     // Validate in case of configured dst
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != attributes.data_type(), "dst and target data types should be same");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != attributes.data_type(),
+                                        "dst and target data types should be same");
     }
 
     return Status{};
@@ -69,17 +71,11 @@ ClComponentCast::ClComponentCast(ComponentId                      id,
                                  const ArgumentPack<ITensorInfo> &tensors,
                                  const Attributes                &attributes,
                                  const Settings                  &settings)
-    : IGpuKernelComponent{ id, properties, tensors },
+    : IGpuKernelComponent{id, properties, tensors},
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer
-{
-    std::make_unique<ClTemplateCast>(id, tensors, attributes)
-}
+      _component_writer{std::make_unique<ClTemplateCast>(id, tensors, attributes)}
 #else  //ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer
-{
-    std::make_unique<GpuCkwCast>(id, tensors, attributes)
-}
+      _component_writer{std::make_unique<GpuCkwCast>(id, tensors, attributes)}
 #endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
     ARM_COMPUTE_UNUSED(attributes, settings);
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h
index 37b8cbb6c9..ed77b1203b 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h
@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTCAST
 
 #include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
 namespace arm_compute
@@ -93,11 +94,10 @@ public:
      * |F16            | U8, S8, U16, S16, U32, S32, F32       |
      * |F32            | U8, S8, U16, S16, U32, S32, F16       |
      */
-    static Status validate(
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes,
-        const Settings                  &settings);
+    static Status validate(const Properties                &properties,
+                           const ArgumentPack<ITensorInfo> &tensors,
+                           const Attributes                &attributes,
+                           const Settings                  &settings);
 
     /** Constructor
      *
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp
index 5626093079..d95e0be1f2 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h"
 
@@ -103,11 +104,10 @@ unsigned int Settings::m0() const
     return _m0;
 }
 
-Status ClComponentDepthwiseConv2d::validate(
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes,
-    const Settings                  &settings)
+Status ClComponentDepthwiseConv2d::validate(const Properties                &properties,
+                                            const ArgumentPack<ITensorInfo> &tensors,
+                                            const Attributes                &attributes,
+                                            const Settings                  &settings)
 {
     ARM_COMPUTE_UNUSED(properties, settings);
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
@@ -121,7 +121,7 @@ Status ClComponentDepthwiseConv2d::validate(
     // Matching data type
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, wei);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bia);
     }
@@ -129,7 +129,7 @@ Status ClComponentDepthwiseConv2d::validate(
     // Matching data layout
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, wei);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, bia);
     }
@@ -138,7 +138,7 @@ Status ClComponentDepthwiseConv2d::validate(
     ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(wei->tensor_shape().total_size() == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(bia->tensor_shape().total_size() == 0);
     }
@@ -148,16 +148,17 @@ Status ClComponentDepthwiseConv2d::validate(
     const DataLayout data_layout = src->data_layout();
     const size_t     channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
-    ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) != (src->dimension(channel_idx) * attributes.depth_multiplier()));
+    ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) !=
+                                (src->dimension(channel_idx) * attributes.depth_multiplier()));
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->num_dimensions() > 3, "Weights can be at most 3 dimensional");
 
     // dst shape is correct
-    const PadStrideInfo pad_stride_info = PadStrideInfo(attributes.stride().x(), attributes.stride().y(),
-                                                        attributes.pad().left, attributes.pad().right,
-                                                        attributes.pad().top, attributes.pad().bottom,
-                                                        attributes.dimension_rounding_type());
-    const ConvolutionInfo conv_info{ pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(), attributes.dilation() };
-    const TensorShape     output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info);
+    const PadStrideInfo pad_stride_info =
+        PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, attributes.pad().right,
+                      attributes.pad().top, attributes.pad().bottom, attributes.dimension_rounding_type());
+    const ConvolutionInfo conv_info{pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(),
+                                    attributes.dilation()};
+    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape);
 
@@ -168,19 +169,22 @@ Status ClComponentDepthwiseConv2d::validate(
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first > 1 && settings.m0() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation.x() > 1 && settings.m0() != 1);
 
-    if(conv_info.depth_multiplier > 1 && settings.n0() > 1)
+    if (conv_info.depth_multiplier > 1 && settings.n0() > 1)
     {
         ARM_COMPUTE_RETURN_ERROR_ON((conv_info.depth_multiplier % settings.n0()) != 0);
     }
 
     // Check export weights to cl image
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((settings.export_weights_to_cl_image() == true) && (export_to_cl_image(wei) == false), "Weights cannot be exported to cl_image!");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((settings.export_weights_to_cl_image() == true) &&
+                                        (export_to_cl_image(wei) == false),
+                                    "Weights cannot be exported to cl_image!");
     ARM_COMPUTE_RETURN_ERROR_ON((settings.export_weights_to_cl_image() == true) && ((settings.n0() % 4) != 0));
 
-    ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) != (src->dimension(channel_idx) * conv_info.depth_multiplier));
+    ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) !=
+                                (src->dimension(channel_idx) * conv_info.depth_multiplier));
 
     // bia shape is correct
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->dimension(0) != output_shape[channel_idx],
                                         "Biases size and number of dst feature maps should match");
@@ -198,14 +202,13 @@ Status ClComponentDepthwiseConv2d::validate(
     return Status{};
 }
 
-ClComponentDepthwiseConv2d::ClComponentDepthwiseConv2d(
-    ComponentId                      id,
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes,
-    const Settings                  &settings)
-    : IGpuKernelComponent{ id, properties, tensors },
-      _component_writer{ std::make_unique<ClTemplateDepthwiseConv2d>(id, tensors, attributes, settings) }
+ClComponentDepthwiseConv2d::ClComponentDepthwiseConv2d(ComponentId                      id,
+                                                       const Properties                &properties,
+                                                       const ArgumentPack<ITensorInfo> &tensors,
+                                                       const Attributes                &attributes,
+                                                       const Settings                  &settings)
+    : IGpuKernelComponent{id, properties, tensors},
+      _component_writer{std::make_unique<ClTemplateDepthwiseConv2d>(id, tensors, attributes, settings)}
 {
 }
 ClComponentDepthwiseConv2d::~ClComponentDepthwiseConv2d()
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h
index 0e2b5f14cb..b3e1bd222d 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h
@@ -25,7 +25,9 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDEPTHWISECONV2D
 
 #include "arm_compute/core/Error.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -77,12 +79,12 @@ public:
     unsigned int m0() const;
 
 private:
-    bool         _export_input_to_cl_image{ false };   /**< Export input to cl_image */
-    bool         _export_weights_to_cl_image{ false }; /**< Export the weights to cl_image */
-    bool         _fast_relaxed_math{ true };           /**< Enable/disable -cl-fast-relaxed-math flag */
-    bool         _is_fma_available{ false };           /**< Is fma instruction available */
-    unsigned int _n0{ 0 };                             /**< Number of columns processed by each thread */
-    unsigned int _m0{ 0 };                             /**< Number of rows processed by each thread */
+    bool         _export_input_to_cl_image{false};   /**< Export input to cl_image */
+    bool         _export_weights_to_cl_image{false}; /**< Export the weights to cl_image */
+    bool         _fast_relaxed_math{true};           /**< Enable/disable -cl-fast-relaxed-math flag */
+    bool         _is_fma_available{false};           /**< Is fma instruction available */
+    unsigned int _n0{0};                             /**< Number of columns processed by each thread */
+    unsigned int _m0{0};                             /**< Number of rows processed by each thread */
 };
 
 /** Forward declaration */
@@ -127,22 +129,20 @@ public:
      * |F16            |F16            |F16            |F16            |
      * |F32            |F32            |F32            |F32            |
      */
-    static Status validate(
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes,
-        const Settings                  &settings);
+    static Status validate(const Properties                &properties,
+                           const ArgumentPack<ITensorInfo> &tensors,
+                           const Attributes                &attributes,
+                           const Settings                  &settings);
 
     /** Constructor
      *
      * Similar to @ref ClComponentDepthwiseConv2d::validate()
      */
-    ClComponentDepthwiseConv2d(
-        ComponentId                      id,
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes,
-        const Settings                  &settings);
+    ClComponentDepthwiseConv2d(ComponentId                      id,
+                               const Properties                &properties,
+                               const ArgumentPack<ITensorInfo> &tensors,
+                               const Attributes                &attributes,
+                               const Settings                  &settings);
 
     /** Destructor */
     ~ClComponentDepthwiseConv2d() override;
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
index a713c82003..98f3d6a882 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
@@ -23,8 +23,8 @@
  */
 #include "ClComponentDirectConv2d.h"
 
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
 
 #include "src/core/CL/CLValidate.h"
@@ -57,7 +57,8 @@ bool ClComponentDirectConv2dSettings::fast_relaxed_math() const
     return _fast_relaxed_math;
 }
 
-ClComponentDirectConv2dSettings &ClComponentDirectConv2dSettings::direct_conv_descriptor(const DirectConvComputeKernelInfo &desc)
+ClComponentDirectConv2dSettings &
+ClComponentDirectConv2dSettings::direct_conv_descriptor(const DirectConvComputeKernelInfo &desc)
 {
     _desc = desc;
     return *this;
@@ -68,11 +69,10 @@ DirectConvComputeKernelInfo ClComponentDirectConv2dSettings::direct_conv_descrip
     return _desc;
 }
 
-Status ClComponentDirectConv2d::validate(
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes,
-    const Settings                  &settings)
+Status ClComponentDirectConv2d::validate(const Properties                &properties,
+                                         const ArgumentPack<ITensorInfo> &tensors,
+                                         const Attributes                &attributes,
+                                         const Settings                  &settings)
 {
     ARM_COMPUTE_UNUSED(properties);
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
@@ -86,7 +86,7 @@ Status ClComponentDirectConv2d::validate(
     // Matching data type
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, wei);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bia);
     }
@@ -94,7 +94,7 @@ Status ClComponentDirectConv2d::validate(
     // Matching data layout
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, wei);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, bia);
     }
@@ -103,7 +103,7 @@ Status ClComponentDirectConv2d::validate(
     ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(wei->tensor_shape().total_size() == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(bia->tensor_shape().total_size() == 0);
     }
@@ -112,22 +112,23 @@ Status ClComponentDirectConv2d::validate(
     // wei shape is correct
     const DataLayout data_layout = src->data_layout();
     const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->dimension(channel_idx) != src->dimension(channel_idx),
+                                    "Weights feature map dimension should match the respective src's one");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->num_dimensions() > 4, "Weights can be at most 4 dimensional");
 
     // dst shape is correct
-    PadStrideInfo legacy_pad_stride(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, attributes.pad().right, attributes.pad().top,
-                                    attributes.pad().bottom, DimensionRoundingType{});
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
-                                                       misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, legacy_pad_stride));
+    PadStrideInfo legacy_pad_stride(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
+                                    attributes.pad().right, attributes.pad().top, attributes.pad().bottom,
+                                    DimensionRoundingType{});
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+        dst->tensor_shape(), misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, legacy_pad_stride));
 
     // bia shape is correct
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->dimension(0) != wei->dimension(3),
                                         "Biases size and number of dst feature maps should match");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->num_dimensions() > 1,
-                                        "Biases should be one dimensional");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->num_dimensions() > 1, "Biases should be one dimensional");
     }
 
     // 2. Check support level
@@ -137,24 +138,25 @@ Status ClComponentDirectConv2d::validate(
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
 
     const auto desc = settings.direct_conv_descriptor();
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 && desc.n0 != 16,
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 &&
+                                        desc.n0 != 16,
                                     "N0 can only be: 1, 2, 3, 4, 8, and 16");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16,
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 &&
+                                        desc.k0 != 16,
                                     "K0 can only be: 1, 2, 3, 4, 8, and 16");
     return Status{};
 }
 
-ClComponentDirectConv2d::ClComponentDirectConv2d(
-    ComponentId                      id,
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes,
-    const Settings                  &settings)
-    : IGpuKernelComponent{ id, properties, tensors },
+ClComponentDirectConv2d::ClComponentDirectConv2d(ComponentId                      id,
+                                                 const Properties                &properties,
+                                                 const ArgumentPack<ITensorInfo> &tensors,
+                                                 const Attributes                &attributes,
+                                                 const Settings                  &settings)
+    : IGpuKernelComponent{id, properties, tensors},
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer{ std::make_unique<ClTemplateDirectConv2d>(id, tensors, attributes, settings) }
-#else // ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer{ std::make_unique<GpuCkwDirectConv2d>(id, tensors, attributes, settings) }
+      _component_writer{std::make_unique<ClTemplateDirectConv2d>(id, tensors, attributes, settings)}
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+      _component_writer{std::make_unique<GpuCkwDirectConv2d>(id, tensors, attributes, settings)}
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
@@ -165,7 +167,7 @@ ClComponentDirectConv2d::~ClComponentDirectConv2d()
 
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuTemplateComponentWriter *ClComponentDirectConv2d::template_writer() const
-#else // ACL_INTERNAL_TEST_CKW_IN_DF
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuCkwComponentDriver *ClComponentDirectConv2d::ckw_component_driver() const
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 {
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
index 24acb1b2c1..d6d9705d3c 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
@@ -26,7 +26,9 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -61,7 +63,7 @@ public:
     DirectConvComputeKernelInfo direct_conv_descriptor() const;
 
 private:
-    bool                        _fast_relaxed_math{ true };
+    bool                        _fast_relaxed_math{true};
     DirectConvComputeKernelInfo _desc{}; // Direct convolution descriptor
 };
 
@@ -111,22 +113,20 @@ public:
      * |F16            |F16            |F16            |F16            |
      * |F32            |F32            |F32            |F32            |
      */
-    static Status validate(
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes,
-        const Settings                  &settings);
+    static Status validate(const Properties                &properties,
+                           const ArgumentPack<ITensorInfo> &tensors,
+                           const Attributes                &attributes,
+                           const Settings                  &settings);
 
     /** Constructor
      *
      * Similar to @ref ClComponentDirectConv2d::validate()
      */
-    ClComponentDirectConv2d(
-        ComponentId                      id,
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes,
-        const Settings                  &settings);
+    ClComponentDirectConv2d(ComponentId                      id,
+                            const Properties                &properties,
+                            const ArgumentPack<ITensorInfo> &tensors,
+                            const Attributes                &attributes,
+                            const Settings                  &settings);
 
     /** Destructor */
     ~ClComponentDirectConv2d() override;
@@ -142,7 +142,7 @@ public:
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     const IGpuTemplateComponentWriter *template_writer() const override;
 #else  // ACL_INTERNAL_TEST_CKW_IN_DF
-    const IGpuCkwComponentDriver *ckw_component_driver() const override;
+    const IGpuCkwComponentDriver       *ckw_component_driver() const override;
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
     /** Get component type */
     GpuComponentType type() const override
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
index 88d729170c..5b136427e4 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
@@ -24,6 +24,7 @@
 #include "ClComponentElementwiseBinary.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h"
@@ -39,56 +40,55 @@ namespace dynamic_fusion
 {
 namespace
 {
-std::set<ElementwiseBinaryCommonAttributes::ElementwiseOp> supported_ops
-{
-    ElementwiseBinaryCommonAttributes::ElementwiseOp::Add,
-    ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub,
-    ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul
-};
+std::set<ElementwiseBinaryCommonAttributes::ElementwiseOp> supported_ops{
+    ElementwiseBinaryCommonAttributes::ElementwiseOp::Add, ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub,
+    ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul};
 }
 
-Status ClComponentElementwiseBinary::validate(const ArgumentPack<ITensorInfo> &tensors, const ElementwiseBinaryCommonAttributes &attributes)
+Status ClComponentElementwiseBinary::validate(const ArgumentPack<ITensorInfo>         &tensors,
+                                              const ElementwiseBinaryCommonAttributes &attributes)
 {
     const auto lhs = tensors.get_const_tensor(TensorType::ACL_SRC_0);
     const auto rhs = tensors.get_const_tensor(TensorType::ACL_SRC_1);
     const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
 
     // Check operator type
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(supported_ops.find(attributes.operation()) == supported_ops.end(), "Provided Elementwise operation not supported.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(supported_ops.find(attributes.operation()) == supported_ops.end(),
+                                    "Provided Elementwise operation not supported.");
 
     // Check validity
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
 
     //Check data type for different elementwise operators
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::S32, DataType::S16, DataType::U8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::S32,
+                                                         DataType::S16, DataType::U8);
 
     // dst shape is correct
     const TensorShape out_shape = TensorShape::broadcast_shape(lhs->tensor_shape(), rhs->tensor_shape());
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
+                                    "Wrong shape for dst.");
 
     const auto &lhs_shape = lhs->tensor_shape();
     const auto &rhs_shape = rhs->tensor_shape();
     const auto &dst_shape = dst->tensor_shape();
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(lhs_shape, dst_shape, 0) && detail::have_different_dimensions(rhs_shape, dst_shape, 0),
-        "Only LHS or RHS can be broadcasting, not both.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(lhs_shape, dst_shape, 0) &&
+                                        detail::have_different_dimensions(rhs_shape, dst_shape, 0),
+                                    "Only LHS or RHS can be broadcasting, not both.");
 
     // Dimension Y and Z are collapsed together in the current kernel implementation,
     // hence they cannot be independently broadcast or non-broadcast.
     // See: ClTemplateElementwiseBinary::get_window
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        (lhs_shape[1] != dst_shape[1] || rhs_shape[1] != dst_shape[1]) != (lhs_shape[2] != dst_shape[2] || rhs_shape[2] != dst_shape[2]),
-        "Dimension Y and Z must both be either broadcast or non-broadcast.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_shape[1] != dst_shape[1] || rhs_shape[1] != dst_shape[1]) !=
+                                        (lhs_shape[2] != dst_shape[2] || rhs_shape[2] != dst_shape[2]),
+                                    "Dimension Y and Z must both be either broadcast or non-broadcast.");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(lhs_shape, dst_shape, 3),
-        "LHS broadcast in dimension 3 or higher is not supported.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(lhs_shape, dst_shape, 3),
+                                    "LHS broadcast in dimension 3 or higher is not supported.");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(rhs_shape, dst_shape, 3),
-        "RHS broadcast in dimension 3 or higher is not supported.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(rhs_shape, dst_shape, 3),
+                                    "RHS broadcast in dimension 3 or higher is not supported.");
 
     // Matching data type
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
@@ -112,22 +112,15 @@ Status ClComponentElementwiseBinary::validate(const ArgumentPack<ITensorInfo> &t
 ClComponentElementwiseBinary::~ClComponentElementwiseBinary()
 {
 }
-ClComponentElementwiseBinary::ClComponentElementwiseBinary(
-    ComponentId                      id,
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes)
-    : IGpuKernelComponent{ id, properties, tensors },
+ClComponentElementwiseBinary::ClComponentElementwiseBinary(ComponentId                      id,
+                                                           const Properties                &properties,
+                                                           const ArgumentPack<ITensorInfo> &tensors,
+                                                           const Attributes                &attributes)
+    : IGpuKernelComponent{id, properties, tensors},
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer
-{
-    std::make_unique<ClTemplateElementwiseBinary>(id, tensors, attributes)
-}
+      _component_writer{std::make_unique<ClTemplateElementwiseBinary>(id, tensors, attributes)}
 #else  //ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer
-{
-    std::make_unique<GpuCkwElementwiseBinary>(id, tensors, attributes)
-}
+      _component_writer{std::make_unique<GpuCkwElementwiseBinary>(id, tensors, attributes)}
 #endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h
index f7175903d0..7589b9732c 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h
@@ -82,17 +82,17 @@ public:
      * |S16            |S16            |S16            |
      * |U8             |U8             |U8             |
      */
-    static Status validate(const ArgumentPack<ITensorInfo> &tensors, const ElementwiseBinaryCommonAttributes &attributes);
+    static Status validate(const ArgumentPack<ITensorInfo>         &tensors,
+                           const ElementwiseBinaryCommonAttributes &attributes);
 
     /** Constructor
      *
      * Similar to @ref ClComponentElementwiseBinary::validate()
      */
-    ClComponentElementwiseBinary(
-        ComponentId                      id,
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes);
+    ClComponentElementwiseBinary(ComponentId                      id,
+                                 const Properties                &properties,
+                                 const ArgumentPack<ITensorInfo> &tensors,
+                                 const Attributes                &attributes);
 
     /** Destructor */
     ~ClComponentElementwiseBinary() override;
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp
index 279c77e227..27c13bd654 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp
@@ -25,9 +25,10 @@
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h"
 
@@ -37,10 +38,9 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-Status ClComponentLogits1DMaxShiftExpSum::validate(
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes)
+Status ClComponentLogits1DMaxShiftExpSum::validate(const Properties                &properties,
+                                                   const ArgumentPack<ITensorInfo> &tensors,
+                                                   const Attributes                &attributes)
 {
     ARM_COMPUTE_UNUSED(properties, attributes);
 
@@ -75,8 +75,8 @@ ClComponentLogits1DMaxShiftExpSum::ClComponentLogits1DMaxShiftExpSum(ComponentId
                                                                      const Properties                &properties,
                                                                      const ArgumentPack<ITensorInfo> &tensors,
                                                                      const Attributes                &attributes)
-    : IGpuKernelComponent{ id, properties, tensors },
-      _component_writer{ std::make_unique<ClTemplateLogits1DMaxShiftExpSum>(id, tensors, attributes) }
+    : IGpuKernelComponent{id, properties, tensors},
+      _component_writer{std::make_unique<ClTemplateLogits1DMaxShiftExpSum>(id, tensors, attributes)}
 {
 }
 
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h
index b5db458248..91ab5de3b5 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h
@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTLOGITS1DMAXSHIFTEXPSUM
 
 #include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
 namespace arm_compute
@@ -89,10 +90,8 @@ public:
      * |F16        | F16       | F16       |
      * |F32        | F32       | F32       |
      */
-    static Status validate(
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes);
+    static Status
+    validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
 
     /** Constructor
      *
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp
index 7864d56d29..fb2544385c 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp
@@ -25,9 +25,10 @@
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.h"
 
@@ -37,10 +38,9 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-Status ClComponentLogits1DNorm::validate(
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes)
+Status ClComponentLogits1DNorm::validate(const Properties                &properties,
+                                         const ArgumentPack<ITensorInfo> &tensors,
+                                         const Attributes                &attributes)
 {
     ARM_COMPUTE_UNUSED(properties, attributes);
 
@@ -77,8 +77,8 @@ ClComponentLogits1DNorm::ClComponentLogits1DNorm(ComponentId
                                                  const Properties                &properties,
                                                  const ArgumentPack<ITensorInfo> &tensors,
                                                  const Attributes                &attributes)
-    : IGpuKernelComponent{ id, properties, tensors },
-      _component_writer{ std::make_unique<ClTemplateLogits1DNorm>(id, tensors, attributes) }
+    : IGpuKernelComponent{id, properties, tensors},
+      _component_writer{std::make_unique<ClTemplateLogits1DNorm>(id, tensors, attributes)}
 {
 }
 
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h
index 5bd350b9bd..74c0273604 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h
@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTLOGITS1DNORM
 
 #include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
 namespace arm_compute
@@ -86,10 +87,8 @@ public:
      * |F16        | F16       | F16       |
      * |F32        | F32       | F32       |
      */
-    static Status validate(
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes);
+    static Status
+    validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
 
     /** Constructor
      *
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp
index d415769094..409b191df5 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp
@@ -24,13 +24,15 @@
 #include "ClComponentPool2d.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h"
 #include "src/dynamic_fusion/utils/Utils.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -39,23 +41,24 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-Status ClComponentPool2d::validate(
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes,
-    const Settings                  &settings)
+Status ClComponentPool2d::validate(const Properties                &properties,
+                                   const ArgumentPack<ITensorInfo> &tensors,
+                                   const Attributes                &attributes,
+                                   const Settings                  &settings)
 {
     ARM_COMPUTE_UNUSED(properties);
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
     const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
 
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_ON_MSG((attributes.pool_type() != PoolingType::AVG && attributes.pool_type() != PoolingType::MAX), "Unsupported Pooling type");
+    ARM_COMPUTE_ERROR_ON_MSG((attributes.pool_type() != PoolingType::AVG && attributes.pool_type() != PoolingType::MAX),
+                             "Unsupported Pooling type");
 
     // 1. Check validity
     // Check if pooling is valid
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(convert_pool_attr_to_pool_info(attributes, settings.mixed_precision())),
-                                    "Pooling region that is entirely outside input tensor is unsupported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        is_pool_region_entirely_outside_input(convert_pool_attr_to_pool_info(attributes, settings.mixed_precision())),
+        "Pooling region that is entirely outside input tensor is unsupported");
 
     // Matching data type
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
@@ -70,8 +73,9 @@ Status ClComponentPool2d::validate(
     // Device requirements are met
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
-                                                       misc::shape_calculator::compute_pool_shape(*src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision())));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+        dst->tensor_shape(), misc::shape_calculator::compute_pool_shape(
+                                 *src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision())));
 
     // 2. Check support level
     // Data type
@@ -83,23 +87,16 @@ Status ClComponentPool2d::validate(
     return Status{};
 }
 
-ClComponentPool2d::ClComponentPool2d(
-    ComponentId                      id,
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes,
-    const Settings                  &settings)
-    : IGpuKernelComponent{ id, properties, tensors },
+ClComponentPool2d::ClComponentPool2d(ComponentId                      id,
+                                     const Properties                &properties,
+                                     const ArgumentPack<ITensorInfo> &tensors,
+                                     const Attributes                &attributes,
+                                     const Settings                  &settings)
+    : IGpuKernelComponent{id, properties, tensors},
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer
-{
-    std::make_unique<ClTemplatePool2d>(id, tensors, attributes, settings)
-}
+      _component_writer{std::make_unique<ClTemplatePool2d>(id, tensors, attributes, settings)}
 #else  //ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer
-{
-    std::make_unique<GpuCkwPool2d>(id, tensors, attributes, settings)
-}
+      _component_writer{std::make_unique<GpuCkwPool2d>(id, tensors, attributes, settings)}
 #endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h
index 6814bf9243..98fed65004 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h
@@ -25,6 +25,7 @@
 #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTPOOL2D_H
 
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
 namespace arm_compute
@@ -82,11 +83,10 @@ public:
      * |F16            |F16            |
      * |F32            |F32            |
      */
-    static Status validate(
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes,
-        const Settings                  &settings);
+    static Status validate(const Properties                &properties,
+                           const ArgumentPack<ITensorInfo> &tensors,
+                           const Attributes                &attributes,
+                           const Settings                  &settings);
 
     /** Constructor
      *
@@ -96,12 +96,11 @@ public:
      * @param[in]     attributes Component attributes
      * @param[in]     settings   Component settings
      */
-    ClComponentPool2d(
-        ComponentId                      id,
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes,
-        const Settings                  &settings);
+    ClComponentPool2d(ComponentId                      id,
+                      const Properties                &properties,
+                      const ArgumentPack<ITensorInfo> &tensors,
+                      const Attributes                &attributes,
+                      const Settings                  &settings);
 
     /** Destructor */
     ~ClComponentPool2d() override;
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp
index 66e2ee6956..0ece9de970 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp
@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "ClComponentReshape.h"
+
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h"
 
@@ -49,12 +51,10 @@ Status ClComponentReshape::validate(const ArgumentPack<ITensorInfo> &tensors)
     return Status{};
 }
 
-ClComponentReshape::ClComponentReshape(
-    ComponentId                      id,
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors)
-    : IGpuKernelComponent{ id, properties, tensors },
-      _component_writer{ std::make_unique<ClTemplateReshape>(id, tensors) }
+ClComponentReshape::ClComponentReshape(ComponentId                      id,
+                                       const Properties                &properties,
+                                       const ArgumentPack<ITensorInfo> &tensors)
+    : IGpuKernelComponent{id, properties, tensors}, _component_writer{std::make_unique<ClTemplateReshape>(id, tensors)}
 {
 }
 ClComponentReshape::~ClComponentReshape()
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h
index f8d165b4c8..78163d6603 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h
@@ -73,10 +73,7 @@ public:
      * @param[in] properties Component properties @ref Properties
      * @param[in] tensors    Tensor arguments to the component
      */
-    ClComponentReshape(
-        ComponentId                      id,
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors);
+    ClComponentReshape(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors);
 
     /** Destructor */
     ~ClComponentReshape() override;
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp
index 6df1d9b3db..b05eb04698 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp
@@ -66,7 +66,9 @@ Status ClComponentResize::validate(const IGpuKernelComponent::Properties &proper
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
 
     // Align corners and sampling policy conformance
-    ARM_COMPUTE_RETURN_ERROR_ON(attributes.align_corners() && !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(attributes.sampling_policy()));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        attributes.align_corners() &&
+        !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(attributes.sampling_policy()));
 
     // All tensor infos are initialized
     ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
@@ -79,11 +81,11 @@ ClComponentResize::ClComponentResize(ComponentId                            id,
                                      const IGpuKernelComponent::Properties &properties,
                                      const ArgumentPack<ITensorInfo>       &tensors,
                                      const ClComponentResize::Attributes   &attributes)
-    : IGpuKernelComponent{ id, properties, tensors },
+    : IGpuKernelComponent{id, properties, tensors},
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer{ std::make_unique<ClTemplateResize>(id, tensors, attributes) }
-#else // ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer{ std::make_unique<GpuCkwResize>(id, tensors, attributes) }
+      _component_writer{std::make_unique<ClTemplateResize>(id, tensors, attributes)}
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+      _component_writer{std::make_unique<GpuCkwResize>(id, tensors, attributes)}
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
@@ -94,7 +96,7 @@ ClComponentResize::~ClComponentResize()
 
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuTemplateComponentWriter *ClComponentResize::template_writer() const
-#else // ACL_INTERNAL_TEST_CKW_IN_DF
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuCkwComponentDriver *ClComponentResize::ckw_component_driver() const
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 {
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h
index 474524f8fc..29276c3257 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h
@@ -26,6 +26,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESIZE
 
 #include "arm_compute/dynamic_fusion/sketch/attributes/ResizeAttributes.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
 namespace arm_compute
@@ -43,7 +44,7 @@ class ArgumentPack;
 /** Forward declaration */
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 class ClTemplateResize;
-#else // ACL_INTERNAL_TEST_CKW_IN_DF
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
 class GpuCkwResize;
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
@@ -82,10 +83,8 @@ public:
      * |U8             |U8             |
      * |S16            |S16            |
      */
-    static Status validate(
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes);
+    static Status
+    validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
 
     /** Constructor
      *
@@ -114,7 +113,7 @@ public:
     /** Get writer for the component */
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     const IGpuTemplateComponentWriter *template_writer() const override;
-#else // ACL_INTERNAL_TEST_CKW_IN_DF
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
     const IGpuCkwComponentDriver *ckw_component_driver() const override;
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
@@ -127,7 +126,7 @@ public:
 private:
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     std::unique_ptr<ClTemplateResize> _component_writer;
-#else // ACL_INTERNAL_TEST_CKW_IN_DF
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
     std::unique_ptr<GpuCkwResize> _component_writer;
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 };
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
index 12b81c3d56..dcbecaff35 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
@@ -38,25 +38,19 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-Status ClComponentStore::validate(
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors)
+Status ClComponentStore::validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors)
 {
     ARM_COMPUTE_UNUSED(properties, tensors);
     return Status{};
 }
-ClComponentStore::ClComponentStore(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors)
-    : IGpuKernelComponent{ id, properties, tensors },
+ClComponentStore::ClComponentStore(ComponentId                      id,
+                                   const Properties                &properties,
+                                   const ArgumentPack<ITensorInfo> &tensors)
+    : IGpuKernelComponent{id, properties, tensors},
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer
-{
-    std::make_unique<ClTemplateStore>(id, tensors)
-}
+      _component_writer{std::make_unique<ClTemplateStore>(id, tensors)}
 #else  //ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer
-{
-    std::make_unique<GpuCkwStore>(id, tensors)
-}
+      _component_writer{std::make_unique<GpuCkwStore>(id, tensors)}
 #endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
index 853ee39012..948785c480 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE
 
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -70,9 +71,7 @@ public:
      * |:--------------|:--------------|
      * |All            |All            |
      */
-    static Status validate(
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors);
+    static Status validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors);
     /** Constructor
      *
      * Similar to @ref ClComponentStore::validate()
diff --git a/src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h
index bc7133f4df..4c3e84e59d 100644
--- a/src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h
+++ b/src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h
@@ -46,18 +46,16 @@ using namespace experimental::dynamic_fusion;
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const ClComponentElementwiseBinary::Attributes::ElementwiseOp &op)
 {
-    const std::map<ClComponentElementwiseBinary::Attributes::ElementwiseOp, std::string> op_name =
-    {
-        { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Add, "add" },
-        { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Div, "div" },
-        { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Max, "max" },
-        { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Min, "min" },
-        { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Mul, "mul" },
-        { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Power, "power" },
-        { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Prelu, "prelu" },
-        { ClComponentElementwiseBinary::Attributes::ElementwiseOp::SquaredDiff, "squareddiff" },
-        { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Sub, "sub" }
-    };
+    const std::map<ClComponentElementwiseBinary::Attributes::ElementwiseOp, std::string> op_name = {
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Add, "add"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Div, "div"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Max, "max"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Min, "min"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Mul, "mul"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Power, "power"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Prelu, "prelu"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::SquaredDiff, "squareddiff"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Sub, "sub"}};
     os << op_name.at(op);
     return os;
 }
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp
index e7ee1c10df..2cec67dc65 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h"
+
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
 
 #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
@@ -32,12 +33,11 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-Status GpuAdd::validate_op(const GpuWorkloadSketch &sketch,
-                           const ITensorInfo       *lhs,
-                           const ITensorInfo       *rhs)
+Status GpuAdd::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *lhs, const ITensorInfo *rhs)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, DataType::S16, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8,
+                                                         DataType::S16, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
 
     // Set the elementwise operation to Add then call the elementwise common validate_op
@@ -46,12 +46,11 @@ Status GpuAdd::validate_op(const GpuWorkloadSketch &sketch,
     return GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, common_attributes);
 }
 
-Status GpuAdd::is_supported_op(const GpuWorkloadContext &context,
-                               const ITensorInfo        *lhs,
-                               const ITensorInfo        *rhs)
+Status GpuAdd::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, DataType::S16, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8,
+                                                         DataType::S16, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
 
     // Set the elementwise operation to Add then call the elementwise common is_supported_op
@@ -60,9 +59,7 @@ Status GpuAdd::is_supported_op(const GpuWorkloadContext &context,
     return GpuElementwiseBinaryCommon::is_supported_op(context, lhs, rhs, common_attributes);
 }
 
-ITensorInfo *GpuAdd::create_op(GpuWorkloadSketch &sketch,
-                               ITensorInfo       *lhs,
-                               ITensorInfo       *rhs)
+ITensorInfo *GpuAdd::create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs)
 {
     // No need to log or validate as they'll be handled inside GpuElementwiseBinaryCommon::create_op()
     // Set the elementwise operation to Add then call the elementwise common create_op
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp
index 33c2d43e07..6f35e66ea8 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp
@@ -23,12 +23,11 @@
  */
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h"
 
+#include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h"
-
-#include "src/common/utils/Log.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 
 namespace arm_compute
 {
@@ -49,7 +48,7 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate_ptr = dst;
     }
@@ -58,25 +57,22 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
 
     // Check support level
     // Data Type
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src,
-                                                         1,
-                                                         DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S16,
-                                                         DataType::U16, DataType::U32, DataType::S32, DataType::F16,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst_info_to_validate_ptr,
-                                                         1,
-                                                         DataType::U8, DataType::S8, DataType::QASYMM8, DataType::S16,
-                                                         DataType::U16, DataType::U32, DataType::S32, DataType::F16,
-                                                         DataType::F32);
-
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+        src, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL,
+        DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst_info_to_validate_ptr, 1, DataType::U8, DataType::S8,
+                                                         DataType::QASYMM8, DataType::S16, DataType::U16, DataType::U32,
+                                                         DataType::S32, DataType::F16, DataType::F32);
+
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = context.cl_compile_context();
         ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
         // Validate Cast Component
         {
-            const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
-            auto       settings   = ClComponentCast::Settings();
+            const auto properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+            auto settings = ClComponentCast::Settings();
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);
@@ -94,16 +90,13 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
 constexpr GpuOperatorType operator_type = GpuOperatorType::Simple;
 } // namespace
 
-Status GpuCast::is_supported_op(const GpuWorkloadContext &context,
-                                const ITensorInfo        *src,
-                                const CastAttributes     &attributes)
+Status
+GpuCast::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const CastAttributes &attributes)
 {
     return is_supported_op_helper(context, src, nullptr, attributes);
 }
 
-Status GpuCast::validate_op(const GpuWorkloadSketch &sketch,
-                            const ITensorInfo       *src,
-                            const CastAttributes    &attributes)
+Status GpuCast::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const CastAttributes &attributes)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id());
@@ -127,9 +120,7 @@ Status GpuCast::validate_op(const GpuWorkloadSketch &sketch,
     return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes);
 }
 
-ITensorInfo *GpuCast::create_op(GpuWorkloadSketch    &sketch,
-                                ITensorInfo          *src,
-                                const CastAttributes &attributes)
+ITensorInfo *GpuCast::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const CastAttributes &attributes)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, attributes);
@@ -145,14 +136,15 @@ ITensorInfo *GpuCast::create_op(GpuWorkloadSketch    &sketch,
     GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph();
     const auto              *sketch_ctx = sketch.implementation().context();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         ARM_COMPUTE_ERROR_ON(sketch_ctx->cl_compile_context() == nullptr);
 
         // Add Depthwise Conv2d Component
         {
-            const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
-            auto       settings   = ClComponentCast::Settings();
+            const auto properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+            auto settings = ClComponentCast::Settings();
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp
index 89b533c9b8..697b7d4e1f 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp
@@ -25,14 +25,13 @@
 
 #include "arm_compute/core/experimental/Types.h"
 
+#include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace experimental
@@ -48,12 +47,13 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(attributes.max_val() < attributes.min_val(), "Maximum clamp value cannot be lower than minimum value");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(attributes.max_val() < attributes.min_val(),
+                                    "Maximum clamp value cannot be lower than minimum value");
 
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate_ptr = dst;
     }
@@ -61,16 +61,15 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
     auto_init_if_empty(dst_info_to_validate, *src->clone());
 
     // CLAMP operator is implemented as LU_BOUNDED_RELU with the alpha and beta variables swapped
-    const ClComponentActivation::Attributes act_info
-    {
-        ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, attributes.max_val(), attributes.min_val()
-    };
+    const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                     attributes.max_val(), attributes.min_val()};
 
     // Check components
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         // Validate Activation Component
-        const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+        const auto properties =
+            IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
         ArgumentPack<ITensorInfo> arguments;
         arguments.add_const_tensor(ACL_SRC, src);
@@ -87,16 +86,13 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
 constexpr GpuOperatorType operator_type = GpuOperatorType::Simple;
 } // namespace
 
-Status GpuClamp::is_supported_op(const GpuWorkloadContext &context,
-                                 const ITensorInfo        *src,
-                                 const ClampAttributes    &attributes)
+Status
+GpuClamp::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const ClampAttributes &attributes)
 {
     return is_supported_op_helper(context, src, nullptr, attributes);
 }
 
-Status GpuClamp::validate_op(const GpuWorkloadSketch &sketch,
-                             const ITensorInfo       *src,
-                             const ClampAttributes   &attributes)
+Status GpuClamp::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const ClampAttributes &attributes)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
 
@@ -121,9 +117,7 @@ Status GpuClamp::validate_op(const GpuWorkloadSketch &sketch,
     return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes);
 }
 
-ITensorInfo *GpuClamp::create_op(GpuWorkloadSketch     &sketch,
-                                 ITensorInfo           *src,
-                                 const ClampAttributes &attributes)
+ITensorInfo *GpuClamp::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const ClampAttributes &attributes)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, attributes);
@@ -139,18 +133,16 @@ ITensorInfo *GpuClamp::create_op(GpuWorkloadSketch     &sketch,
     GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph();
 
     // CLAMP operator is implemented as LU_BOUNDED_RELU with the alpha and beta variables swapped
-    const ClComponentActivation::Attributes act_info
-    {
-        ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, attributes.max_val(), attributes.min_val()
-    };
+    const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                     attributes.max_val(), attributes.min_val()};
 
     const auto *const sketch_ctx = sketch.implementation().context();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         // Add Activation Component
         auto properties = IGpuKernelComponent::Properties();
-        properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+        properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
         ArgumentPack<ITensorInfo> arguments;
         arguments.add_const_tensor(ACL_SRC, src);
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp
index cb270ed4b0..aaeec543f8 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp
@@ -24,15 +24,15 @@
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h"
 
 #include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 #include "src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h"
 #include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h"
@@ -45,24 +45,30 @@ namespace dynamic_fusion
 {
 namespace
 {
-DirectConvComputeKernelInfo config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo
+config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info)
 {
     // Get GPU target
     GPUTarget gpu_target = CLScheduler::get().target();
 
-    std::unique_ptr<arm_compute::cl_direct_conv::IClDirectConvKernelConfig> t = arm_compute::cl_direct_conv::ClDirectConvKernelConfigurationFactory::create(gpu_target);
+    std::unique_ptr<arm_compute::cl_direct_conv::IClDirectConvKernelConfig> t =
+        arm_compute::cl_direct_conv::ClDirectConvKernelConfigurationFactory::create(gpu_target);
 
     return t->configure(src, weights, conv_info);
 }
 
-void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *src, const ITensorInfo *wei, const Conv2dAttributes &attributes)
+void calculate_and_init_dst_if_empty(ITensorInfo            *dst,
+                                     const ITensorInfo      *src,
+                                     const ITensorInfo      *wei,
+                                     const Conv2dAttributes &attributes)
 {
-    if(dst->total_size() == 0U)
+    if (dst->total_size() == 0U)
     {
-        const auto shape = misc::shape_calculator::compute_deep_convolution_shape(src->tensor_shape(), src->data_layout(), wei->tensor_shape(),
-                                                                                  PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
-                                                                                                attributes.pad().right,
-                                                                                                attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType
+        const auto shape = misc::shape_calculator::compute_deep_convolution_shape(
+            src->tensor_shape(), src->data_layout(), wei->tensor_shape(),
+            PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
+                          attributes.pad().right, attributes.pad().top, attributes.pad().bottom,
+                          DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType
 
         auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape));
     }
@@ -83,7 +89,7 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate_ptr = dst;
     }
@@ -98,18 +104,20 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
 
     // Check components
     const auto gpu_target = context.gpu_target();
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = context.cl_compile_context();
         ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
         // Validate Direct Conv2d Component
         {
-            const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
-            auto       settings   = ClComponentDirectConv2d::Settings();
+            const auto properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+            auto settings = ClComponentDirectConv2d::Settings();
 
             settings.fast_relaxed_math(
-                (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
-                && (dst_info_to_validate_ptr->data_type() == DataType::F32 || dst_info_to_validate_ptr->data_type() == DataType::F16));
+                (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+                (dst_info_to_validate_ptr->data_type() == DataType::F32 ||
+                 dst_info_to_validate_ptr->data_type() == DataType::F16));
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);
@@ -142,14 +150,14 @@ Status GpuConv2d::validate_op(const GpuWorkloadSketch &sketch,
                               const ITensorInfo       *src,
                               const ITensorInfo       *wei,
                               const ITensorInfo       *bia,
-                              const Conv2dAttributes &attributes)
+                              const Conv2dAttributes  &attributes)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!wei->are_values_constant(), "Dynamic weights are not supported");
 
     // Check if tensors have valid id. I.e. they are created from a sketch
     ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id() || !wei->has_valid_id());
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(!bia->has_valid_id());
     }
@@ -178,16 +186,13 @@ Status GpuConv2d::validate_op(const GpuWorkloadSketch &sketch,
     return is_supported_op_helper(*sketch.gpu_context(), src, wei, bia, &dst_info_to_validate, attributes);
 }
 
-ITensorInfo *GpuConv2d::create_op(GpuWorkloadSketch      &sketch,
-                                  ITensorInfo            *src,
-                                  ITensorInfo            *wei,
-                                  ITensorInfo            *bia,
-                                  const Conv2dAttributes &attributes)
+ITensorInfo *GpuConv2d::create_op(
+    GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *wei, ITensorInfo *bia, const Conv2dAttributes &attributes)
 {
     ARM_COMPUTE_LOG_PARAMS(src, wei, bia, attributes);
     PadStrideInfo conv_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
-                            attributes.pad().right,
-                            attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR);
+                            attributes.pad().right, attributes.pad().top, attributes.pad().bottom,
+                            DimensionRoundingType::FLOOR);
     // Initialize the direct convolution descriptor
     const DirectConvComputeKernelInfo desc = config_direct_convolution_nhwc(src, wei, conv_info);
 
@@ -207,7 +212,7 @@ ITensorInfo *GpuConv2d::create_op(GpuWorkloadSketch      &sketch,
 
     const auto gpu_target = sketch_ctx->gpu_target();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = sketch_ctx->cl_compile_context();
         ARM_COMPUTE_ERROR_ON(cl_compile_ctx == nullptr);
@@ -216,17 +221,17 @@ ITensorInfo *GpuConv2d::create_op(GpuWorkloadSketch      &sketch,
         // Add Direct Conv2d Component
         {
             auto properties = IGpuKernelComponent::Properties();
-            properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
             auto settings = ClComponentDirectConv2d::Settings();
 
             settings.fast_relaxed_math(
-                (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
-                && (dst->data_type() == DataType::F32 || dst->data_type() == DataType::F16));
+                (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+                (dst->data_type() == DataType::F32 || dst->data_type() == DataType::F16));
 
             settings.direct_conv_descriptor(desc);
 
-            if(settings.export_to_cl_image())
+            if (settings.export_to_cl_image())
             {
                 arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(wei);
             }
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp
index c72098e943..e2b673bd43 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp
@@ -28,8 +28,8 @@
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 #include "src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h"
 #include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
@@ -42,20 +42,20 @@ namespace dynamic_fusion
 {
 namespace
 {
-void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *src, const ITensorInfo *wei, const DepthwiseConv2dAttributes &attributes)
+void calculate_and_init_dst_if_empty(ITensorInfo                     *dst,
+                                     const ITensorInfo               *src,
+                                     const ITensorInfo               *wei,
+                                     const DepthwiseConv2dAttributes &attributes)
 {
-    if(dst->total_size() == 0U)
+    if (dst->total_size() == 0U)
     {
-        const PadStrideInfo pad_stride_info(attributes.stride().x(),
-                                            attributes.stride().y(),
-                                            attributes.pad().left,
-                                            attributes.pad().right,
-                                            attributes.pad().top,
-                                            attributes.pad().bottom,
+        const PadStrideInfo pad_stride_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
+                                            attributes.pad().right, attributes.pad().top, attributes.pad().bottom,
                                             attributes.dimension_rounding_type());
 
-        const ConvolutionInfo conv_info{ pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(), attributes.dilation() };
-        const TensorShape     shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info);
+        const ConvolutionInfo conv_info{pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(),
+                                        attributes.dilation()};
+        const TensorShape shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info);
 
         auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape));
     }
@@ -76,7 +76,7 @@ Status is_supported_op_helper(const GpuWorkloadContext        &context,
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate_ptr = dst;
     }
@@ -91,40 +91,44 @@ Status is_supported_op_helper(const GpuWorkloadContext        &context,
 
     const GpuTarget gpu_target = context.gpu_target();
 
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         const CLCompileContext *cl_compile_ctx = context.cl_compile_context();
         ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
 
         // Validate Depthwise Conv2d Component
         {
-            const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
-            auto       settings   = ClComponentDepthwiseConv2d::Settings();
+            const auto properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+            auto settings = ClComponentDepthwiseConv2d::Settings();
 
-            const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
-                                                 attributes.pad().right,
-                                                 attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR);
+            const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(),
+                                                 attributes.pad().left, attributes.pad().right, attributes.pad().top,
+                                                 attributes.pad().bottom, DimensionRoundingType::FLOOR);
 
             // Get the depthwise convolution compute parameters
-            auto                       t        = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target);
-            const DWCComputeKernelInfo dwc_info = t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier());
+            auto t = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+            const DWCComputeKernelInfo dwc_info =
+                t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier());
 
             settings.fast_relaxed_math(
-                (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
-                && (dst_info_to_validate_ptr->data_type() == DataType::F32 || dst_info_to_validate_ptr->data_type() == DataType::F16));
+                (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+                (dst_info_to_validate_ptr->data_type() == DataType::F32 ||
+                 dst_info_to_validate_ptr->data_type() == DataType::F16));
 
             settings.is_fma_available(get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)
-            .m0(dwc_info.m0)
-            .n0(dwc_info.n0)
-            .export_input_to_cl_image(dwc_info.export_input_to_cl_image)
-            .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image);
+                .m0(dwc_info.m0)
+                .n0(dwc_info.n0)
+                .export_input_to_cl_image(dwc_info.export_input_to_cl_image)
+                .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image);
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);
             arguments.add_const_tensor(ACL_SRC_1, wei);
             arguments.add_const_tensor(ACL_SRC_2, bia);
             arguments.add_const_tensor(ACL_DST_0, dst_info_to_validate_ptr);
-            ARM_COMPUTE_RETURN_ON_ERROR(ClComponentDepthwiseConv2d::validate(properties, arguments, attributes, settings));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                ClComponentDepthwiseConv2d::validate(properties, arguments, attributes, settings));
         }
     }
     else
@@ -158,7 +162,7 @@ Status GpuDepthwiseConv2d::validate_op(const GpuWorkloadSketch         &sketch,
 
     ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id() || !wei->has_valid_id());
 
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(!bia->has_valid_id());
     }
@@ -205,35 +209,37 @@ ITensorInfo *GpuDepthwiseConv2d::create_op(GpuWorkloadSketch               &sket
     const auto              *sketch_ctx = sketch.implementation().context();
     const GpuTarget          gpu_target = sketch_ctx->gpu_target();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(sketch_ctx->cl_compile_context());
 
         // Add Depthwise Conv2d Component
         {
-            const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
-            auto       settings   = ClComponentDepthwiseConv2d::Settings();
+            const auto properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+            auto settings = ClComponentDepthwiseConv2d::Settings();
 
-            const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
-                                                 attributes.pad().right,
-                                                 attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR);
+            const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(),
+                                                 attributes.pad().left, attributes.pad().right, attributes.pad().top,
+                                                 attributes.pad().bottom, DimensionRoundingType::FLOOR);
 
             // Get the depthwise convolution compute parameters
-            auto                       t        = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target);
-            const DWCComputeKernelInfo dwc_info = t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier());
+            auto t = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+            const DWCComputeKernelInfo dwc_info =
+                t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier());
 
             settings.is_fma_available(get_arch_from_target(gpu_target) != GPUTarget::MIDGARD)
-            .m0(dwc_info.m0)
-            .n0(dwc_info.n0)
-            .export_input_to_cl_image(dwc_info.export_input_to_cl_image)
-            .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image);
+                .m0(dwc_info.m0)
+                .n0(dwc_info.n0)
+                .export_input_to_cl_image(dwc_info.export_input_to_cl_image)
+                .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image);
 
-            if(settings.export_input_to_cl_image())
+            if (settings.export_input_to_cl_image())
             {
                 arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(src);
             }
 
-            if(settings.export_weights_to_cl_image())
+            if (settings.export_weights_to_cl_image())
             {
                 arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(wei);
             }
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp
index 464a32cbad..b871171e8d 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h"
+
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
 
 #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
@@ -32,9 +33,7 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-Status GpuMul::validate_op(const GpuWorkloadSketch &sketch,
-                           const ITensorInfo       *lhs,
-                           const ITensorInfo       *rhs)
+Status GpuMul::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *lhs, const ITensorInfo *rhs)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
@@ -46,9 +45,7 @@ Status GpuMul::validate_op(const GpuWorkloadSketch &sketch,
     return GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, common_attributes);
 }
 
-Status GpuMul::is_supported_op(const GpuWorkloadContext &context,
-                               const ITensorInfo        *lhs,
-                               const ITensorInfo        *rhs)
+Status GpuMul::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
@@ -60,9 +57,7 @@ Status GpuMul::is_supported_op(const GpuWorkloadContext &context,
     return GpuElementwiseBinaryCommon::is_supported_op(context, lhs, rhs, common_attributes);
 }
 
-ITensorInfo *GpuMul::create_op(GpuWorkloadSketch &sketch,
-                               ITensorInfo       *lhs,
-                               ITensorInfo       *rhs)
+ITensorInfo *GpuMul::create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs)
 {
     // Set the elementwise operation to Mul then call the elementwise common create_op
     ElementwiseBinaryCommonAttributes common_attributes{};
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp
index 107a5e5fa7..f0d368d757 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp
@@ -26,10 +26,9 @@
 
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
-
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/utils/Utils.h"
 
 namespace arm_compute
@@ -43,9 +42,7 @@ namespace
 constexpr GpuOperatorType operator_type = GpuOperatorType::Simple;
 } // namespace
 
-Status GpuOutput::is_supported_op(const GpuWorkloadContext &context,
-                                  const ITensorInfo        *src,
-                                  const ITensorInfo        *dst)
+Status GpuOutput::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
 
@@ -60,9 +57,7 @@ Status GpuOutput::is_supported_op(const GpuWorkloadContext &context,
     return Status{};
 }
 
-Status GpuOutput::validate_op(const GpuWorkloadSketch &sketch,
-                              const ITensorInfo       *src,
-                              const ITensorInfo       *dst)
+Status GpuOutput::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id());
@@ -90,9 +85,7 @@ Status GpuOutput::validate_op(const GpuWorkloadSketch &sketch,
     return status;
 }
 
-void GpuOutput::create_op(GpuWorkloadSketch &sketch,
-                          ITensorInfo       *src,
-                          ITensorInfo       *dst)
+void GpuOutput::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *dst)
 {
     ARM_COMPUTE_LOG_PARAMS(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(GpuOutput::validate_op(sketch, src, dst));
@@ -104,14 +97,14 @@ void GpuOutput::create_op(GpuWorkloadSketch &sketch,
     auto      &comp_graph = sketch.implementation().component_graph();
     const auto sketch_ctx = sketch.implementation().context();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         ARM_COMPUTE_ERROR_ON(sketch_ctx->cl_compile_context() == nullptr);
 
         // Add store component
         {
             IGpuKernelComponent::Properties properties;
-            properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp
index 7ecfa0158b..55c604aacc 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp
@@ -22,20 +22,21 @@
  * SOFTWARE.
  */
 
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
+
 #include "arm_compute/core/CL/CLCompileContext.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h"
-
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
-#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h"
 #include "src/dynamic_fusion/utils/Utils.h"
 
 namespace arm_compute
@@ -46,11 +47,15 @@ namespace dynamic_fusion
 {
 namespace
 {
-void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *src, const Pool2dAttributes &attributes, const GpuPool2dSettings &settings)
+void calculate_and_init_dst_if_empty(ITensorInfo             *dst,
+                                     const ITensorInfo       *src,
+                                     const Pool2dAttributes  &attributes,
+                                     const GpuPool2dSettings &settings)
 {
-    if(dst->total_size() == 0U)
+    if (dst->total_size() == 0U)
     {
-        auto shape = misc::shape_calculator::compute_pool_shape(*src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision()));
+        auto shape = misc::shape_calculator::compute_pool_shape(
+            *src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision()));
         auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape));
     }
 }
@@ -82,7 +87,7 @@ bool GpuPool2dSettings::use_inf_as_limit() const
 
 Status GpuPool2d::validate_op(const GpuWorkloadSketch &sketch,
                               const ITensorInfo       *src,
-                              const Pool2dAttributes &attributes,
+                              const Pool2dAttributes  &attributes,
                               const GpuPool2dSettings &settings)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
@@ -110,7 +115,7 @@ Status GpuPool2d::validate_op(const GpuWorkloadSketch &sketch,
 Status GpuPool2d::is_supported_op(const GpuWorkloadContext &context,
                                   const ITensorInfo        *src,
                                   const Pool2dAttributes   &attributes,
-                                  const GpuPool2dSettings &settings)
+                                  const GpuPool2dSettings  &settings)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
     // Data type
@@ -118,7 +123,8 @@ Status GpuPool2d::is_supported_op(const GpuWorkloadContext &context,
     // Data layout
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
     // Check exclude padding is not false
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!attributes.exclude_padding(), "Exclude padding must be set to true in Attributes!");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!attributes.exclude_padding(),
+                                    "Exclude padding must be set to true in Attributes!");
 
     // Auto initialize dst tensor info
     TensorInfo dst_info_to_validate;
@@ -126,14 +132,15 @@ Status GpuPool2d::is_supported_op(const GpuWorkloadContext &context,
     calculate_and_init_dst_if_empty(&dst_info_to_validate, src, attributes, settings);
 
     // Check components
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = context.cl_compile_context();
         ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
 
         // Validate Component
         {
-            const KernelProperties properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            const KernelProperties properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);
@@ -148,10 +155,10 @@ Status GpuPool2d::is_supported_op(const GpuWorkloadContext &context,
     return Status{};
 }
 
-ITensorInfo *GpuPool2d::create_op(GpuWorkloadSketch  &sketch,
-                          ITensorInfo                *src,
-                          const Pool2dAttributes     &attributes,
-                          const GpuPool2dSettings    &settings)
+ITensorInfo *GpuPool2d::create_op(GpuWorkloadSketch       &sketch,
+                                  ITensorInfo             *src,
+                                  const Pool2dAttributes  &attributes,
+                                  const GpuPool2dSettings &settings)
 {
     // Assert validation
     ARM_COMPUTE_ERROR_THROW_ON(GpuPool2d::validate_op(sketch, src, attributes, settings));
@@ -168,7 +175,7 @@ ITensorInfo *GpuPool2d::create_op(GpuWorkloadSketch  &sketch,
 
     const auto sketch_ctx = sketch.implementation().context();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = sketch_ctx->cl_compile_context();
         ARM_COMPUTE_UNUSED(cl_compile_ctx);
@@ -177,7 +184,7 @@ ITensorInfo *GpuPool2d::create_op(GpuWorkloadSketch  &sketch,
         // Add Component
         {
             auto properties = IGpuKernelComponent::Properties();
-            properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp
index 0f43a578df..3def7a1a81 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp
@@ -22,12 +22,14 @@
  * SOFTWARE.
  */
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuReshape.h"
+
 #include "arm_compute/core/Error.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 
 namespace arm_compute
 {
@@ -40,14 +42,14 @@ namespace
 Status is_supported_op_helper(const GpuWorkloadContext &context,
                               const ITensorInfo        *src,
                               const ITensorInfo        *dst,
-                              const ReshapeAttributes &attributes)
+                              const ReshapeAttributes  &attributes)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
 
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate_ptr = dst;
     }
@@ -55,7 +57,7 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
     auto_init_if_empty(dst_info_to_validate, src->clone()->set_tensor_shape(attributes.shape()));
 
     // Check components
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = context.cl_compile_context();
         ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
@@ -78,16 +80,13 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
 GpuOperatorType operator_type = GpuOperatorType::Complex;
 } // namespace
 
-Status GpuReshape::is_supported_op(const GpuWorkloadContext &context,
-                                   const ITensorInfo        *src,
-                                   const Attributes         &attributes)
+Status
+GpuReshape::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Attributes &attributes)
 {
     return is_supported_op_helper(context, src, nullptr, attributes);
 }
 
-Status GpuReshape::validate_op(const GpuWorkloadSketch &sketch,
-                               const ITensorInfo       *src,
-                               const Attributes        &attributes)
+Status GpuReshape::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const Attributes &attributes)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id());
@@ -111,9 +110,7 @@ Status GpuReshape::validate_op(const GpuWorkloadSketch &sketch,
     return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes);
 }
 
-ITensorInfo *GpuReshape::create_op(GpuWorkloadSketch &sketch,
-                                   ITensorInfo       *src,
-                                   const Attributes &attributes)
+ITensorInfo *GpuReshape::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const Attributes &attributes)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, attributes.shape());
@@ -127,7 +124,7 @@ ITensorInfo *GpuReshape::create_op(GpuWorkloadSketch &sketch,
     // Translate into components and add to component graph
     auto      &comp_graph = sketch.implementation().component_graph();
     const auto sketch_ctx = sketch.implementation().context();
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = sketch_ctx->cl_compile_context();
         ARM_COMPUTE_UNUSED(cl_compile_ctx);
@@ -136,7 +133,7 @@ ITensorInfo *GpuReshape::create_op(GpuWorkloadSketch &sketch,
         // Add ElementwiseBinary Component
         {
             auto properties = IGpuKernelComponent::Properties();
-            properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp
index 5f52eea7d0..fb09875b33 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp
@@ -26,12 +26,12 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h"
-
-#include "src/common/utils/Log.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 
 namespace arm_compute
 {
@@ -43,7 +43,7 @@ namespace
 {
 void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *src, const ResizeAttributes &attributes)
 {
-    if(dst->total_size() == 0U)
+    if (dst->total_size() == 0U)
     {
         TensorShape out_shape = src->tensor_shape();
 
@@ -64,7 +64,7 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate_ptr = dst;
     }
@@ -73,22 +73,25 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
 
     // Check support level
     // Data type
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::U8, DataType::S16, DataType::F16, DataType::F32);
     // Data layout
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
     // Interpolation policy
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(attributes.interpolation_policy() != InterpolationPolicy::NEAREST_NEIGHBOR && attributes.interpolation_policy() != InterpolationPolicy::BILINEAR,
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(attributes.interpolation_policy() != InterpolationPolicy::NEAREST_NEIGHBOR &&
+                                        attributes.interpolation_policy() != InterpolationPolicy::BILINEAR,
                                     "Interpolation policy must be NEAREST_NEIGHBOR or BILINEAR");
 
     // Check components
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = context.cl_compile_context();
         ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
 
         // Validate Activation Component
         {
-            const KernelProperties properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            const KernelProperties properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);
@@ -107,16 +110,14 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
 constexpr GpuOperatorType operator_type = GpuOperatorType::Complex;
 } // namespace
 
-Status GpuResize::is_supported_op(const GpuWorkloadContext &context,
-                                  const ITensorInfo        *src,
-                                  const Attributes         &attributes)
+Status
+GpuResize::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Attributes &attributes)
 {
     return is_supported_op_helper(context, src, nullptr, attributes);
 }
 
-Status GpuResize::validate_op(const GpuWorkloadSketch     &sketch,
-                              const ITensorInfo           *src,
-                              const GpuResize::Attributes &attributes)
+Status
+GpuResize::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const GpuResize::Attributes &attributes)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id());
@@ -141,9 +142,7 @@ Status GpuResize::validate_op(const GpuWorkloadSketch     &sketch,
     return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes);
 }
 
-ITensorInfo *GpuResize::create_op(GpuWorkloadSketch           &sketch,
-                                  ITensorInfo                 *src,
-                                  const GpuResize::Attributes &attributes)
+ITensorInfo *GpuResize::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const GpuResize::Attributes &attributes)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, attributes);
@@ -159,13 +158,14 @@ ITensorInfo *GpuResize::create_op(GpuWorkloadSketch           &sketch,
     GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph();
     const auto              *sketch_ctx = sketch.implementation().context();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(sketch_ctx->cl_compile_context());
 
         // Add Resize Component
         {
-            const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            const auto properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp
index 09debad969..a2260c8c36 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp
@@ -23,14 +23,15 @@
  */
 
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.h"
+
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 
 namespace arm_compute
 {
@@ -40,9 +41,7 @@ namespace dynamic_fusion
 {
 namespace
 {
-Status is_supported_op_helper(const GpuWorkloadContext &context,
-                              const ITensorInfo        *src,
-                              const ITensorInfo        *dst)
+Status is_supported_op_helper(const GpuWorkloadContext &context, const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
@@ -50,20 +49,21 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate_ptr = dst;
     }
 
     auto_init_if_empty(dst_info_to_validate, *src->clone());
 
-    const ClComponentActivation::Attributes act_info{ ActivationLayerInfo::ActivationFunction::LOGISTIC };
+    const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LOGISTIC};
 
     // Check components
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         // Validate Activation Component
-        const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+        const auto properties =
+            IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
         ArgumentPack<ITensorInfo> arguments;
         arguments.add_const_tensor(ACL_SRC, src);
@@ -80,14 +80,12 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
 constexpr GpuOperatorType operator_type = GpuOperatorType::Simple;
 } // namespace
 
-Status GpuSigmoid::is_supported_op(const GpuWorkloadContext &context,
-                                   const ITensorInfo        *src)
+Status GpuSigmoid::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src)
 {
     return is_supported_op_helper(context, src, nullptr);
 }
 
-Status GpuSigmoid::validate_op(const GpuWorkloadSketch &sketch,
-                               const ITensorInfo       *src)
+Status GpuSigmoid::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
 
@@ -112,8 +110,7 @@ Status GpuSigmoid::validate_op(const GpuWorkloadSketch &sketch,
     return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate);
 }
 
-ITensorInfo *GpuSigmoid::create_op(GpuWorkloadSketch &sketch,
-                                   ITensorInfo       *src)
+ITensorInfo *GpuSigmoid::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src);
@@ -128,15 +125,15 @@ ITensorInfo *GpuSigmoid::create_op(GpuWorkloadSketch &sketch,
     // Translate into components and add to component graph
     GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph();
 
-    const ClComponentActivation::Attributes act_info{ ActivationLayerInfo::ActivationFunction::LOGISTIC };
+    const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LOGISTIC};
 
     const auto *const sketch_ctx = sketch.implementation().context();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         // Add Activation Component
         auto properties = IGpuKernelComponent::Properties();
-        properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+        properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
         ArgumentPack<ITensorInfo> arguments;
         arguments.add_const_tensor(ACL_SRC, src);
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp
index ffc4553a7d..c87b282aec 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp
@@ -22,13 +22,14 @@
  * SOFTWARE.
  */
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.h"
+
 #include "arm_compute/core/Error.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h"
 
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 
@@ -52,7 +53,7 @@ Status GpuSoftmax::is_supported_op(const GpuWorkloadContext &context,
     TensorInfo dst_info_to_validate;
 
     // Auto initialize dst tensor info
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate = *dst;
     }
@@ -61,11 +62,12 @@ Status GpuSoftmax::is_supported_op(const GpuWorkloadContext &context,
         auto_init_if_empty(dst_info_to_validate, *src->clone());
     }
     // Check components
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = context.cl_compile_context();
         ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
-        const KernelProperties properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+        const KernelProperties properties =
+            IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
         TensorShape logits_sum_shape = src->tensor_shape();
         TensorInfo  logits(src->clone()->set_tensor_shape(logits_sum_shape));
@@ -86,7 +88,8 @@ Status GpuSoftmax::is_supported_op(const GpuWorkloadContext &context,
         arguments_norm.add_const_tensor(ACL_SRC_1, &sum);
         arguments_norm.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
 
-        ARM_COMPUTE_RETURN_ON_ERROR(ClComponentLogits1DMaxShiftExpSum::validate(properties, arguments_exp_sum, attributes));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            ClComponentLogits1DMaxShiftExpSum::validate(properties, arguments_exp_sum, attributes));
         ARM_COMPUTE_RETURN_ON_ERROR(ClComponentLogits1DNorm::validate(properties, arguments_norm, attributes));
     }
     else
@@ -105,14 +108,16 @@ Status GpuSoftmax::validate_op(const GpuWorkloadSketch &sketch,
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id() || !dst->has_valid_id());
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 4, "Only up to 4 dimensions are supported");
-    ARM_COMPUTE_RETURN_ERROR_ON(attributes.axis() < static_cast<int32_t>(-src->num_dimensions()) || static_cast<int32_t>(src->num_dimensions()) <= attributes.axis());
+    ARM_COMPUTE_RETURN_ERROR_ON(attributes.axis() < static_cast<int32_t>(-src->num_dimensions()) ||
+                                static_cast<int32_t>(src->num_dimensions()) <= attributes.axis());
 
     // Auto initialize dst tensor info
     TensorInfo dst_info_to_validate = *dst;
     auto_init_if_empty(dst_info_to_validate, *src->clone());
 
-    const size_t actual_axis   = static_cast<size_t>(wrap_around(attributes.axis(), static_cast<int32_t>(src->num_dimensions())));
-    const bool   needs_permute = actual_axis != 0;
+    const size_t actual_axis =
+        static_cast<size_t>(wrap_around(attributes.axis(), static_cast<int32_t>(src->num_dimensions())));
+    const bool needs_permute = actual_axis != 0;
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(needs_permute, "Dynamic fusion softmax on axis!=0 not supported yet.");
 
     // Perform fusion test and check if the operator meets the fusion constraints
@@ -128,17 +133,16 @@ Status GpuSoftmax::validate_op(const GpuWorkloadSketch &sketch,
     return is_supported_op(*sketch.gpu_context(), src, &dst_info_to_validate, attributes);
 }
 
-void GpuSoftmax::create_op(GpuWorkloadSketch &sketch,
-                           ITensorInfo       *src,
-                           ITensorInfo       *dst,
-                           const Attributes &attributes)
+void GpuSoftmax::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *dst, const Attributes &attributes)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_LOG_PARAMS(src, dst, attributes);
     TensorShape  logits_sum_shape = src->tensor_shape();
-    ITensorInfo *logits           = sketch.implementation().create_auxiliary_tensor(src->clone()->set_id(ITensorInfo::invalid_tensor_id).set_tensor_shape(logits_sum_shape));
+    ITensorInfo *logits           = sketch.implementation().create_auxiliary_tensor(
+                  src->clone()->set_id(ITensorInfo::invalid_tensor_id).set_tensor_shape(logits_sum_shape));
     logits_sum_shape.set(0, 1);
-    ITensorInfo *sum = sketch.implementation().create_auxiliary_tensor(src->clone()->set_id(ITensorInfo::invalid_tensor_id).set_tensor_shape(logits_sum_shape));
+    ITensorInfo *sum = sketch.implementation().create_auxiliary_tensor(
+        src->clone()->set_id(ITensorInfo::invalid_tensor_id).set_tensor_shape(logits_sum_shape));
 
     // Auto initialize dst tensor info and the auxiliary tensor infos as well
     auto_init_if_empty(*dst, *src->clone());
@@ -151,7 +155,7 @@ void GpuSoftmax::create_op(GpuWorkloadSketch &sketch,
     auto      &comp_graph = sketch.implementation().component_graph();
     const auto sketch_ctx = sketch.implementation().context();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = sketch_ctx->cl_compile_context();
         ARM_COMPUTE_UNUSED(cl_compile_ctx);
@@ -160,7 +164,7 @@ void GpuSoftmax::create_op(GpuWorkloadSketch &sketch,
         // Add Direct Conv2d Component
         {
             auto properties = IGpuKernelComponent::Properties();
-            properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
             ArgumentPack<ITensorInfo> arguments_exp_sum;
             ArgumentPack<ITensorInfo> arguments_norm;
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp
index 8240008f2a..e5d62c9930 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h"
+
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
 
 #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
@@ -32,12 +33,11 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-Status GpuSub::validate_op(const GpuWorkloadSketch &sketch,
-                           const ITensorInfo       *lhs,
-                           const ITensorInfo       *rhs)
+Status GpuSub::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *lhs, const ITensorInfo *rhs)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, DataType::S16, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8,
+                                                         DataType::S16, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
 
     // Set the elementwise operation to Sub then call the elementwise common validate_op
@@ -46,12 +46,11 @@ Status GpuSub::validate_op(const GpuWorkloadSketch &sketch,
     return GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, common_attributes);
 }
 
-Status GpuSub::is_supported_op(const GpuWorkloadContext &context,
-                               const ITensorInfo        *lhs,
-                               const ITensorInfo        *rhs)
+Status GpuSub::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, DataType::S16, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8,
+                                                         DataType::S16, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
 
     // Set the elementwise operation to Sub then call the elementwise common is_supported_op
@@ -60,9 +59,7 @@ Status GpuSub::is_supported_op(const GpuWorkloadContext &context,
     return GpuElementwiseBinaryCommon::is_supported_op(context, lhs, rhs, common_attributes);
 }
 
-ITensorInfo *GpuSub::create_op(GpuWorkloadSketch &sketch,
-                               ITensorInfo       *lhs,
-                               ITensorInfo       *rhs)
+ITensorInfo *GpuSub::create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs)
 {
     // No need to log or validate as they'll be handled inside GpuElementwiseBinaryCommon::create_op()
     // Set the elementwise operation to Sub then call the elementwise common create_op
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp
index c00716c76e..bf0f274c5c 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp
@@ -23,14 +23,15 @@
  */
 
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuTanh.h"
+
 #include "arm_compute/core/experimental/Types.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/common/utils/Log.h"
 
 namespace arm_compute
 {
@@ -40,9 +41,7 @@ namespace dynamic_fusion
 {
 namespace
 {
-Status is_supported_op_helper(const GpuWorkloadContext &context,
-                              const ITensorInfo        *src,
-                              const ITensorInfo        *dst)
+Status is_supported_op_helper(const GpuWorkloadContext &context, const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
@@ -50,20 +49,21 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate_ptr = dst;
     }
 
     auto_init_if_empty(dst_info_to_validate, *src->clone());
 
-    const ClComponentActivation::Attributes act_info{ ActivationLayerInfo::ActivationFunction::TANH };
+    const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::TANH};
 
     // Check components
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         // Validate Activation Component
-        const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+        const auto properties =
+            IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
         ArgumentPack<ITensorInfo> arguments;
         arguments.add_const_tensor(ACL_SRC, src);
@@ -80,14 +80,12 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
 constexpr GpuOperatorType operator_type = GpuOperatorType::Simple;
 } // namespace
 
-Status GpuTanh::is_supported_op(const GpuWorkloadContext &context,
-                                const ITensorInfo        *src)
+Status GpuTanh::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src)
 {
     return is_supported_op_helper(context, src, nullptr);
 }
 
-Status GpuTanh::validate_op(const GpuWorkloadSketch &sketch,
-                            const ITensorInfo       *src)
+Status GpuTanh::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
 
@@ -112,8 +110,7 @@ Status GpuTanh::validate_op(const GpuWorkloadSketch &sketch,
     return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate);
 }
 
-ITensorInfo *GpuTanh::create_op(GpuWorkloadSketch     &sketch,
-                                ITensorInfo           *src)
+ITensorInfo *GpuTanh::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src);
@@ -128,15 +125,15 @@ ITensorInfo *GpuTanh::create_op(GpuWorkloadSketch     &sketch,
     // Translate into components and add to component graph
     GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph();
 
-    const ClComponentActivation::Attributes act_info{ ActivationLayerInfo::ActivationFunction::TANH };
+    const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::TANH};
 
     const auto *const sketch_ctx = sketch.implementation().context();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         // Add Activation Component
         auto properties = IGpuKernelComponent::Properties();
-        properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+        properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
         ArgumentPack<ITensorInfo> arguments;
         arguments.add_const_tensor(ACL_SRC, src);
diff --git a/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp
index 7c087c9a7b..d79a4c42c9 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp
@@ -22,11 +22,12 @@
  * SOFTWARE.
  */
 #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 
 namespace arm_compute
 {
@@ -38,9 +39,10 @@ namespace
 {
 void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *lhs, const ITensorInfo *rhs)
 {
-    if(dst->total_size() == 0U)
+    if (dst->total_size() == 0U)
     {
-        const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*lhs, *rhs);
+        const std::pair<TensorShape, ValidRegion> broadcast_pair =
+            ITensorInfo::broadcast_shape_and_valid_region(*lhs, *rhs);
         auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(broadcast_pair.first));
     }
 }
@@ -56,7 +58,7 @@ Status is_supported_op_helper(const GpuWorkloadContext                &context,
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate_ptr = dst;
     }
@@ -64,7 +66,7 @@ Status is_supported_op_helper(const GpuWorkloadContext                &context,
     calculate_and_init_dst_if_empty(&dst_info_to_validate, lhs, rhs);
 
     // Check components
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = context.cl_compile_context();
         ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
@@ -90,7 +92,8 @@ Status is_supported_op_helper(const GpuWorkloadContext                &context,
 GpuOperatorType operator_type = GpuOperatorType::Simple;
 } // namespace
 
-ElementwiseBinaryCommonAttributes &ElementwiseBinaryCommonAttributes::operation(const ElementwiseBinaryCommonAttributes::ElementwiseOp &operation)
+ElementwiseBinaryCommonAttributes &
+ElementwiseBinaryCommonAttributes::operation(const ElementwiseBinaryCommonAttributes::ElementwiseOp &operation)
 {
     _operation = operation;
     return *this;
@@ -157,14 +160,14 @@ ITensorInfo *GpuElementwiseBinaryCommon::create_op(GpuWorkloadSketch
 
     const auto sketch_ctx = sketch.implementation().context();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(sketch_ctx->cl_compile_context());
 
         // Add ElementwiseBinary Component
         {
             auto properties = IGpuKernelComponent::Properties();
-            properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, lhs);
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp
index 0972b4e8e2..775b0a0c8c 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp
@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "GpuKernelVariableTable.h"
+
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/ITensorInfo.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 
 namespace arm_compute
@@ -32,14 +34,17 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-void GpuKernelVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group, const ITensorInfo *tensor, GpuKernelArgumentInfo argument_info, const std::string &alias)
+void GpuKernelVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group,
+                                              const ITensorInfo             *tensor,
+                                              GpuKernelArgumentInfo          argument_info,
+                                              const std::string             &alias)
 {
     ARM_COMPUTE_ERROR_ON_MSG(!tensor->has_valid_id(), "Tensor info with valid id expected");
 
     // Do not re-declare if the variable associated with the tensor has already been declared
     auto it = _vars.find(tensor->id());
 
-    if(it != _vars.end())
+    if (it != _vars.end())
     {
         ARM_COMPUTE_ERROR_ON(!(it->second.kernel_argument_info == argument_info));
         return;
@@ -47,14 +52,12 @@ void GpuKernelVariableTable::declare_variable(const GpuKernelComponentGroup &com
 
     const auto target = comp_group.get_tile_for_tensor(tensor);
 
-    if(target != tensor)
+    if (target != tensor)
     {
         // If the tensor uses a shared tile, don't declare another variable.
         it = _vars.find(target->id());
 
-        ARM_COMPUTE_ERROR_ON_MSG(
-            it == _vars.end(),
-            "The variable used for this tensor must have been declared.");
+        ARM_COMPUTE_ERROR_ON_MSG(it == _vars.end(), "The variable used for this tensor must have been declared.");
 
         _vars[tensor->id()] = it->second;
     }
@@ -64,7 +67,7 @@ void GpuKernelVariableTable::declare_variable(const GpuKernelComponentGroup &com
         std::stringstream ss;
         ss << alias << "_t" << abs(tensor->id());
         const auto     uniq_name = ss.str();
-        TensorVariable var{ tensor->id(), uniq_name, argument_info };
+        TensorVariable var{tensor->id(), uniq_name, argument_info};
 
         _vars.emplace(tensor->id(), var);
     }
@@ -76,12 +79,13 @@ GpuKernelVariableTable::TensorVariable GpuKernelVariableTable::get_variable(cons
     return var;
 }
 
-GpuKernelVariableTable::VariableList GpuKernelVariableTable::get_variable_list(const std::vector<const ITensorInfo *> &tensors) const
+GpuKernelVariableTable::VariableList
+GpuKernelVariableTable::get_variable_list(const std::vector<const ITensorInfo *> &tensors) const
 {
     VariableList vars{};
-    for(const auto &tensor : tensors)
+    for (const auto &tensor : tensors)
     {
-        if(!tensor->has_valid_id())
+        if (!tensor->has_valid_id())
         {
             continue;
         }
@@ -90,23 +94,19 @@ GpuKernelVariableTable::VariableList GpuKernelVariableTable::get_variable_list(c
     return vars;
 }
 
-TagVal::TagVal(const GpuKernelVariableTable::TensorVariable &var)
-    : value{ var.uniq_name }
+TagVal::TagVal(const GpuKernelVariableTable::TensorVariable &var) : value{var.uniq_name}
 {
 }
 
-TagVal::TagVal(const std::string &val)
-    : value{ val }
+TagVal::TagVal(const std::string &val) : value{val}
 {
 }
 
-TagVal::TagVal(const char *val)
-    : value{ std::string(val) }
+TagVal::TagVal(const char *val) : value{std::string(val)}
 {
 }
 
-TagVal::TagVal(const DataType &data_type)
-    : value{ get_cl_type_from_data_type(data_type) }
+TagVal::TagVal(const DataType &data_type) : value{get_cl_type_from_data_type(data_type)}
 {
 }
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h
index a49d38e10c..c17f131ada 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h
@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_GPUKERNELVARIABLETABLE
 
 #include "arm_compute/core/ITensorInfo.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "support/AclRequires.h"
 #include "support/StringSupport.h"
@@ -55,11 +56,11 @@ public:
     struct TensorVariable
     {
     public:
-        TensorVariable()                       = default;
-        TensorVariable(const TensorVariable &) = default;
+        TensorVariable()                                        = default;
+        TensorVariable(const TensorVariable &)                  = default;
         TensorVariable       &operator=(const TensorVariable &) = default;
-        ITensorInfo::Id       id{ ITensorInfo::invalid_tensor_id };
-        std::string           uniq_name{ "empty" }; // Unique name, also the final variable name used in the built code
+        ITensorInfo::Id       id{ITensorInfo::invalid_tensor_id};
+        std::string           uniq_name{"empty"}; // Unique name, also the final variable name used in the built code
         GpuKernelArgumentInfo kernel_argument_info{};
         bool                  has_valid_id() const
         {
@@ -76,7 +77,10 @@ public:
      * @param[in] argument_info Kernel argument information
      * @param[in] alias         Alias for the variable. Will be used as part of the variable name
      */
-    void declare_variable(const GpuKernelComponentGroup &comp_group, const ITensorInfo *tensor, GpuKernelArgumentInfo argument_info, const std::string &alias = "unnamed");
+    void declare_variable(const GpuKernelComponentGroup &comp_group,
+                          const ITensorInfo             *tensor,
+                          GpuKernelArgumentInfo          argument_info,
+                          const std::string             &alias = "unnamed");
     /** Get the @ref TensorVariable associated with @p tensor
      *
      * @param[in] tensor Tensor info to be queried
@@ -106,8 +110,7 @@ struct TagVal
     TagVal(const GpuKernelVariableTable::TensorVariable &var);
     /** Construct a @ref TagVal from an integral type */
     template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_integral<T>::value)>
-    TagVal(T val)
-        : value{ support::cpp11::to_string(val) }
+    TagVal(T val) : value{support::cpp11::to_string(val)}
     {
     }
     /** Construct a @ref TagVal from a string */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h b/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h
index 4a1fb142d6..9d0b4f592a 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
 #include "src/dynamic_fusion/sketch/gpu/components/Types.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
@@ -57,8 +58,7 @@ public:
      * @param[in] id      Component id
      * @param[in] tensors Tensor arguments to the components
      */
-    IGpuTemplateComponentWriter(ComponentId id, const ArgumentPack<ITensorInfo> &tensors)
-        : _id{ id }, _tensors{ tensors }
+    IGpuTemplateComponentWriter(ComponentId id, const ArgumentPack<ITensorInfo> &tensors) : _id{id}, _tensors{tensors}
     {
     }
     /** Destructor */
@@ -112,7 +112,7 @@ public:
     /** Generate the header list used in the component */
     virtual std::set<std::string> get_headers_list() const
     {
-        return std::set<std::string> {};
+        return std::set<std::string>{};
     }
     /** Generate the execution window for the component */
     virtual Window get_window() const
@@ -131,7 +131,7 @@ public:
     }
 
 private:
-    ComponentId               _id{ -1 };
+    ComponentId               _id{-1};
     ArgumentPack<ITensorInfo> _tensors{};
 };
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp
index 3c7c843dd8..c165fb5f33 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "support/StringSupport.h"
@@ -39,10 +40,7 @@ namespace dynamic_fusion
 ClTemplateActivation::ClTemplateActivation(ComponentId                      id,
                                            const ArgumentPack<ITensorInfo> &tensors,
                                            const Attributes                &attributes)
-    : IGpuTemplateComponentWriter{ id, tensors },
-      _src{},
-      _dst{},
-      _attributes{ attributes }
+    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST);
@@ -62,7 +60,7 @@ std::string ClTemplateActivation::get_component_code(const ComponentGroup &comp_
     code = R"_(
 //------------------ START KERNEL {{meta_kernel_id}} ---------------------
 )_";
-    if(is_root)
+    if (is_root)
     {
         code += R"_(
 // IN(src)              {{src}}
@@ -104,17 +102,11 @@ LOOP_UNROLLING(int, i, 0, 1, M0,
 
 void ClTemplateActivation::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "src");
-
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "dst");
+    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "src");
+
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "dst");
 }
 
 TagLUT ClTemplateActivation::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
@@ -173,7 +165,7 @@ std::string ClTemplateActivation::get_config_id() const
 
 std::set<std::string> ClTemplateActivation::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h", "activation_float_helpers.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h", "activation_float_helpers.h"};
 }
 
 Window ClTemplateActivation::get_window() const
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h
index ec78cf6ce5..88ee370342 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp
index 4956879ad3..0da3a73801 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 
@@ -35,7 +36,7 @@ namespace experimental
 namespace dynamic_fusion
 {
 ClTemplateCast::ClTemplateCast(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes)
-    : IGpuTemplateComponentWriter{ id, tensors }, _src{}, _dst{}, _attributes{ attributes }
+    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
@@ -62,7 +63,7 @@ std::string ClTemplateCast::get_component_code(const ComponentGroup &comp_group)
 //------------------ START KERNEL {{meta_kernel_id}} CAST ---------------------
 )_";
 
-    if(is_root)
+    if (is_root)
     {
         code += R"_(
 // IN_0(src)            {{src}}
@@ -82,14 +83,15 @@ TILE(uint, M0, 1, g_dst_indirect_y);
     {
 )_";
 
-    if(kernel_name == "cast_down" && is_data_type_quantized(_src->data_type()))
+    if (kernel_name == "cast_down" && is_data_type_quantized(_src->data_type()))
     {
         code += R"_(
     {{tmp}}[m0].v ^= (VEC_DATA_TYPE({{DATA_TYPE_IN}}, N0))0x80;
 )_";
     }
 
-    if(kernel_name == "cast_down" && (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE))
+    if (kernel_name == "cast_down" &&
+        (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE))
     {
         code += R"_(
     {{dst}}[m0].v = CONVERT_SAT({{tmp}}[m0].v, VEC_DATA_TYPE({{DATA_TYPE_OUT}}, N0));
@@ -106,7 +108,7 @@ TILE(uint, M0, 1, g_dst_indirect_y);
     })
 )_";
 
-    if(is_root)
+    if (is_root)
     {
         code += R"_(
     LOOP_UNROLLING(int, i, 0, 1, M0,
@@ -128,17 +130,11 @@ TILE(uint, M0, 1, g_dst_indirect_y);
 
 void ClTemplateCast::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "src");
-
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "dst");
+    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "src");
+
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "dst");
 }
 
 TagLUT ClTemplateCast::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
@@ -199,7 +195,7 @@ std::string ClTemplateCast::get_config_id() const
 
 std::set<std::string> ClTemplateCast::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
 }
 
 Window ClTemplateCast::get_window() const
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp
index ab7cc9f05a..8380620ab2 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp
@@ -36,17 +36,17 @@ ClTemplateDepthwiseConv2d::ClTemplateDepthwiseConv2d(ComponentId
                                                      const ArgumentPack<ITensorInfo> &tensors,
                                                      const Attributes                &attributes,
                                                      const Settings                  &settings)
-    : IGpuTemplateComponentWriter{ id, tensors },
+    : IGpuTemplateComponentWriter{id, tensors},
       _src{},
       _weight{},
       _bias{},
       _dst{},
-      _attributes{ attributes },
-      _settings{ settings }
+      _attributes{attributes},
+      _settings{settings}
 {
     _src    = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
-    if(this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
+    if (this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
     {
         _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
     }
@@ -71,7 +71,7 @@ std::string ClTemplateDepthwiseConv2d::get_component_code(const ComponentGroup &
 // IN_1(wei)            {{weight}}
 )_";
 
-    if(_bias != nullptr && _bias->has_valid_id())
+    if (_bias != nullptr && _bias->has_valid_id())
     {
         code += R"_(
 // IN_1(bia)            {{bias}}
@@ -113,7 +113,7 @@ TILE(uint, M0, 1, g_dst_indirect_y);
     })
 )_";
 
-    if(_weight->dimension(height_idx) < 5)
+    if (_weight->dimension(height_idx) < 5)
     {
         code += R"_(
     LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
@@ -147,7 +147,7 @@ TILE(uint, M0, 1, g_dst_indirect_y);
             {
 )_";
 
-    if(!_settings.is_fma_available())
+    if (!_settings.is_fma_available())
     {
         code += R"_(
                 {{dst}}[m0].v += a[xk + m0].v * b[xk].v;
@@ -166,14 +166,14 @@ TILE(uint, M0, 1, g_dst_indirect_y);
     }
 )_";
 
-    if(_weight->dimension(height_idx) < 5)
+    if (_weight->dimension(height_idx) < 5)
     {
         code += R"_(
     )
 )_";
     }
 
-    if(_bias && _bias->has_valid_id())
+    if (_bias && _bias->has_valid_id())
     {
         code += R"_(
         TILE({{BIA_DATA_TYPE}}, 1, N0, {{bias}});
@@ -198,44 +198,31 @@ TILE(uint, M0, 1, g_dst_indirect_y);
     return code;
 }
 
-void ClTemplateDepthwiseConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+void ClTemplateDepthwiseConv2d::declare_variables(GpuKernelVariableTable &vtable,
+                                                  const ComponentGroup   &comp_group) const
 {
-    const GpuKernelArgumentInfo::Type input_type = _settings.export_input_to_cl_image() ?
-                                                       GpuKernelArgumentInfo::Type::Tensor_4D_t_Image :
-                                                       GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
-
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(input_type),
-        "src");
-
-    const GpuKernelArgumentInfo::Type weight_type = _settings.export_weights_to_cl_image() ?
-                                                        GpuKernelArgumentInfo::Type::Tensor_4D_t_Image :
-                                                        GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
-
-    vtable.declare_variable(
-        comp_group,
-        _weight,
-        GpuKernelArgumentInfo(weight_type),
-        "weight");
-
-    if(_bias != nullptr && _bias->has_valid_id()) // optional bias
+    const GpuKernelArgumentInfo::Type input_type = _settings.export_input_to_cl_image()
+                                                       ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image
+                                                       : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
+
+    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(input_type), "src");
+
+    const GpuKernelArgumentInfo::Type weight_type = _settings.export_weights_to_cl_image()
+                                                        ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image
+                                                        : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
+
+    vtable.declare_variable(comp_group, _weight, GpuKernelArgumentInfo(weight_type), "weight");
+
+    if (_bias != nullptr && _bias->has_valid_id()) // optional bias
     {
-        vtable.declare_variable(
-            comp_group,
-            _bias,
-            GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector),
-            "bias");
+        vtable.declare_variable(comp_group, _bias, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector), "bias");
     }
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "dst");
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "dst");
 }
 
-TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtable,
+                                              const ComponentGroup         &comp_group) const
 {
     TagLUT lut{};
 
@@ -243,7 +230,7 @@ TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtab
     lut["src"]    = vtable.get_variable(_src);
     lut["weight"] = vtable.get_variable(_weight);
 
-    if(_bias != nullptr && _bias->has_valid_id()) // optional bias
+    if (_bias != nullptr && _bias->has_valid_id()) // optional bias
     {
         lut["bias"]          = vtable.get_variable(_bias);
         lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type());
@@ -259,7 +246,7 @@ TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtab
     lut["SRC_DATA_TYPE"]  = _src->data_type();
     lut["WEI_DATA_TYPE"]  = _weight->data_type();
 
-    switch(vtable.get_variable(_src).kernel_argument_info.type)
+    switch (vtable.get_variable(_src).kernel_argument_info.type)
     {
         case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
         case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
@@ -271,7 +258,7 @@ TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtab
             break;
     }
 
-    switch(vtable.get_variable(_weight).kernel_argument_info.type)
+    switch (vtable.get_variable(_weight).kernel_argument_info.type)
     {
         case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
         case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
@@ -318,7 +305,7 @@ CLBuildOptions ClTemplateDepthwiseConv2d::get_build_options(const ComponentGroup
 
     CLBuildOptions build_opts{};
 
-    if(_settings.fast_relaxed_math())
+    if (_settings.fast_relaxed_math())
     {
         build_opts.add_option("-cl-fast-relaxed-math");
     }
@@ -361,7 +348,7 @@ std::string ClTemplateDepthwiseConv2d::get_config_id() const
 
 std::set<std::string> ClTemplateDepthwiseConv2d::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
 }
 
 Window ClTemplateDepthwiseConv2d::get_window() const
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h
index 84b689ef64..5d04c687c3 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h
@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDEPTHWISECONV2D
 
 #include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
 
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
index 3322487910..f6a7a58d1d 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
@@ -23,14 +23,13 @@
  */
 #include "ClTemplateDirectConv2d.h"
 
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
-
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
-#include "src/core/helpers/WindowHelpers.h"
 
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -43,17 +42,17 @@ ClTemplateDirectConv2d::ClTemplateDirectConv2d(ComponentId
                                                const ArgumentPack<ITensorInfo> &tensors,
                                                const Attributes                &attributes,
                                                const Settings                  &settings)
-    : IGpuTemplateComponentWriter{ id, tensors },
+    : IGpuTemplateComponentWriter{id, tensors},
       _src{},
       _weight{},
       _bias{},
       _dst{},
-      _attributes{ attributes },
-      _settings{ settings }
+      _attributes{attributes},
+      _settings{settings}
 {
     _src    = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
-    if(this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
+    if (this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
     {
         _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
     }
@@ -79,7 +78,7 @@ std::string ClTemplateDirectConv2d::get_component_code(const ComponentGroup &com
 // IN_0(src)            {{src}}
 // IN_1(wei)            {{weight}}
 )_";
-    if(_bias && _bias->has_valid_id())
+    if (_bias && _bias->has_valid_id())
     {
         code += R"_(
 // IN_1(bia)            {{bias}}
@@ -161,7 +160,7 @@ TILE(uint, M0, 1, g_dst_indirect_y);
         }
 )_";
 
-    if(leftover_loop)
+    if (leftover_loop)
     {
         code += R"_(
         for(; ck < _ISRC_CHANNELS; ++ck)
@@ -186,9 +185,9 @@ TILE(uint, M0, 1, g_dst_indirect_y);
             T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}});
         }
     )_";
-}
+    }
 
-code += R"_(
+    code += R"_(
 #undef _I_WEI_WIDTH
 #undef _I_WEI_HEIGHT
 #undef _ISRC_WIDTH
@@ -202,7 +201,7 @@ code += R"_(
     }
 )_";
 
-    if(_bias && _bias->has_valid_id())
+    if (_bias && _bias->has_valid_id())
     {
         code += R"_(
         TILE({{BIA_DATA_TYPE}}, 1, N0, bias0);
@@ -211,9 +210,9 @@ code += R"_(
 
         T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}});
     )_";
-}
+    }
 
-code += R"_(
+    code += R"_(
     LOOP_UNROLLING(int, i, 0, 1, M0,
     {
         g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{DST_WIDTH}} * {{DST_HEIGHT}}) - 1);
@@ -227,32 +226,19 @@ code += R"_(
 
 void ClTemplateDirectConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "src");
-
-    const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image() ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
-    vtable.declare_variable(
-        comp_group,
-        _weight,
-        GpuKernelArgumentInfo(weight_type),
-        "weight");
-
-    if(_bias && _bias->has_valid_id()) // optional bias
+    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "src");
+
+    const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image()
+                                                        ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image
+                                                        : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
+    vtable.declare_variable(comp_group, _weight, GpuKernelArgumentInfo(weight_type), "weight");
+
+    if (_bias && _bias->has_valid_id()) // optional bias
     {
-        vtable.declare_variable(
-            comp_group,
-            _bias,
-            GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector),
-            "bias");
+        vtable.declare_variable(comp_group, _bias, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector), "bias");
     }
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(common_tensor_type),
-        "dst");
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(common_tensor_type), "dst");
 }
 
 TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
@@ -262,7 +248,7 @@ TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable,
     lut["src"]    = vtable.get_variable(_src);
     lut["weight"] = vtable.get_variable(_weight);
 
-    if(_bias && _bias->has_valid_id()) // optional bias
+    if (_bias && _bias->has_valid_id()) // optional bias
     {
         lut["bias"]          = vtable.get_variable(_bias);
         lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type());
@@ -279,34 +265,34 @@ TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable,
     lut["WEI_DATA_TYPE"]  = _weight->data_type();
 
     lut["SRC_TENSOR_TYPE"] = "BUFFER";
-    switch(vtable.get_variable(_weight).kernel_argument_info.type)
+    switch (vtable.get_variable(_weight).kernel_argument_info.type)
     {
         case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
         case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
         case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
-    {
-        lut["WEI_TENSOR_TYPE"] = "IMAGE";
-        break;
-    }
+        {
+            lut["WEI_TENSOR_TYPE"] = "IMAGE";
+            break;
+        }
         default:
-    {
-        lut["WEI_TENSOR_TYPE"] = "BUFFER";
-        break;
-    }
+        {
+            lut["WEI_TENSOR_TYPE"] = "BUFFER";
+            break;
+        }
     }
-    const auto width_idx  = 1;
-    const auto height_idx = 2;
+    const auto width_idx   = 1;
+    const auto height_idx  = 2;
     const auto channel_idx = 0;
 
-    lut["SRC_WIDTH"] = _src->dimension(width_idx);
-    lut["SRC_HEIGHT"] = _src->dimension(height_idx);
+    lut["SRC_WIDTH"]    = _src->dimension(width_idx);
+    lut["SRC_HEIGHT"]   = _src->dimension(height_idx);
     lut["SRC_CHANNELS"] = _src->dimension(channel_idx);
 
-    lut["WEI_WIDTH"]      = _weight->dimension(width_idx);
-    lut["WEI_HEIGHT"]     = _weight->dimension(height_idx);
+    lut["WEI_WIDTH"]  = _weight->dimension(width_idx);
+    lut["WEI_HEIGHT"] = _weight->dimension(height_idx);
 
-    lut["DST_WIDTH"] = _dst->dimension(width_idx);
-    lut["DST_HEIGHT"] = _dst->dimension(height_idx);
+    lut["DST_WIDTH"]    = _dst->dimension(width_idx);
+    lut["DST_HEIGHT"]   = _dst->dimension(height_idx);
     lut["DST_CHANNELS"] = _dst->dimension(channel_idx);
 
     lut["STRIDE_X"] = _attributes.stride().x();
@@ -324,14 +310,14 @@ CLBuildOptions ClTemplateDirectConv2d::get_build_options(const ComponentGroup &c
 {
     const unsigned int channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
 
-    const auto         root_window      = comp_group.get_root_component()->template_writer()->get_window();
-    const unsigned int n0               = root_window.x().step();
-    const unsigned int m0               = root_window.y().step();
-    const unsigned int k0               = adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx));
+    const auto         root_window = comp_group.get_root_component()->template_writer()->get_window();
+    const unsigned int n0          = root_window.x().step();
+    const unsigned int m0          = root_window.y().step();
+    const unsigned int k0 = adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx));
     const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
 
     CLBuildOptions build_opts{};
-    if(_settings.fast_relaxed_math())
+    if (_settings.fast_relaxed_math())
     {
         build_opts.add_option("-cl-fast-relaxed-math");
     }
@@ -379,7 +365,7 @@ std::string ClTemplateDirectConv2d::get_config_id() const
 
 std::set<std::string> ClTemplateDirectConv2d::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
 }
 
 Window ClTemplateDirectConv2d::get_window() const
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h
index 8988d3ca1c..03c8cd2f15 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
index c0481ae190..78bff3c3f3 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
@@ -23,14 +23,13 @@
  */
 #include "ClTemplateElementwiseBinary.h"
 
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h"
-
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
-#include "src/core/helpers/WindowHelpers.h"
 
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -44,11 +43,7 @@ constexpr unsigned int vector_size_byte_opencl = 16;
 ClTemplateElementwiseBinary::ClTemplateElementwiseBinary(ComponentId                      id,
                                                          const ArgumentPack<ITensorInfo> &tensors,
                                                          const Attributes                &attributes)
-    : IGpuTemplateComponentWriter{ id, tensors },
-      _lhs{},
-      _rhs{},
-      _dst{},
-      _attributes{ attributes }
+    : IGpuTemplateComponentWriter{id, tensors}, _lhs{}, _rhs{}, _dst{}, _attributes{attributes}
 {
     _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
@@ -69,67 +64,67 @@ std::string ClTemplateElementwiseBinary::get_component_code(const ComponentGroup
     const bool  is_rhs_input = comp_group.is_input_tensor(_rhs);
 
     code =
-R"_(
+        R"_(
     //------------------ START KERNEL {{meta_kernel_id}} {{ELTWISE_OP}} ---------------------
 )_";
 
-    if(is_root)
+    if (is_root)
     {
         code +=
-R"_(
+            R"_(
     TILE(uint, M0, 1, g_dst_indirect_y);
 )_";
     }
 
-    if(is_lhs_input)
+    if (is_lhs_input)
     {
         code +=
-R"_(
+            R"_(
     TILE({{DATA_TYPE}}, {{lhs_m0}}, N0, {{lhs}});
 )_";
     }
 
-    if(is_rhs_input)
+    if (is_rhs_input)
     {
         code +=
-R"_(
+            R"_(
     TILE({{DATA_TYPE}}, {{rhs_m0}}, N0, {{rhs}});
 )_";
     }
 
     code +=
-R"_(
+        R"_(
     {
 )_";
 
-    if(is_lhs_input)
+    if (is_lhs_input)
     {
         code +=
-R"_(
+            R"_(
         {{lhs}}_offset_first_element_in_bytes += g_ind_2 * {{lhs}}_stride_w;
         T_LOAD({{DATA_TYPE}}, {{lhs_m0}}, {{lhs_n0}}, BUFFER, {{lhs}}, {{lhs_start_ind_0}}, {{lhs_start_ind_1}}, 1, {{lhs}}_stride_y, {{lhs}});
 )_";
     }
 
-    if(is_rhs_input)
+    if (is_rhs_input)
     {
         code +=
-R"_(
+            R"_(
         {{rhs}}_offset_first_element_in_bytes += g_ind_2 * {{rhs}}_stride_w;
         T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{rhs}}, {{rhs_start_ind_0}}, {{rhs_start_ind_1}}, 1, {{rhs}}_stride_y, {{rhs}});
 )_";
     }
 
     code +=
-R"_(
+        R"_(
         T_ELTWISE_{{BROADCAST_OP}}{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, {{lhs}}, {{rhs}}, {{dst}});
 )_";
 
-    if(is_root)
+    if (is_root)
     {
         // Calculate the destination indirect Y
         code +=
-R"_(
+            R"_(
         LOOP_UNROLLING(int, i, 0, 1, M0,
         {
             g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{arg_dst}}_w * {{arg_dst}}_h) - 1);
@@ -139,7 +134,7 @@ R"_(
     }
 
     code +=
-R"_(
+        R"_(
     }
     //------------------ END KERNEL {{meta_kernel_id}} {{ELTWISE_OP}} ---------------------
 )_";
@@ -147,28 +142,18 @@ R"_(
     return code;
 }
 
-void ClTemplateElementwiseBinary::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+void ClTemplateElementwiseBinary::declare_variables(GpuKernelVariableTable &vtable,
+                                                    const ComponentGroup   &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _lhs,
-        GpuKernelArgumentInfo(common_tensor_type),
-        "lhs");
-
-    vtable.declare_variable(
-        comp_group,
-        _rhs,
-        GpuKernelArgumentInfo(common_tensor_type),
-        "rhs");
-
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(common_tensor_type),
-        "dst");
+    vtable.declare_variable(comp_group, _lhs, GpuKernelArgumentInfo(common_tensor_type), "lhs");
+
+    vtable.declare_variable(comp_group, _rhs, GpuKernelArgumentInfo(common_tensor_type), "rhs");
+
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(common_tensor_type), "dst");
 }
 
-TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vtable,
+                                                const ComponentGroup         &comp_group) const
 {
     TagLUT lut{};
 
@@ -182,7 +167,7 @@ TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vt
     lut["dst"]     = vtable.get_variable(_dst);
     lut["arg_dst"] = vtable.get_variable(comp_group.get_any_dst_tensor());
 
-    switch(_attributes.operation())
+    switch (_attributes.operation())
     {
         case Attributes::ElementwiseOp::Add:
             lut["ELTWISE_OP"] = "ADD";
@@ -197,10 +182,10 @@ TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vt
             ARM_COMPUTE_ERROR("Arithmetic Operation not supported");
     }
 
-    ARM_COMPUTE_ERROR_ON(
-        comp_group.is_intermediate_tensor(_lhs) && detail::have_different_dimensions(_lhs->tensor_shape(), _dst->tensor_shape(), 0));
-    ARM_COMPUTE_ERROR_ON(
-        comp_group.is_intermediate_tensor(_rhs) && detail::have_different_dimensions(_rhs->tensor_shape(), _dst->tensor_shape(), 0));
+    ARM_COMPUTE_ERROR_ON(comp_group.is_intermediate_tensor(_lhs) &&
+                         detail::have_different_dimensions(_lhs->tensor_shape(), _dst->tensor_shape(), 0));
+    ARM_COMPUTE_ERROR_ON(comp_group.is_intermediate_tensor(_rhs) &&
+                         detail::have_different_dimensions(_rhs->tensor_shape(), _dst->tensor_shape(), 0));
 
     // Set broadcast parameters
     // PRE: All tensors are broadcast-compatible
@@ -228,9 +213,7 @@ TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vt
     lut["rhs_m0"]          = (rhs_broadcast_yz) ? "1" : "M0";
     lut["rhs_start_ind_1"] = (rhs_broadcast_yz) ? "0" : "g_ind_1";
 
-    lut["BROADCAST_OP"] = (lhs_broadcast_yz) ? "BROADCAST_LHS_X_" :
-                          (rhs_broadcast_yz) ? "BROADCAST_RHS_X_" :
-                                               "";
+    lut["BROADCAST_OP"] = (lhs_broadcast_yz) ? "BROADCAST_LHS_X_" : (rhs_broadcast_yz) ? "BROADCAST_RHS_X_" : "";
 
     return lut;
 }
@@ -268,7 +251,7 @@ std::string ClTemplateElementwiseBinary::get_config_id() const
 
 std::set<std::string> ClTemplateElementwiseBinary::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
 }
 
 Window ClTemplateElementwiseBinary::get_window() const
@@ -279,8 +262,9 @@ Window ClTemplateElementwiseBinary::get_window() const
     // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) and upper dimensions unchanged
     // This is in line with the collapsing convention used by operators like Conv2d
     output_shape.collapse(2U, 1U);
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
-    Window             win                               = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
+    Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
 
     return win;
 }
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h
index 8cca954efe..991c0eca44 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h
@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEELEMENTWISEBINARY
 
 #include "arm_compute/core/experimental/Types.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
@@ -48,9 +49,7 @@ public:
      * @param[in] tensors    Tensor arguments to the components
      * @param[in] attributes Component attributes
      */
-    ClTemplateElementwiseBinary(ComponentId                      id,
-                                const ArgumentPack<ITensorInfo> &tensors,
-                                const Attributes                &attributes);
+    ClTemplateElementwiseBinary(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
     /** Prevent instances of this class from being copy constructed */
     ClTemplateElementwiseBinary(const ClTemplateElementwiseBinary &elementwise) = delete;
     /** Prevent instances of this class from being copied */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp
index a8d8d32b12..522c33a022 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "support/StringSupport.h"
@@ -38,16 +39,12 @@ namespace dynamic_fusion
 {
 namespace
 {
-    constexpr unsigned int serial_vector_size = 8;
+constexpr unsigned int serial_vector_size = 8;
 } // namespace
 ClTemplateLogits1DMaxShiftExpSum::ClTemplateLogits1DMaxShiftExpSum(ComponentId                      id,
                                                                    const ArgumentPack<ITensorInfo> &tensors,
                                                                    const Attributes                &attributes)
-    : IGpuTemplateComponentWriter{ id, tensors },
-      _src{},
-      _sum{},
-      _dst{},
-      _attributes{ attributes }
+    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _sum{}, _dst{}, _attributes{attributes}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _sum = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
@@ -79,7 +76,7 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const Component
 
     const bool beta_defined = (_attributes.beta() != 1.f);
 
-    if(beta_defined)
+    if (beta_defined)
     {
         code += R"_(
     VEC_TYPE beta = (VEC_TYPE){{BETA}};
@@ -91,7 +88,7 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const Component
     const unsigned int     vector_size         = adjust_vec_size(_serial_vector_size, reduction_dim_size);
     const bool             non_multiple_of_n0  = ((reduction_dim_size % vector_size) != 0);
 
-    if(non_multiple_of_n0)
+    if (non_multiple_of_n0)
     {
         code += R"_(
     VEC_TYPE data    = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)src_addr);
@@ -111,19 +108,19 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const Component
     VEC_TYPE sum1D = 0;
 )_";
 
-    if(non_multiple_of_n0)
+    if (non_multiple_of_n0)
     {
         code += R"_(
     data -= max_val;
 )_";
-        if(beta_defined)
+        if (beta_defined)
         {
             code += R"_(
     data *= beta;
 )_";
         }
 
-        if(_attributes.is_log_softmax())
+        if (_attributes.is_log_softmax())
         {
             code += R"_(
     VSTORE_PARTIAL(N0, PARTIAL_N0)
@@ -153,14 +150,14 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const Component
         data -= max_val;
 )_";
 
-    if(beta_defined)
+    if (beta_defined)
     {
         code += R"_(
     data *= beta;
 )_";
     }
 
-    if(_attributes.is_log_softmax())
+    if (_attributes.is_log_softmax())
     {
         code += R"_(
     VSTORE(N0)
@@ -191,28 +188,18 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const Component
     return code;
 }
 
-void ClTemplateLogits1DMaxShiftExpSum::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+void ClTemplateLogits1DMaxShiftExpSum::declare_variables(GpuKernelVariableTable &vtable,
+                                                         const ComponentGroup   &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D),
-        "src");
-
-    vtable.declare_variable(
-        comp_group,
-        _sum,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D),
-        "sum");
-
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D),
-        "dst");
+    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "src");
+
+    vtable.declare_variable(comp_group, _sum, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "sum");
+
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "dst");
 }
 
-TagLUT ClTemplateLogits1DMaxShiftExpSum::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+TagLUT ClTemplateLogits1DMaxShiftExpSum::get_tag_lut(const GpuKernelVariableTable &vtable,
+                                                     const ComponentGroup         &comp_group) const
 {
     ARM_COMPUTE_UNUSED(comp_group);
 
@@ -241,8 +228,8 @@ CLBuildOptions ClTemplateLogits1DMaxShiftExpSum::get_build_options(const Compone
     ARM_COMPUTE_UNUSED(comp_group);
     CLBuildOptions build_opts{};
 
-    const unsigned int     reduction_dim_size = _src->dimension(0);
-    const unsigned int     vector_size        = adjust_vec_size(serial_vector_size, reduction_dim_size);
+    const unsigned int reduction_dim_size = _src->dimension(0);
+    const unsigned int vector_size        = adjust_vec_size(serial_vector_size, reduction_dim_size);
 
     build_opts.add_option("-DN0=" + support::cpp11::to_string(vector_size));
     build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string((reduction_dim_size % vector_size)));
@@ -264,7 +251,7 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_config_id() const
 
 std::set<std::string> ClTemplateLogits1DMaxShiftExpSum::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
 }
 
 Window ClTemplateLogits1DMaxShiftExpSum::get_window() const
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h
index 5d232c0cf2..ac9ddaa9d4 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h
@@ -46,7 +46,9 @@ public:
      * @param[in] tensors    Tensor arguments to the components
      * @param[in] attributes Component attributes
      */
-    ClTemplateLogits1DMaxShiftExpSum(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
+    ClTemplateLogits1DMaxShiftExpSum(ComponentId                      id,
+                                     const ArgumentPack<ITensorInfo> &tensors,
+                                     const Attributes                &attributes);
     /** Prevent instances of this class from being copy constructed */
     ClTemplateLogits1DMaxShiftExpSum(const ClTemplateLogits1DMaxShiftExpSum &) = delete;
     /** Prevent instances of this class from being copied */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp
index 056e570a25..7d7c3e6673 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp
@@ -25,6 +25,7 @@
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.h"
 
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "support/StringSupport.h"
@@ -38,11 +39,7 @@ namespace dynamic_fusion
 ClTemplateLogits1DNorm::ClTemplateLogits1DNorm(ComponentId                      id,
                                                const ArgumentPack<ITensorInfo> &tensors,
                                                const Attributes                &attributes)
-    : IGpuTemplateComponentWriter{ id, tensors },
-      _src{},
-      _sum{},
-      _dst{},
-      _attributes{ attributes }
+    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _sum{}, _dst{}, _attributes{attributes}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _sum = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
@@ -76,7 +73,7 @@ std::string ClTemplateLogits1DNorm::get_component_code(const ComponentGroup &com
     data0 = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)src_addr);
 )_";
 
-    if(_attributes.is_log_softmax())
+    if (_attributes.is_log_softmax())
     {
         code += R"_(
     sum_val = log(sum_val);
@@ -101,23 +98,11 @@ std::string ClTemplateLogits1DNorm::get_component_code(const ComponentGroup &com
 
 void ClTemplateLogits1DNorm::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D),
-        "src");
-
-    vtable.declare_variable(
-        comp_group,
-        _sum,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D),
-        "sum");
-
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D),
-        "dst");
+    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "src");
+
+    vtable.declare_variable(comp_group, _sum, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "sum");
+
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "dst");
 }
 
 TagLUT ClTemplateLogits1DNorm::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
@@ -168,14 +153,14 @@ std::string ClTemplateLogits1DNorm::get_config_id() const
 
 std::set<std::string> ClTemplateLogits1DNorm::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
 }
 
 Window ClTemplateLogits1DNorm::get_window() const
 {
     ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
     constexpr unsigned int serial_vector_size = 16;
-    const unsigned int vector_size = adjust_vec_size(serial_vector_size, _src->dimension(0));
+    const unsigned int     vector_size        = adjust_vec_size(serial_vector_size, _src->dimension(0));
 
     Window win = calculate_max_window(*_src, Steps(vector_size));
     return win.collapse(win, Window::DimZ);
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp
index 34840c2100..ebb0374501 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp
@@ -23,14 +23,13 @@
  */
 #include "ClTemplatePool2d.h"
 
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
-#include "src/core/helpers/WindowHelpers.h"
 
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -50,11 +49,7 @@ ClTemplatePool2d::ClTemplatePool2d(ComponentId                      id,
                                    const ArgumentPack<ITensorInfo> &tensors,
                                    const Attributes                &attributes,
                                    const Settings                  &settings)
-    : IGpuTemplateComponentWriter{ id, tensors },
-      _src{},
-      _dst{},
-      _attributes{ attributes },
-      _settings{ settings }
+    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes}, _settings{settings}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
@@ -71,7 +66,7 @@ std::string ClTemplatePool2d::get_component_code(const ComponentGroup &comp_grou
     ARM_COMPUTE_UNUSED(comp_group);
 
     // Condition to use 2x2 optimized kernel
-    if(_attributes.pool_size() == Size2D(2, 2))
+    if (_attributes.pool_size() == Size2D(2, 2))
     {
         return get_2x2_kernel_code();
     }
@@ -83,11 +78,13 @@ std::string ClTemplatePool2d::get_component_code(const ComponentGroup &comp_grou
 
 std::string ClTemplatePool2d::get_MxN_kernel_code() const
 {
-    const auto pool_type          = _attributes.pool_type();
-    const bool fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
+    const auto pool_type = _attributes.pool_type();
+    const bool fp_mixed_precision =
+        (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
 
     // Define pool op macro.
-    std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
+    std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_"
+                                                          : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
 
     // Kernel start
     // Note: If C is not multiple of N0, we shift back of PARTIAL_N0 elements to compute the leftover elements for get_global_id(0) == 0
@@ -129,7 +126,7 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const
 )_";
 
     // Determine filter size depending on if padding is excluded or not
-    if(_attributes.exclude_padding())
+    if (_attributes.exclude_padding())
     {
         code += R"_(
     const int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
@@ -144,7 +141,8 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const
 
     // Loop through pool size
     // if global pooling
-    if(_attributes.pool_size().x() == _src->dimension(width_idx) && _attributes.pool_size().y() == _src->dimension(height_idx))
+    if (_attributes.pool_size().x() == _src->dimension(width_idx) &&
+        _attributes.pool_size().y() == _src->dimension(height_idx))
     {
         // Begin loop
         code += R"_(
@@ -173,7 +171,7 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const
 
     // if condition inside loop - use 32bit acc if mixed_precision.
     // End loop through pooling section.
-    if(fp_mixed_precision)
+    if (fp_mixed_precision)
     {
         // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
         code += R"_(
@@ -194,7 +192,7 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const
     }
 
     // For Pool AVG ONLY, divide pool output by filter size
-    if(pool_type == PoolingType::AVG)
+    if (pool_type == PoolingType::AVG)
     {
         code += R"_(
     res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))filter_size;
@@ -202,7 +200,7 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const
     }
 
     // If mixed precision convert datatype before storing. Then end kernel.
-    if(fp_mixed_precision)
+    if (fp_mixed_precision)
     {
         code += R"_(
     VEC_DATA_TYPE({{DATA_TYPE}}, N0)
@@ -228,9 +226,11 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const
 
 std::string ClTemplatePool2d::get_2x2_kernel_code() const
 {
-    const auto  pool_type          = _attributes.pool_type();
-    const bool  fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
-    std::string pool_op            = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
+    const auto pool_type = _attributes.pool_type();
+    const bool fp_mixed_precision =
+        (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
+    std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_"
+                                                          : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
 
     std::string code = R"_(
 //------------------ START KERNEL {{meta_kernel_id}} ---------------------
@@ -274,7 +274,7 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const
     REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0), data, 0);
 )_";
 
-    if(fp_mixed_precision)
+    if (fp_mixed_precision)
     {
         // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
         code += R"_(
@@ -294,7 +294,7 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const
 )_";
     }
 
-    if(pool_type != PoolingType::MAX)
+    if (pool_type != PoolingType::MAX)
     {
         // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound)
         code += R"_(
@@ -321,10 +321,10 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const
     res0 = POOL_OP(res0, data3);
 )_";
 
-    if(pool_type == PoolingType::AVG)
+    if (pool_type == PoolingType::AVG)
     {
         // If avg pooling divide result accordingly.
-        if(_attributes.exclude_padding())
+        if (_attributes.exclude_padding())
         {
             code += R"_(
     res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))filter_size;
@@ -339,7 +339,7 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const
     }
 
     // Store result
-    if(fp_mixed_precision)
+    if (fp_mixed_precision)
     {
         code += R"_(
     VEC_DATA_TYPE({{DATA_TYPE}}, N0)
@@ -365,17 +365,11 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const
 
 void ClTemplatePool2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "src");
-
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "dst");
+    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "src");
+
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "dst");
 }
 
 TagLUT ClTemplatePool2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
@@ -391,12 +385,15 @@ TagLUT ClTemplatePool2d::get_tag_lut(const GpuKernelVariableTable &vtable, const
     lut["meta_kernel_id"] = id();
 
     // Retrieve relevant data
-    const auto        padding                = _attributes.pad();
-    const auto        stride                 = _attributes.stride();
-    const auto        pool_size              = _attributes.pool_size();
-    const auto        data_type              = _src->data_type();
-    const auto        use_fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && _attributes.pool_type() != PoolingType::MAX;
-    const std::string max_initial_value      = _settings.use_inf_as_limit() ? "(-INFINITY)" : float_to_string_with_full_precision(std::numeric_limits<float>::lowest());
+    const auto padding                = _attributes.pad();
+    const auto stride                 = _attributes.stride();
+    const auto pool_size              = _attributes.pool_size();
+    const auto data_type              = _src->data_type();
+    const auto use_fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() &&
+                                        _attributes.pool_type() != PoolingType::MAX;
+    const std::string max_initial_value =
+        _settings.use_inf_as_limit() ? "(-INFINITY)"
+                                     : float_to_string_with_full_precision(std::numeric_limits<float>::lowest());
 
     // pool specific
     lut["STRIDE_X"]    = stride.x();
@@ -407,7 +404,8 @@ TagLUT ClTemplatePool2d::get_tag_lut(const GpuKernelVariableTable &vtable, const
     lut["POOL_SIZE_Y"] = pool_size.height;
 
     // Datatypes and variables
-    lut["ACC_DATA_TYPE"] = get_cl_type_from_data_type((use_fp_mixed_precision) ? (DataType::F32) : (data_type)); // Type of accumulators to use.
+    lut["ACC_DATA_TYPE"] = get_cl_type_from_data_type(
+        (use_fp_mixed_precision) ? (DataType::F32) : (data_type)); // Type of accumulators to use.
     lut["DATA_TYPE"]     = get_cl_type_from_data_type(data_type);
     lut["SRC_WIDTH"]     = _src->dimension(width_idx);
     lut["SRC_HEIGHT"]    = _src->dimension(height_idx);
@@ -454,14 +452,14 @@ std::string ClTemplatePool2d::get_config_id() const
 
 std::set<std::string> ClTemplatePool2d::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h", "repeat.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h", "repeat.h"};
 }
 
 Window ClTemplatePool2d::get_window() const
 {
     ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
     const auto         output_shape = _dst->tensor_shape();
-    const unsigned int vec_size     = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0));
+    const unsigned int vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0));
 
     // Create and configure kernel window
     auto win = calculate_max_window(output_shape, Steps(vec_size));
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h
index ef1c100f44..d1d3c01669 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp
index 8b50f1e209..c882353fcb 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 
@@ -36,11 +37,8 @@ namespace dynamic_fusion
 {
 constexpr unsigned int vector_size_byte_opencl = 16;
 
-ClTemplateReshape::ClTemplateReshape(ComponentId                      id,
-                                     const ArgumentPack<ITensorInfo> &tensors)
-    : IGpuTemplateComponentWriter{ id, tensors },
-      _src{},
-      _dst{}
+ClTemplateReshape::ClTemplateReshape(ComponentId id, const ArgumentPack<ITensorInfo> &tensors)
+    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
@@ -97,23 +95,17 @@ TILE(uint, M0, 1, g_dst_indirect_y);
 
 void ClTemplateReshape::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(common_tensor_type), // GpuKernelArgumentInfo::Type::Image_3D
-        "src");
-
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(common_tensor_type),
-        "dst");
+    vtable.declare_variable(comp_group, _src,
+                            GpuKernelArgumentInfo(common_tensor_type), // GpuKernelArgumentInfo::Type::Image_3D
+                            "src");
+
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(common_tensor_type), "dst");
 }
 
 TagLUT ClTemplateReshape::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
     ARM_COMPUTE_UNUSED(comp_group);
-    TagLUT     lut{};
+    TagLUT lut{};
 
     // Arguments and global shared variables
     lut["src"]            = vtable.get_variable(_src);
@@ -153,7 +145,7 @@ std::string ClTemplateReshape::get_config_id() const
 
 std::set<std::string> ClTemplateReshape::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
 }
 
 Window ClTemplateReshape::get_window() const
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h
index 56b6585b61..838a21db6d 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h
@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATERESHAPE
 
 #include "arm_compute/core/experimental/Types.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
 
@@ -42,8 +43,7 @@ public:
      * @param[in] id      Component id
      * @param[in] tensors Tensor arguments to the components
      */
-    ClTemplateReshape(ComponentId                      id,
-                      const ArgumentPack<ITensorInfo> &tensors);
+    ClTemplateReshape(ComponentId id, const ArgumentPack<ITensorInfo> &tensors);
     /** Prevent instances of this class from being copy constructed */
     ClTemplateReshape(const ClTemplateReshape &reshape) = delete;
     /** Prevent instances of this class from being copied */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp
index aaed1d990d..846c712ceb 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
@@ -37,8 +38,10 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-ClTemplateResize::ClTemplateResize(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const ClTemplateResize::Attributes &attributes)
-    : IGpuTemplateComponentWriter{ id, tensors }, _src{}, _dst{}, _attributes{ attributes }
+ClTemplateResize::ClTemplateResize(ComponentId                         id,
+                                   const ArgumentPack<ITensorInfo>    &tensors,
+                                   const ClTemplateResize::Attributes &attributes)
+    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
@@ -63,9 +66,9 @@ TILE(uint, 1, 1, g_dst_indirect_y);
     const int bout = g_ind_2 / {{arg_dst}}_h;
 )_";
 
-    if(_attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR)
+    if (_attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
-        if(_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT)
+        if (_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT)
         {
             code += R"_(
     float xi_f = (g_ind_1 * {{SCALE_X}});
@@ -80,7 +83,7 @@ TILE(uint, 1, 1, g_dst_indirect_y);
 )_";
         }
 
-        if(_attributes.align_corners())
+        if (_attributes.align_corners())
         {
             code += R"_(
     xi_f = round(xi_f);
@@ -95,9 +98,9 @@ TILE(uint, 1, 1, g_dst_indirect_y);
     T_LOAD_NHWC_WITH_DILATION({{SRC_DATA_TYPE}}, 1, 1, N0, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yi0, xi0, g_ind_0, {{src}}_w, {{src}}_h, 1, 1, false, {{dst}});
 )_";
     }
-    else if(_attributes.interpolation_policy() == InterpolationPolicy::BILINEAR)
+    else if (_attributes.interpolation_policy() == InterpolationPolicy::BILINEAR)
     {
-        if(_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT)
+        if (_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT)
         {
             code += R"_(
     float xi_f = (g_ind_1 * {{SCALE_X}});
@@ -137,7 +140,7 @@ TILE(uint, 1, 1, g_dst_indirect_y);
     T_LOAD_NHWC_WITH_DILATION({{SRC_DATA_TYPE}}, 1, 1, N0, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yi1, xi1, g_ind_0, {{src}}_w, {{src}}_h, 1, 1, false, in11);
 )_";
 
-        if(is_data_type_float(_src->data_type()))
+        if (is_data_type_float(_src->data_type()))
         {
             code += R"_(
     const {{SRC_DATA_TYPE}} a  = ({{SRC_DATA_TYPE}})(xi_f - (float)xi);
@@ -158,9 +161,9 @@ TILE(uint, 1, 1, g_dst_indirect_y);
     const float b1 = (1.f - a1);
 
     {{dst}}[0].v = CONVERT_SAT(
-        (CONVERT(in00[0].v, VEC_DATA_TYPE(float, N0)) * b * b1) + 
+        (CONVERT(in00[0].v, VEC_DATA_TYPE(float, N0)) * b * b1) +
         (CONVERT(in01[0].v, VEC_DATA_TYPE(float, N0)) * a * b1) +
-        (CONVERT(in10[0].v, VEC_DATA_TYPE(float, N0)) * b * a1) + 
+        (CONVERT(in10[0].v, VEC_DATA_TYPE(float, N0)) * b * a1) +
         (CONVERT(in11[0].v, VEC_DATA_TYPE(float, N0)) * a * a1), VEC_DATA_TYPE({{DST_DATA_TYPE}}, N0));
 )_";
         }
@@ -179,22 +182,18 @@ TILE(uint, 1, 1, g_dst_indirect_y);
     return code;
 }
 
-void ClTemplateResize::declare_variables(GpuKernelVariableTable &vtable, const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const
+void ClTemplateResize::declare_variables(GpuKernelVariableTable                            &vtable,
+                                         const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "src");
-
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "dst");
+    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "src");
+
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "dst");
 }
 
-TagLUT ClTemplateResize::get_tag_lut(const GpuKernelVariableTable &vtable, const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const
+TagLUT ClTemplateResize::get_tag_lut(const GpuKernelVariableTable                      &vtable,
+                                     const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const
 {
     TagLUT lut{};
 
@@ -212,8 +211,10 @@ TagLUT ClTemplateResize::get_tag_lut(const GpuKernelVariableTable &vtable, const
     lut["DST_DATA_TYPE"]   = get_cl_type_from_data_type(_dst->data_type());
     lut["CONSTANT_VALUE"]  = string_from_pixel_value(0, _src->data_type());
 
-    const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(1), _dst->dimension(1), _attributes.align_corners());
-    const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(2), _dst->dimension(2), _attributes.align_corners());
+    const float scale_x =
+        scale_utils::calculate_resize_ratio(_src->dimension(1), _dst->dimension(1), _attributes.align_corners());
+    const float scale_y =
+        scale_utils::calculate_resize_ratio(_src->dimension(2), _dst->dimension(2), _attributes.align_corners());
 
     lut["SCALE_X"] = float_to_string_with_full_precision(scale_x);
     lut["SCALE_Y"] = float_to_string_with_full_precision(scale_y);
@@ -242,7 +243,8 @@ std::string ClTemplateResize::get_config_id() const
     std::string config_id{};
 
     config_id += "resize_";
-    config_id += (_attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR ? "NEAREST_NEIGHBOR" : "");
+    config_id +=
+        (_attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR ? "NEAREST_NEIGHBOR" : "");
     config_id += (_attributes.interpolation_policy() == InterpolationPolicy::BILINEAR ? "BILINEAR" : "");
     config_id += "_";
     config_id += (_attributes.sampling_policy() == SamplingPolicy::CENTER ? "center" : "topleft");
@@ -260,7 +262,7 @@ std::string ClTemplateResize::get_config_id() const
 
 std::set<std::string> ClTemplateResize::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
 }
 
 Window ClTemplateResize::get_window() const
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
index 217214ced3..d0ec91e0a9 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
@@ -32,7 +32,7 @@ namespace experimental
 namespace dynamic_fusion
 {
 ClTemplateStore::ClTemplateStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors)
-    : IGpuTemplateComponentWriter{ id, tensors }, _src{}, _dst{}
+    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
@@ -61,16 +61,10 @@ std::string ClTemplateStore::get_component_code(const ComponentGroup &comp_group
 
 void ClTemplateStore::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "src");
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "dst");
+    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "src");
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "dst");
 }
 
 TagLUT ClTemplateStore::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h
index 3f97a82204..b8c82ceadd 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h
@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE
 
 #include "arm_compute/core/experimental/Types.h"
+
 #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
 
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
index eda15f1d95..d3d7c8db83 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
@@ -24,6 +24,7 @@
 #include "ClTemplateWriter.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
 
@@ -39,11 +40,11 @@ std::string ClTemplateWriter::replace_tags(const std::string &code_template, con
     std::string replaced_code    = "";
     bool        scanning_pattern = false;
     std::string pattern_found    = "";
-    for(size_t i = 0; i < code_template.size() - 1; ++i)
+    for (size_t i = 0; i < code_template.size() - 1; ++i)
     {
-        if(!scanning_pattern)
+        if (!scanning_pattern)
         {
-            if(code_template[i] == '{' && code_template[i + 1] == '{')
+            if (code_template[i] == '{' && code_template[i + 1] == '{')
             {
                 i += 1;
                 scanning_pattern = true;
@@ -56,7 +57,7 @@ std::string ClTemplateWriter::replace_tags(const std::string &code_template, con
         }
         else
         {
-            if(code_template[i] == '}' && code_template[i + 1] == '}')
+            if (code_template[i] == '}' && code_template[i + 1] == '}')
             {
                 i += 1;
                 scanning_pattern = false;
@@ -76,8 +77,7 @@ std::string ClTemplateWriter::replace_tags(const std::string &code_template, con
 ClTemplateWriter::~ClTemplateWriter()
 {
 }
-ClTemplateWriter::ClTemplateWriter(const GpuKernelComponentGroup &components)
-    : _components{ components }
+ClTemplateWriter::ClTemplateWriter(const GpuKernelComponentGroup &components) : _components{components}
 {
 }
 std::string ClTemplateWriter::get_name()
@@ -91,7 +91,7 @@ std::string ClTemplateWriter::get_code()
 std::string ClTemplateWriter::get_config_id()
 {
     std::string config_id = get_name();
-    for(const auto &comp : _components)
+    for (const auto &comp : _components)
     {
         config_id += "--" + comp->template_writer()->get_config_id() + "--";
     }
@@ -103,7 +103,7 @@ CLBuildOptions ClTemplateWriter::get_build_options()
 {
     CLBuildOptions build_opts{};
 
-    for(const auto &comp : _components)
+    for (const auto &comp : _components)
     {
         build_opts.add_options(comp->template_writer()->get_build_options(_components).options());
     }
@@ -122,11 +122,9 @@ std::map<ITensorInfo::Id, GpuKernelArgument> ClTemplateWriter::get_tensors()
 {
     // Assemble GpuKernelArguments
     std::map<ITensorInfo::Id, GpuKernelArgument> tensors;
-    for(const auto t : _components.get_argument_tensors())
+    for (const auto t : _components.get_argument_tensors())
     {
-        tensors.emplace(
-            t->id(),
-            GpuKernelArgument{ *t, _vtable.get_variable(t).kernel_argument_info });
+        tensors.emplace(t->id(), GpuKernelArgument{*t, _vtable.get_variable(t).kernel_argument_info});
     }
     return tensors;
 }
@@ -141,22 +139,24 @@ std::string ClTemplateWriter::write_code()
     std::vector<std::string> component_codes{}; // vector because order matters
 
     // Pass 1: Declare all kernel variables
-    for(auto &component : _components)
+    for (auto &component : _components)
     {
         component->template_writer()->declare_variables(_vtable, _components);
     }
     // Pass 2: Generate component codes
-    for(auto &component : _components)
+    for (auto &component : _components)
     {
         const auto component_writer       = component->template_writer();
         auto       curr_headers_list      = component_writer->get_headers_list();
         auto       curr_additional_macros = component_writer->get_additional_macros();
         auto       curr_component_code    = component_writer->get_component_code(_components);
-        const auto var_lut                = component_writer->get_tag_lut(_vtable, _components); // Ideally can be merged with get_component_code once we have finer-grained code generation technique
+        const auto var_lut                = component_writer->get_tag_lut(
+                           _vtable,
+                           _components); // Ideally can be merged with get_component_code once we have finer-grained code generation technique
         component_codes.push_back(replace_tags(curr_component_code, var_lut));
 
         headers_list.insert(curr_headers_list.begin(), curr_headers_list.end());
-        if(!additional_macros.empty()) // Some components might not have any
+        if (!additional_macros.empty()) // Some components might not have any
         {
             additional_macros.insert(replace_tags(curr_additional_macros, var_lut));
         }
@@ -165,7 +165,7 @@ std::string ClTemplateWriter::write_code()
     // Step 3: Assemble the data gathered by traversing the graph into the string "code"
     std::string code = "";
 
-    for(auto &header : headers_list)
+    for (auto &header : headers_list)
     {
 #if defined(EMBEDDED_KERNELS)
         code += CLKernelLibrary::get().get_program(header).first;
@@ -174,16 +174,14 @@ std::string ClTemplateWriter::write_code()
 #endif // defined(EMBEDDED_KERNELS)
     }
 
-    for(auto &macros : additional_macros)
+    for (auto &macros : additional_macros)
     {
         code += macros;
     }
 
     auto arguments = _components.get_argument_tensors();
-    std::sort(arguments.begin(), arguments.end(), [](const ITensorInfo * l, const ITensorInfo * r)
-    {
-        return l->id() < r->id();
-    });
+    std::sort(arguments.begin(), arguments.end(),
+              [](const ITensorInfo *l, const ITensorInfo *r) { return l->id() < r->id(); });
     code += write_kernel_signature(_vtable.get_variable_list(arguments));
 
     code += "\n{\n\n";
@@ -198,7 +196,7 @@ std::string ClTemplateWriter::write_code()
 
         tiles_ss << "    //------------------ START TILE DECLARATION ---------------------\n";
 
-        for(auto tile : tiles)
+        for (auto tile : tiles)
         {
             const auto var       = _vtable.get_variable(tile);
             const auto data_type = get_cl_type_from_data_type(tile->data_type());
@@ -212,7 +210,7 @@ std::string ClTemplateWriter::write_code()
         code += tiles_ss.str();
     }
 
-    for(const auto &component_code : component_codes)
+    for (const auto &component_code : component_codes)
     {
         code += component_code;
         code += "\n";
@@ -231,7 +229,8 @@ std::string ClTemplateWriter::write_global_section() const
     auto       leftover_w = dst_w % tile_w;
 
     std::string code = "";
-    code += std::string("    int g_ind_0 = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " + std::to_string(leftover_w) + ");\n";
+    code += std::string("    int g_ind_0 = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " +
+            std::to_string(leftover_w) + ");\n";
     code += std::string("    int g_ind_1 = GET_SPATIAL_IDX(1, ") + std::to_string(tile_h) + ", " + "0);\n";
     code += std::string("    int g_ind_2 = GET_SPATIAL_IDX(2, 1, 0);\n\n");
 
@@ -243,7 +242,7 @@ std::string ClTemplateWriter::write_global_section() const
 std::string ClTemplateWriter::write_argument_declaration(const GpuKernelVariableTable::TensorVariable &var) const
 {
     std::string code;
-    switch(var.kernel_argument_info.type)
+    switch (var.kernel_argument_info.type)
     {
         case GpuKernelArgumentInfo::Type::Vector:
         {
@@ -293,11 +292,11 @@ std::string ClTemplateWriter::write_kernel_signature(const GpuKernelVariableTabl
 {
     std::string code = "\n__kernel void " + write_kernel_name() + "(";
 
-    for(int i = 0; i < static_cast<int>(argument_list.size()) - 1; ++i)
+    for (int i = 0; i < static_cast<int>(argument_list.size()) - 1; ++i)
     {
         code += write_argument_declaration(argument_list[i]) + ",";
     }
-    if(static_cast<int>(argument_list.size()) - 1 >= 0)
+    if (static_cast<int>(argument_list.size()) - 1 >= 0)
     {
         code += write_argument_declaration(argument_list[argument_list.size() - 1]);
     }
@@ -308,12 +307,12 @@ std::string ClTemplateWriter::write_kernel_signature(const GpuKernelVariableTabl
 }
 std::string ClTemplateWriter::write_kernel_name() const
 {
-    if(_components.empty())
+    if (_components.empty())
     {
         return "empty_kernel";
     }
     std::string name = _components.empty() ? "" : _components[0]->template_writer()->get_name();
-    for(size_t i = 1; i < _components.size(); ++i)
+    for (size_t i = 1; i < _components.size(); ++i)
     {
         name += "___";
         name += _components[i]->template_writer()->get_name();
diff --git a/src/dynamic_fusion/sketch/utils/DependencyGraph.h b/src/dynamic_fusion/sketch/utils/DependencyGraph.h
index c891e76d8b..c157c2b21c 100644
--- a/src/dynamic_fusion/sketch/utils/DependencyGraph.h
+++ b/src/dynamic_fusion/sketch/utils/DependencyGraph.h
@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_UTILS_DEPENDENCYGRAPH
 
 #include "arm_compute/core/Error.h"
+
 #include <cstdint>
 #include <map>
 #include <set>
@@ -68,12 +69,10 @@ public:
         OperatorId            op{};
         std::vector<TensorId> inputs{};
         std::vector<TensorId> outputs{};
-        friend bool operator==(const OpPack &opp0, const OpPack &opp1)
+        friend bool           operator==(const OpPack &opp0, const OpPack &opp1)
         {
-            return std::make_tuple(
-                       opp0.op, opp0.inputs, opp0.outputs)
-                   == std::make_tuple(
-                       opp1.op, opp1.inputs, opp1.outputs);
+            return std::make_tuple(opp0.op, opp0.inputs, opp0.outputs) ==
+                   std::make_tuple(opp1.op, opp1.inputs, opp1.outputs);
         }
     };
 
@@ -95,10 +94,13 @@ public:
      * @return true  If the operator can be added while keeping the graph as a linear sequence
      * @return false  Otherwise
      */
-    bool try_add_operator_as_linear(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs, bool is_output = false) const
+    bool try_add_operator_as_linear(OperatorId                   op,
+                                    const std::vector<TensorId> &inputs,
+                                    const std::vector<TensorId> &outputs,
+                                    bool                         is_output = false) const
     {
         ARM_COMPUTE_UNUSED(op, is_output);
-        if(all_ops().empty())
+        if (all_ops().empty())
         {
             return true;
         }
@@ -106,25 +108,25 @@ public:
         // If the new operator is not the first operator, at least one input tensor must be
         // the output tensor of the last non-output operator. All other input tensors must be
         // the global input of the graph (i.e. not the output of any operator).
-        if(_last_op_available)
+        if (_last_op_available)
         {
             auto use_input_from_last_op = false;
 
-            for(auto src_tensor : inputs)
+            for (auto src_tensor : inputs)
             {
                 const auto src_ops = _adj_src_ops.find(src_tensor);
 
-                if(src_ops != _adj_src_ops.end())
+                if (src_ops != _adj_src_ops.end())
                 {
                     ARM_COMPUTE_ERROR_ON(src_ops->second.size() > 1);
 
-                    if(!src_ops->second.empty())
+                    if (!src_ops->second.empty())
                     {
                         const auto src_op = src_ops->second[0];
 
-                        if(src_op == _last_op)
+                        if (src_op == _last_op)
                         {
-                            if(use_input_from_last_op)
+                            if (use_input_from_last_op)
                             {
                                 // To be safe, we also forbid using the output tensor
                                 // of the last operator twice.
@@ -143,7 +145,7 @@ public:
                 }
             }
 
-            if(!use_input_from_last_op)
+            if (!use_input_from_last_op)
             {
                 // At least one input tensor must be the output tensor of the last non-output operator.
                 return false;
@@ -152,9 +154,9 @@ public:
 
         // The output tensor of the new operator must not be the input tensor of any previously
         // added operator.
-        for(auto dst_tensor : outputs)
+        for (auto dst_tensor : outputs)
         {
-            if(_adj_dst_ops.find(dst_tensor) != _adj_dst_ops.end())
+            if (_adj_dst_ops.find(dst_tensor) != _adj_dst_ops.end())
             {
                 return false;
             }
@@ -168,7 +170,10 @@ public:
      * INVARIANT: The list can only grow from head to tail
      * INVARIANT: POSTCONDITION: The graph is linear
      */
-    void add_operator_as_linear(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs, bool is_output = false)
+    void add_operator_as_linear(OperatorId                   op,
+                                const std::vector<TensorId> &inputs,
+                                const std::vector<TensorId> &outputs,
+                                bool                         is_output = false)
     {
         const auto success = add_operator(op, inputs, outputs, is_output);
         ARM_COMPUTE_UNUSED(success);
@@ -183,24 +188,27 @@ public:
      * @param[in] outputs   Output tensors to the operator
      * @param[in] is_output Whether this is an output operator
      */
-    bool add_operator(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs, bool is_output = false)
+    bool add_operator(OperatorId                   op,
+                      const std::vector<TensorId> &inputs,
+                      const std::vector<TensorId> &outputs,
+                      bool                         is_output = false)
     {
-        if(operator_exists(op))
+        if (operator_exists(op))
         {
             return false;
         }
         _adj_src_tensors[op] = {};
         _adj_dst_tensors[op] = {};
-        for(auto in_tensor : inputs)
+        for (auto in_tensor : inputs)
         {
             // Linking input tensor to operator node will never create a cycle / loop because we guarantee
             // each op is newly created, so every <input, op> pair / edge is new
             link_input(op, in_tensor);
         }
-        for(auto out_tensor : outputs)
+        for (auto out_tensor : outputs)
         {
             // If there exists a back path from op's output tensor to op already, then linking the two will create a loop / cycle
-            if(path_exists_from_tensor_to_op(out_tensor, op))
+            if (path_exists_from_tensor_to_op(out_tensor, op))
             {
                 remove_operator(op);
                 return false;
@@ -211,10 +219,10 @@ public:
             }
         }
 
-        if(!is_output)
+        if (!is_output)
         {
             _last_op_available = true;
-            _last_op = op;
+            _last_op           = op;
         }
 
         return true;
@@ -230,16 +238,16 @@ public:
     std::vector<OpPack> build_operators_sequence() const
     {
         std::vector<OpPack> ops_seq;
-        std::set<Id> done_ops;
-        std::set<Id> done_tensors;
+        std::set<Id>        done_ops;
+        std::set<Id>        done_tensors;
 
         const auto input_tensors = global_src_tensors();
 
-        for(auto tensor : input_tensors)
+        for (auto tensor : input_tensors)
         {
             done_tensors.insert(tensor);
 
-            for(auto op : _adj_dst_ops.at(tensor))
+            for (auto op : _adj_dst_ops.at(tensor))
             {
                 build_operators_sequence_from_op(op, ops_seq, done_ops, done_tensors);
             }
@@ -260,10 +268,8 @@ public:
     friend bool operator==(const DependencyGraph &g0, const DependencyGraph &g1)
     {
         // Do not compare id allocators
-        return std::make_tuple(
-                   g0._adj_src_tensors, g0._adj_dst_tensors, g0._adj_src_ops, g0._adj_dst_ops)
-               == std::make_tuple(
-                   g1._adj_src_tensors, g1._adj_dst_tensors, g1._adj_src_ops, g1._adj_dst_ops);
+        return std::make_tuple(g0._adj_src_tensors, g0._adj_dst_tensors, g0._adj_src_ops, g0._adj_dst_ops) ==
+               std::make_tuple(g1._adj_src_tensors, g1._adj_dst_tensors, g1._adj_src_ops, g1._adj_dst_ops);
     }
     std::vector<OperatorId> src_ops_from_tensor(TensorId tensor) const
     {
@@ -280,10 +286,8 @@ public:
     std::vector<TensorId> all_tensors() const
     {
         std::vector<TensorId> tensors{};
-        std::transform(std::begin(_adj_src_ops), std::end(_adj_src_ops), std::back_inserter(tensors), [](const auto & it)
-        {
-            return it.first;
-        });
+        std::transform(std::begin(_adj_src_ops), std::end(_adj_src_ops), std::back_inserter(tensors),
+                       [](const auto &it) { return it.first; });
         return tensors;
     }
     /** Get source tensors of the whole graph
@@ -293,9 +297,9 @@ public:
     std::vector<TensorId> global_src_tensors() const
     {
         std::vector<TensorId> tensors;
-        for(auto tensor_src_ops : _adj_src_ops)
+        for (auto tensor_src_ops : _adj_src_ops)
         {
-            if(tensor_src_ops.second.empty())
+            if (tensor_src_ops.second.empty())
             {
                 tensors.push_back(tensor_src_ops.first);
             }
@@ -309,9 +313,9 @@ public:
     std::vector<TensorId> global_dst_tensors() const
     {
         std::vector<TensorId> tensors;
-        for(auto tensor_dst_ops : _adj_dst_ops)
+        for (auto tensor_dst_ops : _adj_dst_ops)
         {
-            if(tensor_dst_ops.second.empty())
+            if (tensor_dst_ops.second.empty())
             {
                 tensors.push_back(tensor_dst_ops.first);
             }
@@ -328,14 +332,14 @@ public:
 
         // If a tensor is used to connect the input of an operator and the output of another operator,
         // it is not allocated in the memory. The tensor exists as a temporary variable only.
-        for(auto src_tensor : _adj_src_ops)
+        for (auto src_tensor : _adj_src_ops)
         {
-            if(!src_tensor.second.empty())
+            if (!src_tensor.second.empty())
             {
                 const auto dst_tensor = _adj_dst_ops.find(src_tensor.first);
-                if(dst_tensor != _adj_dst_ops.end())
+                if (dst_tensor != _adj_dst_ops.end())
                 {
-                    if(!dst_tensor->second.empty())
+                    if (!dst_tensor->second.empty())
                     {
                         tensors.push_back(src_tensor.first);
                     }
@@ -354,9 +358,9 @@ public:
         std::vector<OperatorId> ops{};
         const auto              op_list = all_ops();
 
-        for(auto op : op_list)
+        for (auto op : op_list)
         {
-            if(src_ops(op).empty())
+            if (src_ops(op).empty())
             {
                 ops.emplace_back(op);
             }
@@ -368,7 +372,7 @@ private:
     void link_input(OperatorId op, TensorId in_tensor)
     {
         ARM_COMPUTE_ERROR_ON(!operator_exists(op));
-        if(!tensor_exists(in_tensor))
+        if (!tensor_exists(in_tensor))
         {
             insert_new_tensor(in_tensor);
         }
@@ -379,7 +383,7 @@ private:
     void link_output(OperatorId op, TensorId out_tensor)
     {
         ARM_COMPUTE_ERROR_ON(!operator_exists(op));
-        if(!tensor_exists(out_tensor))
+        if (!tensor_exists(out_tensor))
         {
             insert_new_tensor(out_tensor);
         }
@@ -392,7 +396,7 @@ private:
     {
         ARM_COMPUTE_ERROR_ON(!operator_exists(op));
         std::vector<OperatorId> ops{};
-        for(TensorId src_tensor : src_tensors(op))
+        for (TensorId src_tensor : src_tensors(op))
         {
             ops.insert(ops.end(), std::begin(_adj_src_ops.at(src_tensor)), std::end(_adj_src_ops.at(src_tensor)));
         }
@@ -402,7 +406,7 @@ private:
     {
         ARM_COMPUTE_ERROR_ON(!operator_exists(op));
         std::vector<OperatorId> ops{};
-        for(TensorId dst_tensor : _adj_dst_tensors.at(op))
+        for (TensorId dst_tensor : _adj_dst_tensors.at(op))
         {
             ops.insert(ops.end(), std::begin(_adj_dst_ops.at(dst_tensor)), std::end(_adj_dst_ops.at(dst_tensor)));
         }
@@ -436,10 +440,8 @@ private:
     std::vector<OperatorId> all_ops() const
     {
         std::vector<OperatorId> ops{};
-        std::transform(std::begin(_adj_src_tensors), std::end(_adj_src_tensors), std::back_inserter(ops), [](const auto & it)
-        {
-            return it.first;
-        });
+        std::transform(std::begin(_adj_src_tensors), std::end(_adj_src_tensors), std::back_inserter(ops),
+                       [](const auto &it) { return it.first; });
         return ops;
     }
     /** Remove an operator from graph.
@@ -448,25 +450,21 @@ private:
      */
     void remove_operator(OperatorId op)
     {
-        for(auto src_tensor : _adj_src_tensors.at(op))
+        for (auto src_tensor : _adj_src_tensors.at(op))
         {
             auto &dst_ops = _adj_dst_ops.at(src_tensor);
-            dst_ops.erase(
-                std::remove(std::begin(dst_ops), std::end(dst_ops), op),
-                std::end(dst_ops));
+            dst_ops.erase(std::remove(std::begin(dst_ops), std::end(dst_ops), op), std::end(dst_ops));
         }
-        for(auto dst_tensor : _adj_dst_tensors.at(op))
+        for (auto dst_tensor : _adj_dst_tensors.at(op))
         {
             auto &src_ops = _adj_src_ops.at(dst_tensor);
-            src_ops.erase(
-                std::remove(std::begin(src_ops), std::end(src_ops), op),
-                std::end(src_ops));
+            src_ops.erase(std::remove(std::begin(src_ops), std::end(src_ops), op), std::end(src_ops));
         }
         // Remove any isolated tensors
         // An isolated tensor is one where both its _adj_src_ops and _adj_dst_ops are empty
-        for(auto t : all_tensors())
+        for (auto t : all_tensors())
         {
-            if(_adj_src_ops.at(t).empty() && _adj_dst_ops.at(t).empty())
+            if (_adj_src_ops.at(t).empty() && _adj_dst_ops.at(t).empty())
             {
                 _adj_src_ops.erase(t);
                 _adj_dst_ops.erase(t);
@@ -486,11 +484,12 @@ private:
     }
     bool operator_exists(OperatorId op) const
     {
-        return _adj_src_tensors.find(op) != _adj_src_tensors.end() && _adj_dst_tensors.find(op) != _adj_dst_tensors.end();
+        return _adj_src_tensors.find(op) != _adj_src_tensors.end() &&
+               _adj_dst_tensors.find(op) != _adj_dst_tensors.end();
     }
     bool is_src_tensor_of(OperatorId op, TensorId tensor) const
     {
-        if(!operator_exists(op) || !tensor_exists(tensor))
+        if (!operator_exists(op) || !tensor_exists(tensor))
         {
             return false;
         }
@@ -499,7 +498,7 @@ private:
     }
     bool is_dst_tensor_of(OperatorId op, TensorId tensor) const
     {
-        if(!operator_exists(op) || !tensor_exists(tensor))
+        if (!operator_exists(op) || !tensor_exists(tensor))
         {
             return false;
         }
@@ -525,9 +524,9 @@ private:
         std::vector<OperatorId> ops{};
         const auto              op_list = all_ops();
 
-        for(auto op : op_list)
+        for (auto op : op_list)
         {
-            if(is_dst_op(op))
+            if (is_dst_op(op))
             {
                 ops.emplace_back(op);
             }
@@ -536,13 +535,13 @@ private:
     }
     bool path_exists_from_tensor_to_op(TensorId src_tensor, OperatorId dst_op) const
     {
-        if(!tensor_exists(src_tensor) || !operator_exists(dst_op))
+        if (!tensor_exists(src_tensor) || !operator_exists(dst_op))
         {
             return false;
         }
-        for(auto child_op : dst_ops_from_tensor(src_tensor))
+        for (auto child_op : dst_ops_from_tensor(src_tensor))
         {
-            if(path_exists_from_op_to_op(child_op, dst_op))
+            if (path_exists_from_op_to_op(child_op, dst_op))
             {
                 return true;
             }
@@ -552,21 +551,21 @@ private:
 
     bool path_exists_from_op_to_op(OperatorId src_op, OperatorId dst_op) const
     {
-        if(!operator_exists(src_op) || !operator_exists(dst_op))
+        if (!operator_exists(src_op) || !operator_exists(dst_op))
         {
             return false;
         }
-        if(src_op == dst_op)
+        if (src_op == dst_op)
         {
             return true;
         }
-        if(is_in(src_op, get_dst_ops()))
+        if (is_in(src_op, get_dst_ops()))
         {
             return false;
         }
-        for(auto child_tensor : dst_tensors(src_op))
+        for (auto child_tensor : dst_tensors(src_op))
         {
-            if(path_exists_from_tensor_to_op(child_tensor, dst_op))
+            if (path_exists_from_tensor_to_op(child_tensor, dst_op))
             {
                 return true;
             }
@@ -574,16 +573,15 @@ private:
         return false;
     }
 
-    void build_operators_sequence_from_op(
-        Id op,
-        std::vector<OpPack> &ops_seq,
-        std::set<Id> &done_ops,
-        std::set<Id> &done_tensors) const
+    void build_operators_sequence_from_op(Id                   op,
+                                          std::vector<OpPack> &ops_seq,
+                                          std::set<Id>        &done_ops,
+                                          std::set<Id>        &done_tensors) const
     {
-        while(true)
+        while (true)
         {
             // If the operator has been added to the sequence, ignore it.
-            if(done_ops.find(op) != done_ops.end())
+            if (done_ops.find(op) != done_ops.end())
             {
                 return;
             }
@@ -593,9 +591,9 @@ private:
             // is added to the sequence.
             const auto src_tensors = _adj_src_tensors.at(op);
 
-            for(auto src : src_tensors)
+            for (auto src : src_tensors)
             {
-                if(done_tensors.find(src) == done_tensors.end())
+                if (done_tensors.find(src) == done_tensors.end())
                 {
                     return;
                 }
@@ -606,24 +604,24 @@ private:
 
             done_ops.insert(op);
 
-            OpPack pack{ op, src_tensors, dst_tensors };
+            OpPack pack{op, src_tensors, dst_tensors};
             ops_seq.push_back(pack);
 
             done_tensors.insert(dst_tensors.begin(), dst_tensors.end());
 
             // Visit all the sink operators.
             // Call this function recursively unless there is only one sink.
-            if(dst_tensors.size() == 1 && _adj_dst_ops.at(dst_tensors[0]).size() == 1)
+            if (dst_tensors.size() == 1 && _adj_dst_ops.at(dst_tensors[0]).size() == 1)
             {
                 op = _adj_dst_ops.at(dst_tensors[0])[0];
             }
             else
             {
-                for(auto dst_tensor : dst_tensors)
+                for (auto dst_tensor : dst_tensors)
                 {
                     const auto dst_ops = _adj_dst_ops.at(dst_tensor);
 
-                    for(auto dst_op : dst_ops)
+                    for (auto dst_op : dst_ops)
                     {
                         build_operators_sequence_from_op(dst_op, ops_seq, done_ops, done_tensors);
                     }
@@ -640,8 +638,8 @@ private:
     AdjList _adj_src_ops{};
     AdjList _adj_dst_ops{};
 
-    bool _last_op_available{ false };
-    OperatorId _last_op{ 0 };
+    bool       _last_op_available{false};
+    OperatorId _last_op{0};
 };
 
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/utils/Utils.h b/src/dynamic_fusion/utils/Utils.h
index c9fc2c610f..3f4a2edd03 100644
--- a/src/dynamic_fusion/utils/Utils.h
+++ b/src/dynamic_fusion/utils/Utils.h
@@ -63,17 +63,21 @@ inline bool is_invalid_tensor(const ITensorInfo *tensor_info)
 
 /** Inline function to convert @ref Pool2dAttributes to PoolingLayerInfo
 */
-inline PoolingLayerInfo convert_pool_attr_to_pool_info(const Pool2dAttributes &pool_attr, bool mixed_precision = false, DataLayout data_layout = DataLayout::NHWC)
+inline PoolingLayerInfo convert_pool_attr_to_pool_info(const Pool2dAttributes &pool_attr,
+                                                       bool                    mixed_precision = false,
+                                                       DataLayout              data_layout     = DataLayout::NHWC)
 {
     // Create PadStrideInfo
     const Size2D        stride  = pool_attr.stride();
     const Padding2D     padding = pool_attr.pad();
-    const PadStrideInfo pad_stride(stride.x(), stride.y(), padding.left, padding.top, arm_compute::DimensionRoundingType::FLOOR);
+    const PadStrideInfo pad_stride(stride.x(), stride.y(), padding.left, padding.top,
+                                   arm_compute::DimensionRoundingType::FLOOR);
 
-    return PoolingLayerInfo(pool_attr.pool_type(), pool_attr.pool_size(), data_layout, pad_stride, pool_attr.exclude_padding(), mixed_precision);
-}
-}
-}
+    return PoolingLayerInfo(pool_attr.pool_type(), pool_attr.pool_size(), data_layout, pad_stride,
+                            pool_attr.exclude_padding(), mixed_precision);
 }
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
 
 #endif /* SRC_DYNAMIC_FUSION_UTILS_UTILS */
diff --git a/src/gpu/cl/ClContext.cpp b/src/gpu/cl/ClContext.cpp
index d8ef18e62e..611c1cb501 100644
--- a/src/gpu/cl/ClContext.cpp
+++ b/src/gpu/cl/ClContext.cpp
@@ -23,11 +23,11 @@
  */
 #include "src/gpu/cl/ClContext.h"
 
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+
 #include "src/gpu/cl/ClQueue.h"
 #include "src/gpu/cl/ClTensor.h"
 
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-
 namespace arm_compute
 {
 namespace gpu
@@ -41,7 +41,7 @@ mlgo::MLGOHeuristics populate_mlgo(const char *filename)
     bool                 status = false;
     mlgo::MLGOHeuristics heuristics;
 
-    if(filename != nullptr)
+    if (filename != nullptr)
     {
         status = heuristics.reload_from_file(filename);
     }
@@ -50,12 +50,9 @@ mlgo::MLGOHeuristics populate_mlgo(const char *filename)
 } // namespace
 
 ClContext::ClContext(const AclContextOptions *options)
-    : IContext(Target::GpuOcl),
-      _mlgo_heuristics(),
-      _cl_ctx(),
-      _cl_dev()
+    : IContext(Target::GpuOcl), _mlgo_heuristics(), _cl_ctx(), _cl_dev()
 {
-    if(options != nullptr)
+    if (options != nullptr)
     {
         _mlgo_heuristics = populate_mlgo(options->kernel_config_file);
     }
@@ -80,7 +77,7 @@ const mlgo::MLGOHeuristics &ClContext::mlgo() const
 
 bool ClContext::set_cl_ctx(::cl::Context ctx)
 {
-    if(this->refcount() == 0)
+    if (this->refcount() == 0)
     {
         _cl_ctx = ctx;
         CLScheduler::get().set_context(ctx);
@@ -92,7 +89,7 @@ bool ClContext::set_cl_ctx(::cl::Context ctx)
 ITensorV2 *ClContext::create_tensor(const AclTensorDescriptor &desc, bool allocate)
 {
     ClTensor *tensor = new ClTensor(this, desc);
-    if(tensor != nullptr && allocate)
+    if (tensor != nullptr && allocate)
     {
         tensor->allocate();
     }
diff --git a/src/gpu/cl/ClContext.h b/src/gpu/cl/ClContext.h
index a50b03124b..2c67ccf4d2 100644
--- a/src/gpu/cl/ClContext.h
+++ b/src/gpu/cl/ClContext.h
@@ -24,11 +24,11 @@
 #ifndef SRC_GPU_CLCONTEXT_H
 #define SRC_GPU_CLCONTEXT_H
 
+#include "arm_compute/core/CL/OpenCL.h"
+
 #include "src/common/IContext.h"
 #include "src/runtime/CL/mlgo/MLGOHeuristics.h"
 
-#include "arm_compute/core/CL/OpenCL.h"
-
 namespace arm_compute
 {
 namespace gpu
@@ -74,9 +74,9 @@ public:
     bool set_cl_ctx(::cl::Context ctx);
 
     // Inherrited methods overridden
-    ITensorV2 *create_tensor(const AclTensorDescriptor &desc, bool allocate) override;
-    IQueue *create_queue(const AclQueueOptions *options) override;
-    std::tuple<IOperator *, StatusCode> create_activation(const AclTensorDescriptor &src,
+    ITensorV2                          *create_tensor(const AclTensorDescriptor &desc, bool allocate) override;
+    IQueue                             *create_queue(const AclQueueOptions *options) override;
+    std::tuple<IOperator *, StatusCode> create_activation(const AclTensorDescriptor     &src,
                                                           const AclTensorDescriptor     &dst,
                                                           const AclActivationDescriptor &act,
                                                           bool                           is_validate) override;
@@ -90,4 +90,4 @@ private:
 } // namespace gpu
 } // namespace arm_compute
 
-#endif /* SRC_GPU_CLCONTEXT_H */
-\ No newline at end of file
+#endif /* SRC_GPU_CLCONTEXT_H */
diff --git a/src/gpu/cl/ClKernelLibrary.cpp b/src/gpu/cl/ClKernelLibrary.cpp
index 73bb96298e..bcade94522 100644
--- a/src/gpu/cl/ClKernelLibrary.cpp
+++ b/src/gpu/cl/ClKernelLibrary.cpp
@@ -37,24 +37,16 @@
 namespace
 {
 /* Decoding table */
-constexpr std::array<uint8_t, 256> b64_invtab =
-{
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 62, 0, 0, 0, 63,
-    52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 0, 0, 0, 0, 0, 0,
-    0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
-    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 0, 0, 0, 0,
-    0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
-    41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+constexpr std::array<uint8_t, 256> b64_invtab = {
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  62, 0,  0,  0,  63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+    0,  0,  0,  0,  0,  0,  0,  0, 1, 2, 3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+    22, 23, 24, 25, 0,  0,  0,  0, 0, 0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+    45, 46, 47, 48, 49, 50, 51, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 };
 
 /** Decode a base64 encoded string
@@ -68,13 +60,13 @@ std::string decode_base64(const std::string &str)
     constexpr const char pad_char = '=';
 
     // Handle empty string
-    if(str.empty())
+    if (str.empty())
     {
         return {};
     }
 
     // Base64 encoded string has size multiple of 4
-    if(str.length() % 4)
+    if (str.length() % 4)
     {
         return {};
     }
@@ -92,7 +84,7 @@ std::string decode_base64(const std::string &str)
     // Block decoding function (exclude padding)
     int       c   = 0;
     const int end = str_len - 4 - padding;
-    for(; c <= end; c += 4)
+    for (; c <= end; c += 4)
     {
         const int byte0 = b64_invtab[str[c]];
         const int byte1 = b64_invtab[str[c + 1]];
@@ -105,7 +97,7 @@ std::string decode_base64(const std::string &str)
     }
 
     // Last step that might contain padding symbols
-    if(padding == 1)
+    if (padding == 1)
     {
         const int byte0 = b64_invtab[str[c]];
         const int byte1 = b64_invtab[str[c + 1]];
@@ -114,7 +106,7 @@ std::string decode_base64(const std::string &str)
         dec_b64.push_back((byte0 << 2) | (byte1 >> 4));
         dec_b64.push_back((byte1 << 4) | (byte2 >> 2));
     }
-    else if(padding == 2)
+    else if (padding == 2)
     {
         const int byte0 = b64_invtab[str[c]];
         const int byte1 = b64_invtab[str[c + 1]];
@@ -135,7 +127,7 @@ std::string decompress_zlib(const std::string &str)
 {
     // Create and initialize decompression stream
     z_stream ds{};
-    if(inflateInit(&ds) != Z_OK)
+    if (inflateInit(&ds) != Z_OK)
     {
         return std::string();
     }
@@ -152,16 +144,15 @@ std::string decompress_zlib(const std::string &str)
         ds.next_out  = reinterpret_cast<Bytef *>(roll_buff);
 
         status = inflate(&ds, 0);
-        if(inflated_str.size() < ds.total_out)
+        if (inflated_str.size() < ds.total_out)
         {
             inflated_str.append(roll_buff, ds.total_out - inflated_str.size());
         }
-    }
-    while(status == Z_OK);
+    } while (status == Z_OK);
 
     // Finalize decompression stream
     inflateEnd(&ds);
-    if(status != Z_STREAM_END)
+    if (status != Z_STREAM_END)
     {
         return std::string();
     }
@@ -175,323 +166,321 @@ namespace arm_compute
 {
 namespace opencl
 {
-const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map =
-{
+const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map = {
     // Common Kernels
-    { "activation_layer", "common/activation_layer.cl" },
-    { "activation_layer_quant", "common/activation_layer_quant.cl" },
-    { "activation_layer_quant_f32", "common/activation_layer_quant.cl" },
-    { "arg_min_max_x", "common/arg_min_max.cl" },
-    { "arg_min_max_y", "common/arg_min_max.cl" },
-    { "arg_min_max_z", "common/arg_min_max.cl" },
-    { "arg_min_max_w", "common/arg_min_max.cl" },
-    { "bitwise_or", "common/bitwise_op.cl" },
-    { "bitwise_and", "common/bitwise_op.cl" },
-    { "bitwise_xor", "common/bitwise_op.cl" },
-    { "bitwise_not", "common/bitwise_op.cl" },
-    { "bounding_box_transform", "common/bounding_box_transform.cl" },
-    { "bounding_box_transform_quantized", "common/bounding_box_transform_quantized.cl" },
-    { "compare_equal", "common/comparisons.cl" },
-    { "compare_equal_quantized", "common/comparisons.cl" },
-    { "compare_notequal", "common/comparisons.cl" },
-    { "compare_notequal_quantized", "common/comparisons.cl" },
-    { "compare_greater", "common/comparisons.cl" },
-    { "compare_greater_quantized", "common/comparisons.cl" },
-    { "compare_greaterequal", "common/comparisons.cl" },
-    { "compare_greaterequal_quantized", "common/comparisons.cl" },
-    { "compare_less", "common/comparisons.cl" },
-    { "compare_less_quantized", "common/comparisons.cl" },
-    { "compare_lessequal", "common/comparisons.cl" },
-    { "compare_lessequal_quantized", "common/comparisons.cl" },
-    { "concatenate", "common/concatenate.cl" },
-    { "concatenate_width", "common/concatenate.cl" },
-    { "concatenate_height", "common/concatenate.cl" },
-    { "concatenate_width_x2", "common/concatenate.cl" },
-    { "concatenate_width_x4", "common/concatenate.cl" },
-    { "col2im", "common/col2im.cl" },
-    { "cast_down", "common/cast.cl" },
-    { "cast_up", "common/cast.cl" },
-    { "convert_fc_weights", "common/convert_fc_weights.cl" },
-    { "copy_tensor", "common/copy_tensor.cl" },
-    { "crop_tensor", "common/crop_tensor.cl" },
-    { "deconvolution_reshape", "common/deconvolution_layer.cl" },
-    { "deconvolution_upsample", "common/deconvolution_layer.cl" },
-    { "dequantization_layer", "common/dequantization_layer.cl" },
-    { "elementwise_operation_ADD", "common/elementwise_operation.cl" },
-    { "elementwise_operation_SUB", "common/elementwise_operation.cl" },
-    { "elementwise_operation_MAX", "common/elementwise_operation.cl" },
-    { "elementwise_operation_MIN", "common/elementwise_operation.cl" },
-    { "elementwise_operation_DIV", "common/elementwise_operation.cl" },
-    { "elementwise_operation_SQUARED_DIFF", "common/elementwise_operation.cl" },
-    { "elementwise_operation_POWER", "common/elementwise_operation.cl" },
-    { "elementwise_operation_PRELU", "common/elementwise_operation.cl" },
-    { "elementwise_operation_AND", "common/elementwise_operation.cl" },
-    { "elementwise_operation_OR", "common/elementwise_operation.cl" },
-    { "elementwise_operation_ADD_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_SUB_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_MAX_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_MIN_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_DIV_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_SQUARED_DIFF_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_PRELU_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_unary", "common/elementwise_unary.cl" },
-    { "elementwise_unary_quantized", "common/elementwise_unary_quantized.cl" },
-    { "fft_digit_reverse_axis_0", "common/fft_digit_reverse.cl" },
-    { "fft_digit_reverse_axis_1", "common/fft_digit_reverse.cl" },
-    { "fft_radix_2_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_2_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_2_axis_0", "common/fft.cl" },
-    { "fft_radix_2_axis_1", "common/fft.cl" },
-    { "fft_radix_3_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_3_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_3_axis_0", "common/fft.cl" },
-    { "fft_radix_3_axis_1", "common/fft.cl" },
-    { "fft_radix_4_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_4_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_4_axis_0", "common/fft.cl" },
-    { "fft_radix_4_axis_1", "common/fft.cl" },
-    { "fft_radix_5_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_5_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_5_axis_0", "common/fft.cl" },
-    { "fft_radix_5_axis_1", "common/fft.cl" },
-    { "fft_radix_7_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_7_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_7_axis_0", "common/fft.cl" },
-    { "fft_radix_7_axis_1", "common/fft.cl" },
-    { "fft_radix_8_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_8_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_8_axis_0", "common/fft.cl" },
-    { "fft_radix_8_axis_1", "common/fft.cl" },
-    { "fft_scale_conj", "common/fft_scale.cl" },
-    { "fill_image_borders_constant", "common/fill_border.cl" },
-    { "fill_image_borders_replicate", "common/fill_border.cl" },
-    { "floor_layer", "common/floor.cl" },
-    { "fuse_batchnormalization_layer", "common/batchnormalization_layer.cl" },
-    { "gather", "common/gather.cl" },
-    { "gemm_ma_f16", "common/gemm.cl" },
-    { "gemm_ma_f32", "common/gemm.cl" },
-    { "gemm_mv", "common/gemv.cl" },
-    { "gemm_mv_quantized", "common/gemv.cl" },
-    { "gemm_mm_native", "common/gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_nt_mmul", "common/gemm_reshaped_only_rhs_mmul.cl" },
-    { "gemm_mm_reshaped_only_rhs_nt_mmul_texture", "common/gemm_reshaped_only_rhs_mmul.cl" },
-    { "gemm_mm_reshaped_lhs_nt_rhs_t", "common/gemm.cl" },
-    { "gemm_mm_reshaped_lhs_nt_rhs_t_texture", "common/gemm.cl" },
-    { "gemm_mm_reshaped_lhs_t_rhs_nt", "common/gemm.cl" },
-    { "gemm_mm_reshaped_lhs_t_rhs_nt_texture", "common/gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_nt", "common/gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_nt_texture", "common/gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_t", "common/gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_t_texture", "common/gemm.cl" },
-    { "gemm_lc_vm_f32", "common/gemm.cl" },
-    { "gemm_reshape_lhs_matrix_nt", "common/gemm_utils.cl" },
-    { "gemm_reshape_lhs_matrix_t", "common/gemm_utils.cl" },
-    { "gemm_reshape_rhs_matrix_nt", "common/gemm_utils.cl" },
-    { "gemm_reshape_rhs_matrix_t", "common/gemm_utils.cl" },
-    { "gemmlowp_matrix_a_reduction", "common/gemmlowp.cl" },
-    { "gemmlowp_matrix_a_reduction_dot8", "common/gemmlowp.cl" },
-    { "gemmlowp_matrix_b_reduction", "common/gemmlowp.cl" },
-    { "gemmlowp_mm_native", "common/gemmlowp.cl" },
-    { "gemmlowp_mm_reshaped_lhs_nt_rhs_t", "common/gemmlowp.cl" },
-    { "gemmlowp_mm_reshaped_only_rhs_t", "common/gemmlowp.cl" },
-    { "gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint", "common/gemmlowp.cl" },
-    { "gemmlowp_mm_reshaped_only_rhs_mmul", "common/gemmlowp_reshaped_only_rhs_mmul.cl" },
-    { "gemmlowp_offset_contribution", "common/gemmlowp.cl" },
-    { "gemmlowp_offset_contribution_quantize_down", "common/gemmlowp.cl" },
-    { "gemmlowp_offset_contribution_quantize_down_fixedpoint", "common/gemmlowp.cl" },
-    { "gemmlowp_output_stage_quantize_down", "common/gemmlowp.cl" },
-    { "gemmlowp_output_stage_quantize_down_fixedpoint", "common/gemmlowp.cl" },
-    { "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16", "common/gemmlowp.cl" },
-    { "gemmlowp_output_stage_quantize_down_float", "common/gemmlowp.cl" },
-    { "generate_proposals_compute_all_anchors", "common/generate_proposals.cl" },
-    { "generate_proposals_compute_all_anchors_quantized", "common/generate_proposals_quantized.cl" },
-    { "instance_normalization", "common/instance_normalization.cl" },
-    { "compute_mean_var", "common/instance_normalization.cl" },
-    { "l2_normalize_x", "common/l2_normalize.cl" },
-    { "l2_normalize_y", "common/l2_normalize.cl" },
-    { "l2_normalize_z", "common/l2_normalize.cl" },
-    { "mat_mul_native_mmul_nt_nt", "common/mat_mul_mmul.cl" },
-    { "mat_mul_native_mmul_t_nt", "common/mat_mul_mmul.cl" },
-    { "mat_mul_native_mmul_nt_t", "common/mat_mul_mmul.cl" },
-    { "mat_mul_native_mmul_t_t", "common/mat_mul_mmul.cl" },
-    { "mat_mul_native_nt_nt", "common/mat_mul.cl" },
-    { "mat_mul_native_nt_t", "common/mat_mul.cl" },
-    { "mat_mul_native_t_nt", "common/mat_mul.cl" },
-    { "mat_mul_native_t_t", "common/mat_mul.cl" },
-    { "mat_mul_native_quantized_nt_nt", "common/mat_mul_quantized.cl" },
-    { "mat_mul_native_quantized_nt_t", "common/mat_mul_quantized.cl" },
-    { "mat_mul_native_quantized_t_nt", "common/mat_mul_quantized.cl" },
-    { "mat_mul_native_quantized_t_t", "common/mat_mul_quantized.cl" },
-    { "mat_mul_native_quantized_mmul_nt_nt", "common/mat_mul_quantized_mmul.cl" },
-    { "mat_mul_native_quantized_mmul_nt_t", "common/mat_mul_quantized_mmul.cl" },
-    { "mat_mul_native_quantized_mmul_t_nt", "common/mat_mul_quantized_mmul.cl" },
-    { "mat_mul_native_quantized_mmul_t_t", "common/mat_mul_quantized_mmul.cl" },
-    { "max_unpooling_layer_2", "common/unpooling_layer.cl" },
-    { "mean_stddev_normalization", "common/mean_stddev_normalization.cl" },
-    { "memset", "common/memset.cl" },
-    { "minmax_layer", "common/minmax_layer.cl" },
-    { "non_max_suppression", "common/nonmax.cl" },
-    { "pad_layer_constant", "common/pad_layer.cl" },
-    { "pad_layer_symmetric_reflect", "common/pad_layer.cl" },
-    { "permute", "common/permute.cl" },
-    { "pixelwise_mul_complex", "common/pixelwise_mul_float.cl" },
-    { "pixelwise_mul_float", "common/pixelwise_mul_float.cl" },
-    { "pixelwise_mul_int", "common/pixelwise_mul_int.cl" },
-    { "pixelwise_mul_quantized", "common/pixelwise_mul_int.cl" },
-    { "qlstm_layer_normalization", "common/qlstm_layer_normalization.cl" },
-    { "quantization_layer", "common/quantization_layer.cl" },
-    { "range", "common/range.cl" },
-    { "range_quantized", "common/range.cl" },
-    { "reduction_operation_x", "common/reduction_operation.cl" },
-    { "reduction_operation_non_parallel_x", "common/reduction_operation.cl" },
-    { "reduction_operation_y", "common/reduction_operation.cl" },
-    { "reduction_operation_z", "common/reduction_operation.cl" },
-    { "reduction_operation_w", "common/reduction_operation.cl" },
-    { "reshape_layer", "common/reshape_layer.cl" },
-    { "reshape_to_columns", "common/convolution_layer.cl" },
-    { "reverse", "common/reverse.cl" },
-    { "roi_align_layer", "common/roi_align_layer.cl" },
-    { "roi_align_layer_quantized", "common/roi_align_layer_quantized.cl" },
-    { "roi_pooling_layer", "common/roi_pooling_layer.cl" },
-    { "select_same_rank", "common/select.cl" },
-    { "select_different_rank_2", "common/select.cl" },
-    { "select_different_rank_n", "common/select.cl" },
-    { "softmax_layer_norm", "common/softmax_layer.cl" },
-    { "softmax_layer_norm_quantized", "common/softmax_layer_quantized.cl" },
-    { "softmax_layer_max_shift_exp_sum_quantized_serial", "common/softmax_layer_quantized.cl" },
-    { "softmax_layer_max_shift_exp_sum_quantized_parallel", "common/softmax_layer_quantized.cl" },
-    { "softmax_layer_max_shift_exp_sum_serial", "common/softmax_layer.cl" },
-    { "softmax_layer_max_shift_exp_sum_parallel", "common/softmax_layer.cl" },
-    { "stack_layer", "common/stack_layer.cl" },
-    { "strided_slice", "common/slice_ops.cl" },
-    { "tile", "common/tile.cl" },
-    { "transpose", "common/transpose.cl" },
+    {"activation_layer", "common/activation_layer.cl"},
+    {"activation_layer_quant", "common/activation_layer_quant.cl"},
+    {"activation_layer_quant_f32", "common/activation_layer_quant.cl"},
+    {"arg_min_max_x", "common/arg_min_max.cl"},
+    {"arg_min_max_y", "common/arg_min_max.cl"},
+    {"arg_min_max_z", "common/arg_min_max.cl"},
+    {"arg_min_max_w", "common/arg_min_max.cl"},
+    {"bitwise_or", "common/bitwise_op.cl"},
+    {"bitwise_and", "common/bitwise_op.cl"},
+    {"bitwise_xor", "common/bitwise_op.cl"},
+    {"bitwise_not", "common/bitwise_op.cl"},
+    {"bounding_box_transform", "common/bounding_box_transform.cl"},
+    {"bounding_box_transform_quantized", "common/bounding_box_transform_quantized.cl"},
+    {"compare_equal", "common/comparisons.cl"},
+    {"compare_equal_quantized", "common/comparisons.cl"},
+    {"compare_notequal", "common/comparisons.cl"},
+    {"compare_notequal_quantized", "common/comparisons.cl"},
+    {"compare_greater", "common/comparisons.cl"},
+    {"compare_greater_quantized", "common/comparisons.cl"},
+    {"compare_greaterequal", "common/comparisons.cl"},
+    {"compare_greaterequal_quantized", "common/comparisons.cl"},
+    {"compare_less", "common/comparisons.cl"},
+    {"compare_less_quantized", "common/comparisons.cl"},
+    {"compare_lessequal", "common/comparisons.cl"},
+    {"compare_lessequal_quantized", "common/comparisons.cl"},
+    {"concatenate", "common/concatenate.cl"},
+    {"concatenate_width", "common/concatenate.cl"},
+    {"concatenate_height", "common/concatenate.cl"},
+    {"concatenate_width_x2", "common/concatenate.cl"},
+    {"concatenate_width_x4", "common/concatenate.cl"},
+    {"col2im", "common/col2im.cl"},
+    {"cast_down", "common/cast.cl"},
+    {"cast_up", "common/cast.cl"},
+    {"convert_fc_weights", "common/convert_fc_weights.cl"},
+    {"copy_tensor", "common/copy_tensor.cl"},
+    {"crop_tensor", "common/crop_tensor.cl"},
+    {"deconvolution_reshape", "common/deconvolution_layer.cl"},
+    {"deconvolution_upsample", "common/deconvolution_layer.cl"},
+    {"dequantization_layer", "common/dequantization_layer.cl"},
+    {"elementwise_operation_ADD", "common/elementwise_operation.cl"},
+    {"elementwise_operation_SUB", "common/elementwise_operation.cl"},
+    {"elementwise_operation_MAX", "common/elementwise_operation.cl"},
+    {"elementwise_operation_MIN", "common/elementwise_operation.cl"},
+    {"elementwise_operation_DIV", "common/elementwise_operation.cl"},
+    {"elementwise_operation_SQUARED_DIFF", "common/elementwise_operation.cl"},
+    {"elementwise_operation_POWER", "common/elementwise_operation.cl"},
+    {"elementwise_operation_PRELU", "common/elementwise_operation.cl"},
+    {"elementwise_operation_AND", "common/elementwise_operation.cl"},
+    {"elementwise_operation_OR", "common/elementwise_operation.cl"},
+    {"elementwise_operation_ADD_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_SUB_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_MAX_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_MIN_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_DIV_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_SQUARED_DIFF_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_PRELU_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_unary", "common/elementwise_unary.cl"},
+    {"elementwise_unary_quantized", "common/elementwise_unary_quantized.cl"},
+    {"fft_digit_reverse_axis_0", "common/fft_digit_reverse.cl"},
+    {"fft_digit_reverse_axis_1", "common/fft_digit_reverse.cl"},
+    {"fft_radix_2_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_2_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_2_axis_0", "common/fft.cl"},
+    {"fft_radix_2_axis_1", "common/fft.cl"},
+    {"fft_radix_3_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_3_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_3_axis_0", "common/fft.cl"},
+    {"fft_radix_3_axis_1", "common/fft.cl"},
+    {"fft_radix_4_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_4_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_4_axis_0", "common/fft.cl"},
+    {"fft_radix_4_axis_1", "common/fft.cl"},
+    {"fft_radix_5_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_5_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_5_axis_0", "common/fft.cl"},
+    {"fft_radix_5_axis_1", "common/fft.cl"},
+    {"fft_radix_7_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_7_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_7_axis_0", "common/fft.cl"},
+    {"fft_radix_7_axis_1", "common/fft.cl"},
+    {"fft_radix_8_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_8_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_8_axis_0", "common/fft.cl"},
+    {"fft_radix_8_axis_1", "common/fft.cl"},
+    {"fft_scale_conj", "common/fft_scale.cl"},
+    {"fill_image_borders_constant", "common/fill_border.cl"},
+    {"fill_image_borders_replicate", "common/fill_border.cl"},
+    {"floor_layer", "common/floor.cl"},
+    {"fuse_batchnormalization_layer", "common/batchnormalization_layer.cl"},
+    {"gather", "common/gather.cl"},
+    {"gemm_ma_f16", "common/gemm.cl"},
+    {"gemm_ma_f32", "common/gemm.cl"},
+    {"gemm_mv", "common/gemv.cl"},
+    {"gemm_mv_quantized", "common/gemv.cl"},
+    {"gemm_mm_native", "common/gemm.cl"},
+    {"gemm_mm_reshaped_only_rhs_nt_mmul", "common/gemm_reshaped_only_rhs_mmul.cl"},
+    {"gemm_mm_reshaped_only_rhs_nt_mmul_texture", "common/gemm_reshaped_only_rhs_mmul.cl"},
+    {"gemm_mm_reshaped_lhs_nt_rhs_t", "common/gemm.cl"},
+    {"gemm_mm_reshaped_lhs_nt_rhs_t_texture", "common/gemm.cl"},
+    {"gemm_mm_reshaped_lhs_t_rhs_nt", "common/gemm.cl"},
+    {"gemm_mm_reshaped_lhs_t_rhs_nt_texture", "common/gemm.cl"},
+    {"gemm_mm_reshaped_only_rhs_nt", "common/gemm.cl"},
+    {"gemm_mm_reshaped_only_rhs_nt_texture", "common/gemm.cl"},
+    {"gemm_mm_reshaped_only_rhs_t", "common/gemm.cl"},
+    {"gemm_mm_reshaped_only_rhs_t_texture", "common/gemm.cl"},
+    {"gemm_lc_vm_f32", "common/gemm.cl"},
+    {"gemm_reshape_lhs_matrix_nt", "common/gemm_utils.cl"},
+    {"gemm_reshape_lhs_matrix_t", "common/gemm_utils.cl"},
+    {"gemm_reshape_rhs_matrix_nt", "common/gemm_utils.cl"},
+    {"gemm_reshape_rhs_matrix_t", "common/gemm_utils.cl"},
+    {"gemmlowp_matrix_a_reduction", "common/gemmlowp.cl"},
+    {"gemmlowp_matrix_a_reduction_dot8", "common/gemmlowp.cl"},
+    {"gemmlowp_matrix_b_reduction", "common/gemmlowp.cl"},
+    {"gemmlowp_mm_native", "common/gemmlowp.cl"},
+    {"gemmlowp_mm_reshaped_lhs_nt_rhs_t", "common/gemmlowp.cl"},
+    {"gemmlowp_mm_reshaped_only_rhs_t", "common/gemmlowp.cl"},
+    {"gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint", "common/gemmlowp.cl"},
+    {"gemmlowp_mm_reshaped_only_rhs_mmul", "common/gemmlowp_reshaped_only_rhs_mmul.cl"},
+    {"gemmlowp_offset_contribution", "common/gemmlowp.cl"},
+    {"gemmlowp_offset_contribution_quantize_down", "common/gemmlowp.cl"},
+    {"gemmlowp_offset_contribution_quantize_down_fixedpoint", "common/gemmlowp.cl"},
+    {"gemmlowp_output_stage_quantize_down", "common/gemmlowp.cl"},
+    {"gemmlowp_output_stage_quantize_down_fixedpoint", "common/gemmlowp.cl"},
+    {"gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16", "common/gemmlowp.cl"},
+    {"gemmlowp_output_stage_quantize_down_float", "common/gemmlowp.cl"},
+    {"generate_proposals_compute_all_anchors", "common/generate_proposals.cl"},
+    {"generate_proposals_compute_all_anchors_quantized", "common/generate_proposals_quantized.cl"},
+    {"instance_normalization", "common/instance_normalization.cl"},
+    {"compute_mean_var", "common/instance_normalization.cl"},
+    {"l2_normalize_x", "common/l2_normalize.cl"},
+    {"l2_normalize_y", "common/l2_normalize.cl"},
+    {"l2_normalize_z", "common/l2_normalize.cl"},
+    {"mat_mul_native_mmul_nt_nt", "common/mat_mul_mmul.cl"},
+    {"mat_mul_native_mmul_t_nt", "common/mat_mul_mmul.cl"},
+    {"mat_mul_native_mmul_nt_t", "common/mat_mul_mmul.cl"},
+    {"mat_mul_native_mmul_t_t", "common/mat_mul_mmul.cl"},
+    {"mat_mul_native_nt_nt", "common/mat_mul.cl"},
+    {"mat_mul_native_nt_t", "common/mat_mul.cl"},
+    {"mat_mul_native_t_nt", "common/mat_mul.cl"},
+    {"mat_mul_native_t_t", "common/mat_mul.cl"},
+    {"mat_mul_native_quantized_nt_nt", "common/mat_mul_quantized.cl"},
+    {"mat_mul_native_quantized_nt_t", "common/mat_mul_quantized.cl"},
+    {"mat_mul_native_quantized_t_nt", "common/mat_mul_quantized.cl"},
+    {"mat_mul_native_quantized_t_t", "common/mat_mul_quantized.cl"},
+    {"mat_mul_native_quantized_mmul_nt_nt", "common/mat_mul_quantized_mmul.cl"},
+    {"mat_mul_native_quantized_mmul_nt_t", "common/mat_mul_quantized_mmul.cl"},
+    {"mat_mul_native_quantized_mmul_t_nt", "common/mat_mul_quantized_mmul.cl"},
+    {"mat_mul_native_quantized_mmul_t_t", "common/mat_mul_quantized_mmul.cl"},
+    {"max_unpooling_layer_2", "common/unpooling_layer.cl"},
+    {"mean_stddev_normalization", "common/mean_stddev_normalization.cl"},
+    {"memset", "common/memset.cl"},
+    {"minmax_layer", "common/minmax_layer.cl"},
+    {"non_max_suppression", "common/nonmax.cl"},
+    {"pad_layer_constant", "common/pad_layer.cl"},
+    {"pad_layer_symmetric_reflect", "common/pad_layer.cl"},
+    {"permute", "common/permute.cl"},
+    {"pixelwise_mul_complex", "common/pixelwise_mul_float.cl"},
+    {"pixelwise_mul_float", "common/pixelwise_mul_float.cl"},
+    {"pixelwise_mul_int", "common/pixelwise_mul_int.cl"},
+    {"pixelwise_mul_quantized", "common/pixelwise_mul_int.cl"},
+    {"qlstm_layer_normalization", "common/qlstm_layer_normalization.cl"},
+    {"quantization_layer", "common/quantization_layer.cl"},
+    {"range", "common/range.cl"},
+    {"range_quantized", "common/range.cl"},
+    {"reduction_operation_x", "common/reduction_operation.cl"},
+    {"reduction_operation_non_parallel_x", "common/reduction_operation.cl"},
+    {"reduction_operation_y", "common/reduction_operation.cl"},
+    {"reduction_operation_z", "common/reduction_operation.cl"},
+    {"reduction_operation_w", "common/reduction_operation.cl"},
+    {"reshape_layer", "common/reshape_layer.cl"},
+    {"reshape_to_columns", "common/convolution_layer.cl"},
+    {"reverse", "common/reverse.cl"},
+    {"roi_align_layer", "common/roi_align_layer.cl"},
+    {"roi_align_layer_quantized", "common/roi_align_layer_quantized.cl"},
+    {"roi_pooling_layer", "common/roi_pooling_layer.cl"},
+    {"select_same_rank", "common/select.cl"},
+    {"select_different_rank_2", "common/select.cl"},
+    {"select_different_rank_n", "common/select.cl"},
+    {"softmax_layer_norm", "common/softmax_layer.cl"},
+    {"softmax_layer_norm_quantized", "common/softmax_layer_quantized.cl"},
+    {"softmax_layer_max_shift_exp_sum_quantized_serial", "common/softmax_layer_quantized.cl"},
+    {"softmax_layer_max_shift_exp_sum_quantized_parallel", "common/softmax_layer_quantized.cl"},
+    {"softmax_layer_max_shift_exp_sum_serial", "common/softmax_layer.cl"},
+    {"softmax_layer_max_shift_exp_sum_parallel", "common/softmax_layer.cl"},
+    {"stack_layer", "common/stack_layer.cl"},
+    {"strided_slice", "common/slice_ops.cl"},
+    {"tile", "common/tile.cl"},
+    {"transpose", "common/transpose.cl"},
 #ifdef ENABLE_NCHW_KERNELS
-    { "batch_to_space_nchw", "nchw/batch_to_space.cl" },
-    { "batch_to_space_static_nchw", "nchw/batch_to_space.cl" },
-    { "batchnormalization_layer_nchw", "nchw/batchnormalization_layer.cl" },
-    { "channel_shuffle_nchw", "nchw/channel_shuffle.cl" },
-    { "depth_to_space_nchw", "nchw/depth_to_space.cl" },
-    { "dequantization_layer_per_channel_nchw", "nchw/dequantization_layer.cl" },
-    { "direct_convolution1x1", "nchw/direct_convolution1x1.cl" },
-    { "direct_convolution_nchw", "nchw/direct_convolution.cl" },
+    {"batch_to_space_nchw", "nchw/batch_to_space.cl"},
+    {"batch_to_space_static_nchw", "nchw/batch_to_space.cl"},
+    {"batchnormalization_layer_nchw", "nchw/batchnormalization_layer.cl"},
+    {"channel_shuffle_nchw", "nchw/channel_shuffle.cl"},
+    {"depth_to_space_nchw", "nchw/depth_to_space.cl"},
+    {"dequantization_layer_per_channel_nchw", "nchw/dequantization_layer.cl"},
+    {"direct_convolution1x1", "nchw/direct_convolution1x1.cl"},
+    {"direct_convolution_nchw", "nchw/direct_convolution.cl"},
 
-    { "im2col1x1_stridex1_nchw", "nchw/im2col.cl" },
-    { "im2col3x3_nchw", "nchw/im2col.cl" },
-    { "im2col5x5_nchw", "nchw/im2col.cl" },
-    { "im2col11x11_padx0_pady0_nchw", "nchw/im2col.cl" },
-    { "im2col_generic_nchw", "nchw/im2col.cl" },
-    { "im2col_generic_padx0_pady0_nchw", "nchw/im2col.cl" },
-    { "normalization_layer_cross_map_nchw", "nchw/normalization_layer.cl" },
-    { "normalization_layer_in_map_nchw", "nchw/normalization_layer.cl" },
-    { "normalize_planar_yuv_layer_nchw", "nchw/normalize_planar_yuv_layer.cl" },
-    { "normalize_planar_yuv_layer_q8_nchw", "nchw/normalize_planar_yuv_layer_quantized.cl" },
-    { "pooling_layer_MxN_nchw", "nchw/pooling_layer.cl" },
-    { "pooling_layer_2_nchw_indices", "nchw/pooling_layer.cl" },
-    { "prior_box_layer_nchw", "nchw/prior_box_layer.cl" },
-    { "reorg_layer_nchw", "nchw/reorg_layer.cl" },
-    { "scale_nearest_neighbour_nchw", "nchw/scale.cl" },
-    { "scale_bilinear_nchw", "nchw/scale.cl" },
-    { "space_to_batch_nchw", "nchw/space_to_batch.cl" },
-    { "space_to_batch_static_nchw", "nchw/space_to_batch.cl" },
-    { "space_to_depth_nchw", "nchw/space_to_depth.cl" },
-    { "upsample_layer_nchw", "nchw/upsample_layer.cl" },
-    { "winograd_filter_transform_2x2_3x3_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_2x1_3x1_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x2_1x3_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x4_3x3_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x1_3x1_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x4_1x3_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x4_5x5_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x1_5x1_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x4_1x5_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_input_transform_2x2_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_2x2_3x3_stepz2_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_2x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_2x1_3x1_stepz2_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x2_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x2_1x3_stepz2_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x4_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x4_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x4_5x5_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x1_5x1_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x4_1x5_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_output_transform_2x2_3x3_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_2x1_3x1_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x2_1x3_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x4_3x3_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x1_3x1_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x4_1x3_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x4_5x5_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x1_5x1_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x4_1x5_nchw", "nchw/winograd_output_transform.cl" },
+    {"im2col1x1_stridex1_nchw", "nchw/im2col.cl"},
+    {"im2col3x3_nchw", "nchw/im2col.cl"},
+    {"im2col5x5_nchw", "nchw/im2col.cl"},
+    {"im2col11x11_padx0_pady0_nchw", "nchw/im2col.cl"},
+    {"im2col_generic_nchw", "nchw/im2col.cl"},
+    {"im2col_generic_padx0_pady0_nchw", "nchw/im2col.cl"},
+    {"normalization_layer_cross_map_nchw", "nchw/normalization_layer.cl"},
+    {"normalization_layer_in_map_nchw", "nchw/normalization_layer.cl"},
+    {"normalize_planar_yuv_layer_nchw", "nchw/normalize_planar_yuv_layer.cl"},
+    {"normalize_planar_yuv_layer_q8_nchw", "nchw/normalize_planar_yuv_layer_quantized.cl"},
+    {"pooling_layer_MxN_nchw", "nchw/pooling_layer.cl"},
+    {"pooling_layer_2_nchw_indices", "nchw/pooling_layer.cl"},
+    {"prior_box_layer_nchw", "nchw/prior_box_layer.cl"},
+    {"reorg_layer_nchw", "nchw/reorg_layer.cl"},
+    {"scale_nearest_neighbour_nchw", "nchw/scale.cl"},
+    {"scale_bilinear_nchw", "nchw/scale.cl"},
+    {"space_to_batch_nchw", "nchw/space_to_batch.cl"},
+    {"space_to_batch_static_nchw", "nchw/space_to_batch.cl"},
+    {"space_to_depth_nchw", "nchw/space_to_depth.cl"},
+    {"upsample_layer_nchw", "nchw/upsample_layer.cl"},
+    {"winograd_filter_transform_2x2_3x3_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_2x1_3x1_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x2_1x3_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x4_3x3_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x1_3x1_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x4_1x3_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x4_5x5_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x1_5x1_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x4_1x5_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_input_transform_2x2_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_2x2_3x3_stepz2_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_2x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_2x1_3x1_stepz2_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x2_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x2_1x3_stepz2_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x4_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x4_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x4_5x5_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x1_5x1_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x4_1x5_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_output_transform_2x2_3x3_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_2x1_3x1_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x2_1x3_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x4_3x3_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x1_3x1_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x4_1x3_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x4_5x5_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x1_5x1_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x4_1x5_nchw", "nchw/winograd_output_transform.cl"},
 #endif /* ENABLE_NCHW_KERNELS */
 #ifdef ENABLE_NHWC_KERNELS
-    { "batch_to_space_nhwc", "nhwc/batch_to_space.cl" },
-    { "batch_to_space_static_nhwc", "nhwc/batch_to_space.cl" },
-    { "batchnormalization_layer_nhwc", "nhwc/batchnormalization_layer.cl" },
-    { "channel_shuffle_nhwc", "nhwc/channel_shuffle.cl" },
-    { "depth_to_space_nhwc", "nhwc/depth_to_space.cl" },
-    { "dequantization_layer_per_channel_nhwc", "nhwc/dequantization_layer.cl" },
-    { "dwc_native_fp_nhwc", "nhwc/dwc_native_fp_nhwc.cl" },
-    { "dwc_native_quantized_nhwc", "nhwc/dwc_native_quantized_nhwc.cl" },
-    { "direct_convolution_nhwc", "nhwc/direct_convolution.cl" },
-    { "direct_convolution3d_ndhwc", "nhwc/direct_convolution3d.cl" },
-    { "im2col3x3_nhwc", "nhwc/im2col.cl" },
-    { "im2col9x9_nhwc", "nhwc/im2col.cl" },
-    { "im2col_generic_nhwc", "nhwc/im2col.cl" },
-    { "indirect_convolution_nhwc", "nhwc/indirect_convolution.cl" },
-    { "indirect_convolution_address_precalculation", "nhwc/indirect_convolution.cl" },
-    { "normalization_layer_cross_map_nhwc", "nhwc/normalization_layer.cl" },
-    { "normalization_layer_in_map_nhwc", "nhwc/normalization_layer.cl" },
-    { "normalize_planar_yuv_layer_nhwc", "nhwc/normalize_planar_yuv_layer.cl" },
-    { "normalize_planar_yuv_layer_q8_nhwc", "nhwc/normalize_planar_yuv_layer_quantized.cl" },
-    { "pooling_layer_MxN_nhwc", "nhwc/pooling_layer.cl" },
-    { "pooling_layer_2x2_nhwc", "nhwc/pooling_layer.cl" },
-    { "pooling_layer_MxN_quantized_nhwc", "nhwc/pooling_layer_quantized.cl" },
-    { "pooling_3d_layer_MxN_ndhwc", "nhwc/pooling_3d_layer.cl" },
-    { "pooling_3d_layer_MxN_ndhwc_quantized", "nhwc/pooling_3d_layer_quantized.cl" },
-    { "reorg_layer_nhwc", "nhwc/reorg_layer.cl" },
-    { "scale_nearest_neighbour_nhwc", "nhwc/scale.cl" },
-    { "scale_bilinear_nhwc", "nhwc/scale.cl" },
-    { "space_to_batch_nhwc", "nhwc/space_to_batch.cl" },
-    { "space_to_batch_static_nhwc", "nhwc/space_to_batch.cl" },
-    { "space_to_depth_nhwc", "nhwc/space_to_depth.cl" },
-    { "transposed_convolution_nhwc", "nhwc/transposed_convolution.cl" },
-    { "upsample_layer_nhwc", "nhwc/upsample_layer.cl" },
-    { "winograd_filter_transform_4x1_3x1_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x4_1x3_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x4_3x3_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x4_5x5_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x1_5x1_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x4_1x5_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_2x2_7x7_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_2x1_7x1_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x2_1x7_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_input_transform_4x1_3x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x4_1x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x4_3x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x4_5x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x1_5x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x4_1x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_2x2_7x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_2x1_7x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x2_1x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_output_transform_4x1_3x1_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x4_1x3_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x4_3x3_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x4_5x5_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x1_5x1_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x4_1x5_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_2x2_7x7_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_2x1_7x1_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x2_1x7_nhwc", "nhwc/winograd_output_transform.cl" },
+    {"batch_to_space_nhwc", "nhwc/batch_to_space.cl"},
+    {"batch_to_space_static_nhwc", "nhwc/batch_to_space.cl"},
+    {"batchnormalization_layer_nhwc", "nhwc/batchnormalization_layer.cl"},
+    {"channel_shuffle_nhwc", "nhwc/channel_shuffle.cl"},
+    {"depth_to_space_nhwc", "nhwc/depth_to_space.cl"},
+    {"dequantization_layer_per_channel_nhwc", "nhwc/dequantization_layer.cl"},
+    {"dwc_native_fp_nhwc", "nhwc/dwc_native_fp_nhwc.cl"},
+    {"dwc_native_quantized_nhwc", "nhwc/dwc_native_quantized_nhwc.cl"},
+    {"direct_convolution_nhwc", "nhwc/direct_convolution.cl"},
+    {"direct_convolution3d_ndhwc", "nhwc/direct_convolution3d.cl"},
+    {"im2col3x3_nhwc", "nhwc/im2col.cl"},
+    {"im2col9x9_nhwc", "nhwc/im2col.cl"},
+    {"im2col_generic_nhwc", "nhwc/im2col.cl"},
+    {"indirect_convolution_nhwc", "nhwc/indirect_convolution.cl"},
+    {"indirect_convolution_address_precalculation", "nhwc/indirect_convolution.cl"},
+    {"normalization_layer_cross_map_nhwc", "nhwc/normalization_layer.cl"},
+    {"normalization_layer_in_map_nhwc", "nhwc/normalization_layer.cl"},
+    {"normalize_planar_yuv_layer_nhwc", "nhwc/normalize_planar_yuv_layer.cl"},
+    {"normalize_planar_yuv_layer_q8_nhwc", "nhwc/normalize_planar_yuv_layer_quantized.cl"},
+    {"pooling_layer_MxN_nhwc", "nhwc/pooling_layer.cl"},
+    {"pooling_layer_2x2_nhwc", "nhwc/pooling_layer.cl"},
+    {"pooling_layer_MxN_quantized_nhwc", "nhwc/pooling_layer_quantized.cl"},
+    {"pooling_3d_layer_MxN_ndhwc", "nhwc/pooling_3d_layer.cl"},
+    {"pooling_3d_layer_MxN_ndhwc_quantized", "nhwc/pooling_3d_layer_quantized.cl"},
+    {"reorg_layer_nhwc", "nhwc/reorg_layer.cl"},
+    {"scale_nearest_neighbour_nhwc", "nhwc/scale.cl"},
+    {"scale_bilinear_nhwc", "nhwc/scale.cl"},
+    {"space_to_batch_nhwc", "nhwc/space_to_batch.cl"},
+    {"space_to_batch_static_nhwc", "nhwc/space_to_batch.cl"},
+    {"space_to_depth_nhwc", "nhwc/space_to_depth.cl"},
+    {"transposed_convolution_nhwc", "nhwc/transposed_convolution.cl"},
+    {"upsample_layer_nhwc", "nhwc/upsample_layer.cl"},
+    {"winograd_filter_transform_4x1_3x1_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x4_1x3_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x4_3x3_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x4_5x5_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x1_5x1_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x4_1x5_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_2x2_7x7_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_2x1_7x1_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x2_1x7_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_input_transform_4x1_3x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x4_1x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x4_3x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x4_5x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x1_5x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x4_1x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_2x2_7x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_2x1_7x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x2_1x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_output_transform_4x1_3x1_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x4_1x3_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x4_3x3_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x4_5x5_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x1_5x1_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x4_1x5_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_2x2_7x7_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_2x1_7x1_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x2_1x7_nhwc", "nhwc/winograd_output_transform.cl"},
 #endif /* ENABLE_NHWC_KERNELS */
 };
 
-const std::map<std::string, std::string> ClKernelLibrary::_program_source_map =
-{
+const std::map<std::string, std::string> ClKernelLibrary::_program_source_map = {
 #ifdef EMBEDDED_KERNELS
     {
         "activation_float_helpers.h",
@@ -996,7 +985,7 @@ std::string ClKernelLibrary::program_name(const std::string &kernel_name) const
     // Find which program contains the kernel
     auto kernel_program_it = _kernel_program_map.find(kernel_name);
 
-    if(_kernel_program_map.end() == kernel_program_it)
+    if (_kernel_program_map.end() == kernel_program_it)
     {
         ARM_COMPUTE_ERROR_VAR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str());
     }
@@ -1022,14 +1011,14 @@ ClKernelLibrary::ClProgramInfo ClKernelLibrary::program(const std::string &progr
 #ifdef EMBEDDED_KERNELS
 #ifdef ARM_COMPUTE_COMPRESSED_KERNELS
     const auto inflatted_program_source_it = _decompressed_source_map.find(program_name);
-    if(inflatted_program_source_it != _decompressed_source_map.end())
+    if (inflatted_program_source_it != _decompressed_source_map.end())
     {
-        return ClProgramInfo{ inflatted_program_source_it->second, false };
+        return ClProgramInfo{inflatted_program_source_it->second, false};
     }
 #endif /* ARM_COMPUTE_COMPRESSED_KERNELS */
 
     const auto program_source_it = _program_source_map.find(program_name);
-    if(program_source_it == _program_source_map.end())
+    if (program_source_it == _program_source_map.end())
     {
         ARM_COMPUTE_ERROR_VAR("Embedded program for %s does not exist.", program_name.c_str());
     }
@@ -1042,7 +1031,7 @@ ClKernelLibrary::ClProgramInfo ClKernelLibrary::program(const std::string &progr
     program_source = std::move(decompressed_program_source);
 #endif /* ARM_COMPUTE_COMPRESSED_KERNELS */
 
-    return ClProgramInfo{ program_source, false };
+    return ClProgramInfo{program_source, false};
 #else  /* EMBEDDED_KERNELS */
     // Check for binary
     std::string source_name = _kernel_path + program_name;
@@ -1050,12 +1039,12 @@ ClKernelLibrary::ClProgramInfo ClKernelLibrary::program(const std::string &progr
     std::string program_source{};
     bool        is_binary = false;
 
-    if(std::ifstream(binary_name).is_open())
+    if (std::ifstream(binary_name).is_open())
     {
         program_source = read_file(binary_name, true);
         is_binary      = true;
     }
-    else if(std::ifstream(source_name).is_open())
+    else if (std::ifstream(source_name).is_open())
     {
         program_source = read_file(source_name, false);
     }
@@ -1064,7 +1053,7 @@ ClKernelLibrary::ClProgramInfo ClKernelLibrary::program(const std::string &progr
         ARM_COMPUTE_ERROR_VAR("Kernel file %s does not exist.", source_name.c_str());
     }
 
-    return ClProgramInfo{ program_source, is_binary };
+    return ClProgramInfo{program_source, is_binary};
 #endif /* EMBEDDED_KERNELS */
 }
 } // namespace opencl
diff --git a/src/gpu/cl/ClKernelLibrary.h b/src/gpu/cl/ClKernelLibrary.h
index 42bec95032..cd1d689199 100644
--- a/src/gpu/cl/ClKernelLibrary.h
+++ b/src/gpu/cl/ClKernelLibrary.h
@@ -52,8 +52,8 @@ public:
     /** Structure to encapsulte program related information */
     struct ClProgramInfo
     {
-        std::string program{};          /**< Program raw string */
-        bool        is_binary{ false }; /**< Flag that indicates if is in binary format */
+        std::string program{};        /**< Program raw string */
+        bool        is_binary{false}; /**< Flag that indicates if is in binary format */
     };
 
 public:
@@ -84,10 +84,12 @@ public:
     std::string program_name(const std::string &kernel_name) const;
 
 private:
-    std::string _kernel_path{};                                                 /**< Path to the kernels folder. */
-    mutable std::map<std::string, std::string>      _decompressed_source_map{}; /**< Map holding the decompressed files when compression is used */
-    static const std::map<std::string, std::string> _kernel_program_map;        /**< Map that associates kernel names with programs. */
-    static const std::map<std::string, std::string> _program_source_map;        /**< Contains sources for all programs.
+    std::string _kernel_path{}; /**< Path to the kernels folder. */
+    mutable std::map<std::string, std::string>
+        _decompressed_source_map{}; /**< Map holding the decompressed files when compression is used */
+    static const std::map<std::string, std::string>
+        _kernel_program_map; /**< Map that associates kernel names with programs. */
+    static const std::map<std::string, std::string> _program_source_map; /**< Contains sources for all programs.
                                                                                      Used for compile-time kernel inclusion. >*/
 };
 } // namespace opencl
diff --git a/src/gpu/cl/ClQueue.cpp b/src/gpu/cl/ClQueue.cpp
index 2123adcf39..0cb7af5b61 100644
--- a/src/gpu/cl/ClQueue.cpp
+++ b/src/gpu/cl/ClQueue.cpp
@@ -36,7 +36,7 @@ namespace
 {
 CLTunerMode map_tuner_mode(AclTuningMode mode)
 {
-    switch(mode)
+    switch (mode)
     {
         case AclRapid:
             return CLTunerMode::RAPID;
@@ -55,7 +55,7 @@ CLTunerMode map_tuner_mode(AclTuningMode mode)
 
 std::unique_ptr<CLTuner> populate_tuner(const AclQueueOptions *options)
 {
-    if(options == nullptr || options->mode == AclTuningModeNone)
+    if (options == nullptr || options->mode == AclTuningModeNone)
     {
         return nullptr;
     }
@@ -68,8 +68,7 @@ std::unique_ptr<CLTuner> populate_tuner(const AclQueueOptions *options)
 }
 } // namespace
 
-ClQueue::ClQueue(IContext *ctx, const AclQueueOptions *options)
-    : IQueue(ctx), _tuner(nullptr)
+ClQueue::ClQueue(IContext *ctx, const AclQueueOptions *options) : IQueue(ctx), _tuner(nullptr)
 {
     _tuner = populate_tuner(options);
 }
diff --git a/src/gpu/cl/ClQueue.h b/src/gpu/cl/ClQueue.h
index b16a0f4e83..09ffb06cf3 100644
--- a/src/gpu/cl/ClQueue.h
+++ b/src/gpu/cl/ClQueue.h
@@ -24,10 +24,10 @@
 #ifndef SRC_GPU_CLQUEUE_H
 #define SRC_GPU_CLQUEUE_H
 
-#include "src/common/IQueue.h"
-
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/IQueue.h"
+
 #include <memory>
 
 namespace arm_compute
diff --git a/src/gpu/cl/ClTensor.cpp b/src/gpu/cl/ClTensor.cpp
index 0df07813e3..27422a4130 100644
--- a/src/gpu/cl/ClTensor.cpp
+++ b/src/gpu/cl/ClTensor.cpp
@@ -31,8 +31,7 @@ namespace gpu
 {
 namespace opencl
 {
-ClTensor::ClTensor(IContext *ctx, const AclTensorDescriptor &desc)
-    : ITensorV2(ctx), _legacy_tensor()
+ClTensor::ClTensor(IContext *ctx, const AclTensorDescriptor &desc) : ITensorV2(ctx), _legacy_tensor()
 {
     ARM_COMPUTE_ASSERT((ctx != nullptr) && (ctx->type() == Target::GpuOcl));
     _legacy_tensor = std::make_unique<CLTensor>();
@@ -43,7 +42,7 @@ void *ClTensor::map()
 {
     ARM_COMPUTE_ASSERT(_legacy_tensor.get() != nullptr);
 
-    if(_legacy_tensor == nullptr)
+    if (_legacy_tensor == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[ClTensor:map]: Backing tensor does not exist!");
         return nullptr;
@@ -57,7 +56,7 @@ StatusCode ClTensor::unmap()
 {
     ARM_COMPUTE_ASSERT(_legacy_tensor.get() != nullptr);
 
-    if(_legacy_tensor == nullptr)
+    if (_legacy_tensor == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[ClTensor:unmap]: Backing tensor does not exist!");
         return StatusCode::RuntimeError;
diff --git a/src/gpu/cl/ClTensor.h b/src/gpu/cl/ClTensor.h
index 99d228c0b8..70184cd4bd 100644
--- a/src/gpu/cl/ClTensor.h
+++ b/src/gpu/cl/ClTensor.h
@@ -24,10 +24,10 @@
 #ifndef SRC_GPU_CLTENSOR_H
 #define SRC_GPU_CLTENSOR_H
 
-#include "src/common/ITensorV2.h"
-
 #include "arm_compute/runtime/CL/CLTensor.h"
 
+#include "src/common/ITensorV2.h"
+
 namespace arm_compute
 {
 namespace gpu
@@ -54,7 +54,7 @@ public:
     void                 *map() override;
     StatusCode            unmap() override;
     arm_compute::ITensor *tensor() const override;
-    StatusCode import(void *handle, ImportMemoryType type) override;
+    StatusCode            import(void *handle, ImportMemoryType type) override;
 
 private:
     std::unique_ptr<CLTensor> _legacy_tensor;
@@ -63,4 +63,4 @@ private:
 } // namespace gpu
 } // namespace arm_compute
 
-#endif /* SRC_GPU_CLTENSOR_H */
-\ No newline at end of file
+#endif /* SRC_GPU_CLTENSOR_H */
diff --git a/src/gpu/cl/IClKernel.h b/src/gpu/cl/IClKernel.h
index 52ea3c9183..4f07e9ad68 100644
--- a/src/gpu/cl/IClKernel.h
+++ b/src/gpu/cl/IClKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_ICL_KERNEL_H
 
 #include "arm_compute/core/ITensorInfo.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClActivationKernel.cpp b/src/gpu/cl/kernels/ClActivationKernel.cpp
index ab1543729f..a85296f7cd 100644
--- a/src/gpu/cl/kernels/ClActivationKernel.cpp
+++ b/src/gpu/cl/kernels/ClActivationKernel.cpp
@@ -28,14 +28,14 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/ActivationFunctionUtils.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 #include <set>
@@ -51,36 +51,47 @@ namespace
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM16, DataType::F16, DataType::F32);
 
-    static std::set<ActivationLayerInfo::ActivationFunction> quantized_supported_activations =
-    {
-        ActivationLayerInfo::ActivationFunction::RELU,
-        ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
-        ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-        ActivationLayerInfo::ActivationFunction::LOGISTIC,
-        ActivationLayerInfo::ActivationFunction::TANH,
-        ActivationLayerInfo::ActivationFunction::HARD_SWISH,
+    static std::set<ActivationLayerInfo::ActivationFunction> quantized_supported_activations = {
+        ActivationLayerInfo::ActivationFunction::RELU,         ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+        ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, ActivationLayerInfo::ActivationFunction::LOGISTIC,
+        ActivationLayerInfo::ActivationFunction::TANH,         ActivationLayerInfo::ActivationFunction::HARD_SWISH,
         ActivationLayerInfo::ActivationFunction::LEAKY_RELU,
     };
-    const DataType                                data_type = src->data_type();
-    const QuantizationInfo                       &oq_info   = (dst != nullptr) ? dst->quantization_info() : src->quantization_info();
-    const ActivationLayerInfo::ActivationFunction f_act     = act_info.activation();
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(data_type) && (quantized_supported_activations.count(f_act) == 0),
-                                    "For Quantized data type only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported");
-
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 128)));
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, 0)));
-
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
-
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 0)));
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, -128)));
+    const DataType          data_type = src->data_type();
+    const QuantizationInfo &oq_info   = (dst != nullptr) ? dst->quantization_info() : src->quantization_info();
+    const ActivationLayerInfo::ActivationFunction f_act = act_info.activation();
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(data_type) &&
+                                        (quantized_supported_activations.count(f_act) == 0),
+                                    "For Quantized data type only hard swish, leaky relu, tanh, logistic, relu and "
+                                    "lower/upper bounded relu are supported");
+
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+                                (oq_info != QuantizationInfo(1.f / 128.f, 128)));
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+                                (oq_info != QuantizationInfo(1.f / 256.f, 0)));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+                                (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+                                (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+                                (oq_info != QuantizationInfo(1.f / 128.f, 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+                                (oq_info != QuantizationInfo(1.f / 256.f, -128)));
 
     // Checks performed when destination is configured
-    if((dst != nullptr) && (dst->total_size() != 0))
+    if ((dst != nullptr) && (dst->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
@@ -95,15 +106,18 @@ ClActivationKernel::ClActivationKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClActivationKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo act_info)
+void ClActivationKernel::configure(const ClCompileContext &compile_context,
+                                   ITensorInfo            *src,
+                                   ITensorInfo            *dst,
+                                   ActivationLayerInfo     act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     _run_in_place = (dst == nullptr) || (dst == src);
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         // Destination auto inizialitation if not yet initialized
         auto_init_if_empty(*dst, *src->clone());
@@ -119,11 +133,10 @@ void ClActivationKernel::configure(const ClCompileContext &compile_context, ITen
 
     const ActivationLayerInfo::ActivationFunction f_act        = act_info.activation();
     const bool                                    is_quantized = is_data_type_quantized(dt);
-    const bool                                    perform_activation_in_float =
-        (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-        || (f_act == ActivationLayerInfo::ActivationFunction::TANH)
-        || (f_act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-        || (f_act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU);
+    const bool perform_activation_in_float = (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) ||
+                                             (f_act == ActivationLayerInfo::ActivationFunction::TANH) ||
+                                             (f_act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) ||
+                                             (f_act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU);
 
     // Set build options
     CLBuildOptions build_opts;
@@ -132,22 +145,23 @@ void ClActivationKernel::configure(const ClCompileContext &compile_context, ITen
     build_opts.add_option("-DACT=" + lower_string(string_from_activation_func(f_act)));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
 
     std::string kernel_name = std::string("activation_layer");
 
     // Set quantization info build options
-    if(is_quantized)
+    if (is_quantized)
     {
         const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
 
-        if(!perform_activation_in_float)
+        if (!perform_activation_in_float)
         {
             int a_const_int = 0;
             int b_const_int = 0;
 
             // Create quantized version of constants a, b if needed
-            switch(dt)
+            switch (dt)
             {
                 case DataType::QASYMM8:
                 {
@@ -180,22 +194,25 @@ void ClActivationKernel::configure(const ClCompileContext &compile_context, ITen
         }
 
         // Quantized value of 0 corresponds to the offset o1
-        build_opts.add_option(("-DCONST_0=" + (is_data_type_quantized_asymmetric(dt) ? support::cpp11::to_string(iq_info.offset) : "0")));
+        build_opts.add_option(
+            ("-DCONST_0=" + (is_data_type_quantized_asymmetric(dt) ? support::cpp11::to_string(iq_info.offset) : "0")));
         build_opts.add_option(("-DS1_VAL=" + float_to_string_with_full_precision(iq_info.scale)));
-        build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DO1_VAL=" + support::cpp11::to_string(iq_info.offset));
+        build_opts.add_option_if(is_data_type_quantized_asymmetric(dt),
+                                 "-DO1_VAL=" + support::cpp11::to_string(iq_info.offset));
 
         // Set correct kernel name
         kernel_name += perform_activation_in_float ? std::string("_quant_f32") : std::string("_quant");
 
         // Set scale and offset of the source and destination if they have different quantization info
-        if(dst != nullptr)
+        if (dst != nullptr)
         {
             const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
 
-            if(iq_info != oq_info)
+            if (iq_info != oq_info)
             {
                 build_opts.add_option(("-DS2_VAL=" + float_to_string_with_full_precision(oq_info.scale)));
-                build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DO2_VAL=" + support::cpp11::to_string(oq_info.offset));
+                build_opts.add_option_if(is_data_type_quantized_asymmetric(dt),
+                                         "-DO2_VAL=" + support::cpp11::to_string(oq_info.offset));
             }
         }
     }
@@ -235,8 +252,9 @@ void ClActivationKernel::run_op(ITensorPack &tensors, const Window &window, ::cl
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
     ARM_COMPUTE_ERROR_ON(_run_in_place && src != dst);
 
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
@@ -246,13 +264,12 @@ void ClActivationKernel::run_op(ITensorPack &tensors, const Window &window, ::cl
     {
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, src, slice);
-        if(!_run_in_place)
+        if (!_run_in_place)
         {
             add_3D_tensor_argument(idx, dst, slice);
         }
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClActivationKernel.h b/src/gpu/cl/kernels/ClActivationKernel.h
index 82e35b6104..ab7607bb82 100644
--- a/src/gpu/cl/kernels/ClActivationKernel.h
+++ b/src/gpu/cl/kernels/ClActivationKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_ACTIVATION_KERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -51,7 +52,10 @@ public:
      * @param[out]     dst             Destination tensor info. Data type supported: same as @p src
      * @param[in]      act_info        Activation layer information.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo act_info);
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   ActivationLayerInfo     act_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClActivationKernel::configure()
@@ -64,7 +68,7 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
 
 private:
-    bool _run_in_place{ false };
+    bool _run_in_place{false};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp b/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp
index 3d8ecf1fcc..a853f6bc1b 100644
--- a/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp
+++ b/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp
@@ -30,10 +30,10 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -66,12 +66,15 @@ ClBatchConcatenateKernel::ClBatchConcatenateKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClBatchConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst)
+void ClBatchConcatenateKernel::configure(const CLCompileContext &compile_context,
+                                         ITensorInfo            *src,
+                                         unsigned int            batch_offset,
+                                         ITensorInfo            *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, batch_offset, dst));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     _batch_offset = batch_offset;
 
@@ -81,8 +84,9 @@ void ClBatchConcatenateKernel::configure(const CLCompileContext &compile_context
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
-    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
     {
         const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
         const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
@@ -136,8 +140,9 @@ void ClBatchConcatenateKernel::run_op(ITensorPack &tensors, const Window &window
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window slice = window.first_slice_window_3D();
 
@@ -152,9 +157,8 @@ void ClBatchConcatenateKernel::run_op(ITensorPack &tensors, const Window &window
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
-} // namespace opencl
 } // namespace kernels
+} // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClBatchConcatenateKernel.h b/src/gpu/cl/kernels/ClBatchConcatenateKernel.h
index f6b7c0ed09..549576b628 100644
--- a/src/gpu/cl/kernels/ClBatchConcatenateKernel.h
+++ b/src/gpu/cl/kernels/ClBatchConcatenateKernel.h
@@ -53,7 +53,8 @@ public:
      * @note: The gaps between the two lowest dimensions of src and dst need to be divisible by 2.
      *
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClBatchConcatenateKernel::configure()
@@ -66,7 +67,7 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
 
 private:
-    unsigned int _batch_offset{ 0 };
+    unsigned int _batch_offset{0};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClCastKernel.cpp b/src/gpu/cl/kernels/ClCastKernel.cpp
index f621ad62d7..9ca35634f4 100644
--- a/src/gpu/cl/kernels/ClCastKernel.cpp
+++ b/src/gpu/cl/kernels/ClCastKernel.cpp
@@ -32,10 +32,10 @@
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -52,20 +52,17 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Conver
     ARM_COMPUTE_UNUSED(policy);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src,
-                                                         1,
-                                                         DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S16,
-                                                         DataType::U16, DataType::U32, DataType::S32, DataType::F16,
-                                                         DataType::F32, DataType::S64, DataType::U64);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst,
-                                                         1,
-                                                         DataType::U8, DataType::S8, DataType::QASYMM8, DataType::S16,
-                                                         DataType::U16, DataType::U32, DataType::S32, DataType::F16,
-                                                         DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL,
+                                                         DataType::S16, DataType::U16, DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32, DataType::S64, DataType::U64);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::S16, DataType::U16, DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == dst->data_type(), "src and dst data types must be different");
 
     // Validate in case of configured dst
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
     }
@@ -79,7 +76,10 @@ ClCastKernel::ClCastKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClCastKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
+void ClCastKernel::configure(const CLCompileContext &compile_context,
+                             const ITensorInfo      *src,
+                             ITensorInfo            *dst,
+                             ConvertPolicy           policy)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
@@ -88,7 +88,7 @@ void ClCastKernel::configure(const CLCompileContext &compile_context, const ITen
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, policy));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     // Get data sizes
     const size_t src_size = data_size_from_type(src->data_type());
@@ -100,12 +100,14 @@ void ClCastKernel::configure(const CLCompileContext &compile_context, const ITen
     // Set build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type()));
     // Conversions from float always SATURATE as out-of-bounds conversion from float->integer is implementation defined
     build_opts.add_option_if(is_data_type_float(src->data_type()) || policy == ConvertPolicy::SATURATE, "-DSATURATE");
-    build_opts.add_option_if(is_data_type_float(src->data_type()) || is_data_type_float(dst->data_type()), "-DIS_DATA_TYPE_FLOAT");
+    build_opts.add_option_if(is_data_type_float(src->data_type()) || is_data_type_float(dst->data_type()),
+                             "-DIS_DATA_TYPE_FLOAT");
     build_opts.add_option_if(is_data_type_quantized(src->data_type()), "-DIS_DATA_TYPE_QUANTIZED");
 
     // Create kernel
@@ -148,8 +150,9 @@ void ClCastKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::Comm
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
@@ -162,8 +165,7 @@ void ClCastKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::Comm
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClCastKernel.h b/src/gpu/cl/kernels/ClCastKernel.h
index a021b3c78c..07b0b61443 100644
--- a/src/gpu/cl/kernels/ClCastKernel.h
+++ b/src/gpu/cl/kernels/ClCastKernel.h
@@ -64,7 +64,8 @@ public:
      * @param[out] dst             The destination tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
      * @param[in]  policy          Conversion policy
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
+    void
+    configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClCastKernel::configure()
diff --git a/src/gpu/cl/kernels/ClCol2ImKernel.cpp b/src/gpu/cl/kernels/ClCol2ImKernel.cpp
index 3316742912..9972e07f05 100644
--- a/src/gpu/cl/kernels/ClCol2ImKernel.cpp
+++ b/src/gpu/cl/kernels/ClCol2ImKernel.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -47,29 +48,38 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups)
+Status validate_arguments(const ITensorInfo *src,
+                          const ITensorInfo *dst,
+                          const Size2D      &convolved_dims,
+                          unsigned int       num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
 
     // Checks performed when output is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), compute_col2im_shape(*src, convolved_dims, true, num_groups));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(), compute_col2im_shape(*src, convolved_dims, true, num_groups));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_layout() != DataLayout::NCHW, "Col2Im output's data layout must always be NCHW");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_layout() != DataLayout::NCHW,
+                                        "Col2Im output's data layout must always be NCHW");
     }
 
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_col2im_shape(*src, convolved_dims, true, num_groups)).set_data_layout(DataLayout::NCHW));
+    auto_init_if_empty(*dst, src->clone()
+                                 ->set_tensor_shape(compute_col2im_shape(*src, convolved_dims, true, num_groups))
+                                 .set_data_layout(DataLayout::NCHW));
 
     constexpr unsigned int num_elems_read_per_iteration = 8;
 
@@ -80,18 +90,22 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso
     AccessWindowHorizontal input_access(src, 0, num_elems_read_per_iteration);
     bool                   window_changed = update_window_and_padding(win, input_access);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
 
-ClCol2ImKernel::ClCol2ImKernel()
-    : _convolved_dims()
+ClCol2ImKernel::ClCol2ImKernel() : _convolved_dims()
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClCol2ImKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups)
+void ClCol2ImKernel::configure(const CLCompileContext &compile_context,
+                               ITensorInfo            *src,
+                               ITensorInfo            *dst,
+                               const Size2D           &convolved_dims,
+                               unsigned int            num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
@@ -132,11 +146,15 @@ void ClCol2ImKernel::configure(const CLCompileContext &compile_context, ITensorI
     _config_id += support::cpp11::to_string(dst->dimension(1));
 }
 
-Status ClCol2ImKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups)
+Status ClCol2ImKernel::validate(const ITensorInfo *src,
+                                const ITensorInfo *dst,
+                                const Size2D      &convolved_dims,
+                                unsigned int       num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, convolved_dims, num_groups));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), convolved_dims, num_groups).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(src->clone().get(), dst->clone().get(), convolved_dims, num_groups).first);
     return Status{};
 }
 
@@ -168,8 +186,7 @@ void ClCol2ImKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
         add_3D_tensor_argument(idx, src, slice);
         add_4D_tensor_argument(idx, dst, slice_out);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice) && collapsed_out.slide_window_slice_4D(slice_out));
+    } while (collapsed.slide_window_slice_3D(slice) && collapsed_out.slide_window_slice_4D(slice_out));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClCol2ImKernel.h b/src/gpu/cl/kernels/ClCol2ImKernel.h
index e19b7c8e16..34194aba01 100644
--- a/src/gpu/cl/kernels/ClCol2ImKernel.h
+++ b/src/gpu/cl/kernels/ClCol2ImKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_COL2IM_KERNEL_H
 
 #include "arm_compute/core/Size2D.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -68,14 +69,19 @@ public:
      * @param[in]  convolved_dims  Output convolved dimensions.
      * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups = 1);
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const Size2D           &convolved_dims,
+                   unsigned int            num_groups = 1);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClCol2ImKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups = 1);
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups = 1);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp
index 716dec1f30..85d3c3939c 100644
--- a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp
+++ b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -45,17 +46,21 @@ ClConvertFullyConnectedWeightsKernel::ClConvertFullyConnectedWeightsKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClConvertFullyConnectedWeightsKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape,
-                                                     DataLayout data_layout)
+void ClConvertFullyConnectedWeightsKernel::configure(const CLCompileContext &compile_context,
+                                                     const ITensorInfo      *src,
+                                                     ITensorInfo            *dst,
+                                                     const TensorShape      &original_src_shape,
+                                                     DataLayout              data_layout)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
     // Output tensor auto initialisation if not yet initialized
     auto_init_if_empty(*dst, *src->clone());
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
-    ARM_COMPUTE_ERROR_THROW_ON(ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout));
 
     const DataLayout src_data_layout = (data_layout == DataLayout::NCHW) ? DataLayout::NHWC : DataLayout::NCHW;
 
@@ -85,8 +90,10 @@ void ClConvertFullyConnectedWeightsKernel::configure(const CLCompileContext &com
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape,
-                                                      DataLayout data_layout)
+Status ClConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src,
+                                                      const ITensorInfo *dst,
+                                                      const TensorShape &original_src_shape,
+                                                      DataLayout         data_layout)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
@@ -96,7 +103,7 @@ Status ClConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, co
     ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN);
 
     // Checks performed when dst is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
@@ -110,8 +117,9 @@ void ClConvertFullyConnectedWeightsKernel::run_op(ITensorPack &tensors, const Wi
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
     unsigned int idx = 0;
diff --git a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h
index 16000e82f6..0ddb54561a 100644
--- a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h
+++ b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h
@@ -55,14 +55,21 @@ public:
      * @param[in]  original_src_shape Shape of the original src tensor (the one entering fully connected layer).
      * @param[in]  data_layout        The data layout the weights have been trained in.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   const TensorShape      &original_src_shape,
+                   DataLayout              data_layout);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClConvertFullyConnectedWeightsKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           const TensorShape &original_src_shape,
+                           DataLayout         data_layout);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClCopyKernel.cpp b/src/gpu/cl/kernels/ClCopyKernel.cpp
index 4719448819..c80ef664f5 100644
--- a/src/gpu/cl/kernels/ClCopyKernel.cpp
+++ b/src/gpu/cl/kernels/ClCopyKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -50,11 +51,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Window
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
 
     // Validate dst if initialized
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-        if(dst_window == nullptr)
+        if (dst_window == nullptr)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst->tensor_shape());
         }
@@ -74,12 +75,15 @@ ClCopyKernel::ClCopyKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClCopyKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Window *dst_window)
+void ClCopyKernel::configure(const CLCompileContext &compile_context,
+                             const ITensorInfo      *src,
+                             ITensorInfo            *dst,
+                             Window                 *dst_window)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, dst_window));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     // Create kernel
     CLBuildOptions build_opts;
@@ -93,7 +97,7 @@ void ClCopyKernel::configure(const CLCompileContext &compile_context, const ITen
 
     const Window win_config = calculate_max_window(*src, Steps(vec_size_x));
 
-    if(dst_window != nullptr)
+    if (dst_window != nullptr)
     {
         _has_dst_window                = true;
         _dst_window                    = Window(*dst_window);
@@ -101,9 +105,11 @@ void ClCopyKernel::configure(const CLCompileContext &compile_context, const ITen
         const int  vec_size_x_leftover = width_x % vec_size_x;
         const bool multi_access_x      = width_x >= static_cast<int32_t>(vec_size_x);
 
-        if(multi_access_x)
+        if (multi_access_x)
         {
-            _dst_window.set(Window::DimX, Window::Dimension(dst_window->x().start(), ceil_to_multiple(dst_window->x().end(), vec_size_x), vec_size_x));
+            _dst_window.set(Window::DimX,
+                            Window::Dimension(dst_window->x().start(),
+                                              ceil_to_multiple(dst_window->x().end(), vec_size_x), vec_size_x));
         }
 
         build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftover));
@@ -127,7 +133,8 @@ void ClCopyKernel::configure(const CLCompileContext &compile_context, const ITen
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClCopyKernel::validate(const arm_compute::ITensorInfo *src, const arm_compute::ITensorInfo *dst, Window *dst_window)
+Status
+ClCopyKernel::validate(const arm_compute::ITensorInfo *src, const arm_compute::ITensorInfo *dst, Window *dst_window)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, dst_window));
 
@@ -139,12 +146,13 @@ void ClCopyKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window slice;
 
-    if(_has_dst_window)
+    if (_has_dst_window)
     {
         slice            = window.first_slice_window_3D();
         Window out_slice = _dst_window.first_slice_window_3D();
@@ -154,8 +162,7 @@ void ClCopyKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman
             add_3D_tensor_argument(idx, src, slice);
             add_3D_tensor_argument(idx, dst, out_slice);
             enqueue(queue, *this, slice, lws_hint());
-        }
-        while(window.slide_window_slice_3D(slice) && _dst_window.slide_window_slice_3D(out_slice));
+        } while (window.slide_window_slice_3D(slice) && _dst_window.slide_window_slice_3D(out_slice));
     }
     else
     {
@@ -167,8 +174,7 @@ void ClCopyKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman
             add_3D_tensor_argument(idx, src, slice);
             add_3D_tensor_argument(idx, dst, slice);
             enqueue(queue, *this, slice, lws_hint());
-        }
-        while(collapsed.slide_window_slice_3D(slice));
+        } while (collapsed.slide_window_slice_3D(slice));
     }
 }
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/ClCopyKernel.h b/src/gpu/cl/kernels/ClCopyKernel.h
index 63fd806586..f915bf672d 100644
--- a/src/gpu/cl/kernels/ClCopyKernel.h
+++ b/src/gpu/cl/kernels/ClCopyKernel.h
@@ -47,7 +47,10 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
      * @param[in]  dst_window      (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Window *dst_window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   Window                 *dst_window = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClCopyKernel::configure()
diff --git a/src/gpu/cl/kernels/ClCropKernel.cpp b/src/gpu/cl/kernels/ClCropKernel.cpp
index 87ad6b49d9..0c503e13fc 100644
--- a/src/gpu/cl/kernels/ClCropKernel.cpp
+++ b/src/gpu/cl/kernels/ClCropKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
@@ -46,8 +47,14 @@ ClCropKernel::ClCropKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClCropKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index,
-                             float extrapolation_value, Window *dst_window)
+void ClCropKernel::configure(const CLCompileContext &compile_context,
+                             const ITensorInfo      *src,
+                             ITensorInfo            *dst,
+                             Coordinates2D           start,
+                             Coordinates2D           end,
+                             uint32_t                batch_index,
+                             float                   extrapolation_value,
+                             Window                 *dst_window)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst, start, end, batch_index, extrapolation_value, dst_window));
@@ -60,7 +67,7 @@ void ClCropKernel::configure(const CLCompileContext &compile_context, const ITen
     // Create and update the window (if needed)
     Window win = calculate_max_window(*dst);
 
-    if(dst_window != nullptr)
+    if (dst_window != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *dst_window);
         win = *dst_window;
@@ -70,7 +77,7 @@ void ClCropKernel::configure(const CLCompileContext &compile_context, const ITen
     const bool     multi_access_x = dst_width_x >= vec_size_x;
     const bool     remainder_x    = dst_width_x % vec_size_x > 0;
 
-    if(multi_access_x)
+    if (multi_access_x)
     {
         win.set(Window::DimX,
                 Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
@@ -81,13 +88,21 @@ void ClCropKernel::configure(const CLCompileContext &compile_context, const ITen
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(dst_width_x - vec_size_x, 0)));
+    build_opts.add_option_if(multi_access_x && remainder_x,
+                             "-DLAST_ACCESSED_X=" +
+                                 support::cpp11::to_string(std::max<int>(dst_width_x - vec_size_x, 0)));
     build_opts.add_option_if(start.x > end.x, "-DWIDTH_FLIPPED=");
     build_opts.add_option_if(start.y > end.y, "-DHEIGHT_FLIPPED=");
     _kernel = create_kernel(compile_context, "crop_tensor", build_opts.options());
 }
 
-Status ClCropKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window)
+Status ClCropKernel::validate(const ITensorInfo *src,
+                              const ITensorInfo *dst,
+                              Coordinates2D      start,
+                              Coordinates2D      end,
+                              uint32_t           batch_index,
+                              float              extrapolation_value,
+                              Window            *dst_window)
 {
     ARM_COMPUTE_UNUSED(extrapolation_value, dst_window);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
@@ -95,14 +110,15 @@ Status ClCropKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, Co
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
     ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().num_dimensions() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(start.x < 0 || start.y < 0 || end.x < 0 || end.y < 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(start.x >= static_cast<int32_t>(src->dimension(1)) || start.y >= static_cast<int32_t>(src->dimension(2))
-                                || end.x >= static_cast<int32_t>(src->dimension(1)) || end.y >= static_cast<int32_t>(src->dimension(2)));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        start.x >= static_cast<int32_t>(src->dimension(1)) || start.y >= static_cast<int32_t>(src->dimension(2)) ||
+        end.x >= static_cast<int32_t>(src->dimension(1)) || end.y >= static_cast<int32_t>(src->dimension(2)));
     ARM_COMPUTE_RETURN_ERROR_ON(batch_index >= src->dimension(3));
-    if(dst_window != nullptr)
+    if (dst_window != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(dst_window->x().step() != 1);
     }
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(dst, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
@@ -116,12 +132,15 @@ void ClCropKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window in_slice = Window();
     in_slice.use_tensor_dimensions(src->info()->tensor_shape());
-    in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), ceil_to_multiple(in_slice.x().end(), window.x().step()), window.x().step()));
+    in_slice.set(Window::DimX,
+                 Window::Dimension(in_slice.x().start(), ceil_to_multiple(in_slice.x().end(), window.x().step()),
+                                   window.x().step()));
     in_slice.set(3, Window::Dimension(_batch_index, _batch_index + 1, 1));
 
     unsigned int idx = 0;
diff --git a/src/gpu/cl/kernels/ClCropKernel.h b/src/gpu/cl/kernels/ClCropKernel.h
index 2f166e184c..506262608c 100644
--- a/src/gpu/cl/kernels/ClCropKernel.h
+++ b/src/gpu/cl/kernels/ClCropKernel.h
@@ -53,16 +53,27 @@ public:
      * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
      * @param[in]  dst_window          Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                   Window *dst_window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   Coordinates2D           start,
+                   Coordinates2D           end,
+                   uint32_t                batch_index,
+                   float                   extrapolation_value = 0,
+                   Window                 *dst_window          = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClCropKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                           Window *dst_window = nullptr);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           Coordinates2D      start,
+                           Coordinates2D      end,
+                           uint32_t           batch_index,
+                           float              extrapolation_value = 0,
+                           Window            *dst_window          = nullptr);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp b/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp
index a05cd1321e..ec44d88f01 100644
--- a/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp
+++ b/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp
@@ -30,10 +30,10 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -48,7 +48,8 @@ Status validate_arguments(const ITensorInfo *src, unsigned int depth_offset, con
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
 
     ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX));
@@ -60,18 +61,20 @@ Status validate_arguments(const ITensorInfo *src, unsigned int depth_offset, con
 }
 } // namespace
 
-ClDepthConcatenateKernel::ClDepthConcatenateKernel()
-    : _depth_offset(0)
+ClDepthConcatenateKernel::ClDepthConcatenateKernel() : _depth_offset(0)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClDepthConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst)
+void ClDepthConcatenateKernel::configure(const CLCompileContext &compile_context,
+                                         ITensorInfo            *src,
+                                         unsigned int            depth_offset,
+                                         ITensorInfo            *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, depth_offset, dst));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     _depth_offset = depth_offset;
 
@@ -81,8 +84,9 @@ void ClDepthConcatenateKernel::configure(const CLCompileContext &compile_context
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
-    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
     {
         const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
         const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
@@ -122,8 +126,9 @@ void ClDepthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window slice = window.first_slice_window_3D();
 
@@ -138,8 +143,7 @@ void ClDepthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClDepthConcatenateKernel.h b/src/gpu/cl/kernels/ClDepthConcatenateKernel.h
index 4739677f3b..539f010303 100644
--- a/src/gpu/cl/kernels/ClDepthConcatenateKernel.h
+++ b/src/gpu/cl/kernels/ClDepthConcatenateKernel.h
@@ -53,7 +53,8 @@ public:
      * @note: The gaps between the two lowest dimensions of src and dst need to be divisible by 2.
      *
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClDepthConcatenateKernel::configure()
diff --git a/src/gpu/cl/kernels/ClDequantizeKernel.cpp b/src/gpu/cl/kernels/ClDequantizeKernel.cpp
index 756cd56a8b..53429ab1aa 100644
--- a/src/gpu/cl/kernels/ClDequantizeKernel.cpp
+++ b/src/gpu/cl/kernels/ClDequantizeKernel.cpp
@@ -34,7 +34,6 @@
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -49,9 +48,11 @@ namespace
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8,
+                                                         DataType::QSYMM16);
 
-    if(dst->tensor_shape().total_size() > 0)
+    if (dst->tensor_shape().total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(dst);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32);
@@ -74,7 +75,7 @@ void ClDequantizeKernel::configure(const CLCompileContext &compile_context, ITen
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*dst, src->tensor_shape(), 1, DataType::F32);
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
 
@@ -87,7 +88,7 @@ void ClDequantizeKernel::configure(const CLCompileContext &compile_context, ITen
 
     // Create kernel
     CLBuildOptions build_opts;
-    if(!is_quantized_per_channel)
+    if (!is_quantized_per_channel)
     {
         const UniformQuantizationInfo qinfo   = src->quantization_info().uniform();
         const int                     qoffset = is_data_type_quantized_asymmetric(src->data_type()) ? qinfo.offset : 0;
@@ -103,16 +104,18 @@ void ClDequantizeKernel::configure(const CLCompileContext &compile_context, ITen
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
     build_opts.add_option("-DDATA_TYPE_SRC=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DDATA_TYPE_DST=" + get_cl_type_from_data_type(dst->data_type()));
-    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(
+                                                                        std::max<int>(output_width_x - vec_size_x, 0)));
 
     // Create kernel name
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*dst);
-    if(multi_access_x)
+    if (multi_access_x)
     {
-        win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+        win.set(Window::DimX,
+                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
     }
     ICLKernel::configure_internal(win);
 
@@ -136,10 +139,11 @@ void ClDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, cl::
     const bool is_quantized_per_channel = is_data_type_quantized_per_channel(src->info()->data_type());
 
     // Collapse windo
-    Window new_window = is_quantized_per_channel ? window.collapse_if_possible(ICLKernel::window(), 4) : window.collapse_if_possible(ICLKernel::window(), 3);
+    Window new_window = is_quantized_per_channel ? window.collapse_if_possible(ICLKernel::window(), 4)
+                                                 : window.collapse_if_possible(ICLKernel::window(), 3);
     Window slice      = new_window.first_slice_window_3D();
 
-    if(is_quantized_per_channel)
+    if (is_quantized_per_channel)
     {
         unsigned int idx = num_arguments_per_3D_tensor() * 2; //Skip the input and output parameters
         _kernel.setArg(idx++, src->quantization().scale->cl_buffer());
@@ -151,8 +155,7 @@ void ClDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, cl::
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(new_window.slide_window_slice_3D(slice));
+    } while (new_window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
index 7ad398412a..7cf1958c1b 100644
--- a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
+++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
@@ -23,17 +23,18 @@
  */
 #include "src/gpu/cl/kernels/ClDirectConv2dKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLUtils.h"
 #include "src/core/CL/CLValidate.h"
@@ -51,11 +52,17 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                          const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc)
+Status validate_arguments(const ITensorInfo                 *src,
+                          const ITensorInfo                 *weights,
+                          const ITensorInfo                 *biases,
+                          const ITensorInfo                 *dst,
+                          const PadStrideInfo               &conv_info,
+                          const ActivationLayerInfo         &act_info,
+                          const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
 
     const DataLayout data_layout = src->data_layout();
@@ -63,41 +70,56 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
     const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx),
+                                    "Weights feature map dimension should match the respective src's one");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_input_to_cl_image == true, "Export to CLImage is not supported for the input tensor");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_output_to_cl_image == true, "Export to CLImage is not supported for the output tensor");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_input_to_cl_image == true,
+                                    "Export to CLImage is not supported for the input tensor");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_output_to_cl_image == true,
+                                    "Export to CLImage is not supported for the output tensor");
 
-    if(data_layout == DataLayout::NCHW)
+    if (data_layout == DataLayout::NCHW)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx), "Weights should have same width and height");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported for 1x1 convolution.");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5 || weights->dimension(width_idx) == 9) && std::get<0>(conv_info.stride()) > 2,
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx),
+                                        "Weights should have same width and height");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3,
+                                        "Strides larger than 3 not supported for 1x1 convolution.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5 ||
+                                         weights->dimension(width_idx) == 9) &&
+                                            std::get<0>(conv_info.stride()) > 2,
                                         "Strides larger than 2 not supported for 3x3, 5x5, 9x9 convolution.");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(act_info.enabled(), "Fused activation is not supported for NCHW layout");
 
-        if(is_data_type_quantized(src->data_type()))
+        if (is_data_type_quantized(src->data_type()))
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9,
-                                            "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported with quantized data types");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 &&
+                    weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9,
+                "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported with quantized data types");
         }
         else
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5,
-                                            "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported with float data types");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 &&
+                    weights->dimension(width_idx) != 5,
+                "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported with float data types");
         }
     }
 
-    if(data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NHWC)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(act_info.enabled() && !is_data_type_float(src->data_type()), "Fused activation in NHWC is only supported for floating point.");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8, "M0 can only be greater than 0 and less than or equal to 8");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 && desc.n0 != 16,
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(act_info.enabled() && !is_data_type_float(src->data_type()),
+                                        "Fused activation in NHWC is only supported for floating point.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8,
+                                        "M0 can only be greater than 0 and less than or equal to 8");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 &&
+                                            desc.n0 != 16,
                                         "N0 can only be: 1, 2, 3, 4, 8, and 16");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16,
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 &&
+                                            desc.k0 != 16,
                                         "K0 can only be: 1, 2, 3, 4, 8, and 16");
-        if(desc.export_weights_to_cl_image)
+        if (desc.export_weights_to_cl_image)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16,
                                             "K0 can only be: 4, 8, and 16");
@@ -106,9 +128,9 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
         }
     }
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
-        if(is_data_type_quantized_asymmetric(src->data_type()))
+        if (is_data_type_quantized_asymmetric(src->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -118,20 +140,19 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
         }
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3),
                                         "Biases size and number of dst feature maps should match");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1,
-                                        "Biases should be one dimensional");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, "Biases should be one dimensional");
     }
 
     // Checks performed when dst is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
-                                                           misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(), misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     }
 
     const auto data_type = src->data_type();
-    if(is_data_type_quantized(data_type))
+    if (is_data_type_quantized(data_type))
     {
         const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
         const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
@@ -140,7 +161,8 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
         float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
         int   output_multiplier = 0;
         int   output_shift      = 0;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
     }
     return Status{};
 }
@@ -151,8 +173,14 @@ ClDirectConv2dKernel::ClDirectConv2dKernel()
     _type = CLKernelType::DIRECT;
 }
 
-void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                                     const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc)
+void ClDirectConv2dKernel::configure(const CLCompileContext            &compile_context,
+                                     ITensorInfo                       *src,
+                                     ITensorInfo                       *weights,
+                                     ITensorInfo                       *biases,
+                                     ITensorInfo                       *dst,
+                                     const PadStrideInfo               &conv_info,
+                                     const ActivationLayerInfo         &act_info,
+                                     const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
 
@@ -178,14 +206,11 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
     TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
 
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, output_shape,
-                       1,
-                       src->data_type(),
-                       src->quantization_info());
+    auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info());
 
     // Configure kernel window
     Window win;
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         output_shape.collapse(2U, 1U);
         const unsigned int n0 = adjust_vec_size(desc.n0, output_shape[0]);
@@ -194,7 +219,7 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
         // Create window and update padding
         win = calculate_max_window(output_shape, Steps(n0, m0));
     }
-    else if(_data_layout == DataLayout::NCHW)
+    else if (_data_layout == DataLayout::NCHW)
     {
         _num_elems_processed_per_iteration = 1u;
         win                                = calculate_max_window(*dst, Steps(_num_elems_processed_per_iteration));
@@ -205,7 +230,7 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
     std::stringstream kernel_name;
     CLBuildOptions    build_options;
 
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         kernel_name << "direct_convolution_nhwc";
 
@@ -221,22 +246,22 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
         _export_output_to_cl_image  = desc.export_output_to_cl_image;
 
         // Update the padding for the weights tensor if we can export to cl_image
-        if(_export_weights_to_cl_image)
+        if (_export_weights_to_cl_image)
         {
             gemm::update_padding_for_cl_image(weights);
         }
 
-        if(_export_output_to_cl_image)
+        if (_export_output_to_cl_image)
         {
             gemm::update_padding_for_cl_image(dst);
         }
 
-        if(_export_input_to_cl_image)
+        if (_export_input_to_cl_image)
         {
             gemm::update_padding_for_cl_image(src);
         }
 
-        if(biases != nullptr)
+        if (biases != nullptr)
         {
             build_options.add_option(std::string("-DHAS_BIAS"));
             build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type())));
@@ -246,9 +271,10 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
         const auto act_function  = act_info.activation();
         const auto dst_data_type = dst->data_type();
 
-        if((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
-           && (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-           && (dst_data_type == DataType::F32 || dst_data_type == DataType::F16))
+        if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+            (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU ||
+             act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) &&
+            (dst_data_type == DataType::F32 || dst_data_type == DataType::F16))
         {
             // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
             // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
@@ -259,7 +285,8 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
             build_options.add_option("-cl-fast-relaxed-math");
         }
 
-        build_options.add_option_if_else(_export_input_to_cl_image, "-DSRC_TENSOR_TYPE=IMAGE", "-DSRC_TENSOR_TYPE=BUFFER");
+        build_options.add_option_if_else(_export_input_to_cl_image, "-DSRC_TENSOR_TYPE=IMAGE",
+                                         "-DSRC_TENSOR_TYPE=BUFFER");
         build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
         build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(0)));
         build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(1)));
@@ -267,9 +294,11 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
         build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(0)));
         build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(1)));
         build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(2)));
-        build_options.add_option_if_else(_export_output_to_cl_image, "-DDST_TENSOR_TYPE=IMAGE", "-DDST_TENSOR_TYPE=BUFFER");
+        build_options.add_option_if_else(_export_output_to_cl_image, "-DDST_TENSOR_TYPE=IMAGE",
+                                         "-DDST_TENSOR_TYPE=BUFFER");
         build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst_data_type));
-        build_options.add_option_if_else(_export_weights_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER");
+        build_options.add_option_if_else(_export_weights_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE",
+                                         "-DWEI_TENSOR_TYPE=BUFFER");
         build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx)));
         build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx)));
         build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(weights->data_type()));
@@ -284,7 +313,7 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
         build_options.add_option_if((src->dimension(channel_idx) % k0) != 0, "-DLEFTOVER_LOOP");
         build_options.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_function)));
 
-        if(is_data_type_quantized(data_type))
+        if (is_data_type_quantized(data_type))
         {
             const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
             const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
@@ -314,11 +343,13 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
             build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(0));
             build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(0));
             build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(0));
-            build_options.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
-            build_options.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
+            build_options.add_option_if(act_info.enabled(),
+                                        "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
+            build_options.add_option_if(act_info.enabled(),
+                                        "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
         }
 
-        if(compile_context.get_ddk_version() >= 30)
+        if (compile_context.get_ddk_version() >= 30)
         {
             build_options.add_option("-fregister-allocation=64");
         }
@@ -340,13 +371,17 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
         build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx)));
         build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
         build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
-        build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx))));
+        build_options.add_option(
+            std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx))));
         build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x)));
         build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type)));
-        build_options.add_option(std::string("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration)));
-        build_options.add_option(std::string("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration)));
+        build_options.add_option(
+            std::string("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration)));
+        build_options.add_option(
+            std::string("-DVEC_SIZE_LEFTOVER=" +
+                        support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration)));
 
-        if(is_data_type_quantized(data_type))
+        if (is_data_type_quantized(data_type))
         {
             const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
             const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
@@ -405,8 +440,13 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
     _config_id += lower_string(string_from_data_layout(_data_layout));
 }
 
-Status ClDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                      const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc)
+Status ClDirectConv2dKernel::validate(const ITensorInfo                 *src,
+                                      const ITensorInfo                 *weights,
+                                      const ITensorInfo                 *biases,
+                                      const ITensorInfo                 *dst,
+                                      const PadStrideInfo               &conv_info,
+                                      const ActivationLayerInfo         &act_info,
+                                      const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info, desc));
     return Status{};
@@ -420,52 +460,55 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl
     // Get initial windows
     Window slice = window.first_slice_window_3D();
 
-    const auto src     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto biases  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst     = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto weights =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto biases =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         cl::Image2D weights_cl_image;
         cl::Image2D output_cl_image;
         cl::Image2D input_cl_image;
 
-        if(_export_weights_to_cl_image)
+        if (_export_weights_to_cl_image)
         {
             // Export tensor to cl_image
             weights_cl_image = create_image2d_from_tensor(weights, CLImage2DType::ReadOnly);
         }
 
-        if(_export_output_to_cl_image)
+        if (_export_output_to_cl_image)
         {
             // Export tensor to cl_image
             output_cl_image = create_image2d_from_tensor(dst, CLImage2DType::WriteOnly);
         }
 
-        if(_export_input_to_cl_image)
+        if (_export_input_to_cl_image)
         {
             // Export tensor to cl_image
             input_cl_image = create_image2d_from_tensor(src, CLImage2DType::ReadOnly);
         }
 
         unsigned int idx = 0;
-        if(_export_input_to_cl_image)
+        if (_export_input_to_cl_image)
         {
             _kernel.setArg(idx++, input_cl_image);
         }
         add_4d_tensor_nhwc_argument(idx, src);
-        if(_export_output_to_cl_image)
+        if (_export_output_to_cl_image)
         {
             _kernel.setArg(idx++, output_cl_image);
         }
         add_4d_tensor_nhwc_argument(idx, dst);
-        if(_export_weights_to_cl_image)
+        if (_export_weights_to_cl_image)
         {
             _kernel.setArg(idx++, weights_cl_image);
         }
         add_4d_tensor_nhwc_argument(idx, weights);
-        if(biases != nullptr)
+        if (biases != nullptr)
         {
             add_1D_tensor_argument(idx, biases, slice);
         }
@@ -476,7 +519,7 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl
         unsigned int idx1 = 2 * num_arguments_per_3D_tensor();
         add_3D_tensor_argument(idx1, weights, slice);
 
-        if(biases != nullptr)
+        if (biases != nullptr)
         {
             Window slice_biases;
             slice_biases.use_tensor_dimensions(biases->info()->tensor_shape());
@@ -491,8 +534,7 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl
             add_3D_tensor_argument(idx, src, slice);
             add_3D_tensor_argument(idx, dst, slice);
             enqueue(queue, *this, slice, lws_hint());
-        }
-        while(window.slide_window_slice_3D(slice));
+        } while (window.slide_window_slice_3D(slice));
     }
 }
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.h b/src/gpu/cl/kernels/ClDirectConv2dKernel.h
index 7132762b35..c934c825ca 100644
--- a/src/gpu/cl/kernels/ClDirectConv2dKernel.h
+++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -68,16 +69,27 @@ public:
      * @param[in]  act_info        Contains activaton information described in @ref ActivationLayerInfo.
      * @param[in]  desc            Direct convolution descriptor used to build the NHWC direct convolution kernel. For NCHW, this parameter is ignored.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                   const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc);
+    void configure(const CLCompileContext            &compile_context,
+                   ITensorInfo                       *src,
+                   ITensorInfo                       *weights,
+                   ITensorInfo                       *biases,
+                   ITensorInfo                       *dst,
+                   const PadStrideInfo               &conv_info,
+                   const ActivationLayerInfo         &act_info,
+                   const DirectConvComputeKernelInfo &desc);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClDirectConv2dKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                           const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc);
+    static Status validate(const ITensorInfo                 *src,
+                           const ITensorInfo                 *weights,
+                           const ITensorInfo                 *biases,
+                           const ITensorInfo                 *dst,
+                           const PadStrideInfo               &conv_info,
+                           const ActivationLayerInfo         &act_info,
+                           const DirectConvComputeKernelInfo &desc);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -85,9 +97,9 @@ public:
 public:
     DataLayout    _data_layout{};
     PadStrideInfo _conv_info{};
-    bool          _export_weights_to_cl_image{ false };
-    bool          _export_output_to_cl_image{ false };
-    bool          _export_input_to_cl_image{ false };
+    bool          _export_weights_to_cl_image{false};
+    bool          _export_output_to_cl_image{false};
+    bool          _export_input_to_cl_image{false};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp
index 6191178911..8002520a87 100644
--- a/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp
+++ b/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
@@ -40,7 +41,11 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info)
+Status validate_arguments(const ITensorInfo *src0,
+                          const ITensorInfo *src1,
+                          const ITensorInfo *src2,
+                          const ITensorInfo *dst,
+                          const Conv3dInfo  &conv3d_info)
 {
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_LAYOUT(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->data_layout() != DataLayout::NDHWC, "Only NDHWC layout supported");
@@ -49,20 +54,25 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv3d_info.act_info.enabled(), "Fused activation not supported");
 
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
     ARM_COMPUTE_RETURN_ERROR_ON(conv3d_info.dilation != Size3D(1U, 1U, 1U));
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->dimension(1) != src0->dimension(0), "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->dimension(1) != src0->dimension(0),
+                                    "Weights feature map dimension should match the respective src's one");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 5, "Weights can be at most 5 dimensional");
 
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(2) > (src0->dimension(1) + conv3d_info.padding.left + conv3d_info.padding.right));
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(3) > (src0->dimension(2) + conv3d_info.padding.top + conv3d_info.padding.bottom));
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(4) > (src0->dimension(3) + conv3d_info.padding.front + conv3d_info.padding.back));
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(2) >
+                                (src0->dimension(1) + conv3d_info.padding.left + conv3d_info.padding.right));
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(3) >
+                                (src0->dimension(2) + conv3d_info.padding.top + conv3d_info.padding.bottom));
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(4) >
+                                (src0->dimension(3) + conv3d_info.padding.front + conv3d_info.padding.back));
 
-    if(src2 != nullptr)
+    if (src2 != nullptr)
     {
-        if(is_data_type_quantized(src0->data_type()))
+        if (is_data_type_quantized(src0->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::S32);
         }
@@ -70,15 +80,18 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
         }
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->dimension(0) != src1->dimension(0), "Biases size and number of dst feature maps should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->dimension(0) != src1->dimension(0),
+                                        "Biases size and number of dst feature maps should match");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->num_dimensions() > 1, "Biases should be one dimensional");
     }
 
     // Checks performed when dst is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src1->dimension(0), "Weights and dst OFMs should match");
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv3d_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(),
+            misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv3d_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
     }
 
@@ -91,8 +104,12 @@ ClDirectConv3dKernel::ClDirectConv3dKernel()
     _type = CLKernelType::DIRECT;
 }
 
-void ClDirectConv3dKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst,
-                                     const Conv3dInfo &conv3d_info)
+void ClDirectConv3dKernel::configure(const CLCompileContext &compile_context,
+                                     const ITensorInfo      *src0,
+                                     const ITensorInfo      *src1,
+                                     const ITensorInfo      *src2,
+                                     ITensorInfo            *dst,
+                                     const Conv3dInfo       &conv3d_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
@@ -149,13 +166,13 @@ void ClDirectConv3dKernel::configure(const CLCompileContext &compile_context, co
     build_options.add_option("-DK0=" + support::cpp11::to_string(k0));
     build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
 
-    if(src2 != nullptr)
+    if (src2 != nullptr)
     {
         build_options.add_option(std::string("-DHAS_BIAS"));
         build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(src2->data_type())));
     }
 
-    if(is_data_type_quantized(data_type))
+    if (is_data_type_quantized(data_type))
     {
         const UniformQuantizationInfo iqinfo = src0->quantization_info().uniform();
         const UniformQuantizationInfo wqinfo = src1->quantization_info().uniform();
@@ -218,7 +235,11 @@ void ClDirectConv3dKernel::configure(const CLCompileContext &compile_context, co
     _config_id += support::cpp11::to_string(dst_channels);
 }
 
-Status ClDirectConv3dKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info)
+Status ClDirectConv3dKernel::validate(const ITensorInfo *src0,
+                                      const ITensorInfo *src1,
+                                      const ITensorInfo *src2,
+                                      const ITensorInfo *dst,
+                                      const Conv3dInfo  &conv3d_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, conv3d_info));
     return Status{};
@@ -229,21 +250,28 @@ void ClDirectConv3dKernel::run_op(ITensorPack &tensors, const Window &window, cl
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    const auto src     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto biases  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst     = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto weights =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto biases =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     // Get initial windows
     Window slice = window.first_slice_window_3D();
-    slice.set(Window::DimY, Window::Dimension(0, ceil_to_multiple(dst->info()->dimension(1) * dst->info()->dimension(2) * dst->info()->dimension(3), slice.y().step()), slice.y().step()));
+    slice.set(Window::DimY, Window::Dimension(0,
+                                              ceil_to_multiple(dst->info()->dimension(1) * dst->info()->dimension(2) *
+                                                                   dst->info()->dimension(3),
+                                                               slice.y().step()),
+                                              slice.y().step()));
     slice.set(Window::DimZ, Window::Dimension(0, dst->info()->dimension(4), 1));
 
     unsigned int idx = 0;
     add_4D_tensor_argument(idx, src, slice);
     add_4D_tensor_argument(idx, dst, slice);
     add_4D_tensor_argument(idx, weights, slice);
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         add_1D_tensor_argument(idx, biases, slice);
     }
diff --git a/src/gpu/cl/kernels/ClDirectConv3dKernel.h b/src/gpu/cl/kernels/ClDirectConv3dKernel.h
index de4f0ce216..cb7509d8fa 100644
--- a/src/gpu/cl/kernels/ClDirectConv3dKernel.h
+++ b/src/gpu/cl/kernels/ClDirectConv3dKernel.h
@@ -70,14 +70,23 @@ public:
      * @param[out] dst             Destination tensor. 4 lower dimensions represent a single dst [OFM, width, height, depth], while the rest represent batch of dsts.
      * @param[in]  conv3d_info     Contains strides, padding, rounding, activation, dilation and fast math information. Activation and fast math are currently unused.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv3d_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src0,
+                   const ITensorInfo      *src1,
+                   const ITensorInfo      *src2,
+                   ITensorInfo            *dst,
+                   const Conv3dInfo       &conv3d_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClDirectConv3dKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info);
+    static Status validate(const ITensorInfo *src0,
+                           const ITensorInfo *src1,
+                           const ITensorInfo *src2,
+                           const ITensorInfo *dst,
+                           const Conv3dInfo  &conv3d_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClElementwiseKernel.cpp b/src/gpu/cl/kernels/ClElementwiseKernel.cpp
index 6beee576b5..cdb3527a92 100644
--- a/src/gpu/cl/kernels/ClElementwiseKernel.cpp
+++ b/src/gpu/cl/kernels/ClElementwiseKernel.cpp
@@ -23,18 +23,20 @@
  */
 #include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/common/utils/Validate.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
 #include "support/StringSupport.h"
+
 #include <map>
 
 namespace arm_compute
@@ -47,25 +49,20 @@ namespace
 {
 constexpr unsigned int vector_size_byte_opencl = 16;
 
-std::map<ArithmeticOperation, std::string> supported_arithmetic_ops =
-{
-    { ArithmeticOperation::ADD, "ADD" },
-    { ArithmeticOperation::SUB, "SUB" },
-    { ArithmeticOperation::DIV, "DIV" },
-    { ArithmeticOperation::SQUARED_DIFF, "SQUARED_DIFF" },
-    { ArithmeticOperation::MIN, "MIN" },
-    { ArithmeticOperation::MAX, "MAX" },
-    { ArithmeticOperation::POWER, "POWER" },
-    { ArithmeticOperation::PRELU, "PRELU" },
+std::map<ArithmeticOperation, std::string> supported_arithmetic_ops = {
+    {ArithmeticOperation::ADD, "ADD"},     {ArithmeticOperation::SUB, "SUB"},
+    {ArithmeticOperation::DIV, "DIV"},     {ArithmeticOperation::SQUARED_DIFF, "SQUARED_DIFF"},
+    {ArithmeticOperation::MIN, "MIN"},     {ArithmeticOperation::MAX, "MAX"},
+    {ArithmeticOperation::POWER, "POWER"}, {ArithmeticOperation::PRELU, "PRELU"},
 };
 
-std::map<ArithmeticOperation, std::string> supported_sat_arithmetic_ops =
-{
-    { ArithmeticOperation::ADD, "ADD" },
-    { ArithmeticOperation::SUB, "SUB" },
+std::map<ArithmeticOperation, std::string> supported_sat_arithmetic_ops = {
+    {ArithmeticOperation::ADD, "ADD"},
+    {ArithmeticOperation::SUB, "SUB"},
 };
 
-std::string generate_id_for_tuning_common(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst)
+std::string
+generate_id_for_tuning_common(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst)
 {
     std::string config_id;
     // Set config_id for enabling LWS tuning
@@ -79,12 +76,18 @@ std::string generate_id_for_tuning_common(const std::string &kernel_name, const
     return config_id;
 }
 
-Status validate_in_place_output_shape(const bool in_place, const bool src1_in_place, const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst, const TensorShape &out_shape)
+Status validate_in_place_output_shape(const bool         in_place,
+                                      const bool         src1_in_place,
+                                      const ITensorInfo &src1,
+                                      const ITensorInfo &src2,
+                                      const ITensorInfo &dst,
+                                      const TensorShape &out_shape)
 {
-    if(in_place)
+    if (in_place)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src1_in_place ? src1.tensor_shape() : src2.tensor_shape(), 0),
-                                        "Wrong shape for dst, cannot do in_place calculation");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            detail::have_different_dimensions(out_shape, src1_in_place ? src1.tensor_shape() : src2.tensor_shape(), 0),
+            "Wrong shape for dst, cannot do in_place calculation");
     }
     else
     {
@@ -94,7 +97,9 @@ Status validate_in_place_output_shape(const bool in_place, const bool src1_in_pl
     return Status{};
 }
 
-Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
+Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src1,
+                                                          const ITensorInfo &src2,
+                                                          const ITensorInfo &dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(&src1, &src2, &dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src1);
@@ -110,11 +115,12 @@ Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::F16, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &dst);
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape));
     }
 
     return Status{};
@@ -136,25 +142,27 @@ Status validate_arguments_divide_operation(const ITensorInfo *src1, const ITenso
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
     // Validate in case of configured dst
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, dst);
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, *src1, *src2, *dst, out_shape));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_in_place_output_shape(in_place, src1_in_place, *src1, *src2, *dst, out_shape));
     }
 
     return Status{};
 }
 
-Status validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
+Status
+validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16, DataType::F16,
-                                                         DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+                                                         DataType::F16, DataType::S32, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &src2);
 
-    if(is_data_type_quantized_symmetric(src1.data_type()))
+    if (is_data_type_quantized_symmetric(src1.data_type()))
     {
         const int32_t in1_offset = src1.quantization_info().uniform().offset;
         const int32_t in2_offset = src2.quantization_info().uniform().offset;
@@ -170,13 +178,15 @@ Status validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const I
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), "Wrong shape for dst");
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape));
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
+                                        "Wrong shape for dst");
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape));
 
-        if(is_data_type_quantized_symmetric(dst.data_type()))
+        if (is_data_type_quantized_symmetric(dst.data_type()))
         {
             const int32_t offset = dst.quantization_info().uniform().offset;
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(offset != 0, "For quantized symmetric, offset must be zero");
@@ -185,19 +195,26 @@ Status validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const I
     return Status{};
 }
 
-CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst, const std::string &operation_string)
+CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &src1,
+                                                            const ITensorInfo &src2,
+                                                            const ITensorInfo &dst,
+                                                            const std::string &operation_string)
 {
     CLBuildOptions build_opts;
 
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0));
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0));
 
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src1.data_type()));
-    build_opts.add_option("-DVEC_SIZE_IN1=" + support::cpp11::to_string(src1.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_IN2=" + support::cpp11::to_string(src2.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_IN1=" +
+                          support::cpp11::to_string(src1.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_IN2=" +
+                          support::cpp11::to_string(src2.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
     build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(dst.dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(dst.dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DOP=" + operation_string);
-    if(is_data_type_quantized(src1.data_type()))
+    if (is_data_type_quantized(src1.data_type()))
     {
         const UniformQuantizationInfo iq1info = src1.quantization_info().uniform();
         const UniformQuantizationInfo iq2info = src2.quantization_info().uniform();
@@ -223,14 +240,17 @@ CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &s
 
 std::pair<Status, Window> configure_window_arithmetic_common(ITensorInfo &dst)
 {
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0));
-    Window             win                               = calculate_max_window(dst, Steps(num_elems_processed_per_iteration));
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0));
+    Window win = calculate_max_window(dst, Steps(num_elems_processed_per_iteration));
     return std::make_pair(Status{}, win);
 }
 
-std::pair<Status, Window> validate_and_configure_window_for_arithmetic_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+std::pair<Status, Window>
+validate_and_configure_window_for_arithmetic_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
 {
-    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
+    const std::pair<TensorShape, ValidRegion> broadcast_pair =
+        ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
     const TensorShape &out_shape = broadcast_pair.first;
 
     auto_init_if_empty(dst, out_shape, 1, src1.data_type());
@@ -238,9 +258,11 @@ std::pair<Status, Window> validate_and_configure_window_for_arithmetic_operators
     return configure_window_arithmetic_common(dst);
 }
 
-std::pair<Status, Window> validate_and_configure_window_for_logical_binary_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+std::pair<Status, Window>
+validate_and_configure_window_for_logical_binary_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
 {
-    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
+    const std::pair<TensorShape, ValidRegion> broadcast_pair =
+        ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
     const TensorShape &out_shape = broadcast_pair.first;
 
     set_shape_if_empty(dst, out_shape);
@@ -249,9 +271,11 @@ std::pair<Status, Window> validate_and_configure_window_for_logical_binary_opera
     return configure_window_arithmetic_common(dst);
 }
 
-std::pair<Status, Window> validate_and_configure_window_for_division(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+std::pair<Status, Window>
+validate_and_configure_window_for_division(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
 {
-    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
+    const std::pair<TensorShape, ValidRegion> broadcast_pair =
+        ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
     const TensorShape &out_shape = broadcast_pair.first;
 
     auto_init_if_empty(dst, out_shape, 1, src1.data_type());
@@ -265,21 +289,24 @@ ClElementwiseKernel::ClElementwiseKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClElementwiseKernel::configure_common(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
+void ClElementwiseKernel::configure_common(const ClCompileContext &compile_context,
+                                           ITensorInfo            *src1,
+                                           ITensorInfo            *src2,
+                                           ITensorInfo            *dst)
 {
     // Configure kernel window
     auto win_config = validate_and_configure_window(*src1, *src2, *dst);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
     std::string kernel_name = "elementwise_operation_" + name();
-    if(is_data_type_quantized(src1->data_type()))
+    if (is_data_type_quantized(src1->data_type()))
     {
         kernel_name += "_quantized";
     }
 
     // Set kernel build options
     CLBuildOptions build_opts = generate_build_options(*src1, *src2, *dst);
-    if(_act_info.enabled())
+    if (_act_info.enabled())
     {
         build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(_act_info.activation())));
         build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(_act_info.a()));
@@ -299,9 +326,11 @@ void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::c
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst   = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src_0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src_1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src_0, src_1, dst);
 
@@ -311,17 +340,18 @@ void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::c
 
     bool       can_collapse = true;
     const bool is_vector    = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1;
-    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
+    if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
     {
         can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+        for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
         {
             can_collapse = (in_shape1[d] == in_shape2[d]);
         }
     }
 
     bool   has_collapsed = false;
-    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+    Window collapsed =
+        can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
 
     const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
     const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
@@ -337,7 +367,7 @@ void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::c
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, src_0, slice_src1);
         add_3D_tensor_argument(idx, src_1, slice_src2);
-        if(!in_place)
+        if (!in_place)
         {
             add_3D_tensor_argument(idx, dst, slice);
         }
@@ -345,13 +375,16 @@ void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::c
         enqueue(queue, *this, slice, lws_hint());
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_src1));
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_src2));
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 
 /** Logical binary */
 
-void ClLogicalBinaryKernel::configure(const ClCompileContext &compile_context, LogicalOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
+void ClLogicalBinaryKernel::configure(const ClCompileContext &compile_context,
+                                      LogicalOperation        op,
+                                      ITensorInfo            *src1,
+                                      ITensorInfo            *src2,
+                                      ITensorInfo            *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_ERROR_THROW_ON(ClLogicalBinaryKernel::validate(op, src1, src2, dst));
@@ -359,7 +392,10 @@ void ClLogicalBinaryKernel::configure(const ClCompileContext &compile_context, L
     configure_common(compile_context, src1, src2, dst);
 }
 
-Status ClLogicalBinaryKernel::validate(LogicalOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
+Status ClLogicalBinaryKernel::validate(LogicalOperation   op,
+                                       const ITensorInfo *src1,
+                                       const ITensorInfo *src2,
+                                       const ITensorInfo *dst)
 {
     ARM_COMPUTE_UNUSED(op);
     ARM_COMPUTE_ASSERT(op != LogicalOperation::Unknown && op != LogicalOperation::Not);
@@ -369,14 +405,16 @@ Status ClLogicalBinaryKernel::validate(LogicalOperation op, const ITensorInfo *s
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
 
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*src1, *src2, *dst));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_logical_binary_operators(*src1->clone(), *src2->clone(), *dst->clone()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window_for_logical_binary_operators(*src1->clone(), *src2->clone(), *dst->clone())
+            .first);
 
     return Status{};
 }
 
 std::string ClLogicalBinaryKernel::name()
 {
-    switch(_op)
+    switch (_op)
     {
         case LogicalOperation::And:
             return "AND";
@@ -390,30 +428,38 @@ std::string ClLogicalBinaryKernel::name()
     return "";
 }
 
-std::pair<Status, Window> ClLogicalBinaryKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+std::pair<Status, Window>
+ClLogicalBinaryKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
 {
     return validate_and_configure_window_for_logical_binary_operators(src1, src2, dst);
 }
 
-CLBuildOptions ClLogicalBinaryKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
+CLBuildOptions
+ClLogicalBinaryKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
 {
     // The arithmetic utility functions can be share
     return generate_build_options_with_arithmetic_rules(src1, src2, dst, name());
 }
 
-std::string ClLogicalBinaryKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst)
+std::string ClLogicalBinaryKernel::generate_id_for_tuning(const std::string &kernel_name,
+                                                          const ITensorInfo &src1,
+                                                          const ITensorInfo &dst)
 {
     return generate_id_for_tuning_common(kernel_name, src1, dst);
 }
 
 /** Arithmetic operations with saturation*/
-void ClSaturatedArithmeticKernel::configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output,
+void ClSaturatedArithmeticKernel::configure(const ClCompileContext    &compile_context,
+                                            ArithmeticOperation        op,
+                                            ITensorInfo               *input1,
+                                            ITensorInfo               *input2,
+                                            ITensorInfo               *output,
                                             const ConvertPolicy       &policy,
                                             const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_ERROR_THROW_ON(ClSaturatedArithmeticKernel::validate(op, input1, input2, output, policy, act_info));
-    auto padding_info = get_padding_info({ input1, input2, output });
+    auto padding_info = get_padding_info({input1, input2, output});
 
     _policy   = policy;
     _op       = op;
@@ -422,24 +468,34 @@ void ClSaturatedArithmeticKernel::configure(const ClCompileContext &compile_cont
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClSaturatedArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ConvertPolicy &policy,
+Status ClSaturatedArithmeticKernel::validate(ArithmeticOperation        op,
+                                             const ITensorInfo         *input1,
+                                             const ITensorInfo         *input2,
+                                             const ITensorInfo         *output,
+                                             const ConvertPolicy       &policy,
                                              const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(op, policy);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*input1, *input2, *output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_arithmetic_operators(*input1->clone(), *input2->clone(), *output->clone()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window_for_arithmetic_operators(*input1->clone(), *input2->clone(), *output->clone())
+            .first);
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(output->data_type()));
 
     return Status{};
 }
 
-std::pair<Status, Window> ClSaturatedArithmeticKernel::validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
+std::pair<Status, Window> ClSaturatedArithmeticKernel::validate_and_configure_window(ITensorInfo &input1,
+                                                                                     ITensorInfo &input2,
+                                                                                     ITensorInfo &output)
 {
     return validate_and_configure_window_for_arithmetic_operators(input1, input2, output);
 }
 
-CLBuildOptions ClSaturatedArithmeticKernel::generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+CLBuildOptions ClSaturatedArithmeticKernel::generate_build_options(const ITensorInfo &input1,
+                                                                   const ITensorInfo &input2,
+                                                                   const ITensorInfo &output)
 {
     const bool has_float_out = is_data_type_float(output.data_type());
     auto       build_options = generate_build_options_with_arithmetic_rules(input1, input2, output, name());
@@ -447,7 +503,9 @@ CLBuildOptions ClSaturatedArithmeticKernel::generate_build_options(const ITensor
     return build_options;
 }
 
-std::string ClSaturatedArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output)
+std::string ClSaturatedArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name,
+                                                                const ITensorInfo &input1,
+                                                                const ITensorInfo &output)
 {
     auto config_id = generate_id_for_tuning_common(kernel_name, input1, output);
     config_id += (_policy == ConvertPolicy::WRAP) ? "_wrap_" : "_saturate_";
@@ -461,12 +519,16 @@ std::string ClSaturatedArithmeticKernel::name()
 }
 
 /** Arithmetic operations*/
-void ClArithmeticKernel::configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
+void ClArithmeticKernel::configure(const ClCompileContext    &compile_context,
+                                   ArithmeticOperation        op,
+                                   ITensorInfo               *src1,
+                                   ITensorInfo               *src2,
+                                   ITensorInfo               *dst,
                                    const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_ERROR_THROW_ON(ClArithmeticKernel::validate(op, src1, src2, dst, act_info));
-    auto padding_info = get_padding_info({ src1, src2, dst });
+    auto padding_info = get_padding_info({src1, src2, dst});
 
     _op       = op;
     _act_info = act_info;
@@ -474,33 +536,42 @@ void ClArithmeticKernel::configure(const ClCompileContext &compile_context, Arit
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClArithmeticKernel::validate(ArithmeticOperation        op,
+                                    const ITensorInfo         *src1,
+                                    const ITensorInfo         *src2,
+                                    const ITensorInfo         *dst,
+                                    const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
-    if(op == ArithmeticOperation::DIV)
+    if (op == ArithmeticOperation::DIV)
     {
         // Partial integer support S32/F32/F16
         ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_divide_operation(src1, src2, dst));
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first);
     }
-    else if(op == ArithmeticOperation::POWER)
+    else if (op == ArithmeticOperation::POWER)
     {
         // Power operators doesn't support integer arithmetic
         ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_float_only_supported_rules(*src1, *src2, *dst));
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first);
     }
     else
     {
         ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*src1, *src2, *dst));
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_arithmetic_operators(*src1->clone(), *src2->clone(), *dst->clone()).first);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_and_configure_window_for_arithmetic_operators(*src1->clone(), *src2->clone(), *dst->clone())
+                .first);
     }
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type()));
 
     return Status{};
 }
-std::pair<Status, Window> ClArithmeticKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+std::pair<Status, Window>
+ClArithmeticKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
 {
-    if(_op == ArithmeticOperation::DIV || _op == ArithmeticOperation::POWER)
+    if (_op == ArithmeticOperation::DIV || _op == ArithmeticOperation::POWER)
     {
         // Division and Power operators don't support integer arithmetic
         return validate_and_configure_window_for_division(src1, src2, dst);
@@ -511,11 +582,14 @@ std::pair<Status, Window> ClArithmeticKernel::validate_and_configure_window(ITen
     }
 }
 
-CLBuildOptions ClArithmeticKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
+CLBuildOptions
+ClArithmeticKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
 {
     return generate_build_options_with_arithmetic_rules(src1, src2, dst, name());
 }
-std::string ClArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst)
+std::string ClArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name,
+                                                       const ITensorInfo &src1,
+                                                       const ITensorInfo &dst)
 {
     return generate_id_for_tuning_common(kernel_name, src1, dst);
 }
diff --git a/src/gpu/cl/kernels/ClElementwiseKernel.h b/src/gpu/cl/kernels/ClElementwiseKernel.h
index ea3ddb2124..73e54542b2 100644
--- a/src/gpu/cl/kernels/ClElementwiseKernel.h
+++ b/src/gpu/cl/kernels/ClElementwiseKernel.h
@@ -25,8 +25,9 @@
 #define ARM_COMPUTE_CL_ELEMENTWISE_KERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
-#include "src/core/KernelTypes.h"
+
 #include "src/core/common/Macros.h"
+#include "src/core/KernelTypes.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
 
@@ -65,24 +66,28 @@ protected:
      *
      * @return a pair of Status and Window
      */
-    virtual std::pair<Status, Window> validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) = 0;
+    virtual std::pair<Status, Window>
+    validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) = 0;
 
     /** Generate the build options for the specific kernel
      *
      * @reutrn a CLBuildOptions struct
      */
-    virtual CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) = 0;
+    virtual CLBuildOptions
+    generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) = 0;
 
     /** Generate the identifier for tuning
      *
      * @reutrn a string
      */
-    virtual std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) = 0;
+    virtual std::string
+    generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) = 0;
 
     /** Commmon configure function for element-wise operators with no additional options (e.g., Div, Min, Max, SquaredDiff)
      *
      */
-    void configure_common(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
+    void
+    configure_common(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
 
     ActivationLayerInfo _act_info{};
 };
@@ -100,23 +105,31 @@ public:
      * @param[in] src2            Second source tensor info. Data types supported: same as @p src1.
      * @param[in] dst             Destination tensor info. Data types supported: same as @p src1.
      */
-    void configure(const ClCompileContext &compile_context, LogicalOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
+    void configure(const ClCompileContext &compile_context,
+                   LogicalOperation        op,
+                   ITensorInfo            *src1,
+                   ITensorInfo            *src2,
+                   ITensorInfo            *dst);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClLogicalBinaryKernel::configure()
      *
      * @return a status
      */
-    static Status validate(LogicalOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst);
+    static Status
+    validate(LogicalOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst);
 
 private:
     // Inherited methods overridden:
     std::string name() override;
-    std::pair<Status, Window> validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override;
-    CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override;
-    std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override;
-
-    LogicalOperation _op{ LogicalOperation::Unknown };
+    std::pair<Status, Window>
+    validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override;
+    CLBuildOptions
+    generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override;
+    std::string
+    generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override;
+
+    LogicalOperation _op{LogicalOperation::Unknown};
 };
 
 /** Addition operation */
@@ -135,7 +148,12 @@ public:
      * @param[in] policy          Policy to use to handle overflow.
      * @param[in] act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ConvertPolicy &policy,
+    void configure(const ClCompileContext    &compile_context,
+                   ArithmeticOperation        op,
+                   ITensorInfo               *input1,
+                   ITensorInfo               *input2,
+                   ITensorInfo               *output,
+                   const ConvertPolicy       &policy,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     /** Static function to check if given info will lead to a valid configuration
@@ -144,15 +162,23 @@ public:
      *
      * @return a status
      */
-    static Status validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ConvertPolicy &policy,
+    static Status validate(ArithmeticOperation        op,
+                           const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ConvertPolicy       &policy,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
 protected:
     // Inherited methods overridden:
     std::string name() override;
-    std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) override;
-    CLBuildOptions generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override;
-    std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output) override;
+    std::pair<Status, Window>
+    validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) override;
+    CLBuildOptions
+    generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override;
+    std::string generate_id_for_tuning(const std::string &kernel_name,
+                                       const ITensorInfo &input1,
+                                       const ITensorInfo &output) override;
 
 private:
     ConvertPolicy       _policy{};
@@ -174,7 +200,11 @@ public:
      * @param[in] dst             Destination tensor info. Data types supported: same as @p src1.
      * @param[in] act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
+    void configure(const ClCompileContext    &compile_context,
+                   ArithmeticOperation        op,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     /** Static function to check if given info will lead to a valid configuration
@@ -183,14 +213,21 @@ public:
      *
      * @return a status
      */
-    static Status validate(ArithmeticOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(ArithmeticOperation        op,
+                           const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
 protected:
     // Inherited methods overridden:
     std::string name() override;
-    std::pair<Status, Window> validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override;
-    CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override;
-    std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override;
+    std::pair<Status, Window>
+    validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override;
+    CLBuildOptions
+    generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override;
+    std::string
+    generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override;
 
 private:
     ArithmeticOperation _op{};
diff --git a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp
index 744a3a40c7..f7c198ee54 100644
--- a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp
+++ b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp
@@ -23,11 +23,12 @@
  */
 #include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
 
-#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
@@ -46,17 +47,18 @@ constexpr unsigned int vector_size_byte_opencl = 16;
 Status validate_arguments(const ITensorInfo &src, const ITensorInfo &dst, const ElementWiseUnary op)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src);
-    if(op == ElementWiseUnary::LOGICAL_NOT)
+    if (op == ElementWiseUnary::LOGICAL_NOT)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::U8);
     }
-    else if(op == ElementWiseUnary::NEG)
+    else if (op == ElementWiseUnary::NEG)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32);
     }
-    else if(op == ElementWiseUnary::RSQRT) // Allow quantized types for only RSQRT.
+    else if (op == ElementWiseUnary::RSQRT) // Allow quantized types for only RSQRT.
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::QASYMM8,
+                                                             DataType::QASYMM8_SIGNED);
     }
     else
     {
@@ -64,7 +66,7 @@ Status validate_arguments(const ITensorInfo &src, const ITensorInfo &dst, const
     }
 
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
@@ -80,19 +82,23 @@ ClElementWiseUnaryKernel::ClElementWiseUnaryKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const ElementWiseUnary &op)
+void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context,
+                                         const ITensorInfo      *src,
+                                         ITensorInfo            *dst,
+                                         const ElementWiseUnary &op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src, *dst, op));
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst->element_size(), dst->dimension(0));
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / dst->element_size(), dst->dimension(0));
 
-    std::string kernel_name    = "elementwise_unary";
-    const int   vec_size_x     = num_elems_processed_per_iteration;
-    const int   dst_width_x    = dst->dimension(0);
-    if(is_data_type_quantized(src->data_type()))
+    std::string kernel_name = "elementwise_unary";
+    const int   vec_size_x  = num_elems_processed_per_iteration;
+    const int   dst_width_x = dst->dimension(0);
+    if (is_data_type_quantized(src->data_type()))
     {
         kernel_name += "_quantized";
     }
@@ -101,7 +107,7 @@ void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
     build_opts.add_option("-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(dst_width_x - vec_size_x, 0)));
-    if(is_data_type_quantized(src->data_type()))
+    if (is_data_type_quantized(src->data_type()))
     {
         const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
         const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
@@ -110,7 +116,7 @@ void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context
         build_opts.add_option("-DSCALE_IN=" + float_to_string_with_full_precision(iqinfo.scale));
         build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oqinfo.scale));
     }
-    switch(op)
+    switch (op)
     {
         case ElementWiseUnary::RSQRT:
             build_opts.add_option("-DOPERATION=rsqrt_op");
@@ -169,8 +175,9 @@ void ClElementWiseUnaryKernel::run_op(ITensorPack &tensors, const Window &window
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice     = collapsed.first_slice_window_3D();
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     do
     {
@@ -178,8 +185,7 @@ void ClElementWiseUnaryKernel::run_op(ITensorPack &tensors, const Window &window
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h
index 0f270f25e8..81721f8ca8 100644
--- a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h
+++ b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h
@@ -47,7 +47,10 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
      * @param[in]  op              Element wise unary operation to perform.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const ElementWiseUnary &op);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   const ElementWiseUnary &op);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClElementWiseUnaryKernel::configure()
diff --git a/src/gpu/cl/kernels/ClFillKernel.cpp b/src/gpu/cl/kernels/ClFillKernel.cpp
index a9345ee334..96ad503730 100644
--- a/src/gpu/cl/kernels/ClFillKernel.cpp
+++ b/src/gpu/cl/kernels/ClFillKernel.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -47,9 +48,10 @@ ClFillKernel::ClFillKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClFillKernel::configure(const CLCompileContext &compile_context, ITensorInfo *tensor,
-                             const PixelValue &constant_value,
-                             Window           *window)
+void ClFillKernel::configure(const CLCompileContext &compile_context,
+                             ITensorInfo            *tensor,
+                             const PixelValue       &constant_value,
+                             Window                 *window)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
     ARM_COMPUTE_ERROR_THROW_ON(validate(tensor, constant_value, window));
@@ -60,7 +62,7 @@ void ClFillKernel::configure(const CLCompileContext &compile_context, ITensorInf
     // Create and update the window (if needed)
     _full_window = calculate_max_window(*tensor);
     Window win   = _full_window;
-    if(window != nullptr)
+    if (window != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *window);
         win = *window;
@@ -70,9 +72,10 @@ void ClFillKernel::configure(const CLCompileContext &compile_context, ITensorInf
     const bool multi_access_x = output_width_x >= vec_size_x;
     const bool remainder_x    = output_width_x % vec_size_x > 0;
 
-    if(multi_access_x)
+    if (multi_access_x)
     {
-        win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+        win.set(Window::DimX,
+                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
     }
     ICLKernel::configure_internal(win);
 
@@ -81,7 +84,9 @@ void ClFillKernel::configure(const CLCompileContext &compile_context, ITensorInf
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
     build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(constant_value, data_type));
     build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+    build_opts.add_option_if(multi_access_x && remainder_x,
+                             "-DLAST_ACCESSED_X=" +
+                                 support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
     _kernel = create_kernel(compile_context, "memset", build_opts.options());
 }
 
@@ -89,7 +94,7 @@ Status ClFillKernel::validate(const ITensorInfo *tensor, const PixelValue &const
 {
     ARM_COMPUTE_UNUSED(tensor);
     ARM_COMPUTE_UNUSED(constant_value);
-    if(window != nullptr)
+    if (window != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(window->x().step() != 1);
     }
@@ -101,7 +106,8 @@ void ClFillKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto tensor = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto tensor =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
 
     // Collapse all the batches on the third
     Window collapsed = window.collapse_if_possible(_full_window, Window::DimZ);
@@ -112,8 +118,7 @@ void ClFillKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, tensor, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClFillKernel.h b/src/gpu/cl/kernels/ClFillKernel.h
index f25cf928ad..5d69fbfbd1 100644
--- a/src/gpu/cl/kernels/ClFillKernel.h
+++ b/src/gpu/cl/kernels/ClFillKernel.h
@@ -47,7 +47,10 @@ public:
      * @param[in]     constant_value  The value used to fill the planes of the tensor
      * @param[in]     window          Window to be used in case setting only part of a tensor. Default is nullptr.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *tensor,
+                   const PixelValue       &constant_value,
+                   Window                 *window = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClFillKernel::configure()
diff --git a/src/gpu/cl/kernels/ClFloorKernel.cpp b/src/gpu/cl/kernels/ClFloorKernel.cpp
index f9f834875a..358e84012b 100644
--- a/src/gpu/cl/kernels/ClFloorKernel.cpp
+++ b/src/gpu/cl/kernels/ClFloorKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -52,7 +53,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
 
     // Validate in case of configured output
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
@@ -76,9 +77,9 @@ void ClFloorKernel::configure(const ClCompileContext &compile_context, const ITe
 
     // Validate
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
-    const unsigned int vec_size_x           = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0));
+    const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0));
     const int          vec_size_x_leftovers = src->dimension(0) % vec_size_x;
     CLBuildOptions     build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
@@ -105,8 +106,9 @@ void ClFloorKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comma
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IClKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
     Window slice     = collapsed.first_slice_window_3D();
@@ -117,8 +119,7 @@ void ClFloorKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comma
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp
index accafeecc2..e0d925dfb2 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp
@@ -29,14 +29,13 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -50,26 +49,35 @@ namespace
 {
 using ElementsProcessed = Steps;
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                          const GEMMReshapeInfo &gemm_info)
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *dst,
+                          const GEMMLHSMatrixInfo &lhs_info,
+                          const GEMMRHSMatrixInfo &rhs_info,
+                          const GEMMReshapeInfo   &gemm_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    if(src0->data_type() == DataType::QASYMM8)
+    if (src0->data_type() == DataType::QASYMM8)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8,
+                                                             DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
 
     const int m = gemm_info.m();
@@ -83,7 +91,7 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k));
     ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) != static_cast<unsigned int>(n));
     ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != static_cast<unsigned int>(k));
-    if(gemm_info.reinterpret_input_as_3d())
+    if (gemm_info.reinterpret_input_as_3d())
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m));
     }
@@ -92,9 +100,10 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != static_cast<unsigned int>(m));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+        const TensorInfo tensor_info_dst =
+            dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
     }
@@ -102,8 +111,13 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                                                        const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo       *src0,
+                                                        ITensorInfo             *src1,
+                                                        ITensorInfo             *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
+                                                        const GEMMRHSMatrixInfo &rhs_info,
+                                                        const GEMMReshapeInfo   &gemm_info,
+                                                        ElementsProcessed       &num_elements_processed)
 {
     unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
     unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
@@ -115,17 +129,19 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_dst_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_dst_as_3d)
+    if (reinterpret_input_as_3d == reinterpret_dst_as_3d)
     {
         reinterpret_dst_as_3d = false;
     }
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32));
+    auto_init_if_empty(*dst, src0->clone()
+                                 ->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))
+                                 .set_data_type(DataType::S32));
 
     TensorInfo tmp_info(*dst);
 
-    if(reinterpret_dst_as_3d)
+    if (reinterpret_dst_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -138,12 +154,12 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
     num_elems_processed_per_iteration_x = rhs_info.n0;
     num_elems_processed_per_iteration_y = lhs_info.m0;
 
-    win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     // RHS matrix still needs padding on the X
-    AccessWindowStatic src1_access(src1, 0, 0,
-                                   ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x),
-                                   src1->dimension(1));
+    AccessWindowStatic src1_access(
+        src1, 0, 0, ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), src1->dimension(1));
 
     window_changed = update_window_and_padding(win, src1_access); // window used by the execute_window_loop
 
@@ -153,7 +169,8 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
     const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
     collapsed                                = win.collapse(win, dimension_to_collapse);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, collapsed);
 }
 } // namespace
@@ -163,8 +180,13 @@ ClGemmLowpMatrixMultiplyNativeKernel::ClGemmLowpMatrixMultiplyNativeKernel()
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst,
-                                                     const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext  &compile_context,
+                                                     const ITensorInfo       *src0,
+                                                     ITensorInfo             *src1,
+                                                     ITensorInfo             *dst,
+                                                     const GEMMLHSMatrixInfo &lhs_info,
+                                                     const GEMMRHSMatrixInfo &rhs_info,
+                                                     const GEMMReshapeInfo   &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
@@ -175,11 +197,11 @@ void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com
     _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
 
     // We still need padding on the X dimension for the RHS matrix
-    auto padding_info = get_padding_info({ src0, dst });
+    auto padding_info = get_padding_info({src0, dst});
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_dst_as_3d to be false.
-    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+    if (_reinterpret_input_as_3d == _reinterpret_output_as_3d)
     {
         _reinterpret_input_as_3d  = false;
         _reinterpret_output_as_3d = false;
@@ -192,7 +214,8 @@ void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
+    auto win_config =
+        validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
@@ -212,8 +235,10 @@ void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com
     CLBuildOptions build_opts;
     build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
     build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
     build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
     build_opts.add_option("-DM=" + support::cpp11::to_string(src0->dimension(1)));
@@ -258,19 +283,19 @@ void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
-                                                      const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+Status ClGemmLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo       *src0,
+                                                      const ITensorInfo       *src1,
+                                                      const ITensorInfo       *dst,
+                                                      const GEMMLHSMatrixInfo &lhs_info,
+                                                      const GEMMRHSMatrixInfo &rhs_info,
+                                                      const GEMMReshapeInfo   &gemm_info)
 {
     ElementsProcessed num_elements_processed{};
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
-                                                              dst->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info,
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(),
+                                                              dst->clone().get(), lhs_info, rhs_info, gemm_info,
                                                               num_elements_processed)
-                                .first);
+                                    .first);
 
     return Status{};
 }
@@ -280,11 +305,13 @@ void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Wi
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
-    if(src1->info()->num_dimensions() < 3)
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -296,7 +323,7 @@ void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Wi
     slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
     slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
 
-    if(_reinterpret_input_as_3d)
+    if (_reinterpret_input_as_3d)
     {
         // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
         const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3;
@@ -304,10 +331,10 @@ void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Wi
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
     }
 
-    if(_reinterpret_output_as_3d)
+    if (_reinterpret_output_as_3d)
     {
         // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
+        const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
         const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
     }
@@ -317,7 +344,7 @@ void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Wi
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
+        if (!_slide_matrix_b)
         {
             slice_b = slice_matrix_b;
         }
@@ -330,8 +357,7 @@ void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Wi
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h
index 4b328e0ab8..4f87096158 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -55,25 +56,34 @@ public:
      *                             rhs_info.k0: same as lhs_info.k0
      * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst,
-                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       *src0,
+                   ITensorInfo             *src1,
+                   ITensorInfo             *dst,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMReshapeInfo   &gemm_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpMatrixMultiplyNativeKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst,
-                           const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *dst,
+                           const GEMMLHSMatrixInfo &lhs_info,
+                           const GEMMRHSMatrixInfo &rhs_info,
+                           const GEMMReshapeInfo   &gemm_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool _slide_matrix_b{ true };
-    bool _reinterpret_input_as_3d{ false };
-    bool _reinterpret_output_as_3d{ false };
-    bool _use_dummy_work_items{ false };
+    bool _slide_matrix_b{true};
+    bool _reinterpret_input_as_3d{false};
+    bool _reinterpret_output_as_3d{false};
+    bool _use_dummy_work_items{false};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp
index 15493f7ddc..ddbc809cdd 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp
@@ -29,13 +29,12 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -51,45 +50,55 @@ namespace
 {
 using ElementsProcessed = Steps;
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst,
-                          const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *dst,
+                          const GEMMLHSMatrixInfo &lhs_info,
+                          const GEMMRHSMatrixInfo &rhs_info,
+                          const GEMMReshapeInfo   &gemm_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose);
     ARM_COMPUTE_RETURN_ERROR_ON(!rhs_info.transpose);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
 
     const int m = gemm_info.m();
     const int n = gemm_info.n();
     const int k = gemm_info.k();
 
-    TensorShape tensor_shape0{ src0->tensor_shape() };
+    TensorShape tensor_shape0{src0->tensor_shape()};
     tensor_shape0.set(0, k);
     tensor_shape0.set(1, m);
 
-    TensorShape tensor_shape1{ src1->tensor_shape() };
+    TensorShape tensor_shape1{src1->tensor_shape()};
     tensor_shape1.set(0, n);
     tensor_shape1.set(1, k);
 
     const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0);
     const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
 
-    const TensorInfo tensor_info_reshaped0 = src0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+    const TensorInfo tensor_info_reshaped0 =
+        src0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
@@ -99,19 +108,24 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst,
-                                                        const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info,
-                                                        ElementsProcessed &num_elements_processed)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo       *src0,
+                                                        const ITensorInfo       *src1,
+                                                        ITensorInfo             *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
+                                                        const GEMMRHSMatrixInfo &rhs_info,
+                                                        const GEMMReshapeInfo   &gemm_info,
+                                                        ElementsProcessed       &num_elements_processed)
 {
     unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
     unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
     bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d() != 0);
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32));
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32));
 
     TensorInfo tmp_info(*dst);
-    if(reinterpret_output_as_3d)
+    if (reinterpret_output_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -123,7 +137,8 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
     // Configure kernel window
     num_elems_processed_per_iteration_x = rhs_info.n0;
     num_elems_processed_per_iteration_y = lhs_info.m0;
-    Window win                          = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    Window win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     // Collapse along the Z direction
     // This collapse needs to be here in order to tune the Z dimension of LWS
@@ -140,8 +155,13 @@ ClGemmLowpMatrixMultiplyReshapedKernel::ClGemmLowpMatrixMultiplyReshapedKernel()
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst,
-                                                       const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext  &compile_context,
+                                                       const ITensorInfo       *src0,
+                                                       const ITensorInfo       *src1,
+                                                       ITensorInfo             *dst,
+                                                       const GEMMLHSMatrixInfo &lhs_info,
+                                                       const GEMMRHSMatrixInfo &rhs_info,
+                                                       const GEMMReshapeInfo   &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info));
@@ -154,11 +174,12 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &c
     const unsigned int num_dimensionssrc0 = src0->num_dimensions();
     _slide_matrix_b                       = (src1->num_dimensions() >= num_dimensionssrc0);
 
-    auto              padding_info = get_padding_info({ src0, src1, dst });
+    auto              padding_info = get_padding_info({src0, src1, dst});
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
+    auto win_config =
+        validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
@@ -171,8 +192,10 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &c
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
+    build_opts.add_option_if(_reinterpret_output_as_3d,
+                             "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
+    build_opts.add_option_if(_reinterpret_output_as_3d,
+                             "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
     build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
     build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
@@ -230,19 +253,19 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &c
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
-                                                        const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+Status ClGemmLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo       *src0,
+                                                        const ITensorInfo       *src1,
+                                                        const ITensorInfo       *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
+                                                        const GEMMRHSMatrixInfo &rhs_info,
+                                                        const GEMMReshapeInfo   &gemm_info)
 {
     ElementsProcessed num_elements_processed{};
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
-                                                              dst->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info,
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(),
+                                                              dst->clone().get(), lhs_info, rhs_info, gemm_info,
                                                               num_elements_processed)
-                                .first);
+                                    .first);
 
     return Status{};
 }
@@ -252,11 +275,13 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
-    if(src1->info()->num_dimensions() < 3)
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -268,7 +293,7 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const
     slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
     slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
 
-    if(_reinterpret_output_as_3d)
+    if (_reinterpret_output_as_3d)
     {
         // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
         const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 4;
@@ -281,7 +306,7 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
+        if (!_slide_matrix_b)
         {
             slice_b = slice_matrix_b;
         }
@@ -295,8 +320,7 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h
index a16f500f11..d7b785996f 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -64,25 +65,34 @@ public:
      *
      * @note lhs_info.k0 must be equal to rhs_info.k0
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst,
-                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       *src0,
+                   const ITensorInfo       *src1,
+                   ITensorInfo             *dst,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMReshapeInfo   &gemm_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpMatrixMultiplyReshapedKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                           const GEMMReshapeInfo &gemm_info);
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *dst,
+                           const GEMMLHSMatrixInfo &lhs_info,
+                           const GEMMRHSMatrixInfo &rhs_info,
+                           const GEMMReshapeInfo   &gemm_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool         _slide_matrix_b{ true };
-    bool         _reinterpret_output_as_3d{ false };
-    unsigned int _k{ 1 };
-    bool         _use_dummy_work_items{ false };
+    bool         _slide_matrix_b{true};
+    bool         _reinterpret_output_as_3d{false};
+    unsigned int _k{1};
+    bool         _use_dummy_work_items{false};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp
index 5d552b8d63..2f1f3b8df0 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp
@@ -29,14 +29,13 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -54,45 +53,57 @@ namespace
 {
 using ElementsProcessed = Steps;
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                          const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                          const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status validate_arguments(const ITensorInfo    *src0,
+                          const ITensorInfo    *src1,
+                          const ITensorInfo    *dst,
+                          const GEMMKernelInfo &gemm_info,
+                          const ITensorInfo    *vector_sum_col,
+                          const ITensorInfo    *vector_sum_row,
+                          const ITensorInfo    *bias,
+                          const ITensorInfo    *output_multipliers,
+                          const ITensorInfo    *output_shifts)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    if(src0->data_type() == DataType::QASYMM8)
+    if (src0->data_type() == DataType::QASYMM8)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8,
+                                                             DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
 
     const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
     const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
     const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3) || (rhs_info.k0 > 16)), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3) || (rhs_info.k0 > 16)),
+                                    "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3) || rhs_info.n0 > 16), "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3) || rhs_info.n0 > 16),
+                                    "Only 2,3,4,8,16 are supported for n0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
 
     const int m = gemm_info.m;
     const int n = gemm_info.n;
     const int k = gemm_info.k;
 
-    TensorShape tensor_shape1{ src1->tensor_shape() };
+    TensorShape tensor_shape1{src1->tensor_shape()};
     tensor_shape1.set(0, n);
     tensor_shape1.set(1, k);
 
-    const TensorInfo tensor_info1          = src1->clone()->set_tensor_shape(tensor_shape1);
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+    const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
 
     ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k));
-    if(gemm_info.reinterpret_input_as_3d)
+    if (gemm_info.reinterpret_input_as_3d)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m));
     }
@@ -103,11 +114,11 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
 
     const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_dst_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
-        if(output_stage.type == GEMMLowpOutputStageType::NONE)
+        if (output_stage.type == GEMMLowpOutputStageType::NONE)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
         }
@@ -117,39 +128,42 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         }
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != bias->dimension(0));
     }
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) ||
+                                        (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT),
                                     "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported");
 
     // Checks performed if the dst stage needs to be fused
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
         // If a_offset == 0, vector_sum_col can be a nullptr
-        if(gemm_info.a_offset != 0)
+        if (gemm_info.a_offset != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
             ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_dst_shape[0]);
         }
 
         // If b_offset == 0, vector_sum_row can be a nullptr
-        if(gemm_info.b_offset != 0)
+        if (gemm_info.b_offset != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
 
             // Check if mm result is a 3D reinterpretation
-            const bool reinterpret_as_3d = expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x();
+            const bool reinterpret_as_3d =
+                expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x();
 
             // Validate input
-            ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (expected_dst_shape[1] * expected_dst_shape[2]));
+            ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+                                                                 (expected_dst_shape[1] * expected_dst_shape[2]));
             ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_dst_shape[1]);
 
-            if(expected_dst_shape.num_dimensions() > 1)
+            if (expected_dst_shape.num_dimensions() > 1)
             {
                 const unsigned int dst_batch_idx = reinterpret_as_3d ? 3 : 2;
 
@@ -161,30 +175,32 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
                 ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_dst_shape[dst_batch_idx],
                                                 "vector_sum_row must have the same number of batches of dst tensor");
 
-                if(gemm_info.a_offset != 0)
+                if (gemm_info.a_offset != 0)
                 {
                     TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
                     vector_sum_col_shape.collapse_from(1);
 
-                    ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                    "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+                    ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+                                                        vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                    "vector_sum_col tensor must have the same number of batches of "
+                                                    "vector_sum_row_shape or the number of batches must be set to 1");
                 }
             }
         }
 
-        if(dst->total_size() != 0)
+        if (dst->total_size() != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type());
         }
         ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
 
-        if(output_multipliers != nullptr && output_shifts != nullptr)
+        if (output_multipliers != nullptr && output_shifts != nullptr)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
             ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
             ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
-            if(output_stage.is_quantized_per_channel)
+            if (output_stage.is_quantized_per_channel)
             {
                 ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_shifts->dimension(0));
                 ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_multipliers->dimension(0));
@@ -194,9 +210,16 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                                                        ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias,
-                                                        ITensorInfo *output_multipliers, ITensorInfo *output_shifts, ElementsProcessed &num_elements_processed)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo    *src0,
+                                                        const ITensorInfo    *src1,
+                                                        ITensorInfo          *dst,
+                                                        const GEMMKernelInfo &gemm_info,
+                                                        ITensorInfo          *vector_sum_col,
+                                                        const ITensorInfo    *vector_sum_row,
+                                                        ITensorInfo          *bias,
+                                                        ITensorInfo          *output_multipliers,
+                                                        ITensorInfo          *output_shifts,
+                                                        ElementsProcessed    &num_elements_processed)
 {
     const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
 
@@ -211,16 +234,17 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
+    if (reinterpret_input_as_3d == reinterpret_output_as_3d)
     {
         reinterpret_output_as_3d = false;
     }
 
     // dst tensor auto initialization if not yet initialized
     const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
-    if(output_stage.type != GEMMLowpOutputStageType::NONE)
+    if (output_stage.type != GEMMLowpOutputStageType::NONE)
     {
-        auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type));
+        auto_init_if_empty(
+            *dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type));
     }
     else
     {
@@ -229,7 +253,7 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
 
     TensorInfo tmp_info(*dst);
 
-    if(reinterpret_output_as_3d)
+    if (reinterpret_output_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -242,12 +266,14 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
     num_elems_processed_per_iteration_x = gemm_info.rhs_info.n0;
     num_elems_processed_per_iteration_y = gemm_info.lhs_info.m0;
 
-    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win_out =
+        calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
-        if(gemm_info.a_offset != 0)
+        if (gemm_info.a_offset != 0)
         {
             AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration_x);
             window_changed = window_changed || update_window_and_padding(win_out, vector_sum_col_access);
@@ -255,17 +281,19 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
         // No access window needed for vector_sum_row
         ARM_COMPUTE_UNUSED(vector_sum_row);
 
-        if(bias != nullptr)
+        if (bias != nullptr)
         {
             AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration_x);
             window_changed = window_changed || update_window_and_padding(win_out, bias_access);
         }
 
-        if(output_multipliers != nullptr && output_stage.is_quantized_per_channel)
+        if (output_multipliers != nullptr && output_stage.is_quantized_per_channel)
         {
-            AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_processed_per_iteration_x);
+            AccessWindowHorizontal output_multipliers_access(output_multipliers, 0,
+                                                             num_elems_processed_per_iteration_x);
             AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration_x);
-            window_changed = window_changed || update_window_and_padding(win_out, output_multipliers_access, output_shifts_access);
+            window_changed =
+                window_changed || update_window_and_padding(win_out, output_multipliers_access, output_shifts_access);
         }
     }
 
@@ -275,7 +303,8 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
     const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
     collapsed                                = win.collapse(win, dimension_to_collapse);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, collapsed);
 }
 } // namespace
@@ -285,15 +314,22 @@ ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::ClGemmLowpMatrixMultiplyReshapedO
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst,
-                                                              const GEMMKernelInfo &gemm_info,
-                                                              ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias,
-                                                              ITensorInfo *output_multipliers, ITensorInfo *output_shifts)
+void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context,
+                                                              const ITensorInfo      *src0,
+                                                              const ITensorInfo      *src1,
+                                                              ITensorInfo            *dst,
+                                                              const GEMMKernelInfo   &gemm_info,
+                                                              ITensorInfo            *vector_sum_col,
+                                                              const ITensorInfo      *vector_sum_row,
+                                                              ITensorInfo            *bias,
+                                                              ITensorInfo            *output_multipliers,
+                                                              ITensorInfo            *output_shifts)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                  output_multipliers, output_shifts));
 
-    auto                          padding_info = get_padding_info({ src0, src1, dst, vector_sum_row });
+    auto                          padding_info = get_padding_info({src0, src1, dst, vector_sum_row});
     const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
     const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
     const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
@@ -307,7 +343,7 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+    if (_reinterpret_input_as_3d == _reinterpret_output_as_3d)
     {
         _reinterpret_input_as_3d  = false;
         _reinterpret_output_as_3d = false;
@@ -320,7 +356,8 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts, num_elements_processed);
+    auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                    output_multipliers, output_shifts, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
@@ -341,8 +378,10 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon
     CLBuildOptions build_opts;
     build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
     build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
     build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
     build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
@@ -361,12 +400,12 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon
     std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_");
     kernel_name += rhs_info.transpose ? "t" : "nt";
 
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
         kernel_name += "_fused_output_stage_fixedpoint";
         _fuse_output_stage = true;
         // If a_offset == 0, vector_sum_col can be a nullptr
-        if(a_offset != 0 && vector_sum_col != nullptr)
+        if (a_offset != 0 && vector_sum_col != nullptr)
         {
             build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
             build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
@@ -377,9 +416,10 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon
         build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
         build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
         // In case of _is_quantized_per_channel, RESULT_MULTIPLIER and RESULT_SHIFT are not utilized, but they are passed as a part of T_QUANTIZE8 macro.
-        if(!_is_quantized_per_channel)
+        if (!_is_quantized_per_channel)
         {
-            build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
+            build_opts.add_option("-DRESULT_MULTIPLIER=" +
+                                  support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
             build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0]));
         }
         else
@@ -432,42 +472,56 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                                                               const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                               const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo    *src0,
+                                                               const ITensorInfo    *src1,
+                                                               const ITensorInfo    *dst,
+                                                               const GEMMKernelInfo &gemm_info,
+                                                               const ITensorInfo    *vector_sum_col,
+                                                               const ITensorInfo    *vector_sum_row,
+                                                               const ITensorInfo    *bias,
+                                                               const ITensorInfo    *output_multipliers,
+                                                               const ITensorInfo    *output_shifts)
 {
     ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
-                                                              dst->clone().get(),
-                                                              gemm_info,
-                                                              vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
-                                                              vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
-                                                              bias != nullptr ? bias->clone().get() : nullptr,
-                                                              output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
-                                                              output_shifts != nullptr ? output_shifts->clone().get() : nullptr,
-                                                              num_elements_processed)
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                   output_multipliers, output_shifts));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(src0->clone().get(), src1->clone().get(), dst->clone().get(), gemm_info,
+                                      vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
+                                      vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
+                                      bias != nullptr ? bias->clone().get() : nullptr,
+                                      output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
+                                      output_shifts != nullptr ? output_shifts->clone().get() : nullptr,
+                                      num_elements_processed)
+            .first);
 
     return Status{};
 }
 
-void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack      &tensors,
+                                                           const Window     &window,
+                                                           cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0               = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1               = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto bias               = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    const auto vector_sum_col     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
-    const auto vector_sum_row     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
-    const auto output_shifts      = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS));
-    const auto output_multipliers = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS));
-    auto       dst                = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-
-    if(src1->info()->num_dimensions() < 3)
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    const auto vector_sum_col =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
+    const auto vector_sum_row =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
+    const auto output_shifts =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS));
+    const auto output_multipliers =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -479,7 +533,7 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors,
     slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
     slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
 
-    if(_reinterpret_input_as_3d)
+    if (_reinterpret_input_as_3d)
     {
         // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
         const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3;
@@ -487,10 +541,10 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors,
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
     }
 
-    if(_reinterpret_output_as_3d)
+    if (_reinterpret_output_as_3d)
     {
         // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
+        const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
         const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
     }
@@ -515,7 +569,7 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors,
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
+        if (!_slide_matrix_b)
         {
             slice_b = slice_matrix_b;
         }
@@ -527,19 +581,19 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors,
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
-        if(_reinterpret_input_as_3d)
+        if (_reinterpret_input_as_3d)
         {
             // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
             idx++;
         }
 
-        if(_reinterpret_output_as_3d)
+        if (_reinterpret_output_as_3d)
         {
             // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
             idx++;
         }
 
-        if(_fuse_output_stage)
+        if (_fuse_output_stage)
         {
             add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col);
             add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row);
@@ -548,8 +602,7 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors,
             add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_shifts, biases_slice);
         }
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h
index a77604db7c..1d4696b089 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -70,31 +71,44 @@ public:
      * @param[in]  output_shifts      (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
      *                                Supported data types: S32.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                   ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, ITensorInfo *bias = nullptr,
-                   ITensorInfo *output_multipliers = nullptr, ITensorInfo *output_shifts = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src0,
+                   const ITensorInfo      *src1,
+                   ITensorInfo            *dst,
+                   const GEMMKernelInfo   &gemm_info,
+                   ITensorInfo            *vector_sum_col     = nullptr,
+                   const ITensorInfo      *vector_sum_row     = nullptr,
+                   ITensorInfo            *bias               = nullptr,
+                   ITensorInfo            *output_multipliers = nullptr,
+                   ITensorInfo            *output_shifts      = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                           const ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, const ITensorInfo *bias = nullptr,
-                           const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
+    static Status validate(const ITensorInfo    *src0,
+                           const ITensorInfo    *src1,
+                           const ITensorInfo    *dst,
+                           const GEMMKernelInfo &gemm_info,
+                           const ITensorInfo    *vector_sum_col     = nullptr,
+                           const ITensorInfo    *vector_sum_row     = nullptr,
+                           const ITensorInfo    *bias               = nullptr,
+                           const ITensorInfo    *output_multipliers = nullptr,
+                           const ITensorInfo    *output_shifts      = nullptr);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool _slide_matrix_b{ true };
-    bool _reinterpret_input_as_3d{ false };
-    bool _reinterpret_output_as_3d{ false };
-    bool _use_dummy_work_items{ false };
-    bool _is_quantized_per_channel{ false };
-    bool _fuse_output_stage{ false };
+    bool _slide_matrix_b{true};
+    bool _reinterpret_input_as_3d{false};
+    bool _reinterpret_output_as_3d{false};
+    bool _use_dummy_work_items{false};
+    bool _is_quantized_per_channel{false};
+    bool _fuse_output_stage{false};
 };
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
index 792c71da76..030c11d069 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
@@ -23,16 +23,15 @@
  */
 #include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 
 namespace arm_compute
@@ -47,39 +46,51 @@ namespace
 {
 using ElementsProcessed = Steps;
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                          const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                          const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status validate_arguments(const ITensorInfo    *src0,
+                          const ITensorInfo    *src1,
+                          const ITensorInfo    *dst,
+                          const GEMMKernelInfo &gemm_info,
+                          const ITensorInfo    *vector_sum_col,
+                          const ITensorInfo    *vector_sum_row,
+                          const ITensorInfo    *bias,
+                          const ITensorInfo    *output_multipliers,
+                          const ITensorInfo    *output_shifts)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()), "The extension cl_arm_matrix_multiply is not supported on the target platform");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()),
+                                    "The extension cl_arm_matrix_multiply is not supported on the target platform");
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
 
     const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
     const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
     const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.k0 != 4 || lhs_info.k0 != 4, "Only 4 is supported as value for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(lhs_info.m0 == 1 || lhs_info.m0 == 2 || lhs_info.m0 == 4), "Only 1,2,4 are supported for m0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(rhs_info.n0 == 1 || rhs_info.n0 == 4 || rhs_info.n0 == 8), "Only 1,4,8 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(lhs_info.m0 == 1 || lhs_info.m0 == 2 || lhs_info.m0 == 4),
+                                    "Only 1,2,4 are supported for m0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(rhs_info.n0 == 1 || rhs_info.n0 == 4 || rhs_info.n0 == 8),
+                                    "Only 1,4,8 are supported for n0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
 
     const int m = gemm_info.m;
     const int n = gemm_info.n;
     const int k = gemm_info.k;
 
-    TensorShape tensor_shape1{ src1->tensor_shape() };
+    TensorShape tensor_shape1{src1->tensor_shape()};
     tensor_shape1.set(0, n);
     tensor_shape1.set(1, k);
 
-    const TensorInfo tensor_info1          = src1->clone()->set_tensor_shape(tensor_shape1);
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+    const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
 
     ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k));
-    if(gemm_info.reinterpret_input_as_3d)
+    if (gemm_info.reinterpret_input_as_3d)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m));
     }
@@ -90,11 +101,11 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
 
     const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_dst_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
-        if(output_stage.type == GEMMLowpOutputStageType::NONE)
+        if (output_stage.type == GEMMLowpOutputStageType::NONE)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
         }
@@ -104,38 +115,41 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         }
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != bias->dimension(0));
     }
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) ||
+                                        (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT),
                                     "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported");
 
     // Checks performed if the dst stage needs to be fused
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
         // If a_offset == 0, vector_sum_col can be a nullptr
-        if(gemm_info.a_offset != 0)
+        if (gemm_info.a_offset != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
             ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_dst_shape[0]);
         }
 
         // If b_offset == 0, vector_sum_row can be a nullptr
-        if(gemm_info.b_offset != 0)
+        if (gemm_info.b_offset != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
 
             // Check if mm result is a 3D reinterpretation
-            const bool reinterpret_as_3d = expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x();
+            const bool reinterpret_as_3d =
+                expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x();
 
             // Validate input
-            ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (expected_dst_shape[1] * expected_dst_shape[2]));
+            ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+                                                                 (expected_dst_shape[1] * expected_dst_shape[2]));
             ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_dst_shape[1]);
 
-            if(expected_dst_shape.num_dimensions() > 1)
+            if (expected_dst_shape.num_dimensions() > 1)
             {
                 const unsigned int dst_batch_idx = reinterpret_as_3d ? 3 : 2;
 
@@ -147,30 +161,32 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
                 ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_dst_shape[dst_batch_idx],
                                                 "vector_sum_row must have the same number of batches of dst tensor");
 
-                if(gemm_info.a_offset != 0)
+                if (gemm_info.a_offset != 0)
                 {
                     TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
                     vector_sum_col_shape.collapse_from(1);
 
-                    ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                    "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+                    ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+                                                        vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                    "vector_sum_col tensor must have the same number of batches of "
+                                                    "vector_sum_row_shape or the number of batches must be set to 1");
                 }
             }
         }
 
-        if(dst->total_size() != 0)
+        if (dst->total_size() != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type());
         }
         ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
 
-        if(output_multipliers != nullptr && output_shifts != nullptr)
+        if (output_multipliers != nullptr && output_shifts != nullptr)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
             ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
             ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
-            if(output_stage.is_quantized_per_channel)
+            if (output_stage.is_quantized_per_channel)
             {
                 ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_shifts->dimension(0));
                 ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_multipliers->dimension(0));
@@ -180,9 +196,16 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                                                        ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias,
-                                                        ITensorInfo *output_multipliers, ITensorInfo *output_shifts, ElementsProcessed &num_elements_processed)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo    *src0,
+                                                        const ITensorInfo    *src1,
+                                                        ITensorInfo          *dst,
+                                                        const GEMMKernelInfo &gemm_info,
+                                                        ITensorInfo          *vector_sum_col,
+                                                        const ITensorInfo    *vector_sum_row,
+                                                        ITensorInfo          *bias,
+                                                        ITensorInfo          *output_multipliers,
+                                                        ITensorInfo          *output_shifts,
+                                                        ElementsProcessed    &num_elements_processed)
 {
     const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
 
@@ -200,9 +223,10 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
     reinterpret_output_as_3d = false;
     // dst tensor auto initialization if not yet initialized
     const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
-    if(output_stage.type != GEMMLowpOutputStageType::NONE)
+    if (output_stage.type != GEMMLowpOutputStageType::NONE)
     {
-        auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type));
+        auto_init_if_empty(
+            *dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type));
     }
     else
     {
@@ -211,7 +235,7 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
 
     TensorInfo tmp_info(*dst);
 
-    if(reinterpret_output_as_3d)
+    if (reinterpret_output_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -224,11 +248,12 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
     num_elems_processed_per_iteration_x = 1;
     num_elems_processed_per_iteration_y = 1;
 
-    win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
-        if(gemm_info.a_offset != 0)
+        if (gemm_info.a_offset != 0)
         {
             AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration_x);
             window_changed = window_changed || update_window_and_padding(win, vector_sum_col_access);
@@ -236,17 +261,19 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
         // No access window needed for vector_sum_row
         ARM_COMPUTE_UNUSED(vector_sum_row);
 
-        if(bias != nullptr)
+        if (bias != nullptr)
         {
             AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration_x);
             window_changed = window_changed || update_window_and_padding(win, bias_access);
         }
 
-        if(output_multipliers != nullptr && output_stage.is_quantized_per_channel)
+        if (output_multipliers != nullptr && output_stage.is_quantized_per_channel)
         {
-            AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_processed_per_iteration_x);
+            AccessWindowHorizontal output_multipliers_access(output_multipliers, 0,
+                                                             num_elems_processed_per_iteration_x);
             AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration_x);
-            window_changed = window_changed || update_window_and_padding(win, output_multipliers_access, output_shifts_access);
+            window_changed =
+                window_changed || update_window_and_padding(win, output_multipliers_access, output_shifts_access);
         }
     }
 
@@ -278,7 +305,8 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0,
     collapsed.set(Window::DimX, x_dimension);
     collapsed.set(Window::DimY, y_dimension);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, collapsed);
 }
 } // namespace
@@ -288,15 +316,22 @@ ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::ClGemmLowpMatrixMultiplyResha
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst,
-                                                                  const GEMMKernelInfo &gemm_info,
-                                                                  ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias,
-                                                                  ITensorInfo *output_multipliers, ITensorInfo *output_shifts)
+void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileContext &compile_context,
+                                                                  const ITensorInfo      *src0,
+                                                                  const ITensorInfo      *src1,
+                                                                  ITensorInfo            *dst,
+                                                                  const GEMMKernelInfo   &gemm_info,
+                                                                  ITensorInfo            *vector_sum_col,
+                                                                  const ITensorInfo      *vector_sum_row,
+                                                                  ITensorInfo            *bias,
+                                                                  ITensorInfo            *output_multipliers,
+                                                                  ITensorInfo            *output_shifts)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                  output_multipliers, output_shifts));
 
-    auto                          padding_info = get_padding_info({ src0, src1, dst, vector_sum_row });
+    auto                          padding_info = get_padding_info({src0, src1, dst, vector_sum_row});
     const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
     const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
     const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
@@ -313,7 +348,8 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompil
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts, num_elements_processed);
+    auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                    output_multipliers, output_shifts, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
@@ -334,18 +370,19 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompil
     build_opts.add_option("-DMMUL_M0=" + support::cpp11::to_string(mmul_m0));
     build_opts.add_option("-DMMUL_N0=" + support::cpp11::to_string(mmul_n0));
     build_opts.add_option("-DMMUL_K0=" + support::cpp11::to_string(mmul_k0));
-    build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option("-DACTIVATION_TYPE=" +
+                          lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
     build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
     build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
 
     std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_mmul");
 
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
         build_opts.add_option("-DFUSED_OUTPUT_STAGE_FIXED_POINT");
         _fuse_output_stage = true;
         // If a_offset == 0, vector_sum_col can be a nullptr
-        if(a_offset != 0 && vector_sum_col != nullptr)
+        if (a_offset != 0 && vector_sum_col != nullptr)
         {
             build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
             build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
@@ -396,42 +433,54 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompil
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                                                                   const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                                   const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(const ITensorInfo    *src0,
+                                                                   const ITensorInfo    *src1,
+                                                                   const ITensorInfo    *dst,
+                                                                   const GEMMKernelInfo &gemm_info,
+                                                                   const ITensorInfo    *vector_sum_col,
+                                                                   const ITensorInfo    *vector_sum_row,
+                                                                   const ITensorInfo    *bias,
+                                                                   const ITensorInfo    *output_multipliers,
+                                                                   const ITensorInfo    *output_shifts)
 {
     ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
-                                                              dst->clone().get(),
-                                                              gemm_info,
-                                                              vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
-                                                              vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
-                                                              bias != nullptr ? bias->clone().get() : nullptr,
-                                                              output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
-                                                              output_shifts != nullptr ? output_shifts->clone().get() : nullptr,
-                                                              num_elements_processed)
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                   output_multipliers, output_shifts));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(src0->clone().get(), src1->clone().get(), dst->clone().get(), gemm_info,
+                                      vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
+                                      vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
+                                      bias != nullptr ? bias->clone().get() : nullptr,
+                                      output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
+                                      output_shifts != nullptr ? output_shifts->clone().get() : nullptr,
+                                      num_elements_processed)
+            .first);
 
     return Status{};
 }
 
-void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack      &tensors,
+                                                               const Window     &window,
+                                                               cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0           = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1           = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto src2           = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    const auto vector_sum_col = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
-    const auto vector_sum_row = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
-    auto       dst            = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    const auto vector_sum_col =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
+    const auto vector_sum_row =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
-    if(src1->info()->num_dimensions() < 3)
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -449,7 +498,7 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tens
         add_3d_tensor_nhw_argument(idx, src1);
 
         // Bias buffer (_add_bias == true)
-        if(src2 != nullptr)
+        if (src2 != nullptr)
         {
             add_3d_tensor_nhw_argument(idx, src2);
         }
@@ -461,21 +510,20 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tens
         _kernel.setArg<cl_int>(idx++, _n);
         _kernel.setArg<cl_int>(idx++, _k);
 
-        if(_fuse_output_stage)
+        if (_fuse_output_stage)
         {
-            if(vector_sum_col != nullptr)
+            if (vector_sum_col != nullptr)
             {
                 add_3d_tensor_nhw_argument(idx, vector_sum_col);
             }
-            if(vector_sum_row != nullptr)
+            if (vector_sum_row != nullptr)
             {
                 add_3d_tensor_nhw_argument(idx, vector_sum_row);
             }
         }
 
         enqueue(queue, *this, slice, cl::NDRange(32, 2), false);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h
index 0ae549cd53..fc8b73140d 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMUL_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/IClKernel.h"
 
@@ -65,29 +66,42 @@ public:
      * @param[in]  output_multipliers (Optional) Output multipliers tensor. Supported data types: S32.
      * @param[in]  output_shifts      (Optional) Output shifts tensor. Supported data types: S32.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                   ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, ITensorInfo *bias = nullptr,
-                   ITensorInfo *output_multipliers = nullptr, ITensorInfo *output_shifts = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src0,
+                   const ITensorInfo      *src1,
+                   ITensorInfo            *dst,
+                   const GEMMKernelInfo   &gemm_info,
+                   ITensorInfo            *vector_sum_col     = nullptr,
+                   const ITensorInfo      *vector_sum_row     = nullptr,
+                   ITensorInfo            *bias               = nullptr,
+                   ITensorInfo            *output_multipliers = nullptr,
+                   ITensorInfo            *output_shifts      = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                           const ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, const ITensorInfo *bias = nullptr,
-                           const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
+    static Status validate(const ITensorInfo    *src0,
+                           const ITensorInfo    *src1,
+                           const ITensorInfo    *dst,
+                           const GEMMKernelInfo &gemm_info,
+                           const ITensorInfo    *vector_sum_col     = nullptr,
+                           const ITensorInfo    *vector_sum_row     = nullptr,
+                           const ITensorInfo    *bias               = nullptr,
+                           const ITensorInfo    *output_multipliers = nullptr,
+                           const ITensorInfo    *output_shifts      = nullptr);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool       _fuse_output_stage{ false };
-    signed int _m{ 1 };
-    signed int _n{ 1 };
-    signed int _k{ 1 };
+    bool       _fuse_output_stage{false};
+    signed int _m{1};
+    signed int _n{1};
+    signed int _k{1};
 };
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMULKERNEL_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMULKERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp
index 9ec0b5182f..d93dbde95a 100644
--- a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp
@@ -28,11 +28,10 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -44,12 +43,16 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                          int32_t a_offset, int32_t b_offset)
+Status validate_arguments(const ITensorInfo *mm_result,
+                          const ITensorInfo *vector_sum_col,
+                          const ITensorInfo *vector_sum_row,
+                          const ITensorInfo *bias,
+                          int32_t            a_offset,
+                          int32_t            b_offset)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
@@ -57,26 +60,28 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
     }
 
     // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
     }
 
     // If b_offset == 0, vector_sum_row can be a nullptr
-    if(b_offset != 0)
+    if (b_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
 
         // Check if input is a 3D reinterpretation
-        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+        const bool reinterpret_as_3d =
+            mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
 
         // Validate input
-        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
+        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+                                                             (mm_result->dimension(1) * mm_result->dimension(2)));
         ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
 
         TensorShape output_shape = mm_result->tensor_shape();
-        if(output_shape.num_dimensions() > 1)
+        if (output_shape.num_dimensions() > 1)
         {
             const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
 
@@ -87,13 +92,15 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
                                             "mm_result tensor must have the same number of batches of output tensor");
 
-            if(a_offset != 0)
+            if (a_offset != 0)
             {
                 TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
                 vector_sum_col_shape.collapse_from(1);
 
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+                                                    vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                "vector_sum_col tensor must have the same number of batches of "
+                                                "vector_sum_row_shape or the number of batches must be set to 1");
             }
         }
     }
@@ -108,29 +115,34 @@ ClGemmLowpOffsetContributionKernel::ClGemmLowpOffsetContributionKernel()
 }
 
 void ClGemmLowpOffsetContributionKernel::configure(const CLCompileContext &compile_context,
-                                                   const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                   int32_t k, int32_t a_offset, int32_t b_offset)
+                                                   const ITensorInfo      *mm_result,
+                                                   const ITensorInfo      *vector_sum_col,
+                                                   const ITensorInfo      *vector_sum_row,
+                                                   const ITensorInfo      *bias,
+                                                   int32_t                 k,
+                                                   int32_t                 a_offset,
+                                                   int32_t                 b_offset)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset));
 
-    auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias });
+    auto padding_info = get_padding_info({mm_result, vector_sum_col, vector_sum_row, bias});
 
     // Check if input is a 3D reinterpretation
-    const bool reinterpret_as_3d = vector_sum_row != nullptr
-                                   && mm_result->num_dimensions() > 1
-                                   && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+    const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->num_dimensions() > 1 &&
+                                   mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
 
     const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->dimension(0));
 
     // Set the arguments to pass at compile time
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration));
 
     // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
         build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
@@ -138,8 +150,10 @@ void ClGemmLowpOffsetContributionKernel::configure(const CLCompileContext &compi
     // If b_offset == 0, vector_sum_row can be a nullptr
     build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
     build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
-    build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1)));
-    build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2)));
+    build_opts.add_option_if(reinterpret_as_3d,
+                             "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1)));
+    build_opts.add_option_if(reinterpret_as_3d,
+                             "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2)));
     build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
 
     std::string kernel_name("gemmlowp_offset_contribution");
@@ -165,10 +179,15 @@ void ClGemmLowpOffsetContributionKernel::configure(const CLCompileContext &compi
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                    int32_t a_offset, int32_t b_offset)
+Status ClGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result,
+                                                    const ITensorInfo *vector_sum_col,
+                                                    const ITensorInfo *vector_sum_row,
+                                                    const ITensorInfo *bias,
+                                                    int32_t            a_offset,
+                                                    int32_t            b_offset)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset));
     return Status{};
 }
 
@@ -177,10 +196,13 @@ void ClGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Wind
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IClKernel::window(), window);
 
-    const auto vector_sum_col = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
-    const auto vector_sum_row = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
-    const auto bias           = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    const auto mm_result      = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_SRC_DST));
+    const auto vector_sum_col =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
+    const auto vector_sum_row =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    const auto mm_result = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_SRC_DST));
 
     Window collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
     Window slice     = collapsed.first_slice_window_3D();
@@ -209,8 +231,7 @@ void ClGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Wind
         add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice);
 
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h
index 48926e280b..2080a3a091 100644
--- a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h
@@ -67,15 +67,25 @@ public:
      * @param[in]      b_offset        Offset to be added to each element of the matrix B.
      */
     void configure(const CLCompileContext &compile_context,
-                   const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                   int32_t k, int32_t a_offset, int32_t b_offset);
+                   const ITensorInfo      *mm_result,
+                   const ITensorInfo      *vector_sum_col,
+                   const ITensorInfo      *vector_sum_row,
+                   const ITensorInfo      *bias,
+                   int32_t                 k,
+                   int32_t                 a_offset,
+                   int32_t                 b_offset);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpOffsetContributionKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset);
+    static Status validate(const ITensorInfo *mm_result,
+                           const ITensorInfo *vector_sum_col,
+                           const ITensorInfo *vector_sum_row,
+                           const ITensorInfo *bias,
+                           int32_t            a_offset,
+                           int32_t            b_offset);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp
index c5fb54f524..26f479f61a 100644
--- a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp
@@ -34,7 +34,6 @@
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -46,12 +45,20 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst,
-                          int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status validate_arguments(const ITensorInfo             *mm_result,
+                          const ITensorInfo             *vector_sum_col,
+                          const ITensorInfo             *vector_sum_row,
+                          const ITensorInfo             *bias,
+                          const ITensorInfo             *dst,
+                          int32_t                        a_offset,
+                          int32_t                        b_offset,
+                          const GEMMLowpOutputStageInfo &output_stage,
+                          const ITensorInfo             *output_multipliers,
+                          const ITensorInfo             *output_shifts)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
@@ -62,33 +69,35 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
     ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
-    if(output_stage.is_quantized_per_channel)
+    if (output_stage.is_quantized_per_channel)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_shifts->dimension(0));
         ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_multipliers->dimension(0));
     }
 
     // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
     }
 
     // If b_offset == 0, vector_sum_row can be a nullptr
-    if(b_offset != 0)
+    if (b_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
 
         // Check if input is a 3D reinterpretation
-        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+        const bool reinterpret_as_3d =
+            mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
 
         // Validate input
-        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
+        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+                                                             (mm_result->dimension(1) * mm_result->dimension(2)));
         ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
 
         TensorShape output_shape = mm_result->tensor_shape();
-        if(output_shape.num_dimensions() > 1)
+        if (output_shape.num_dimensions() > 1)
         {
             const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
 
@@ -99,20 +108,22 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
                                             "mm_result tensor must have the same number of batches of output tensor");
 
-            if(a_offset != 0)
+            if (a_offset != 0)
             {
                 TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
                 vector_sum_col_shape.collapse_from(1);
 
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+                                                    vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                "vector_sum_col tensor must have the same number of batches of "
+                                                "vector_sum_row_shape or the number of batches must be set to 1");
             }
         }
     }
 
     ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type == GEMMLowpOutputStageType::NONE);
     // Checks performed when output is configured
-    if((dst != nullptr) && (dst->total_size() != 0))
+    if ((dst != nullptr) && (dst->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type());
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
@@ -120,7 +131,8 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
     }
 
     ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_stage.gemmlowp_multipliers.size() != output_stage.gemmlowp_shifts.size(), "per channel quantization info is incorrect");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_stage.gemmlowp_multipliers.size() != output_stage.gemmlowp_shifts.size(),
+                                    "per channel quantization info is incorrect");
 
     return Status{};
 }
@@ -131,16 +143,26 @@ ClGemmLowpOffsetContributionOutputStageKernel::ClGemmLowpOffsetContributionOutpu
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileContext &compile_context,
-                                                              const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst,
-                                                              int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
-                                                              const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileContext        &compile_context,
+                                                              const ITensorInfo             *mm_result,
+                                                              const ITensorInfo             *vector_sum_col,
+                                                              const ITensorInfo             *vector_sum_row,
+                                                              const ITensorInfo             *bias,
+                                                              ITensorInfo                   *dst,
+                                                              int32_t                        k,
+                                                              int32_t                        a_offset,
+                                                              int32_t                        b_offset,
+                                                              const GEMMLowpOutputStageInfo &output_stage,
+                                                              const ITensorInfo             *output_multipliers,
+                                                              const ITensorInfo             *output_shifts)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, dst, output_multipliers, output_shifts);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage, output_multipliers, output_shifts));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset,
+                                                  b_offset, output_stage, output_multipliers, output_shifts));
 
-    auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias, dst, output_multipliers, output_shifts });
+    auto padding_info =
+        get_padding_info({mm_result, vector_sum_col, vector_sum_row, bias, dst, output_multipliers, output_shifts});
 
     const int min = output_stage.gemmlowp_min_bound;
     const int max = output_stage.gemmlowp_max_bound;
@@ -148,9 +170,8 @@ void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileCon
     _is_quantized_per_channel = output_stage.is_quantized_per_channel;
 
     // Check if input is a 3D reinterpretation
-    const bool reinterpret_as_3d = vector_sum_row != nullptr
-                                   && mm_result->num_dimensions() > 1
-                                   && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+    const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->num_dimensions() > 1 &&
+                                   mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
 
     // Auto initialize the output
     auto_init_if_empty(*dst, mm_result->clone()->set_data_type(output_stage.output_data_type));
@@ -160,10 +181,11 @@ void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileCon
     // Set the arguments to pass at compile time
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration));
 
     // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
         build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
@@ -171,8 +193,10 @@ void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileCon
     // If b_offset == 0, vector_sum_row can be a nullptr
     build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
     build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
-    build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1)));
-    build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2)));
+    build_opts.add_option_if(reinterpret_as_3d,
+                             "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1)));
+    build_opts.add_option_if(reinterpret_as_3d,
+                             "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2)));
     build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
     build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
     build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
@@ -210,26 +234,42 @@ void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileCon
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                               const ITensorInfo *dst, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
-                                                               const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status ClGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo             *mm_result,
+                                                               const ITensorInfo             *vector_sum_col,
+                                                               const ITensorInfo             *vector_sum_row,
+                                                               const ITensorInfo             *bias,
+                                                               const ITensorInfo             *dst,
+                                                               int32_t                        a_offset,
+                                                               int32_t                        b_offset,
+                                                               const GEMMLowpOutputStageInfo &output_stage,
+                                                               const ITensorInfo             *output_multipliers,
+                                                               const ITensorInfo             *output_shifts)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage, output_multipliers, output_shifts));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset,
+                                                   b_offset, output_stage, output_multipliers, output_shifts));
     return Status{};
 }
 
-void ClGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack      &tensors,
+                                                           const Window     &window,
+                                                           cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto mm_result          = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    const auto bias               = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    const auto vector_sum_col     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
-    const auto vector_sum_row     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
-    const auto output_shifts      = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS));
-    const auto output_multipliers = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS));
-    auto       dst                = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto mm_result =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    const auto vector_sum_col =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
+    const auto vector_sum_row =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
+    const auto output_shifts =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS));
+    const auto output_multipliers =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice     = collapsed.first_slice_window_3D();
@@ -260,8 +300,7 @@ void ClGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors,
         add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_multipliers, biases_slice);
         add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_shifts, biases_slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h
index cee04473c4..97ee9bc97f 100644
--- a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h
@@ -66,23 +66,40 @@ public:
      * @param[in]  output_shifts      Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
      *                                Supported data types: S32
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst,
-                   int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
-                   const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts);
+    void configure(const CLCompileContext        &compile_context,
+                   const ITensorInfo             *mm_result,
+                   const ITensorInfo             *vector_sum_col,
+                   const ITensorInfo             *vector_sum_row,
+                   const ITensorInfo             *bias,
+                   ITensorInfo                   *dst,
+                   int32_t                        k,
+                   int32_t                        a_offset,
+                   int32_t                        b_offset,
+                   const GEMMLowpOutputStageInfo &output_stage,
+                   const ITensorInfo             *output_multipliers,
+                   const ITensorInfo             *output_shifts);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpOffsetContributionOutputStageKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, int32_t a_offset,
-                           int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts);
+    static Status validate(const ITensorInfo             *mm_result,
+                           const ITensorInfo             *vector_sum_col,
+                           const ITensorInfo             *vector_sum_row,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           int32_t                        a_offset,
+                           int32_t                        b_offset,
+                           const GEMMLowpOutputStageInfo &output_stage,
+                           const ITensorInfo             *output_multipliers,
+                           const ITensorInfo             *output_shifts);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool _is_quantized_per_channel{ false };
+    bool _is_quantized_per_channel{false};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
index 39754385a1..7b7beab12c 100644
--- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
@@ -27,15 +27,14 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -47,20 +46,23 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info)
+Status validate_arguments(const ITensorInfo             *src,
+                          const ITensorInfo             *bias,
+                          const ITensorInfo             *dst,
+                          const GEMMLowpOutputStageInfo *info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
 
     // Check biases if exist
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != info->output_data_type, "Mismatching dst data type");
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
@@ -75,7 +77,9 @@ ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::ClGemmLowpQuantizeDownInt32S
     _type = CLKernelType::ELEMENTWISE;
 }
 
-Status ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
+Status ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo             *src,
+                                                                    const ITensorInfo             *bias,
+                                                                    const ITensorInfo             *dst,
                                                                     const GEMMLowpOutputStageInfo *info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
@@ -84,14 +88,17 @@ Status ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITenso
     return Status{};
 }
 
-void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
+void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompileContext        &compile_context,
+                                                                   const ITensorInfo             *src,
+                                                                   const ITensorInfo             *bias,
+                                                                   ITensorInfo                   *dst,
                                                                    const GEMMLowpOutputStageInfo *info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info));
 
-    auto padding_info = get_padding_info({ src, bias, dst });
+    auto padding_info = get_padding_info({src, bias, dst});
 
     // dst auto inizialitation if not yet initialized
     auto_init_if_empty(*dst, src->clone()->set_data_type(info->output_data_type));
@@ -103,19 +110,26 @@ void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompi
     auto           max = info->gemmlowp_max_bound;
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(info->gemmlowp_offset));
     build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(info->gemmlowp_multiplier));
     build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(info->gemmlowp_shift));
     build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
-    build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max),
-                             "-DMIN_BOUND=" + support::cpp11::to_string(min));
-    build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max),
-                             "-DMAX_BOUND=" + support::cpp11::to_string(max));
+    build_opts.add_option_if(
+        (min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) &&
+            (min != max),
+        "-DMIN_BOUND=" + support::cpp11::to_string(min));
+    build_opts.add_option_if(
+        (max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) &&
+            (min != max),
+        "-DMAX_BOUND=" + support::cpp11::to_string(max));
     build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
 
     // Create kernel
-    const std::string kernel_name = (info->output_data_type == DataType::QSYMM16) ? "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16" : "gemmlowp_output_stage_quantize_down_fixedpoint";
+    const std::string kernel_name = (info->output_data_type == DataType::QSYMM16)
+                                        ? "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16"
+                                        : "gemmlowp_output_stage_quantize_down_fixedpoint";
 
     // A macro guard to compile ONLY the kernel of interest
     build_opts.add_option("-D" + upper_string(kernel_name));
@@ -129,14 +143,18 @@ void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompi
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack      &tensors,
+                                                                const Window     &window,
+                                                                cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     // Create src window
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
@@ -144,7 +162,7 @@ void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack &ten
 
     // Setup bias slice
     unsigned int idx1 = num_arguments_per_3D_tensor();
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         Window biases_slice(slice);
         biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
@@ -158,8 +176,7 @@ void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack &ten
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx1, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h
index 69b5fc5018..71c9f4b752 100644
--- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h
@@ -60,14 +60,21 @@ public:
      * @param[out] dst             Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16.
      * @param[in]  info            Output stage info. Used to pass the quantized output data type
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *info);
+    void configure(const CLCompileContext        &compile_context,
+                   const ITensorInfo             *src,
+                   const ITensorInfo             *bias,
+                   ITensorInfo                   *dst,
+                   const GEMMLowpOutputStageInfo *info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info);
+    static Status validate(const ITensorInfo             *src,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           const GEMMLowpOutputStageInfo *info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp
index f379698326..52ebd32d46 100644
--- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp
@@ -27,15 +27,14 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -47,23 +46,31 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info)
+Status validate_arguments(const ITensorInfo             *src,
+                          const ITensorInfo             *bias,
+                          const ITensorInfo             *dst,
+                          const GEMMLowpOutputStageInfo *info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON((info->output_data_type != DataType::QASYMM8) && (info->output_data_type != DataType::QASYMM8_SIGNED));
-    ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)));
-    ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))
-                                || info->gemmlowp_min_bound > info->gemmlowp_max_bound);
+    ARM_COMPUTE_RETURN_ERROR_ON((info->output_data_type != DataType::QASYMM8) &&
+                                (info->output_data_type != DataType::QASYMM8_SIGNED));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        info->gemmlowp_max_bound >
+        std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        info->gemmlowp_min_bound <
+            std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)) ||
+        info->gemmlowp_min_bound > info->gemmlowp_max_bound);
 
     // Check biases if exist
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != info->output_data_type, "Mismatching output data type");
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
@@ -78,7 +85,9 @@ ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::ClGemmLowpQuantizeDownInt32ScaleB
     _type = CLKernelType::ELEMENTWISE;
 }
 
-Status ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
+Status ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo             *src,
+                                                               const ITensorInfo             *bias,
+                                                               const ITensorInfo             *dst,
                                                                const GEMMLowpOutputStageInfo *info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
@@ -87,14 +96,17 @@ Status ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo
     return Status{};
 }
 
-void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
+void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileContext        &compile_context,
+                                                              const ITensorInfo             *src,
+                                                              const ITensorInfo             *bias,
+                                                              ITensorInfo                   *dst,
                                                               const GEMMLowpOutputStageInfo *info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info));
 
-    auto padding_info = get_padding_info({ src, bias, dst });
+    auto padding_info = get_padding_info({src, bias, dst});
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*dst, src->clone()->set_data_type(info->output_data_type));
@@ -107,7 +119,8 @@ void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileCon
     // Set the arguments to pass at compile time
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DREAL_MULTIPLIER=" + float_to_string_with_full_precision(info->gemmlowp_real_multiplier));
     build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(info->gemmlowp_offset));
     build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
@@ -130,14 +143,18 @@ void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileCon
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack      &tensors,
+                                                           const Window     &window,
+                                                           cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     // Create input window
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
@@ -145,7 +162,7 @@ void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack &tensors,
 
     // Setup bias slice
     unsigned int idx1 = num_arguments_per_3D_tensor();
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         Window biases_slice(slice);
         biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
@@ -159,8 +176,7 @@ void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack &tensors,
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx1, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h
index 8eda24d25f..057c66767f 100644
--- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h
@@ -62,14 +62,21 @@ public:
      * @param[out] dst             Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
      * @param[in]  info            Output stage info. Used to pass the quantized output data type
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *info);
+    void configure(const CLCompileContext        &compile_context,
+                   const ITensorInfo             *src,
+                   const ITensorInfo             *bias,
+                   ITensorInfo                   *dst,
+                   const GEMMLowpOutputStageInfo *info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info);
+    static Status validate(const ITensorInfo             *src,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           const GEMMLowpOutputStageInfo *info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp
index 5d54db214a..31434ce61b 100644
--- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp
@@ -26,15 +26,14 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -46,25 +45,34 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage)
+Status validate_arguments(const ITensorInfo             *src,
+                          const ITensorInfo             *bias,
+                          const ITensorInfo             *dst,
+                          const GEMMLowpOutputStageInfo *output_stage)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON((output_stage->output_data_type != DataType::QASYMM8) && (output_stage->output_data_type != DataType::QASYMM8_SIGNED));
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)));
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))
-                                || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
+    ARM_COMPUTE_RETURN_ERROR_ON((output_stage->output_data_type != DataType::QASYMM8) &&
+                                (output_stage->output_data_type != DataType::QASYMM8_SIGNED));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        output_stage->gemmlowp_max_bound >
+        std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        output_stage->gemmlowp_min_bound <
+            std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) ||
+        output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
 
     // Check biases if exist
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != output_stage->output_data_type, "Mismatching output data type");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != output_stage->output_data_type,
+                                        "Mismatching output data type");
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
     }
 
@@ -77,7 +85,10 @@ ClGemmLowpQuantizeDownInt32ScaleKernel::ClGemmLowpQuantizeDownInt32ScaleKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-Status ClGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage)
+Status ClGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo             *src,
+                                                        const ITensorInfo             *bias,
+                                                        const ITensorInfo             *dst,
+                                                        const GEMMLowpOutputStageInfo *output_stage)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, output_stage));
@@ -85,14 +96,17 @@ Status ClGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src,
     return Status{};
 }
 
-void ClGemmLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
+void ClGemmLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext        &compile_context,
+                                                       const ITensorInfo             *src,
+                                                       const ITensorInfo             *bias,
+                                                       ITensorInfo                   *dst,
                                                        const GEMMLowpOutputStageInfo *output_stage)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, output_stage));
 
-    auto padding_info = get_padding_info({ src, bias, dst });
+    auto padding_info = get_padding_info({src, bias, dst});
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*dst, src->clone()->set_data_type(output_stage->output_data_type));
@@ -104,13 +118,18 @@ void ClGemmLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &c
     auto           max = output_stage->gemmlowp_max_bound;
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage->gemmlowp_offset));
     build_opts.add_option("-DRESULT_MULT_INT=" + support::cpp11::to_string(output_stage->gemmlowp_multiplier));
     build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage->gemmlowp_shift));
-    build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max),
+    build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(
+                                        output_stage->output_data_type))) &&
+                                 (min != max),
                              "-DMIN_BOUND=" + support::cpp11::to_string(min));
-    build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max),
+    build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(
+                                        output_stage->output_data_type))) &&
+                                 (min != max),
                              "-DMAX_BOUND=" + support::cpp11::to_string(max));
     build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
     build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
@@ -135,15 +154,17 @@ void ClGemmLowpQuantizeDownInt32ScaleKernel::run_op(ITensorPack &tensors, const
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice     = collapsed.first_slice_window_3D();
 
     unsigned int idx1 = num_arguments_per_3D_tensor();
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         Window biases_slice(slice);
         biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
@@ -157,8 +178,7 @@ void ClGemmLowpQuantizeDownInt32ScaleKernel::run_op(ITensorPack &tensors, const
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx1, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h
index 84c5060362..e6390801f1 100644
--- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h
@@ -62,14 +62,21 @@ public:
      * @param[out] dst             Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
      * @param[in]  output_stage    GEMMLowp output stage metadata.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage);
+    void configure(const CLCompileContext        &compile_context,
+                   const ITensorInfo             *src,
+                   const ITensorInfo             *bias,
+                   ITensorInfo                   *dst,
+                   const GEMMLowpOutputStageInfo *output_stage);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage);
+    static Status validate(const ITensorInfo             *src,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           const GEMMLowpOutputStageInfo *output_stage);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -77,4 +84,4 @@ public:
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp
index ea88b485a0..ee4a191fed 100644
--- a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp
@@ -32,7 +32,6 @@
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -47,12 +46,15 @@ namespace
 Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8);
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(1), "Output vector must have length equal to the number of rows of the input matrix");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->dimension(0) != src->dimension(1),
+            "Output vector must have length equal to the number of rows of the input matrix");
     }
     return Status{};
 }
@@ -60,12 +62,15 @@ Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITens
 Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(0), "Output vector must have length equal to the number of columns of the input matrix");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->dimension(0) != src->dimension(0),
+            "Output vector must have length equal to the number of columns of the input matrix");
     }
     return Status{};
 }
@@ -76,7 +81,10 @@ IClGemmLowpReductionKernel::IClGemmLowpReductionKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_a, ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
+void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext            &compile_context,
+                                                 const ITensorInfo                 *mtx_a,
+                                                 ITensorInfo                       *vector_sum_row,
+                                                 const GEMMLowpReductionKernelInfo &info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row);
@@ -85,7 +93,7 @@ void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext &compile
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*vector_sum_row, TensorShape(mtx_a->dimension(1)), 1, DataType::S32);
 
-    auto padding_info = get_padding_info({ mtx_a, vector_sum_row });
+    auto padding_info = get_padding_info({mtx_a, vector_sum_row});
 
     // Set the arguments to pass at compile time
     CLBuildOptions build_opts;
@@ -120,7 +128,9 @@ void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext &compile
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
+Status ClGemmLowpMatrixAReductionKernel::validate(const ITensorInfo                 *mtx_a,
+                                                  const ITensorInfo                 *vector_sum_row,
+                                                  const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
@@ -133,8 +143,9 @@ void ClGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY);
     Window slice_in  = collapsed.first_slice_window_2D();
@@ -151,11 +162,13 @@ void ClGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window
         add_3D_tensor_argument(idx, src, slice_in);
         add_2D_tensor_argument(idx, dst, slice_out);
         enqueue(queue, *this, slice_out, lws_hint());
-    }
-    while(collapsed.slide_window_slice_2D(slice_out));
+    } while (collapsed.slide_window_slice_2D(slice_out));
 }
 
-void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_b, ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
+void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext            &compile_context,
+                                                 const ITensorInfo                 *mtx_b,
+                                                 ITensorInfo                       *vector_sum_col,
+                                                 const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
@@ -163,14 +176,15 @@ void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext &compile
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*vector_sum_col, TensorShape(mtx_b->dimension(0)), 1, DataType::S32);
 
-    auto padding_info = get_padding_info({ mtx_b, vector_sum_col });
+    auto padding_info = get_padding_info({mtx_b, vector_sum_col});
 
     const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, mtx_b->dimension(0));
 
     // Set the arguments to pass at compile time
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mtx_b->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(mtx_b->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(mtx_b->dimension(0)));
     build_opts.add_option("-DROWS_B=" + support::cpp11::to_string(mtx_b->dimension(1)));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_b->data_type()));
@@ -192,7 +206,9 @@ void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext &compile
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
+Status ClGemmLowpMatrixBReductionKernel::validate(const ITensorInfo                 *mtx_b,
+                                                  const ITensorInfo                 *vector_sum_col,
+                                                  const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
@@ -205,8 +221,9 @@ void ClGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window collapsed = window.collapse_if_possible(IKernel::window(), Window::DimY);
 
@@ -222,8 +239,7 @@ void ClGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window
         add_3D_tensor_argument(idx, src, slice_in);
         add_2D_tensor_argument(idx, dst, slice_out);
         enqueue(queue, *this, slice_out, lws_hint());
-    }
-    while(collapsed.slide_window_slice_2D(slice_out));
+    } while (collapsed.slide_window_slice_2D(slice_out));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h
index 7119b5fee0..c81543e4c2 100644
--- a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -52,7 +53,10 @@ public:
      *                             - scalar       Scalar value to multiply each reduced column/row by.
      *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
      */
-    virtual void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const GEMMLowpReductionKernelInfo &info) = 0;
+    virtual void configure(const CLCompileContext            &compile_context,
+                           const ITensorInfo                 *input,
+                           ITensorInfo                       *output,
+                           const GEMMLowpReductionKernelInfo &info) = 0;
 };
 
 /** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
@@ -74,14 +78,18 @@ public:
      *                             - scalar       Scalar value to multiply each reduced column/row by.
      *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_a, ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override;
+    void configure(const CLCompileContext            &compile_context,
+                   const ITensorInfo                 *mtx_a,
+                   ITensorInfo                       *vector_sum_row,
+                   const GEMMLowpReductionKernelInfo &info) override;
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info);
+    static Status
+    validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -106,14 +114,18 @@ public:
      *                             - scalar       Scalar value to multiply each reduced column/row by.
      *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_b, ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override;
+    void configure(const CLCompileContext            &compile_context,
+                   const ITensorInfo                 *mtx_b,
+                   ITensorInfo                       *vector_sum_col,
+                   const GEMMLowpReductionKernelInfo &info) override;
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info);
+    static Status
+    validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
index b8997dfc7f..fd23aa9924 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
@@ -29,10 +29,11 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/ActivationFunctionUtils.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLUtils.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -51,7 +52,13 @@ namespace
 {
 using ElementsProcessed = Steps;
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *src2,
+                          const ITensorInfo       *dst,
+                          float                    alpha,
+                          float                    beta,
+                          const GEMMLHSMatrixInfo &lhs_info,
                           const GEMMRHSMatrixInfo &rhs_info,
                           const GEMMKernelInfo    &gemm_info)
 {
@@ -59,15 +66,20 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr)
-                                    && (!gemm_info.broadcast_bias),
-                                    "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) &&
+            (!gemm_info.broadcast_bias),
+        "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for GEMM native");
 
@@ -82,7 +94,7 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k);
     ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) != n);
     ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != k);
-    if(gemm_info.reinterpret_input_as_3d)
+    if (gemm_info.reinterpret_input_as_3d)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m);
     }
@@ -91,15 +103,16 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != m);
     }
 
-    if(src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
+    if (src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
     {
         const unsigned int src2_dim0 = src2->dimension(0);
         const unsigned int src2_dim1 = src2->dimension(1);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1);
-        if(gemm_info.broadcast_bias)
+        if (gemm_info.broadcast_bias)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n),
+                                            "Incorrect dimension of bias matrix which is to be broadcasted");
         }
         else
         {
@@ -107,9 +120,10 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         }
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+        const TensorInfo tensor_info_dst =
+            dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
     }
@@ -117,9 +131,14 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo             *src0,
+                                                        ITensorInfo             *src1,
+                                                        ITensorInfo             *src2,
+                                                        ITensorInfo             *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
                                                         const GEMMRHSMatrixInfo &rhs_info,
-                                                        const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
+                                                        const GEMMKernelInfo    &gemm_info,
+                                                        ElementsProcessed       &num_elements_processed)
 {
     unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
     unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
@@ -132,17 +151,18 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITens
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
+    if (reinterpret_input_as_3d == reinterpret_output_as_3d)
     {
         reinterpret_output_as_3d = false;
     }
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
 
     TensorInfo tmp_info(*dst);
 
-    if(reinterpret_output_as_3d)
+    if (reinterpret_output_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -155,34 +175,34 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITens
     num_elems_processed_per_iteration_x = rhs_info.n0;
     num_elems_processed_per_iteration_y = lhs_info.m0;
 
-    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win_out =
+        calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
-    AccessWindowStatic src0_access(src0, 0, 0,
-                                   src0->dimension(0),
-                                   src0->dimension(1));
-    AccessWindowStatic src1_access(src1, 0, 0,
-                                   ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x),
-                                   src1->dimension(1));
-    AccessWindowStatic dst_access(dst, 0, 0,
-                                  dst->dimension(0),
-                                  dst->dimension(1));
+    AccessWindowStatic src0_access(src0, 0, 0, src0->dimension(0), src0->dimension(1));
+    AccessWindowStatic src1_access(
+        src1, 0, 0, ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), src1->dimension(1));
+    AccessWindowStatic dst_access(dst, 0, 0, dst->dimension(0), dst->dimension(1));
 
-    if(src2 != nullptr)
+    if (src2 != nullptr)
     {
         const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
 
-        AccessWindowStatic src2_access(src2, 0, 0,
-                                       ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x),
+        AccessWindowStatic src2_access(src2, 0, 0, ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x),
                                        src2->dimension(1));
 
-        window_changed = update_window_and_padding(win, src0_access, src1_access, src2_access) || // window used by the execute_window_loop
-                         update_window_and_padding(win_out, dst_access);                          // window used to update the padding requirements of dst tensor
+        window_changed = update_window_and_padding(win, src0_access, src1_access,
+                                                   src2_access) || // window used by the execute_window_loop
+                         update_window_and_padding(
+                             win_out, dst_access); // window used to update the padding requirements of dst tensor
     }
     else
     {
-        window_changed = update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop
-                         update_window_and_padding(win_out, dst_access);             // window used to update the padding requirements of dst tensor
+        window_changed =
+            update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop
+            update_window_and_padding(win_out,
+                                      dst_access); // window used to update the padding requirements of dst tensor
     }
 
     // Collapse along the Z direction
@@ -191,7 +211,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITens
     const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
     collapsed                                = win.collapse(win, dimension_to_collapse);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, collapsed);
 }
 } // namespace
@@ -201,19 +222,26 @@ ClGemmMatrixMultiplyNativeKernel::ClGemmMatrixMultiplyNativeKernel()
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha,
+void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext  &compile_context,
+                                                 ITensorInfo             *src0,
+                                                 ITensorInfo             *src1,
+                                                 ITensorInfo             *src2,
+                                                 ITensorInfo             *dst,
+                                                 float                    alpha,
                                                  float                    beta,
                                                  const GEMMLHSMatrixInfo &lhs_info,
-                                                 const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+                                                 const GEMMRHSMatrixInfo &rhs_info,
+                                                 const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
 
-    auto padding_info         = get_padding_info({ src0, dst });
+    auto padding_info         = get_padding_info({src0, dst});
     _reinterpret_input_as_3d  = gemm_info.reinterpret_input_as_3d;
     _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
     _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
@@ -221,7 +249,7 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+    if (_reinterpret_input_as_3d == _reinterpret_output_as_3d)
     {
         _reinterpret_input_as_3d  = false;
         _reinterpret_output_as_3d = false;
@@ -234,7 +262,8 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, src2 != nullptr ? src2 : nullptr, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
+    auto win_config = validate_and_configure_window(src0, src1, src2 != nullptr ? src2 : nullptr, dst, lhs_info,
+                                                    rhs_info, gemm_info, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     IClKernel::configure_internal(win_config.second);
 
@@ -260,14 +289,17 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
-    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
+    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)),
+                             "-DALPHA=" + float_to_string_with_full_precision(alpha));
     build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
     build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
     build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
     build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
     build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
     build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
     build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
@@ -275,9 +307,13 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile
     build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
     build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
     build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DACTIVATION_TYPE=" +
+                                 lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
 
     std::string kernel_name("gemm_mm_native");
 
@@ -314,21 +350,23 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmMatrixMultiplyNativeKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
+Status ClGemmMatrixMultiplyNativeKernel::validate(const ITensorInfo       *src0,
+                                                  const ITensorInfo       *src1,
+                                                  const ITensorInfo       *src2,
+                                                  const ITensorInfo       *dst,
+                                                  float                    alpha,
+                                                  float                    beta,
                                                   const GEMMLHSMatrixInfo &lhs_info,
-                                                  const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+                                                  const GEMMRHSMatrixInfo &rhs_info,
+                                                  const GEMMKernelInfo    &gemm_info)
 {
     ElementsProcessed num_elements_processed{};
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(),
                                                               src2 != nullptr ? src2->clone().get() : nullptr,
-                                                              dst->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info,
+                                                              dst->clone().get(), lhs_info, rhs_info, gemm_info,
                                                               num_elements_processed)
-                                .first);
+                                    .first);
 
     return Status{};
 }
@@ -338,15 +376,18 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
 
-    if(src1->info()->num_dimensions() < 3)
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -358,11 +399,11 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
     slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
     slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
 
-    if(_reinterpret_input_as_3d)
+    if (_reinterpret_input_as_3d)
     {
         // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
         unsigned int idx0;
-        if(_add_bias)
+        if (_add_bias)
         {
             idx0 = 4 * num_arguments_per_2D_tensor() + 7;
         }
@@ -374,11 +415,11 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
     }
 
-    if(_reinterpret_output_as_3d)
+    if (_reinterpret_output_as_3d)
     {
         // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
         unsigned int idx0;
-        if(_add_bias)
+        if (_add_bias)
         {
             idx0 = 4 * num_arguments_per_2D_tensor() + 7 + (_reinterpret_input_as_3d ? 1 : 0);
         }
@@ -395,7 +436,7 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
+        if (!_slide_matrix_b)
         {
             slice_b = slice_matrix_b;
         }
@@ -403,7 +444,7 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
         unsigned int idx = 0;
         add_2D_tensor_argument(idx, src0, slice);
         add_2D_tensor_argument(idx, src1, slice_b);
-        if(_add_bias)
+        if (_add_bias)
         {
             add_2D_tensor_argument(idx, src2, slice);
         }
@@ -411,7 +452,7 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
 
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
-        if(_add_bias)
+        if (_add_bias)
         {
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[2]));
         }
@@ -423,8 +464,7 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window
         _kernel.setArg<cl_int>(idx++, _k);
 
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
index 80f8355932..da6c9a5bb7 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
@@ -25,6 +25,7 @@
 #define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -58,7 +59,13 @@ public:
      *                             rhs_info.k0: same of lhs_info.k0
      * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
+    void configure(const ClCompileContext  &compile_context,
+                   ITensorInfo             *src0,
+                   ITensorInfo             *src1,
+                   ITensorInfo             *src2,
+                   ITensorInfo             *dst,
+                   float                    alpha,
+                   float                    beta,
                    const GEMMLHSMatrixInfo &lhs_info,
                    const GEMMRHSMatrixInfo &rhs_info,
                    const GEMMKernelInfo    &gemm_info);
@@ -68,7 +75,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *src2,
+                           const ITensorInfo       *dst,
+                           float                    alpha,
+                           float                    beta,
+                           const GEMMLHSMatrixInfo &lhs_info,
                            const GEMMRHSMatrixInfo &rhs_info,
                            const GEMMKernelInfo    &gemm_info);
 
@@ -76,14 +89,14 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool       _slide_matrix_b{ true };
-    bool       _reinterpret_input_as_3d{ false };
-    bool       _reinterpret_output_as_3d{ false };
-    bool       _use_dummy_work_items{ false };
-    bool       _add_bias{ false };
-    signed int _m{ 1 };
-    signed int _n{ 1 };
-    signed int _k{ 1 };
+    bool       _slide_matrix_b{true};
+    bool       _reinterpret_input_as_3d{false};
+    bool       _reinterpret_output_as_3d{false};
+    bool       _use_dummy_work_items{false};
+    bool       _add_bias{false};
+    signed int _m{1};
+    signed int _n{1};
+    signed int _k{1};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
index d72d29ea1e..4fe6bddb36 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
@@ -29,10 +29,11 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/ActivationFunctionUtils.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLUtils.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -52,7 +53,13 @@ namespace
 {
 using ElementsProcessed = Steps;
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *src2,
+                          const ITensorInfo       *dst,
+                          float                    alpha,
+                          float                    beta,
+                          const GEMMLHSMatrixInfo &lhs_info,
                           const GEMMRHSMatrixInfo &rhs_info,
                           const GEMMKernelInfo    &gemm_info)
 {
@@ -61,42 +68,50 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose == rhs_info.transpose);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_info.transpose) && ((lhs_info.m0 & (lhs_info.m0 - 1)) && lhs_info.m0 != 3), "Only 2,3,4,8,16 are supported for m0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.transpose) && ((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr)
-                                    && (!gemm_info.broadcast_bias),
-                                    "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (src0->data_type() == DataType::F32), "Mixed precision only supported for F16 data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_info.transpose) && ((lhs_info.m0 & (lhs_info.m0 - 1)) && lhs_info.m0 != 3),
+                                    "Only 2,3,4,8,16 are supported for m0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.transpose) && ((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) &&
+            (!gemm_info.broadcast_bias),
+        "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (src0->data_type() == DataType::F32),
+                                    "Mixed precision only supported for F16 data type");
     ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
 
     const unsigned int m = gemm_info.m;
     const unsigned int n = gemm_info.n;
     const unsigned int k = gemm_info.k;
 
-    TensorShape tensor_shape0{ src0->tensor_shape() };
+    TensorShape tensor_shape0{src0->tensor_shape()};
     tensor_shape0.set(0, k);
     tensor_shape0.set(1, m);
 
-    TensorShape tensor_shape1{ src1->tensor_shape() };
+    TensorShape tensor_shape1{src1->tensor_shape()};
     tensor_shape1.set(0, n);
     tensor_shape1.set(1, k);
 
-    if(src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
+    if (src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
     {
         const unsigned int src2_dim0 = src2->dimension(0);
         const unsigned int src2_dim1 = src2->dimension(1);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1);
-        if(gemm_info.broadcast_bias)
+        if (gemm_info.broadcast_bias)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n),
+                                            "Incorrect dimension of bias matrix which is to be broadcasted");
         }
         else
         {
@@ -107,15 +122,18 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0);
     const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
 
-    const TensorInfo tensor_info_reshaped0 = src0->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(tensor_info0, lhs_info));
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+    const TensorInfo tensor_info_reshaped0 =
+        src0->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(tensor_info0, lhs_info));
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+        const TensorInfo tensor_info_dst =
+            dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
     }
@@ -123,9 +141,14 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo             *src0,
+                                                        ITensorInfo             *src1,
+                                                        ITensorInfo             *src2,
+                                                        ITensorInfo             *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
                                                         const GEMMRHSMatrixInfo &rhs_info,
-                                                        const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
+                                                        const GEMMKernelInfo    &gemm_info,
+                                                        ElementsProcessed       &num_elements_processed)
 {
     ARM_COMPUTE_UNUSED(src0, src1, src2);
     unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
@@ -134,7 +157,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITens
 
     TensorInfo tmp_info(*dst);
 
-    if(reinterpret_output_as_3d)
+    if (reinterpret_output_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -147,7 +170,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITens
     num_elems_processed_per_iteration_x = rhs_info.n0;
     num_elems_processed_per_iteration_y = lhs_info.m0;
 
-    Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    Window win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     // Collapse along the Z direction
     // This collapse needs to be here in order to tune the Z dimension of LWS
@@ -164,18 +188,26 @@ ClGemmMatrixMultiplyReshapedKernel::ClGemmMatrixMultiplyReshapedKernel()
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context,
-                                                   const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
-                                                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext  &compile_context,
+                                                   const ITensorInfo       *src0,
+                                                   const ITensorInfo       *src1,
+                                                   const ITensorInfo       *src2,
+                                                   ITensorInfo             *dst,
+                                                   float                    alpha,
+                                                   float                    beta,
+                                                   const GEMMLHSMatrixInfo &lhs_info,
+                                                   const GEMMRHSMatrixInfo &rhs_info,
+                                                   const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
 
-    auto padding_info         = get_padding_info({ src0, src1, src2, dst });
+    auto padding_info         = get_padding_info({src0, src1, src2, dst});
     _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
     _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
     _add_bias                 = src2 != nullptr;
@@ -188,14 +220,9 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src0->clone().get(),
-                                                    src1->clone().get(),
-                                                    (src2 != nullptr) ? src2->clone().get() : nullptr,
-                                                    dst->clone().get(),
-                                                    lhs_info,
-                                                    rhs_info,
-                                                    gemm_info,
-                                                    num_elements_processed);
+    auto win_config = validate_and_configure_window(
+        src0->clone().get(), src1->clone().get(), (src2 != nullptr) ? src2->clone().get() : nullptr, dst->clone().get(),
+        lhs_info, rhs_info, gemm_info, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
@@ -213,12 +240,15 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
 
     // Create build options
     CLBuildOptions build_opts;
-    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
+    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)),
+                             "-DALPHA=" + float_to_string_with_full_precision(alpha));
     build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
     build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
     build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
+    build_opts.add_option_if(_reinterpret_output_as_3d,
+                             "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
+    build_opts.add_option_if(_reinterpret_output_as_3d,
+                             "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
     build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
     build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
@@ -229,7 +259,9 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
     build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT");
     build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1)));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-    build_opts.add_option("-DDATA_TYPE_ACCUMULATOR=" + (enable_mixed_precision ? get_cl_type_from_data_type(DataType::F32) : get_cl_type_from_data_type(data_type)));
+    build_opts.add_option("-DDATA_TYPE_ACCUMULATOR=" + (enable_mixed_precision
+                                                            ? get_cl_type_from_data_type(DataType::F32)
+                                                            : get_cl_type_from_data_type(data_type)));
     build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
     build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
     build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
@@ -237,9 +269,13 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
     build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
     build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
     build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DACTIVATION_TYPE=" +
+                                 lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
 
     std::string kernel_name("gemm_mm_reshaped_");
     kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
@@ -287,9 +323,15 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
+Status ClGemmMatrixMultiplyReshapedKernel::validate(const ITensorInfo       *src0,
+                                                    const ITensorInfo       *src1,
+                                                    const ITensorInfo       *src2,
+                                                    const ITensorInfo       *dst,
+                                                    float                    alpha,
+                                                    float                    beta,
                                                     const GEMMLHSMatrixInfo &lhs_info,
-                                                    const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+                                                    const GEMMRHSMatrixInfo &rhs_info,
+                                                    const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
     return Status{};
@@ -300,15 +342,18 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
 
-    if(src1->info()->num_dimensions() < 3)
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -324,12 +369,14 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
 
     cl::Image2D src1_image2d;
 
-    if(_export_to_cl_image)
+    if (_export_to_cl_image)
     {
-        const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2));
+        const TensorShape shape2d(src1->info()->dimension(0) / 4,
+                                  src1->info()->dimension(1) * src1->info()->dimension(2));
         const size_t      image_row_pitch = src1->info()->strides_in_bytes()[1];
 
-        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d,
+                                                  src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
     }
 
     do
@@ -337,7 +384,7 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
+        if (!_slide_matrix_b)
         {
             slice_b = slice_matrix_b;
         }
@@ -348,7 +395,7 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
         add_2D_tensor_argument(idx, src0, slice);
 
         // RHS buffer or RHS OpenCL image (_export_to_cl_image == true)
-        if(_export_to_cl_image)
+        if (_export_to_cl_image)
         {
             _kernel.setArg(idx++, src1_image2d);
         }
@@ -370,7 +417,7 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
 
         // Bias stride_z (if _add_bias == true)
-        if(_add_bias)
+        if (_add_bias)
         {
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[2]));
         }
@@ -379,7 +426,7 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
 
         // Cross-plan padding (if _reinterpret_output_as_3d = true)
-        if(_reinterpret_output_as_3d)
+        if (_reinterpret_output_as_3d)
         {
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad));
         }
@@ -393,8 +440,7 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
 
         // Dispatch kernel
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
index 8d25412a40..30928c4e1d 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
@@ -24,12 +24,12 @@
 #ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
 #define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
 
+#include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
 
-#include "arm_compute/core/KernelDescriptors.h"
-
 namespace arm_compute
 {
 namespace opencl
@@ -83,16 +83,29 @@ public:
      *
      * @note lhs_info.k0 must be equal to rhs_info.k0
      */
-    void configure(const ClCompileContext &compile_context,
-                   const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
-                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info);
+    void configure(const ClCompileContext  &compile_context,
+                   const ITensorInfo       *src0,
+                   const ITensorInfo       *src1,
+                   const ITensorInfo       *src2,
+                   ITensorInfo             *dst,
+                   float                    alpha,
+                   float                    beta,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMKernelInfo    &gemm_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmMatrixMultiplyReshapedKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *src2,
+                           const ITensorInfo       *dst,
+                           float                    alpha,
+                           float                    beta,
+                           const GEMMLHSMatrixInfo &lhs_info,
                            const GEMMRHSMatrixInfo &rhs_info,
                            const GEMMKernelInfo    &gemm_info);
 
@@ -100,14 +113,14 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool       _slide_matrix_b{ true };
-    bool       _reinterpret_output_as_3d{ false };
-    bool       _use_dummy_work_items{ false };
-    bool       _add_bias{ false };
-    bool       _export_to_cl_image{ false };
-    signed int _m{ 1 };
-    signed int _n{ 1 };
-    signed int _k{ 1 };
+    bool       _slide_matrix_b{true};
+    bool       _reinterpret_output_as_3d{false};
+    bool       _use_dummy_work_items{false};
+    bool       _add_bias{false};
+    bool       _export_to_cl_image{false};
+    signed int _m{1};
+    signed int _n{1};
+    signed int _k{1};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
index b34c17cda8..1b19f1ec5b 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
@@ -25,8 +25,9 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/ActivationFunctionUtils.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLUtils.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -46,24 +47,36 @@ namespace
 {
 using ElementsProcessed = Steps;
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
-                          const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *src2,
+                          const ITensorInfo       *dst,
+                          float                    alpha,
+                          float                    beta,
+                          const GEMMLHSMatrixInfo &lhs_info,
+                          const GEMMRHSMatrixInfo &rhs_info,
+                          const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_info.m0 < 1 || lhs_info.m0 > 8, "Only 1,2,3,4,5,6,7,8 are supported for m0");
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16 || rhs_info.k0 < 2);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16 || rhs_info.n0 < 2);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr)
-                                    && (!gemm_info.broadcast_bias),
-                                    "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) &&
+            (!gemm_info.broadcast_bias),
+        "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
     ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
 
@@ -71,19 +84,20 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     const unsigned int n = gemm_info.n;
     const unsigned int k = gemm_info.k;
 
-    TensorShape tensor_shape1{ src1->tensor_shape() };
+    TensorShape tensor_shape1{src1->tensor_shape()};
     tensor_shape1.set(0, n);
     tensor_shape1.set(1, k);
 
-    if(src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
+    if (src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
     {
         const unsigned int src2_dim0 = src2->dimension(0);
         const unsigned int src2_dim1 = src2->dimension(1);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src0);
-        if(gemm_info.broadcast_bias)
+        if (gemm_info.broadcast_bias)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n),
+                                            "Incorrect dimension of bias matrix which is to be broadcasted");
         }
         else
         {
@@ -93,10 +107,11 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
 
     const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
 
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
 
     ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k);
-    if(gemm_info.reinterpret_input_as_3d)
+    if (gemm_info.reinterpret_input_as_3d)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m);
     }
@@ -106,9 +121,10 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     }
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+        const TensorInfo tensor_info_dst =
+            dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
     }
@@ -116,8 +132,14 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     return Status{};
 }
 
-Window validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
-                                     const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
+Window validate_and_configure_window(ITensorInfo             *src0,
+                                     ITensorInfo             *src1,
+                                     ITensorInfo             *src2,
+                                     ITensorInfo             *dst,
+                                     const GEMMLHSMatrixInfo &lhs_info,
+                                     const GEMMRHSMatrixInfo &rhs_info,
+                                     const GEMMKernelInfo    &gemm_info,
+                                     ElementsProcessed       &num_elements_processed)
 {
     ARM_COMPUTE_UNUSED(src0, src1, src2);
     unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
@@ -128,14 +150,14 @@ Window validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITens
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
     // This approach should only be used when the input/dst tensors have pad on the y direction
-    if((reinterpret_input_as_3d == reinterpret_output_as_3d) && gemm_info.has_pad_y)
+    if ((reinterpret_input_as_3d == reinterpret_output_as_3d) && gemm_info.has_pad_y)
     {
         reinterpret_output_as_3d = false;
     }
 
     TensorInfo tmp_info(*dst);
 
-    if(reinterpret_output_as_3d)
+    if (reinterpret_output_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -148,7 +170,8 @@ Window validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITens
     num_elems_processed_per_iteration_x = rhs_info.n0;
     num_elems_processed_per_iteration_y = lhs_info.m0;
 
-    Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    Window win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     // Collapse along the Z direction
     // This collapse needs to be here in order to tune the Z dimension of LWS
@@ -164,14 +187,22 @@ ClGemmMatrixMultiplyReshapedOnlyRhsKernel::ClGemmMatrixMultiplyReshapedOnlyRhsKe
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context,
-                                                          const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
-                                                          const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext  &compile_context,
+                                                          const ITensorInfo       *src0,
+                                                          const ITensorInfo       *src1,
+                                                          const ITensorInfo       *src2,
+                                                          ITensorInfo             *dst,
+                                                          float                    alpha,
+                                                          float                    beta,
+                                                          const GEMMLHSMatrixInfo &lhs_info,
+                                                          const GEMMRHSMatrixInfo &rhs_info,
+                                                          const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
 
@@ -182,11 +213,11 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext
     _export_to_cl_image       = rhs_info.export_to_cl_image;
     _has_pad_y                = gemm_info.has_pad_y;
 
-    auto padding_info = get_padding_info({ src0, src1, src2, dst });
+    auto padding_info = get_padding_info({src0, src1, src2, dst});
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if((_reinterpret_input_as_3d == _reinterpret_output_as_3d) && _has_pad_y)
+    if ((_reinterpret_input_as_3d == _reinterpret_output_as_3d) && _has_pad_y)
     {
         _reinterpret_input_as_3d  = false;
         _reinterpret_output_as_3d = false;
@@ -199,8 +230,9 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    Window win = validate_and_configure_window(src0->clone().get(), src1->clone().get(), (src2 != nullptr) ? src2->clone().get() : nullptr, dst->clone().get(), lhs_info, rhs_info, gemm_info,
-                                               num_elements_processed);
+    Window win = validate_and_configure_window(src0->clone().get(), src1->clone().get(),
+                                               (src2 != nullptr) ? src2->clone().get() : nullptr, dst->clone().get(),
+                                               lhs_info, rhs_info, gemm_info, num_elements_processed);
     ICLKernel::configure_internal(win);
 
     // If _reinterpret_input_as_3d = reinterpret_output_as_3d = true,
@@ -225,7 +257,8 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
-    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
+    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)),
+                             "-DALPHA=" + float_to_string_with_full_precision(alpha));
     build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
     build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
     build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
@@ -240,17 +273,23 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext
     build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
     build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
     build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    if(_has_pad_y)
+    if (_has_pad_y)
     {
         build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
         build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-        build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
-        build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
+        build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                                 "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
+        build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                                 "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
     }
 
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DACTIVATION_TYPE=" +
+                                 lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
 
     std::string kernel_name("gemm_mm_reshaped_only_rhs_");
     kernel_name += rhs_info.transpose ? "t" : "nt";
@@ -294,28 +333,39 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
+Status ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo       *src0,
+                                                           const ITensorInfo       *src1,
+                                                           const ITensorInfo       *src2,
+                                                           const ITensorInfo       *dst,
+                                                           float                    alpha,
+                                                           float                    beta,
                                                            const GEMMLHSMatrixInfo &lhs_info,
-                                                           const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+                                                           const GEMMRHSMatrixInfo &rhs_info,
+                                                           const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
     return Status{};
 }
 
-void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack      &tensors,
+                                                       const Window     &window,
+                                                       cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
 
-    if(src1->info()->num_dimensions() < 3)
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -341,12 +391,14 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con
 
     cl::Image2D src1_image2d;
 
-    if(_export_to_cl_image)
+    if (_export_to_cl_image)
     {
-        const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2));
+        const TensorShape shape2d(src1->info()->dimension(0) / 4,
+                                  src1->info()->dimension(1) * src1->info()->dimension(2));
         const size_t      image_row_pitch = src1->info()->strides_in_bytes()[1];
 
-        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d,
+                                                  src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
     }
 
     do
@@ -354,7 +406,7 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
+        if (!_slide_matrix_b)
         {
             slice_b = slice_matrix_b;
         }
@@ -365,7 +417,7 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con
         add_2D_tensor_argument(idx, src0, slice);
 
         // RHS buffer or RHS OpenCL image (_export_to_cl_image == true)
-        if(_export_to_cl_image)
+        if (_export_to_cl_image)
         {
             _kernel.setArg(idx++, src1_image2d);
         }
@@ -387,22 +439,23 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[rhs_idx_batch_size]));
 
         // Bias stride_z (if _add_bias == true)
-        if(_add_bias)
+        if (_add_bias)
         {
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[bia_idx_batch_size]));
+            _kernel.setArg<cl_uint>(idx++,
+                                    static_cast<unsigned int>(src2->info()->strides_in_bytes()[bia_idx_batch_size]));
         }
 
         // dst stride_z
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[out_idx_batch_size]));
 
         // Cross-plan padding (if _reinterpret_input_as_3d = true)
-        if(_reinterpret_input_as_3d && _has_pad_y)
+        if (_reinterpret_input_as_3d && _has_pad_y)
         {
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_lhs));
         }
 
         // Cross-plan padding (if reinterpret_output_as_3d = true)
-        if(_reinterpret_output_as_3d && _has_pad_y)
+        if (_reinterpret_output_as_3d && _has_pad_y)
         {
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_out));
         }
@@ -413,8 +466,7 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con
         _kernel.setArg<cl_int>(idx++, _k);
 
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
index 471160c94b..e8fd78d476 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
@@ -24,12 +24,12 @@
 #ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
 #define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
 
+#include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
 
-#include "arm_compute/core/KernelDescriptors.h"
-
 namespace arm_compute
 {
 namespace opencl
@@ -74,32 +74,46 @@ public:
      *                             rhs_info.transpose: true,false
      * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
      */
-    void configure(const ClCompileContext &compile_context,
-                   const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
-                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info);
+    void configure(const ClCompileContext  &compile_context,
+                   const ITensorInfo       *src0,
+                   const ITensorInfo       *src1,
+                   const ITensorInfo       *src2,
+                   ITensorInfo             *dst,
+                   float                    alpha,
+                   float                    beta,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMKernelInfo    &gemm_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
-                           const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info);
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *src2,
+                           const ITensorInfo       *dst,
+                           float                    alpha,
+                           float                    beta,
+                           const GEMMLHSMatrixInfo &lhs_info,
+                           const GEMMRHSMatrixInfo &rhs_info,
+                           const GEMMKernelInfo    &gemm_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool       _slide_matrix_b{ true };
-    bool       _reinterpret_input_as_3d{ false };
-    bool       _reinterpret_output_as_3d{ false };
-    bool       _use_dummy_work_items{ false };
-    bool       _add_bias{ false };
-    bool       _export_to_cl_image{ false };
-    bool       _has_pad_y{ false };
-    signed int _m{ 1 };
-    signed int _n{ 1 };
-    signed int _k{ 1 };
+    bool       _slide_matrix_b{true};
+    bool       _reinterpret_input_as_3d{false};
+    bool       _reinterpret_output_as_3d{false};
+    bool       _use_dummy_work_items{false};
+    bool       _add_bias{false};
+    bool       _export_to_cl_image{false};
+    bool       _has_pad_y{false};
+    signed int _m{1};
+    signed int _n{1};
+    signed int _k{1};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
index 734f8f9b4c..9a2a4890f3 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
@@ -23,16 +23,17 @@
  */
 #include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/CL/CLUtils.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -56,23 +57,36 @@ constexpr int mmul_m0 = 4;
 constexpr int mmul_n0 = 4;
 constexpr int mmul_k0 = 4;
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *src2,
+                          const ITensorInfo       *dst,
+                          float                    alpha,
+                          float                    beta,
+                          const GEMMLHSMatrixInfo &lhs_info,
                           const GEMMRHSMatrixInfo &rhs_info,
                           const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()), "The extension cl_arm_matrix_multiply is not supported on the target platform");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()),
+                                    "The extension cl_arm_matrix_multiply is not supported on the target platform");
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_info.m0 < 1, "Only values greater than 0 are supported for m0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.n0 != 1 && rhs_info.n0 != 2 && rhs_info.n0 != 3 && rhs_info.n0 != 4 && rhs_info.n0 != 8 && rhs_info.n0 != 16, "Only 1,2,3,4,8, and 16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.n0 != 1 && rhs_info.n0 != 2 && rhs_info.n0 != 3 && rhs_info.n0 != 4 &&
+                                        rhs_info.n0 != 8 && rhs_info.n0 != 16,
+                                    "Only 1,2,3,4,8, and 16 are supported for n0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.k0 != 1 || lhs_info.k0 != 1), "Only 1 is supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.h0 != 4), "Only 4 is supported for h0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.interleave != true, "Only true is supported for interleave with mmul extension enabled");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.transpose != false, "Only false is supported for transpose with mmul extension enabled");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.interleave != true,
+                                    "Only true is supported for interleave with mmul extension enabled");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.transpose != false,
+                                    "Only false is supported for transpose with mmul extension enabled");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
     ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
 
@@ -87,7 +101,7 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k);
 
     // Validate the reinterpreted-as-3D-case
-    if(gemm_info.depth_output_gemm3d != 0)
+    if (gemm_info.depth_output_gemm3d != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m);
     }
@@ -97,9 +111,9 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     }
 
     // Validate the gemm-batched case
-    if(src1->num_dimensions() > 2)
+    if (src1->num_dimensions() > 2)
     {
-        if(gemm_info.depth_output_gemm3d != 0)
+        if (gemm_info.depth_output_gemm3d != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(3) != src1->dimension(2));
         }
@@ -109,15 +123,16 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         }
     }
 
-    if(src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
+    if (src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
     {
         const unsigned int src2_dim0 = src2->dimension(0);
         const unsigned int src2_dim1 = src2->dimension(1);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1);
-        if(gemm_info.broadcast_bias)
+        if (gemm_info.broadcast_bias)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n),
+                                            "Incorrect dimension of bias matrix which is to be broadcasted");
         }
         else
         {
@@ -125,18 +140,20 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
         }
     }
 
-    TensorShape tensor_shape1{ src1->tensor_shape() };
+    TensorShape tensor_shape1{src1->tensor_shape()};
     tensor_shape1.set(0, n);
     tensor_shape1.set(1, k);
 
-    const TensorInfo tensor_info1          = src1->clone()->set_tensor_shape(tensor_shape1);
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+    const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+        const TensorInfo tensor_info_dst =
+            dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
     }
@@ -144,7 +161,11 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo             *src0,
+                                                        ITensorInfo             *src1,
+                                                        ITensorInfo             *src2,
+                                                        ITensorInfo             *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
                                                         const GEMMRHSMatrixInfo &rhs_info,
                                                         const GEMMKernelInfo    &gemm_info)
 {
@@ -152,11 +173,12 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITens
     bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
 
     TensorInfo tmp_info(*dst);
 
-    if(reinterpret_output_as_3d)
+    if (reinterpret_output_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -204,19 +226,26 @@ ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::ClGemmMatrixMultiplyReshapedOnlyR
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha,
+void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileContext  &compile_context,
+                                                              ITensorInfo             *src0,
+                                                              ITensorInfo             *src1,
+                                                              ITensorInfo             *src2,
+                                                              ITensorInfo             *dst,
+                                                              float                    alpha,
                                                               float                    beta,
                                                               const GEMMLHSMatrixInfo &lhs_info,
-                                                              const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+                                                              const GEMMRHSMatrixInfo &rhs_info,
+                                                              const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
 
-    auto padding_info   = get_padding_info({ src0, src1, src2, dst });
+    auto padding_info   = get_padding_info({src0, src1, src2, dst});
     _add_bias           = src2 != nullptr;
     _export_to_cl_image = rhs_info.export_to_cl_image;
 
@@ -236,7 +265,8 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileCon
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
-    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
+    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)),
+                             "-DALPHA=" + float_to_string_with_full_precision(alpha));
     build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
     build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
     build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
@@ -249,7 +279,8 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileCon
     build_opts.add_option("-DMMUL_M0=" + support::cpp11::to_string(mmul_m0));
     build_opts.add_option("-DMMUL_N0=" + support::cpp11::to_string(mmul_n0));
     build_opts.add_option("-DMMUL_K0=" + support::cpp11::to_string(mmul_k0));
-    build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option("-DACTIVATION_TYPE=" +
+                          lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
     build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
     build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
 
@@ -283,37 +314,44 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileCon
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
+Status ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(const ITensorInfo       *src0,
+                                                               const ITensorInfo       *src1,
+                                                               const ITensorInfo       *src2,
+                                                               const ITensorInfo       *dst,
+                                                               float                    alpha,
+                                                               float                    beta,
                                                                const GEMMLHSMatrixInfo &lhs_info,
-                                                               const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+                                                               const GEMMRHSMatrixInfo &rhs_info,
+                                                               const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(),
                                                               src2 != nullptr ? src2->clone().get() : nullptr,
-                                                              dst->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info)
-                                .first);
+                                                              dst->clone().get(), lhs_info, rhs_info, gemm_info)
+                                    .first);
 
     return Status{};
 }
 
-void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack      &tensors,
+                                                           const Window     &window,
+                                                           cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
 
-    if(src1->info()->num_dimensions() < 3)
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -321,12 +359,14 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors,
 
     cl::Image2D src1_image2d;
 
-    if(_export_to_cl_image)
+    if (_export_to_cl_image)
     {
-        const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2));
+        const TensorShape shape2d(src1->info()->dimension(0) / 4,
+                                  src1->info()->dimension(1) * src1->info()->dimension(2));
         const size_t      image_row_pitch = src1->info()->strides_in_bytes()[1];
 
-        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d,
+                                                  src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
     }
 
     Window slice = window.first_slice_window_3D();
@@ -336,14 +376,14 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors,
         unsigned int idx = 0;
 
         add_3d_tensor_nhw_argument(idx, src0);
-        if(_export_to_cl_image)
+        if (_export_to_cl_image)
         {
             _kernel.setArg(idx++, src1_image2d);
         }
         add_3d_tensor_nhw_argument(idx, src1);
 
         // Bias buffer (_add_bias == true)
-        if(_add_bias)
+        if (_add_bias)
         {
             add_3d_tensor_nhw_argument(idx, src2);
         }
@@ -358,8 +398,7 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors,
         // LWS_x should be multiple of 16 at least. (32, 2) has been chosen to have more work-items on a single core
         // LWS also enforces the order of execution of the workitems which improves cache utilization
         enqueue(queue, *this, slice, cl::NDRange(32, 2), false);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h
index 59612fcf5d..86d3012f6e 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMUL_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -59,7 +60,13 @@ public:
      *                             rhs_info.transpose: false
      * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
+    void configure(const ClCompileContext  &compile_context,
+                   ITensorInfo             *src0,
+                   ITensorInfo             *src1,
+                   ITensorInfo             *src2,
+                   ITensorInfo             *dst,
+                   float                    alpha,
+                   float                    beta,
                    const GEMMLHSMatrixInfo &lhs_info,
                    const GEMMRHSMatrixInfo &rhs_info,
                    const GEMMKernelInfo    &gemm_info);
@@ -69,7 +76,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *src2,
+                           const ITensorInfo       *dst,
+                           float                    alpha,
+                           float                    beta,
+                           const GEMMLHSMatrixInfo &lhs_info,
                            const GEMMRHSMatrixInfo &rhs_info,
                            const GEMMKernelInfo    &gemm_info);
 
@@ -77,11 +90,11 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool       _add_bias{ false };
-    bool       _export_to_cl_image{ false };
-    signed int _m{ 1 };
-    signed int _n{ 1 };
-    signed int _k{ 1 };
+    bool       _add_bias{false};
+    bool       _export_to_cl_image{false};
+    signed int _m{1};
+    signed int _n{1};
+    signed int _k{1};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp
index bf4b664b6e..eea2a169a3 100644
--- a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -46,13 +47,17 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
+Status validate_arguments(const ITensorInfo       *src,
+                          const ITensorInfo       *dst,
+                          const GEMMLHSMatrixInfo &lhs_info,
+                          bool                     reinterpret_input_as_3d)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.v0 == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
     ARM_COMPUTE_RETURN_ERROR_ON((lhs_info.m0 > 4 && lhs_info.m0 < 8) && lhs_info.transpose);
@@ -60,10 +65,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
-                                                           misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(),
+            misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
     }
@@ -71,14 +77,15 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     return Status{};
 }
 
-Window configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
+Window
+configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
 {
     const unsigned int num_elems_processed_per_iteration_x = lhs_info.k0;
     const unsigned int num_elems_processed_per_iteration_y = lhs_info.m0;
 
     TensorInfo tmp_info(*src);
 
-    if(reinterpret_input_as_3d)
+    if (reinterpret_input_as_3d)
     {
         // Since the src tensor has to be reinterpreted as 3D and the execute window is based on a 2D interleave,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -88,10 +95,12 @@ Window configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixI
     }
 
     // dst auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d)));
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(
+                                 *src, lhs_info, reinterpret_input_as_3d)));
 
     // Configure window
-    Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    Window win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     // Collapse along the Z direction
     // This collapse needs to be here in order to tune the Z dimension of LWS
@@ -106,14 +115,18 @@ ClGemmReshapeLhsMatrixKernel::ClGemmReshapeLhsMatrixKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClGemmReshapeLhsMatrixKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
+void ClGemmReshapeLhsMatrixKernel::configure(const CLCompileContext  &compile_context,
+                                             ITensorInfo             *src,
+                                             ITensorInfo             *dst,
+                                             const GEMMLHSMatrixInfo &lhs_info,
+                                             bool                     reinterpret_input_as_3d)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
     // Perform validate step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, lhs_info, reinterpret_input_as_3d));
 
-    auto padding_info = get_padding_info({ src });
+    auto padding_info = get_padding_info({src});
 
     const unsigned int src_w      = src->dimension(0);
     const unsigned int m          = reinterpret_input_as_3d ? src->dimension(1) * src->dimension(2) : src->dimension(1);
@@ -168,7 +181,10 @@ void ClGemmReshapeLhsMatrixKernel::configure(const CLCompileContext &compile_con
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmReshapeLhsMatrixKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
+Status ClGemmReshapeLhsMatrixKernel::validate(const ITensorInfo       *src,
+                                              const ITensorInfo       *dst,
+                                              const GEMMLHSMatrixInfo &lhs_info,
+                                              bool                     reinterpret_input_as_3d)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, lhs_info, reinterpret_input_as_3d));
     return Status{};
@@ -179,8 +195,9 @@ void ClGemmReshapeLhsMatrixKernel::run_op(ITensorPack &tensors, const Window &wi
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
@@ -192,8 +209,7 @@ void ClGemmReshapeLhsMatrixKernel::run_op(ITensorPack &tensors, const Window &wi
         add_3d_tensor_nhw_argument(idx, src);
         add_3d_tensor_nhw_argument(idx, dst);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h
index db88e0d735..8e84e8ad8e 100644
--- a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h
+++ b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h
@@ -57,14 +57,21 @@ public:
      *                                   lhs_info.interleave: true, false
      * @param[in]  reinterpret_src_as_3d (Optional) True if the src has to be reinterpreted as 3D tensor
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_src_as_3d = false);
+    void configure(const ClCompileContext  &compile_context,
+                   ITensorInfo             *src,
+                   ITensorInfo             *dst,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   bool                     reinterpret_src_as_3d = false);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmReshapeLhsMatrixKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_src_as_3d);
+    static Status validate(const ITensorInfo       *src,
+                           const ITensorInfo       *dst,
+                           const GEMMLHSMatrixInfo &lhs_info,
+                           bool                     reinterpret_src_as_3d);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -72,4 +79,4 @@ public:
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp
index b3a03880ed..b9ce3873c7 100644
--- a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -52,8 +53,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.h0 == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && (rhs_info.k0 != 1) && (rhs_info.k0 != 3)), "Only 1,2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && (rhs_info.k0 != 1) && (rhs_info.k0 != 3)),
+                                    "Only 1,2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON((rhs_info.k0 == 1) && (rhs_info.transpose));
@@ -61,15 +64,17 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
 
-    if(rhs_info.export_to_cl_image)
+    if (rhs_info.export_to_cl_image)
     {
-        const TensorInfo tensor_reshaped_info(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info), 1, src->data_type());
+        const TensorInfo tensor_reshaped_info(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info), 1,
+                                              src->data_type());
         ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(), misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
     }
@@ -77,23 +82,27 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
 {
     const unsigned int num_elems_processed_per_iteration_x = rhs_info.n0;
     const unsigned int num_elems_processed_per_iteration_y = rhs_info.k0;
     bool               window_changed                      = false;
 
     // dst auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info)));
+    auto_init_if_empty(
+        *dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info)));
 
     // Configure window
-    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    Window win =
+        calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
-    AccessWindowRectangle src_access(src, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+    AccessWindowRectangle src_access(src, 0, 0, num_elems_processed_per_iteration_x,
+                                     num_elems_processed_per_iteration_y);
 
     window_changed = update_window_and_padding(win, src_access);
 
-    if(rhs_info.export_to_cl_image)
+    if (rhs_info.export_to_cl_image)
     {
         gemm::update_padding_for_cl_image(dst);
     }
@@ -102,7 +111,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso
     // This collapse needs to be here in order to tune the Z dimension of LWS
     Window collapsed = win.collapse(win, Window::DimZ);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, collapsed);
 }
 } // namespace
@@ -112,7 +122,10 @@ ClGemmReshapeRhsMatrixKernel::ClGemmReshapeRhsMatrixKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClGemmReshapeRhsMatrixKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
+void ClGemmReshapeRhsMatrixKernel::configure(const CLCompileContext  &compile_context,
+                                             ITensorInfo             *src,
+                                             ITensorInfo             *dst,
+                                             const GEMMRHSMatrixInfo &rhs_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
@@ -143,7 +156,9 @@ void ClGemmReshapeRhsMatrixKernel::configure(const CLCompileContext &compile_con
     _kernel.setArg<cl_int>(idx++, rhs_info.h0);
 }
 
-Status ClGemmReshapeRhsMatrixKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
+Status ClGemmReshapeRhsMatrixKernel::validate(const ITensorInfo       *src,
+                                              const ITensorInfo       *dst,
+                                              const GEMMRHSMatrixInfo &rhs_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, rhs_info));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), rhs_info).first);
@@ -156,8 +171,9 @@ void ClGemmReshapeRhsMatrixKernel::run_op(ITensorPack &tensors, const Window &wi
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
@@ -169,9 +185,8 @@ void ClGemmReshapeRhsMatrixKernel::run_op(ITensorPack &tensors, const Window &wi
         add_3d_tensor_nhw_argument(idx, src);
         add_3d_tensor_nhw_argument(idx, dst);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h
index 31eaa46e02..7203d574fb 100644
--- a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h
+++ b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h
@@ -66,7 +66,10 @@ public:
      *                             rhs_info.transpose: true, false
      *                             rhs_info.interleave: true, false
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info);
+    void configure(const ClCompileContext  &compile_context,
+                   ITensorInfo             *src,
+                   ITensorInfo             *dst,
+                   const GEMMRHSMatrixInfo &rhs_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmReshapeRhsMatrixKernel::configure()
@@ -81,4 +84,4 @@ public:
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp b/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp
index 719201d1fe..2e1cefc6e7 100644
--- a/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp
+++ b/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp
@@ -30,10 +30,10 @@
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -52,7 +52,7 @@ Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, co
     ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) + height_offset > dst->dimension(Window::DimY));
 
     ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != dst->dimension(0));
-    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
     }
@@ -62,8 +62,7 @@ Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, co
 }
 } // namespace
 
-ClHeightConcatenateKernel::ClHeightConcatenateKernel()
-    : _height_offset(0)
+ClHeightConcatenateKernel::ClHeightConcatenateKernel() : _height_offset(0)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -74,12 +73,15 @@ Status ClHeightConcatenateKernel::validate(const ITensorInfo *src, unsigned int
     return Status{};
 }
 
-void ClHeightConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst)
+void ClHeightConcatenateKernel::configure(const CLCompileContext &compile_context,
+                                          ITensorInfo            *src,
+                                          unsigned int            height_offset,
+                                          ITensorInfo            *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, height_offset, dst));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     _height_offset = height_offset;
 
@@ -90,9 +92,10 @@ void ClHeightConcatenateKernel::configure(const CLCompileContext &compile_contex
     build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DHEIGHT_OFFSET=" + support::cpp11::to_string(_height_offset));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
 
-    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
+    if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
     {
         const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
         const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
@@ -125,8 +128,9 @@ void ClHeightConcatenateKernel::run_op(ITensorPack &tensors, const Window &windo
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     unsigned int idx = 0;
     add_4D_tensor_argument(idx, src, window);
diff --git a/src/gpu/cl/kernels/ClHeightConcatenateKernel.h b/src/gpu/cl/kernels/ClHeightConcatenateKernel.h
index d3c077fc22..5a391a1212 100644
--- a/src/gpu/cl/kernels/ClHeightConcatenateKernel.h
+++ b/src/gpu/cl/kernels/ClHeightConcatenateKernel.h
@@ -50,7 +50,8 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
      *
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClHeightConcatenateKernel::configure()
@@ -64,7 +65,7 @@ public:
 
 private:
     unsigned int _height_offset;
-    int32_t      _depth{ 0 };
+    int32_t      _depth{0};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClIm2ColKernel.cpp b/src/gpu/cl/kernels/ClIm2ColKernel.cpp
index e890847199..ef7a52828f 100644
--- a/src/gpu/cl/kernels/ClIm2ColKernel.cpp
+++ b/src/gpu/cl/kernels/ClIm2ColKernel.cpp
@@ -29,9 +29,10 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -60,13 +61,19 @@ struct Im2ColConfiguration
     bool                  is_padding_required_nchw{};
 };
 
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
-                          unsigned int num_groups)
+Status validate_arguments(const ITensorInfo   *src,
+                          const ITensorInfo   *dst,
+                          const Size2D        &kernel_dims,
+                          const PadStrideInfo &conv_info,
+                          bool                 has_bias,
+                          const Size2D        &dilation,
+                          unsigned int         num_groups)
 {
     const unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
 
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) && has_bias);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
     ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
@@ -82,9 +89,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     const unsigned     total_height = src->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom();
     ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height));
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
-        const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups));
+        const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(
+            compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
@@ -93,13 +101,21 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
-                                                        unsigned int num_elems_processed_per_iteration, bool is_padding_required_nchw, unsigned int num_groups)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo         *src,
+                                                        ITensorInfo         *dst,
+                                                        const Size2D        &kernel_dims,
+                                                        const PadStrideInfo &conv_info,
+                                                        bool                 has_bias,
+                                                        const Size2D        &dilation,
+                                                        unsigned int         num_elems_processed_per_iteration,
+                                                        bool                 is_padding_required_nchw,
+                                                        unsigned int         num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
     // Output tensor auto initialization if not yet initialized
-    TensorShape expected_output_shape = compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups);
+    TensorShape expected_output_shape =
+        compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups);
 
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(expected_output_shape));
 
@@ -113,22 +129,22 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso
     bool   window_changed = false;
     Window win;
 
-    if(data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NHWC)
     {
         win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
     }
     else
     {
-        if(is_padding_required_nchw)
+        if (is_padding_required_nchw)
         {
-            const BorderSize border(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
-            win = calculate_max_window(*src,
-                                       Steps(num_elems_processed_per_iteration * conv_info.stride().first, conv_info.stride().second));
-            AccessWindowStatic input_access(src,
-                                            -border.left,
-                                            -border.top,
-                                            ceil_to_multiple(input_width + border.right, kernel_dims.width * num_elems_processed_per_iteration),
-                                            input_height + border.bottom);
+            const BorderSize border(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(),
+                                    conv_info.pad_left());
+            win = calculate_max_window(
+                *src, Steps(num_elems_processed_per_iteration * conv_info.stride().first, conv_info.stride().second));
+            AccessWindowStatic input_access(
+                src, -border.left, -border.top,
+                ceil_to_multiple(input_width + border.right, kernel_dims.width * num_elems_processed_per_iteration),
+                input_height + border.bottom);
             window_changed = window_changed || update_window_and_padding(win, input_access);
         }
         else
@@ -142,11 +158,17 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso
     // set the Z dimension's step same size as the whole dimension so that one can't split across the Z dimension
     win.set_dimension_step(Window::DimZ, win[Window::DimZ].end() - win[Window::DimZ].start());
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 
-Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, unsigned int num_groups)
+Im2ColConfiguration configure_opencl_kernel(const ITensorInfo   *src,
+                                            const Size2D        &kernel_dims,
+                                            const PadStrideInfo &conv_info,
+                                            bool                 has_bias,
+                                            const Size2D        &dilation,
+                                            unsigned int         num_groups)
 {
     const DataLayout   data_layout   = src->data_layout();
     const DataType     data_type     = src->data_type();
@@ -157,7 +179,8 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D
     const unsigned int input_height  = src->dimension(height_idx);
     const unsigned int input_channel = src->dimension(channel_idx);
 
-    const std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
+    const std::pair<unsigned int, unsigned int> convolved_dims =
+        scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
 
     // Im2Col configuration
     std::string                   kernel_name = "im2col_generic_";
@@ -184,21 +207,22 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D
     build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
     build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
     build_opts.add_option_if(num_groups > 1, "-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
-    build_opts.add_option_if_else(is_data_type_quantized(data_type), "-DPAD_VALUE=" + support::cpp11::to_string(qinfo.offset), "-DPAD_VALUE=0");
+    build_opts.add_option_if_else(is_data_type_quantized(data_type),
+                                  "-DPAD_VALUE=" + support::cpp11::to_string(qinfo.offset), "-DPAD_VALUE=0");
     build_opts.add_option_if(has_bias, "-DHAS_BIAS");
 
-    if(data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NHWC)
     {
         num_elems_processed_per_iteration = std::min(2U, input_channel);
         is_padding_required_nchw          = false;
 
         // Only the 3x3 and 9x9 cases are optimized for NHWC
-        if(kernel_dims == Size2D(3U, 3U))
+        if (kernel_dims == Size2D(3U, 3U))
         {
             kernel_name = "im2col3x3_";
             build_opts.add_option("-DIM2COL_3X3");
         }
-        else if(kernel_dims == Size2D(9U, 9U))
+        else if (kernel_dims == Size2D(9U, 9U))
         {
             kernel_name = "im2col9x9_";
             build_opts.add_option("-DIM2COL_9X9");
@@ -219,17 +243,17 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D
     }
     else
     {
-        if(dilation == Size2D(1U, 1U))
+        if (dilation == Size2D(1U, 1U))
         {
             const bool squared_im2col = kernel_dims.width == kernel_dims.height;
-            if(squared_im2col)
+            if (squared_im2col)
             {
                 // Check if we can run an optimized im2col for NCHW
-                switch(kernel_dims.width)
+                switch (kernel_dims.width)
                 {
                     case 1:
                         // Optimized im2col1x1 if stride_x = 1 and conv_info.has_padding() = false
-                        if(conv_info.stride().first == 1 && !conv_info.has_padding())
+                        if (conv_info.stride().first == 1 && !conv_info.has_padding())
                         {
                             kernel_name                       = "im2col1x1_stridex1_";
                             num_elems_processed_per_iteration = 4;
@@ -248,7 +272,7 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D
                         break;
                     case 11:
                         // Optimized im2col11x11 if pad_x = pad_y = 0
-                        if(!conv_info.has_padding())
+                        if (!conv_info.has_padding())
                         {
                             kernel_name                       = "im2col11x11_padx0_pady0_";
                             num_elems_processed_per_iteration = 1;
@@ -262,7 +286,7 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D
                         break;
                 }
             }
-            else if(kernel_dims.width > 1 && !conv_info.has_padding())
+            else if (kernel_dims.width > 1 && !conv_info.has_padding())
             {
                 kernel_name                       = "im2col_generic_padx0_pady0_";
                 num_elems_processed_per_iteration = 1;
@@ -297,19 +321,29 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D
 } // namespace
 
 ClIm2ColKernel::ClIm2ColKernel()
-    : _data_layout(DataLayout::UNKNOWN), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info(), _num_groups()
+    : _data_layout(DataLayout::UNKNOWN),
+      _convolved_dims(),
+      _num_elems_processed_per_iteration(1),
+      _kernel_dims(),
+      _conv_info(),
+      _num_groups()
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClIm2ColKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias,
-                               const Size2D &dilation,
-                               unsigned int  num_groups)
+void ClIm2ColKernel::configure(const ClCompileContext &compile_context,
+                               ITensorInfo            *src,
+                               ITensorInfo            *dst,
+                               const Size2D           &kernel_dims,
+                               const PadStrideInfo    &conv_info,
+                               bool                    has_bias,
+                               const Size2D           &dilation,
+                               unsigned int            num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
     _data_layout      = src->data_layout();
 
     const unsigned int width_idx    = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
@@ -320,19 +354,22 @@ void ClIm2ColKernel::configure(const ClCompileContext &compile_context, ITensorI
     // Select and configure the optimal OpenCL kernel to run.
     // This function returns the OpenCL kernel's name, the arguments to pass at compile time, the number of elements processed per iteration
     // and the padding requirement flag
-    Im2ColConfiguration im2col_config = configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups);
+    Im2ColConfiguration im2col_config =
+        configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups);
 
     // Create kernel
     _kernel = create_kernel(compile_context, im2col_config.kernel_name, im2col_config.build_options);
 
-    _convolved_dims                    = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
+    _convolved_dims =
+        scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
     _num_elems_processed_per_iteration = im2col_config.num_elems_processed_per_iteration;
     _kernel_dims                       = kernel_dims; // Only needed by the Tuner
     _conv_info                         = conv_info;   // Only needed by the Tuner
     _num_groups                        = num_groups;
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src, dst, kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration,
+    auto win_config = validate_and_configure_window(src, dst, kernel_dims, conv_info, has_bias, dilation,
+                                                    im2col_config.num_elems_processed_per_iteration,
                                                     im2col_config.is_padding_required_nchw, num_groups);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     IClKernel::configure_internal(win_config.second);
@@ -353,14 +390,22 @@ void ClIm2ColKernel::configure(const ClCompileContext &compile_context, ITensorI
     ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info));
 }
 
-Status ClIm2ColKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
-                                unsigned int num_groups)
+Status ClIm2ColKernel::validate(const ITensorInfo   *src,
+                                const ITensorInfo   *dst,
+                                const Size2D        &kernel_dims,
+                                const PadStrideInfo &conv_info,
+                                bool                 has_bias,
+                                const Size2D        &dilation,
+                                unsigned int         num_groups)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups));
-    Im2ColConfiguration im2col_config = configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration,
+    Im2ColConfiguration im2col_config =
+        configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), kernel_dims,
+                                                              conv_info, has_bias, dilation,
+                                                              im2col_config.num_elems_processed_per_iteration,
                                                               im2col_config.is_padding_required_nchw, num_groups)
-                                .first);
+                                    .first);
     return Status{};
 }
 
@@ -388,7 +433,7 @@ void ClIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
     Window slice_in  = first_slice_3d;
     Window slice_out = window_output.first_slice_window_2D();
 
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         const Window tmp_win     = window.collapse_if_possible(ICLKernel::window(), 3);
         const int    num_batches = tmp_win[3].end();
@@ -398,7 +443,10 @@ void ClIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
     }
     else
     {
-        slice.set(0, Window::Dimension(0, static_cast<int>(ceil_to_multiple(_convolved_dims.first, _num_elems_processed_per_iteration)), _num_elems_processed_per_iteration));
+        slice.set(0,
+                  Window::Dimension(
+                      0, static_cast<int>(ceil_to_multiple(_convolved_dims.first, _num_elems_processed_per_iteration)),
+                      _num_elems_processed_per_iteration));
         slice.set(1, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1));
         // Note: In case of NCHW the 3rd dimension is already set collapsing the input window
     }
@@ -414,14 +462,16 @@ void ClIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
     slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
     slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
 
-    unsigned int idx = num_arguments_per_3D_tensor() + (_num_groups == 1 ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor());
+    unsigned int idx = num_arguments_per_3D_tensor() +
+                       (_num_groups == 1 ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor());
     _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src->info()->strides_in_bytes()[3]));
-    _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[((_num_groups == 1) ? 2 : 3)]));
+    _kernel.setArg<cl_uint>(idx++,
+                            static_cast<unsigned int>(dst->info()->strides_in_bytes()[((_num_groups == 1) ? 2 : 3)]));
     do
     {
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, src, slice_in);
-        if(_num_groups == 1)
+        if (_num_groups == 1)
         {
             add_2D_tensor_argument(idx, dst, slice_out);
         }
@@ -430,8 +480,8 @@ void ClIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
             add_3D_tensor_argument(idx, dst, slice_out);
         }
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice) && window_output.slide_window_slice_2D(slice_out) && window_collapsed.slide_window_slice_3D(slice_in));
+    } while (window_collapsed.slide_window_slice_3D(slice) && window_output.slide_window_slice_2D(slice_out) &&
+             window_collapsed.slide_window_slice_3D(slice_in));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClIm2ColKernel.h b/src/gpu/cl/kernels/ClIm2ColKernel.h
index a637ad215d..c8cd5b328d 100644
--- a/src/gpu/cl/kernels/ClIm2ColKernel.h
+++ b/src/gpu/cl/kernels/ClIm2ColKernel.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Size2D.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -77,28 +78,38 @@ public:
      * @param[in]  dilation        (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias,
-                   const Size2D &dilation   = Size2D(1U, 1U),
-                   unsigned int  num_groups = 1);
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const Size2D           &kernel_dims,
+                   const PadStrideInfo    &conv_info,
+                   bool                    has_bias,
+                   const Size2D           &dilation   = Size2D(1U, 1U),
+                   unsigned int            num_groups = 1);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClIm2ColKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U),
-                           unsigned int num_groups = 1);
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *output,
+                           const Size2D        &kernel_dims,
+                           const PadStrideInfo &conv_info,
+                           bool                 has_bias,
+                           const Size2D        &dilation   = Size2D(1U, 1U),
+                           unsigned int         num_groups = 1);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 public:
-    DataLayout _data_layout;
+    DataLayout                            _data_layout;
     std::pair<unsigned int, unsigned int> _convolved_dims;
-    unsigned int  _num_elems_processed_per_iteration;
-    Size2D        _kernel_dims;
-    PadStrideInfo _conv_info;
-    unsigned int  _num_groups;
+    unsigned int                          _num_elems_processed_per_iteration;
+    Size2D                                _kernel_dims;
+    PadStrideInfo                         _conv_info;
+    unsigned int                          _num_groups;
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp
index d291fad76c..8c493d08c6 100644
--- a/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp
+++ b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -43,26 +44,29 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst,
-                          const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc)
+Status validate_arguments(const ITensorInfo                 *src,
+                          const ITensorInfo                 *weights,
+                          const ITensorInfo                 *dst,
+                          const PadStrideInfo               &conv_info,
+                          const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != src->dimension(0), "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != src->dimension(0),
+                                    "Weights feature map dimension should match the respective src's one");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8, "M0 can only be greater than 0 and less than or equal to 8");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8,
+                                    "M0 can only be greater than 0 and less than or equal to 8");
 
     // Checks performed when dst is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
-                                                           misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(),
-                                                                                                                 src->data_layout(),
-                                                                                                                 weights->tensor_shape(),
-                                                                                                                 conv_info,
-                                                                                                                 desc));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(),
+            misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(), src->data_layout(),
+                                                                  weights->tensor_shape(), conv_info, desc));
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
     }
 
@@ -75,8 +79,12 @@ ClIndirectConv2dAddressPrecalculationKernel::ClIndirectConv2dAddressPrecalculati
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClIndirectConv2dAddressPrecalculationKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst,
-                                                            const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc)
+void ClIndirectConv2dAddressPrecalculationKernel::configure(const CLCompileContext            &compile_context,
+                                                            ITensorInfo                       *src,
+                                                            ITensorInfo                       *weights,
+                                                            ITensorInfo                       *dst,
+                                                            const PadStrideInfo               &conv_info,
+                                                            const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, dst, conv_info, desc));
@@ -85,11 +93,8 @@ void ClIndirectConv2dAddressPrecalculationKernel::configure(const CLCompileConte
     constexpr unsigned int height_idx = 2;
 
     // Get dst shape
-    TensorShape output_shape = misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(),
-                                                                                     src->data_layout(),
-                                                                                     weights->tensor_shape(),
-                                                                                     conv_info,
-                                                                                     desc);
+    TensorShape output_shape = misc::shape_calculator::compute_indirect_buffer_shape(
+        src->tensor_shape(), src->data_layout(), weights->tensor_shape(), conv_info, desc);
 
     TensorShape output_conv_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
 
@@ -136,14 +141,19 @@ void ClIndirectConv2dAddressPrecalculationKernel::configure(const CLCompileConte
     // Since this kernel should be called only once, we do not need to set the config_id for tuning
 }
 
-Status ClIndirectConv2dAddressPrecalculationKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst,
-                                                             const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc)
+Status ClIndirectConv2dAddressPrecalculationKernel::validate(const ITensorInfo                 *src,
+                                                             const ITensorInfo                 *weights,
+                                                             const ITensorInfo                 *dst,
+                                                             const PadStrideInfo               &conv_info,
+                                                             const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, dst, conv_info, desc));
     return Status{};
 }
 
-void ClIndirectConv2dAddressPrecalculationKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClIndirectConv2dAddressPrecalculationKernel::run_op(ITensorPack      &tensors,
+                                                         const Window     &window,
+                                                         cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
diff --git a/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h
index ff7f4be147..b565609c6a 100644
--- a/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h
+++ b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h
@@ -60,16 +60,23 @@ public:
      * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]  desc            Direct convolution descriptor used to build the NHWC direct/indirect convolution kernel.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst,
-                   const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc);
+    void configure(const CLCompileContext            &compile_context,
+                   ITensorInfo                       *src,
+                   ITensorInfo                       *weights,
+                   ITensorInfo                       *dst,
+                   const PadStrideInfo               &conv_info,
+                   const DirectConvComputeKernelInfo &desc);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClIndirectConv2dAddressPreCalculationKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst,
-                           const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc);
+    static Status validate(const ITensorInfo                 *src,
+                           const ITensorInfo                 *weights,
+                           const ITensorInfo                 *dst,
+                           const PadStrideInfo               &conv_info,
+                           const DirectConvComputeKernelInfo &desc);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp
index a337eb50fd..3510b6970c 100644
--- a/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp
+++ b/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp
@@ -23,13 +23,14 @@
  */
 #include "src/gpu/cl/kernels/ClIndirectConv2dKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLUtils.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -46,8 +47,14 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *indirect_buffer, const ITensorInfo *dst,
-                          const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc)
+Status validate_arguments(const ITensorInfo                 *src,
+                          const ITensorInfo                 *weights,
+                          const ITensorInfo                 *biases,
+                          const ITensorInfo                 *indirect_buffer,
+                          const ITensorInfo                 *dst,
+                          const PadStrideInfo               &conv_info,
+                          const ActivationLayerInfo         &act_info,
+                          const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_UNUSED(act_info);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
@@ -55,37 +62,38 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indirect_buffer, 1, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(indirect_buffer->tensor_shape(),
-                                                       misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(),
-                                                                                                             src->data_layout(),
-                                                                                                             weights->tensor_shape(),
-                                                                                                             conv_info,
-                                                                                                             desc));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+        indirect_buffer->tensor_shape(),
+        misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(), src->data_layout(),
+                                                              weights->tensor_shape(), conv_info, desc));
 
     constexpr int channel_idx = 0;
     constexpr int batch_idx   = 3;
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx),
+                                    "Weights feature map dimension should match the respective src's one");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8, "M0 can only be greater than 0 and less than or equal to 8");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8,
+                                    "M0 can only be greater than 0 and less than or equal to 8");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 && desc.n0 != 16,
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 &&
+                                        desc.n0 != 16,
                                     "N0 can only be: 1, 2, 3, 4, 8, and 16");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16,
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 &&
+                                        desc.k0 != 16,
                                     "K0 can only be: 1, 2, 3, 4, 8, and 16");
 
-    if(desc.export_weights_to_cl_image)
+    if (desc.export_weights_to_cl_image)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16,
-                                        "K0 can only be: 4, 8, and 16");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16, "K0 can only be: 4, 8, and 16");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(weights),
                                         "Export to CLImage is not supported for this weight configuration");
     }
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
-        if(is_data_type_quantized_asymmetric(src->data_type()))
+        if (is_data_type_quantized_asymmetric(src->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -95,15 +103,14 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
         }
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(channel_idx) != weights->dimension(batch_idx),
                                         "Biases size and number of dst feature maps should match");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1,
-                                        "Biases should be one dimensional");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, "Biases should be one dimensional");
     }
 
     // Checks performed when dst is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
-                                                           misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(), misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     }
 
@@ -116,13 +123,21 @@ ClIndirectConv2dKernel::ClIndirectConv2dKernel()
     _type = CLKernelType::DIRECT;
 }
 
-void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *indirect_buffer, ITensorInfo *dst,
-                                       const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc)
+void ClIndirectConv2dKernel::configure(const CLCompileContext            &compile_context,
+                                       ITensorInfo                       *src,
+                                       ITensorInfo                       *weights,
+                                       ITensorInfo                       *biases,
+                                       ITensorInfo                       *indirect_buffer,
+                                       ITensorInfo                       *dst,
+                                       const PadStrideInfo               &conv_info,
+                                       const ActivationLayerInfo         &act_info,
+                                       const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, indirect_buffer, dst);
 
     // Perform validation
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, indirect_buffer, dst, conv_info, act_info, desc));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(src, weights, biases, indirect_buffer, dst, conv_info, act_info, desc));
 
     constexpr unsigned int channel_idx   = 0;
     constexpr unsigned int width_idx     = 1;
@@ -137,10 +152,7 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context,
     TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
 
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, output_shape,
-                       1,
-                       src->data_type(),
-                       src->quantization_info());
+    auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info());
 
     // Configure kernel window
     Window win;
@@ -164,7 +176,7 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context,
     _export_to_cl_image = desc.export_weights_to_cl_image;
 
     // Update the padding for the weights tensor if we can export to cl_image
-    if(_export_to_cl_image)
+    if (_export_to_cl_image)
     {
         gemm::update_padding_for_cl_image(weights);
     }
@@ -173,11 +185,12 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context,
     // When M0 is 5, 6, and 7, we use vload8 to fetch the data from the buffer
     const unsigned int load_indirect_buf_size = m0 > 4 ? 8 : m0;
     const unsigned int indirect_buf_width     = indirect_buffer->tensor_shape()[0];
-    const unsigned int round_up_width         = ((indirect_buf_width + load_indirect_buf_size - 1) / load_indirect_buf_size) * load_indirect_buf_size;
-    const unsigned int padding                = round_up_width - indirect_buf_width;
+    const unsigned int round_up_width =
+        ((indirect_buf_width + load_indirect_buf_size - 1) / load_indirect_buf_size) * load_indirect_buf_size;
+    const unsigned int padding = round_up_width - indirect_buf_width;
     indirect_buffer->extend_padding(PaddingSize(0, indirect_buffer->padding().right + padding, 0, 0));
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         build_options.add_option(std::string("-DHAS_BIAS"));
         build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type())));
@@ -186,9 +199,10 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context,
     // Conditions of -cl-fast-relaxed-math causing accuracy issues can be traced from COMPMID-5324
     const auto act_function = act_info.activation();
 
-    if((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
-       && (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-       && (data_type == DataType::F32 || data_type == DataType::F16))
+    if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+        (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU ||
+         act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) &&
+        (data_type == DataType::F32 || data_type == DataType::F16))
     {
         // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
         // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
@@ -224,7 +238,7 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context,
     // A macro guard to compile ONLY the kernel of interest
     build_options.add_option("-D" + upper_string(kernel_name.str()));
 
-    if(compile_context.get_ddk_version() >= 30)
+    if (compile_context.get_ddk_version() >= 30)
     {
         build_options.add_option("-fregister-allocation=64");
     }
@@ -253,10 +267,17 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context,
     _config_id += support::cpp11::to_string(dst->dimension(channel_idx));
 }
 
-Status ClIndirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *indirect_buffer, const ITensorInfo *dst,
-                                        const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc)
+Status ClIndirectConv2dKernel::validate(const ITensorInfo                 *src,
+                                        const ITensorInfo                 *weights,
+                                        const ITensorInfo                 *biases,
+                                        const ITensorInfo                 *indirect_buffer,
+                                        const ITensorInfo                 *dst,
+                                        const PadStrideInfo               &conv_info,
+                                        const ActivationLayerInfo         &act_info,
+                                        const DirectConvComputeKernelInfo &desc)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, indirect_buffer, dst, conv_info, act_info, desc));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(src, weights, biases, indirect_buffer, dst, conv_info, act_info, desc));
     return Status{};
 }
 
@@ -268,35 +289,42 @@ void ClIndirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window,
     // Get initial windows
     Window slice = window.first_slice_window_3D();
 
-    const auto src             = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto weights         = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto biases          = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    const auto indirect_buffer = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_3));
-    auto       dst             = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto weights =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto biases =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    const auto indirect_buffer =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_3));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     cl::Image2D weights_cl_image;
 
-    if(_export_to_cl_image)
+    if (_export_to_cl_image)
     {
-        const size_t      image_w = weights->info()->dimension(0) / 4;
-        const size_t      image_h = weights->info()->dimension(1) * weights->info()->dimension(2) * weights->info()->dimension(3);
+        const size_t image_w = weights->info()->dimension(0) / 4;
+        const size_t image_h =
+            weights->info()->dimension(1) * weights->info()->dimension(2) * weights->info()->dimension(3);
         const TensorShape shape2d(image_w, image_h);
         const size_t      image_row_pitch = weights->info()->strides_in_bytes()[1];
 
         // Export cl_buffer to cl_image
-        weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d, weights->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+        weights_cl_image =
+            create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d,
+                                       weights->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
     }
 
     unsigned int idx = 0;
     add_4d_tensor_nhwc_argument(idx, src);
     add_4d_tensor_nhwc_argument(idx, indirect_buffer);
     add_4d_tensor_nhwc_argument(idx, dst);
-    if(_export_to_cl_image)
+    if (_export_to_cl_image)
     {
         _kernel.setArg(idx++, weights_cl_image);
     }
     add_4d_tensor_nhwc_argument(idx, weights);
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         add_1D_tensor_argument(idx, biases, slice);
     }
diff --git a/src/gpu/cl/kernels/ClIndirectConv2dKernel.h b/src/gpu/cl/kernels/ClIndirectConv2dKernel.h
index b6c7b35fa4..04166d417e 100644
--- a/src/gpu/cl/kernels/ClIndirectConv2dKernel.h
+++ b/src/gpu/cl/kernels/ClIndirectConv2dKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -60,22 +61,35 @@ public:
      * @param[in]  act_info        Contains activaton information described in @ref ActivationLayerInfo.
      * @param[in]  desc            Direct convolution descriptor used to build the NHWC indirect convolution kernel.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *off, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                   const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc);
+    void configure(const CLCompileContext            &compile_context,
+                   ITensorInfo                       *src,
+                   ITensorInfo                       *off,
+                   ITensorInfo                       *weights,
+                   ITensorInfo                       *biases,
+                   ITensorInfo                       *dst,
+                   const PadStrideInfo               &conv_info,
+                   const ActivationLayerInfo         &act_info,
+                   const DirectConvComputeKernelInfo &desc);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClIndirectConv2dKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *off, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                           const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc);
+    static Status validate(const ITensorInfo                 *src,
+                           const ITensorInfo                 *off,
+                           const ITensorInfo                 *weights,
+                           const ITensorInfo                 *biases,
+                           const ITensorInfo                 *dst,
+                           const PadStrideInfo               &conv_info,
+                           const ActivationLayerInfo         &act_info,
+                           const DirectConvComputeKernelInfo &desc);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 public:
-    bool _export_to_cl_image{ false };
+    bool _export_to_cl_image{false};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp
index 66331bc818..0bb6b0c083 100644
--- a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp
@@ -29,17 +29,16 @@
 #include "arm_compute/core/QuantizationInfo.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/ActivationFunctionUtils.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
 
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -62,51 +61,62 @@ Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info)
     // Validate M0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
 
-    if(adj_lhs)
+    if (adj_lhs)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16), "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16),
+                                        "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
     }
 
     // Validate N0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16), "Only 1,2,3,4,8,16 are supported for N0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16),
+                                    "Only 1,2,3,4,8,16 are supported for N0");
 
     // Validate K0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 < 1, "Only positive integers are supported for K0");
-    if(!adj_lhs || adj_rhs)
+    if (!adj_lhs || adj_rhs)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16), "Only 1,2,3,4,8,16 are supported for K0");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16),
+                                        "Only 1,2,3,4,8,16 are supported for K0");
     }
 
     return Status{};
 }
-}
+} // namespace
 ClMatMulLowpNativeKernel::ClMatMulLowpNativeKernel()
 {
     _type = CLKernelType::GEMM;
 }
-Status ClMatMulLowpNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+Status ClMatMulLowpNativeKernel::validate(const ITensorInfo         *lhs,
+                                          const ITensorInfo         *rhs,
+                                          const ITensorInfo         *bias,
+                                          const ITensorInfo         *dst,
+                                          const MatMulKernelInfo    &matmul_kernel_info,
                                           const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.activation() != ActivationFunction::IDENTITY && act_info.activation() != ActivationFunction::RELU
-                                     && act_info.activation() != ActivationFunction::LU_BOUNDED_RELU && act_info.activation() != ActivationFunction::BOUNDED_RELU),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.activation() != ActivationFunction::IDENTITY &&
+                                     act_info.activation() != ActivationFunction::RELU &&
+                                     act_info.activation() != ActivationFunction::LU_BOUNDED_RELU &&
+                                     act_info.activation() != ActivationFunction::BOUNDED_RELU),
                                     "Activation Function specified is unsupported.");
-    const TensorShape expected_output_shape = misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info);
+    const TensorShape expected_output_shape =
+        misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
@@ -115,7 +125,12 @@ Status ClMatMulLowpNativeKernel::validate(const ITensorInfo *lhs, const ITensorI
 
     return Status{};
 }
-void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+void ClMatMulLowpNativeKernel::configure(const ClCompileContext    &compile_context,
+                                         ITensorInfo               *lhs,
+                                         ITensorInfo               *rhs,
+                                         ITensorInfo               *bias,
+                                         ITensorInfo               *dst,
+                                         const MatMulKernelInfo    &matmul_kernel_info,
                                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info);
@@ -123,7 +138,8 @@ void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context
     ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info));
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(
+                                 lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
 
     const int  m       = dst->dimension(1);
     const int  n       = dst->dimension(0);
@@ -217,10 +233,13 @@ void ClMatMulLowpNativeKernel::run_op(ITensorPack &tensors, const Window &window
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const ICLTensor *lhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const ICLTensor *rhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    ICLTensor       *dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const ICLTensor *lhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const ICLTensor *rhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const ICLTensor *bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst);
 
@@ -229,7 +248,7 @@ void ClMatMulLowpNativeKernel::run_op(ITensorPack &tensors, const Window &window
 
     add_3d_tensor_nhw_argument(idx, lhs);
     add_3d_tensor_nhw_argument(idx, rhs);
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         add_3d_tensor_nhw_argument(idx, bias);
     }
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
index 64415f42f7..ffdb720855 100644
--- a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
@@ -25,6 +25,7 @@
 #define ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEKERNEL
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -54,7 +55,12 @@ public:
      * @param[in]  matmul_kernel_info Attributes for Batch MatMul Kernel
      * @param[in]  act_info           (Optional) Class containing information about fused activation function.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *lhs,
+                   ITensorInfo               *rhs,
+                   ITensorInfo               *bias,
+                   ITensorInfo               *dst,
+                   const MatMulKernelInfo    &matmul_kernel_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -62,7 +68,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *dst,
+                           const MatMulKernelInfo    &matmul_kernel_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp
index 464212d7db..94e3c4e47b 100644
--- a/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp
@@ -28,10 +28,10 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
 
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -64,13 +64,15 @@ Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info)
     // Validate M0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
 
-    if(adj_lhs)
+    if (adj_lhs)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((m0 != 1) && (m0 != 2) && (m0 != 3) && (m0 != 4) && (m0 != 8) && (m0 != 16), "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((m0 != 1) && (m0 != 2) && (m0 != 3) && (m0 != 4) && (m0 != 8) && (m0 != 16),
+                                        "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
     }
 
     // Validate N0
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((n0 != 1) && (n0 != 2) && (n0 != 3) && (n0 != 4) && (n0 != 8) && (n0 != 16), "Only 1,2,3,4,8,16 are supported for N0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((n0 != 1) && (n0 != 2) && (n0 != 3) && (n0 != 4) && (n0 != 8) && (n0 != 16),
+                                    "Only 1,2,3,4,8,16 are supported for N0");
 
     // Validate K0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((k0 != 4), "Only 4 is supported for k0");
@@ -84,7 +86,11 @@ ClMatMulLowpNativeMMULKernel::ClMatMulLowpNativeMMULKernel()
     _type = CLKernelType::GEMM;
 }
 
-Status ClMatMulLowpNativeMMULKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+Status ClMatMulLowpNativeMMULKernel::validate(const ITensorInfo         *lhs,
+                                              const ITensorInfo         *rhs,
+                                              const ITensorInfo         *bias,
+                                              const ITensorInfo         *dst,
+                                              const MatMulKernelInfo    &matmul_kernel_info,
                                               const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
@@ -102,16 +108,17 @@ Status ClMatMulLowpNativeMMULKernel::validate(const ITensorInfo *lhs, const ITen
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.activation() != ActivationFunction::IDENTITY),
                                     "Activation Function specified is unsupported.");
-    const TensorShape expected_output_shape = misc::shape_calculator::compute_matmul_shape(lhs_shape, rhs->tensor_shape(), matmul_kernel_info);
+    const TensorShape expected_output_shape =
+        misc::shape_calculator::compute_matmul_shape(lhs_shape, rhs->tensor_shape(), matmul_kernel_info);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
@@ -121,15 +128,21 @@ Status ClMatMulLowpNativeMMULKernel::validate(const ITensorInfo *lhs, const ITen
     return Status{};
 }
 
-void ClMatMulLowpNativeMMULKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst,
-                                             const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info)
+void ClMatMulLowpNativeMMULKernel::configure(const ClCompileContext    &compile_context,
+                                             ITensorInfo               *lhs,
+                                             ITensorInfo               *rhs,
+                                             ITensorInfo               *bias,
+                                             ITensorInfo               *dst,
+                                             const MatMulKernelInfo    &matmul_kernel_info,
+                                             const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst, matmul_kernel_info, act_info);
     ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info));
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(
+                                 lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
 
     ARM_COMPUTE_UNUSED(compile_context, lhs, rhs, bias, matmul_kernel_info, act_info);
     CLBuildOptions build_opts;
@@ -147,7 +160,8 @@ void ClMatMulLowpNativeMMULKernel::configure(const ClCompileContext &compile_con
     const unsigned int n0_leftover = n % n0;
 
     // Configure kernel window
-    const auto win_config = validate_and_configure_window_for_mmul_kernels(lhs, rhs, dst, matmul_kernel_info, mmul_m0, mmul_n0);
+    const auto win_config =
+        validate_and_configure_window_for_mmul_kernels(lhs, rhs, dst, matmul_kernel_info, mmul_m0, mmul_n0);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     IClKernel::configure_internal(win_config.second);
 
@@ -215,10 +229,13 @@ void ClMatMulLowpNativeMMULKernel::run_op(ITensorPack &tensors, const Window &wi
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto *lhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto *rhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present
-    auto       *dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto *lhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto *rhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(
+        tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present
+    auto *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst);
@@ -227,7 +244,7 @@ void ClMatMulLowpNativeMMULKernel::run_op(ITensorPack &tensors, const Window &wi
     add_3d_tensor_nhw_argument(idx, lhs);
     add_3d_tensor_nhw_argument(idx, rhs);
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         add_3d_tensor_nhw_argument(idx, bias);
     }
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h
index d2aa40b2e2..6c56f15d74 100644
--- a/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h
@@ -25,6 +25,7 @@
 #define ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEMMULKERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -49,7 +50,12 @@ public:
      *
      * @return a status
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *lhs,
+                   ITensorInfo               *rhs,
+                   ITensorInfo               *bias,
+                   ITensorInfo               *dst,
+                   const MatMulKernelInfo    &matmul_kernel_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -57,7 +63,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *dst,
+                           const MatMulKernelInfo    &matmul_kernel_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
index 41ba5d5e25..a1fa9fa9ab 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
@@ -28,9 +28,9 @@
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/ActivationFunctionUtils.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
 
 #include "src/common/utils/Log.h"
 #include "src/core/CL/CLUtils.h"
@@ -38,7 +38,6 @@
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 #include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -61,20 +60,23 @@ Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info)
     // Validate M0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
 
-    if(adj_lhs)
+    if (adj_lhs)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16), "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16),
+                                        "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
     }
 
     // Validate N0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16), "Only 1,2,3,4,8,16 are supported for N0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16),
+                                    "Only 1,2,3,4,8,16 are supported for N0");
 
     // Validate K0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 < 1, "Only positive integers are supported for K0");
-    if(!adj_lhs || adj_rhs)
+    if (!adj_lhs || adj_rhs)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16), "Only 1,2,3,4,8,16 are supported for K0");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16),
+                                        "Only 1,2,3,4,8,16 are supported for K0");
     }
 
     return Status{};
@@ -83,30 +85,37 @@ Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info)
 Status validate_export_to_cl_image(const ITensorInfo *rhs, const MatMulKernelInfo &matmul_kernel_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(matmul_kernel_info.export_rhs_to_cl_image && rhs->lock_paddings());
-    if(matmul_kernel_info.export_rhs_to_cl_image)
+    if (matmul_kernel_info.export_rhs_to_cl_image)
     {
-        if(matmul_kernel_info.adj_rhs)
+        if (matmul_kernel_info.adj_rhs)
         {
             const int k0 = matmul_kernel_info.k0;
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 != 4 && k0 != 8 && k0 != 16, "K0 can only be: 4, 8, and 16 for Rhs transposed");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 != 4 && k0 != 8 && k0 != 16,
+                                            "K0 can only be: 4, 8, and 16 for Rhs transposed");
         }
         else
         {
             const int n0 = matmul_kernel_info.n0;
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 != 4 && n0 != 8 && n0 != 16, "N0 can only be: 4, 8, and 16 for Rhs non-transposed");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 != 4 && n0 != 8 && n0 != 16,
+                                            "N0 can only be: 4, 8, and 16 for Rhs non-transposed");
         }
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(rhs), "Export to CLImage is not supported for this device/configuration");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(rhs),
+                                        "Export to CLImage is not supported for this device/configuration");
     }
 
     return Status{};
 }
-}
+} // namespace
 ClMatMulNativeKernel::ClMatMulNativeKernel()
 {
     _type = CLKernelType::GEMM;
 }
 
-Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+Status ClMatMulNativeKernel::validate(const ITensorInfo         *lhs,
+                                      const ITensorInfo         *rhs,
+                                      const ITensorInfo         *bias,
+                                      const ITensorInfo         *dst,
+                                      const MatMulKernelInfo    &matmul_kernel_info,
                                       const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
@@ -114,28 +123,36 @@ Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_export_to_cl_image(rhs, matmul_kernel_info));
 
-    const TensorShape expected_output_shape = misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info);
+    const TensorShape expected_output_shape =
+        misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(bias, lhs);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG((bias->num_dimensions() > 1), "Multi dimensional bias is unsupported.");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != expected_output_shape[0], "First dimension of bias and output tensors must match.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != expected_output_shape[0],
+                                        "First dimension of bias and output tensors must match.");
     }
 
     return Status{};
 }
-void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+void ClMatMulNativeKernel::configure(const ClCompileContext    &compile_context,
+                                     ITensorInfo               *lhs,
+                                     ITensorInfo               *rhs,
+                                     ITensorInfo               *bias,
+                                     ITensorInfo               *dst,
+                                     const MatMulKernelInfo    &matmul_kernel_info,
                                      const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info);
@@ -143,7 +160,8 @@ void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, IT
     ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info));
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(
+                                 lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
 
     const int  m       = dst->dimension(1);
     const int  n       = dst->dimension(0);
@@ -187,7 +205,7 @@ void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, IT
     // A macro guard to compile ONLY the kernel of interest
     build_opts.add_option("-D" + upper_string(kernel_name));
 
-    if(_export_rhs_to_cl_image)
+    if (_export_rhs_to_cl_image)
     {
         gemm::update_padding_for_cl_image(rhs);
     }
@@ -222,10 +240,13 @@ void ClMatMulNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const ICLTensor *lhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const ICLTensor *rhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present
-    ICLTensor       *dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const ICLTensor *lhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const ICLTensor *rhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(
+        tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present
+    ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst);
 
@@ -235,7 +256,7 @@ void ClMatMulNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl
     add_3d_tensor_nhw_argument(idx, lhs);
 
     cl::Image2D rhs_cl_image;
-    if(_export_rhs_to_cl_image)
+    if (_export_rhs_to_cl_image)
     {
         const size_t      image_w = rhs->info()->dimension(0) / 4;
         const size_t      image_h = rhs->info()->tensor_shape().total_size() / rhs->info()->dimension(0);
@@ -243,12 +264,13 @@ void ClMatMulNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl
         const size_t      image_row_pitch = rhs->info()->strides_in_bytes()[1];
 
         // Export cl_buffer to cl_image
-        rhs_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), rhs->cl_buffer(), shape2d, rhs->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+        rhs_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), rhs->cl_buffer(), shape2d,
+                                                  rhs->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
         _kernel.setArg(idx++, rhs_cl_image);
     }
 
     add_3d_tensor_nhw_argument(idx, rhs);
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         add_3d_tensor_nhw_argument(idx, bias);
     }
diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.h b/src/gpu/cl/kernels/ClMatMulNativeKernel.h
index fe2b787c12..2cb150bc8f 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeKernel.h
+++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.h
@@ -25,6 +25,7 @@
 #define ACL_SRC_GPU_CL_KERNELS_CLMATMULNATIVEKERNEL
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -52,7 +53,12 @@ public:
      * @param[in]  matmul_kernel_info Attributes for Batch MatMul Kernel
      * @param[in]  act_info           (Optional) Specifies activation function to use after Matrix multiplication. Default is Identity function.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *lhs,
+                   ITensorInfo               *rhs,
+                   ITensorInfo               *bias,
+                   ITensorInfo               *dst,
+                   const MatMulKernelInfo    &matmul_kernel_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -60,14 +66,18 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *dst,
+                           const MatMulKernelInfo    &matmul_kernel_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool _export_rhs_to_cl_image{ false };
+    bool _export_rhs_to_cl_image{false};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp
index 432270e8bf..76bf846e74 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp
+++ b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp
@@ -28,14 +28,13 @@
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
 
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -62,31 +61,38 @@ Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info)
     // Validate M0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
 
-    if(adj_lhs)
+    if (adj_lhs)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((m0 != 1) && (m0 != 2) && (m0 != 3) && (m0 != 4) && (m0 != 8) && (m0 != 16), "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((m0 != 1) && (m0 != 2) && (m0 != 3) && (m0 != 4) && (m0 != 8) && (m0 != 16),
+                                        "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
     }
 
     // Validate N0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((n0 != 1) && (n0 != 2) && (n0 != 3) && (n0 != 4) && (n0 != 8) && (n0 != 16), "Only 1,2,3,4,8,16 are supported for N0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((n0 != 1) && (n0 != 2) && (n0 != 3) && (n0 != 4) && (n0 != 8) && (n0 != 16),
+                                    "Only 1,2,3,4,8,16 are supported for N0");
 
     // Validate K0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((k0 != 1), "Only 1 is supported for k0");
 
     return Status{};
 }
-}
+} // namespace
 ClMatMulNativeMMULKernel::ClMatMulNativeMMULKernel()
 {
     _type = CLKernelType::GEMM;
 }
 
-Status ClMatMulNativeMMULKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info)
+Status ClMatMulNativeMMULKernel::validate(const ITensorInfo      *lhs,
+                                          const ITensorInfo      *rhs,
+                                          const ITensorInfo      *bias,
+                                          const ITensorInfo      *dst,
+                                          const MatMulKernelInfo &matmul_kernel_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()), "The extension cl_arm_matrix_multiply is not supported on the target platform");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()),
+                                    "The extension cl_arm_matrix_multiply is not supported on the target platform");
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info));
 
@@ -96,32 +102,40 @@ Status ClMatMulNativeMMULKernel::validate(const ITensorInfo *lhs, const ITensorI
     const size_t lhs_k = matmul_kernel_info.adj_lhs ? lhs_shape.y() : lhs_shape.x();
     ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR((lhs_k % mmul_k0) != 0, "K dimension must be a multiple of %d", mmul_k0);
 
-    const TensorShape expected_output_shape = misc::shape_calculator::compute_matmul_shape(lhs_shape, rhs->tensor_shape(), matmul_kernel_info);
+    const TensorShape expected_output_shape =
+        misc::shape_calculator::compute_matmul_shape(lhs_shape, rhs->tensor_shape(), matmul_kernel_info);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG((bias->num_dimensions() > 1), "Multi dimensional bias is unsupported.");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != expected_output_shape[0], "First dimension of bias and output tensors must match.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != expected_output_shape[0],
+                                        "First dimension of bias and output tensors must match.");
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, bias);
     }
 
     return Status{};
 }
-void ClMatMulNativeMMULKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info)
+void ClMatMulNativeMMULKernel::configure(const ClCompileContext &compile_context,
+                                         ITensorInfo            *lhs,
+                                         ITensorInfo            *rhs,
+                                         ITensorInfo            *bias,
+                                         ITensorInfo            *dst,
+                                         const MatMulKernelInfo &matmul_kernel_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst, matmul_kernel_info);
     ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info));
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(
+                                 lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
 
     const int m = dst->dimension(1);
     const int n = dst->dimension(0);
@@ -135,7 +149,8 @@ void ClMatMulNativeMMULKernel::configure(const ClCompileContext &compile_context
     const int n0 = adjust_vec_size(matmul_kernel_info.n0, n);
 
     // Configure kernel window
-    const auto win_config = validate_and_configure_window_for_mmul_kernels(lhs, rhs, dst, matmul_kernel_info, mmul_m0, mmul_n0);
+    const auto win_config =
+        validate_and_configure_window_for_mmul_kernels(lhs, rhs, dst, matmul_kernel_info, mmul_m0, mmul_n0);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     IClKernel::configure_internal(win_config.second);
 
@@ -186,17 +201,20 @@ void ClMatMulNativeMMULKernel::run_op(ITensorPack &tensors, const Window &window
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const ICLTensor *lhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const ICLTensor *rhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present
-    ICLTensor       *dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const ICLTensor *lhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const ICLTensor *rhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(
+        tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present
+    ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst);
     unsigned int idx = 0;
 
     add_3d_tensor_nhw_argument(idx, lhs);
     add_3d_tensor_nhw_argument(idx, rhs);
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         add_3d_tensor_nhw_argument(idx, bias);
     }
diff --git a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h
index 80448974c4..1aeb896325 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h
+++ b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h
@@ -72,22 +72,31 @@ public:
      * @param[out] dst             Output tensor info.
      * @param[in]  matmul_info     Attributes for Batch MatMul Kernel
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *lhs,
+                   ITensorInfo            *rhs,
+                   ITensorInfo            *bias,
+                   ITensorInfo            *dst,
+                   const MatMulKernelInfo &matmul_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClMatMulNativeMMULKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
+    static Status validate(const ITensorInfo      *lhs,
+                           const ITensorInfo      *rhs,
+                           const ITensorInfo      *bias,
+                           const ITensorInfo      *dst,
+                           const MatMulKernelInfo &matmul_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    int _m{ 1 };
-    int _n{ 1 };
-    int _k{ 1 };
+    int _m{1};
+    int _n{1};
+    int _k{1};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClMulKernel.cpp b/src/gpu/cl/kernels/ClMulKernel.cpp
index 5ca0639852..3b59c2a7fc 100644
--- a/src/gpu/cl/kernels/ClMulKernel.cpp
+++ b/src/gpu/cl/kernels/ClMulKernel.cpp
@@ -23,15 +23,16 @@
  */
 #include "src/gpu/cl/kernels/ClMulKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -46,24 +47,25 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
-                          ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+Status validate_arguments(const ITensorInfo         *src1,
+                          const ITensorInfo         *src2,
+                          const ITensorInfo         *dst,
+                          float                      scale,
+                          ConvertPolicy              overflow_policy,
+                          RoundingPolicy             rounding_policy,
+                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(overflow_policy);
     ARM_COMPUTE_UNUSED(rounding_policy);
 
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1,
-                                                         1,
-                                                         DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16, DataType::F16, DataType::S32,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2,
-                                                         1,
-                                                         DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16, DataType::F16, DataType::S32,
-                                                         DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+                                                         DataType::F16, DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+                                                         DataType::F16, DataType::S32, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type()));
 
@@ -76,27 +78,35 @@ Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, cons
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
     // Validate in case of configured dst
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst,
-                                                             1,
-                                                             DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                             DataType::S16, DataType::QSYMM16, DataType::F16,
-                                                             DataType::S32, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::U8 && (src1->data_type() != DataType::U8 || src2->data_type() != DataType::U8),
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8,
+                                                             DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+                                                             DataType::F16, DataType::S32, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::U8 &&
+                                            (src1->data_type() != DataType::U8 || src2->data_type() != DataType::U8),
                                         "Dst can only be U8 if both src are U8");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QASYMM8 && (src1->data_type() != DataType::QASYMM8 || src2->data_type() != DataType::QASYMM8),
-                                        "Dst can only be QASYMM8 if both src are QASYMM8");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QASYMM8_SIGNED && (src1->data_type() != DataType::QASYMM8_SIGNED || src2->data_type() != DataType::QASYMM8_SIGNED),
-                                        "Dst can only be QASYMM8_SIGNED if both src are QASYMM8_SIGNED");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QSYMM16 && (src1->data_type() != DataType::QSYMM16 || src2->data_type() != DataType::QSYMM16),
-                                        "Dst can only be QSYMM16 if both src are QSYMM16");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((src1->data_type() == DataType::S32 || src2->data_type() == DataType::S32) && (dst->data_type() != DataType::S32),
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->data_type() == DataType::QASYMM8 &&
+                (src1->data_type() != DataType::QASYMM8 || src2->data_type() != DataType::QASYMM8),
+            "Dst can only be QASYMM8 if both src are QASYMM8");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->data_type() == DataType::QASYMM8_SIGNED &&
+                (src1->data_type() != DataType::QASYMM8_SIGNED || src2->data_type() != DataType::QASYMM8_SIGNED),
+            "Dst can only be QASYMM8_SIGNED if both src are QASYMM8_SIGNED");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->data_type() == DataType::QSYMM16 &&
+                (src1->data_type() != DataType::QSYMM16 || src2->data_type() != DataType::QSYMM16),
+            "Dst can only be QSYMM16 if both src are QSYMM16");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((src1->data_type() == DataType::S32 || src2->data_type() == DataType::S32) &&
+                                            (dst->data_type() != DataType::S32),
                                         "Dst must be S32 if source tensors are S32");
-        if(in_place)
+        if (in_place)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src1_in_place ? src1->tensor_shape() : src2->tensor_shape(), 0),
-                                            "Wrong shape for dst, cannot do in_place calculation");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                detail::have_different_dimensions(out_shape,
+                                                  src1_in_place ? src1->tensor_shape() : src2->tensor_shape(), 0),
+                "Wrong shape for dst, cannot do in_place calculation");
         }
         else
         {
@@ -114,14 +124,19 @@ ClMulKernel::ClMulKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
-                            ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+void ClMulKernel::configure(const CLCompileContext    &compile_context,
+                            ITensorInfo               *src1,
+                            ITensorInfo               *src2,
+                            ITensorInfo               *dst,
+                            float                      scale,
+                            ConvertPolicy              overflow_policy,
+                            RoundingPolicy             rounding_policy,
+                            const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst,
-                                                  scale, overflow_policy, rounding_policy, act_info));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info));
 
-    auto padding_info = get_padding_info({ src1, src2, dst });
+    auto padding_info = get_padding_info({src1, src2, dst});
 
     const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
     auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape));
@@ -133,7 +148,7 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo
     // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
     // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
     // Moreover, it will be negative as we deal with 1/2^n
-    if((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1))
+    if ((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1))
     {
         // Store the positive exponent. We know that we compute 1/2^n
         // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
@@ -142,19 +157,19 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo
 
     std::string acc_type;
     // Check if it has float src and dst
-    if(is_data_type_float(src1->data_type()) || is_data_type_float(src2->data_type()))
+    if (is_data_type_float(src1->data_type()) || is_data_type_float(src2->data_type()))
     {
         scale_int = -1;
         acc_type  = (src1->data_type() == DataType::F32 || src2->data_type() == DataType::F32) ? "float" : "half";
     }
     else
     {
-        if(src1->element_size() == 4 || src2->element_size() == 4)
+        if (src1->element_size() == 4 || src2->element_size() == 4)
         {
             // use 64 bit accumulator for 32-bit input
             acc_type = "long";
         }
-        else if(src1->element_size() == 2 || src2->element_size() == 2)
+        else if (src1->element_size() == 2 || src2->element_size() == 2)
         {
             // Use 32-bit accumulator for 16-bit input
             acc_type = "int";
@@ -176,11 +191,15 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo
     build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(src1->data_type()));
     build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(src2->data_type()));
     build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type()));
-    build_opts.add_option("-DVEC_SIZE_IN1=" + ((dst->dimension(0) != 1 && src1->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size)));
-    build_opts.add_option("-DVEC_SIZE_IN2=" + ((dst->dimension(0) != 1 && src2->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size)));
+    build_opts.add_option("-DVEC_SIZE_IN1=" + ((dst->dimension(0) != 1 && src1->dimension(0) == 1)
+                                                   ? "1"
+                                                   : support::cpp11::to_string(vec_size)));
+    build_opts.add_option("-DVEC_SIZE_IN2=" + ((dst->dimension(0) != 1 && src2->dimension(0) == 1)
+                                                   ? "1"
+                                                   : support::cpp11::to_string(vec_size)));
     build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(vec_size));
     build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
-    if(is_quantized && (dst->data_type() != DataType::S32))
+    if (is_quantized && (dst->data_type() != DataType::S32))
     {
         const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
         const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform();
@@ -200,12 +219,14 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo
     else
     {
         kernel_name += (scale_int >= 0) ? "_int" : "_float";
-        build_opts.add_option_if_else(overflow_policy == ConvertPolicy::WRAP || is_data_type_float(dst->data_type()), "-DWRAP", "-DSATURATE");
+        build_opts.add_option_if_else(overflow_policy == ConvertPolicy::WRAP || is_data_type_float(dst->data_type()),
+                                      "-DWRAP", "-DSATURATE");
         build_opts.add_option_if_else(rounding_policy == RoundingPolicy::TO_ZERO, "-DROUND=_rtz", "-DROUND=_rte");
         build_opts.add_option("-DACC_DATA_TYPE=" + acc_type);
-        if(act_info.enabled())
+        if (act_info.enabled())
         {
-            build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
+            build_opts.add_option("-DACTIVATION_TYPE=" +
+                                  lower_string(string_from_activation_func(act_info.activation())));
             build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
             build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
         }
@@ -223,7 +244,7 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo
     // Set scale argument
     unsigned int idx = (in_place ? 2 : 3) * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
 
-    if(scale_int >= 0 && !is_quantized)
+    if (scale_int >= 0 && !is_quantized)
     {
         _kernel.setArg(idx++, scale_int);
     }
@@ -261,8 +282,13 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo
     _config_id += support::cpp11::to_string(dst->dimension(2));
 }
 
-Status ClMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
-                             ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+Status ClMulKernel::validate(const ITensorInfo         *src1,
+                             const ITensorInfo         *src2,
+                             const ITensorInfo         *dst,
+                             float                      scale,
+                             ConvertPolicy              overflow_policy,
+                             RoundingPolicy             rounding_policy,
+                             const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info));
@@ -275,9 +301,11 @@ void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::Command
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst   = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src_0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src_1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src_0, src_1, dst);
 
@@ -286,17 +314,18 @@ void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::Command
     const TensorShape &out_shape = dst->info()->tensor_shape();
 
     bool can_collapse = true;
-    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+    if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
     {
         can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
+        for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
         {
             can_collapse = (in_shape1[d] == in_shape2[d]);
         }
     }
 
     bool   has_collapsed = false;
-    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+    Window collapsed =
+        can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
 
     const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
     const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
@@ -312,7 +341,7 @@ void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::Command
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, src_0, slice_input1);
         add_3D_tensor_argument(idx, src_1, slice_input2);
-        if(!in_place)
+        if (!in_place)
         {
             add_3D_tensor_argument(idx, dst, slice);
         }
@@ -320,15 +349,17 @@ void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::Command
 
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 
 namespace
 {
 constexpr unsigned int vec_size_complex = 1;
 
-Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status validate_arguments_complex(const ITensorInfo         *src1,
+                                  const ITensorInfo         *src2,
+                                  const ITensorInfo         *dst,
+                                  const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F16, DataType::F32);
@@ -340,11 +371,12 @@ Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *sr
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type()));
 
     // Validate in case of configured dst
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F16, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
+                                        "Wrong shape for dst");
     }
 
     return Status{};
@@ -356,19 +388,23 @@ ClComplexMulKernel::ClComplexMulKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClComplexMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClComplexMulKernel::configure(const CLCompileContext    &compile_context,
+                                   ITensorInfo               *src1,
+                                   ITensorInfo               *src2,
+                                   ITensorInfo               *dst,
+                                   const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst, act_info));
 
-    auto padding_info = get_padding_info({ src1, src2, dst });
+    auto padding_info = get_padding_info({src1, src2, dst});
 
     const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
     auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape));
 
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
-    if(act_info.enabled())
+    if (act_info.enabled())
     {
         build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
         build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
@@ -384,7 +420,10 @@ void ClComplexMulKernel::configure(const CLCompileContext &compile_context, ITen
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClComplexMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClComplexMulKernel::validate(const ITensorInfo         *src1,
+                                    const ITensorInfo         *src2,
+                                    const ITensorInfo         *dst,
+                                    const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst, act_info));
@@ -397,26 +436,29 @@ void ClComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst   = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src_0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src_1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     const TensorShape &in_shape1 = src_0->info()->tensor_shape();
     const TensorShape &in_shape2 = src_1->info()->tensor_shape();
     const TensorShape &out_shape = dst->info()->tensor_shape();
 
     bool can_collapse = true;
-    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+    if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
     {
         can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
+        for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
         {
             can_collapse = (in_shape1[d] == in_shape2[d]);
         }
     }
 
     bool   has_collapsed = false;
-    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+    Window collapsed =
+        can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
 
     const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
     const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
@@ -435,8 +477,7 @@ void ClComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::
 
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClMulKernel.h b/src/gpu/cl/kernels/ClMulKernel.h
index 4e62a6d67a..76a3ce02c1 100644
--- a/src/gpu/cl/kernels/ClMulKernel.h
+++ b/src/gpu/cl/kernels/ClMulKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_MUL_KERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -72,16 +73,27 @@ public:
      * @param[in]  rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
-                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   float                      scale,
+                   ConvertPolicy              overflow_policy,
+                   RoundingPolicy             rounding_policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClMulKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
-                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           float                      scale,
+                           ConvertPolicy              overflow_policy,
+                           RoundingPolicy             rounding_policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -101,14 +113,21 @@ public:
      * @param[out] dst             The dst tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClComplexMulKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClPermuteKernel.cpp b/src/gpu/cl/kernels/ClPermuteKernel.cpp
index 8d4655114b..a4755782ed 100644
--- a/src/gpu/cl/kernels/ClPermuteKernel.cpp
+++ b/src/gpu/cl/kernels/ClPermuteKernel.cpp
@@ -29,8 +29,9 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -60,13 +61,13 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
                                     "Permutation up to 4-D src tensor is supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(perm.num_dimensions() < 1 || perm.num_dimensions() > 4,
                                     "Permutation vector size should be less than or equal to 4");
-    for(const auto &p : perm)
+    for (const auto &p : perm)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(p >= perm.num_dimensions(), "Permutation vector has invalid values");
     }
 
     // Validate configured dst
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
@@ -82,10 +83,13 @@ ClPermuteKernel::ClPermuteKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClPermuteKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm)
+void ClPermuteKernel::configure(const CLCompileContext  &compile_context,
+                                const ITensorInfo       *src,
+                                ITensorInfo             *dst,
+                                const PermutationVector &perm)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    auto              padding_info = get_padding_info({ src, dst });
+    auto              padding_info = get_padding_info({src, dst});
     const TensorShape dst_shape    = get_dst_shape(src, perm);
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
@@ -96,7 +100,8 @@ void ClPermuteKernel::configure(const CLCompileContext &compile_context, const I
 
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(src->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(src->data_type())));
     build_opts.add_option("-DDEPTH_IN=" + support::cpp11::to_string(src->dimension(2)));
     // New positions of  width(W), height(H), channel(C) and batch(D) based on permutation vector
     build_opts.add_option("-DP1=" + support::cpp11::to_string((_perm.num_dimensions() >= 1) ? perm[0] : 0));
@@ -126,8 +131,9 @@ void ClPermuteKernel::run_op(ITensorPack &tensors, const Window &window, cl::Com
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
 
@@ -144,9 +150,8 @@ void ClPermuteKernel::run_op(ITensorPack &tensors, const Window &window, cl::Com
         add_4D_tensor_argument(idx, src, slice_in);
         add_4D_tensor_argument(idx, dst, slice_out);
         enqueue(queue, *this, slice_in, lws_hint());
-    }
-    while(window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+    } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
 }
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClPermuteKernel.h b/src/gpu/cl/kernels/ClPermuteKernel.h
index 0d349e739b..2413b10284 100644
--- a/src/gpu/cl/kernels/ClPermuteKernel.h
+++ b/src/gpu/cl/kernels/ClPermuteKernel.h
@@ -52,7 +52,10 @@ public:
      * @param[in] dst             The dst tensor info. Data types supported: Same as @p src
      * @param[in] perm            Permutation vector
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm);
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       *src,
+                   ITensorInfo             *dst,
+                   const PermutationVector &perm);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClPermuteKernel::configure()
diff --git a/src/gpu/cl/kernels/ClPool2dKernel.cpp b/src/gpu/cl/kernels/ClPool2dKernel.cpp
index a1afc585e0..41ab4d6922 100644
--- a/src/gpu/cl/kernels/ClPool2dKernel.cpp
+++ b/src/gpu/cl/kernels/ClPool2dKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -43,37 +44,47 @@ using namespace arm_compute::misc::shape_calculator;
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status validate_arguments(const ITensorInfo      *src,
+                          const ITensorInfo      *dst,
+                          const PoolingLayerInfo &pool_info,
+                          const ITensorInfo      *indices)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(src->data_type()) && pool_info.pool_type == PoolingType::L2),
-                                    "Unsupported combination of parameters!");
-
-    const auto   data_layout       = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    const int    idx_width         = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int    idx_height        = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const bool   is_global_pooling = pool_info.is_global_pooling;
-    unsigned int pool_size_x       = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
-    unsigned int pool_size_y       = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
-    int          output_width      = 0;
-    int          output_height     = 0;
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(pool_info), "Pooling region that is entirely outside input tensor is unsupported");
-
-    std::tie(output_width, output_height) = scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
-                                                                     pool_size_x, pool_size_y, pool_info.pad_stride_info);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), "Calculated output dimension size is invalid");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (is_data_type_quantized_asymmetric(src->data_type()) && pool_info.pool_type == PoolingType::L2),
+        "Unsupported combination of parameters!");
+
+    const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const bool is_global_pooling = pool_info.is_global_pooling;
+    unsigned int pool_size_x     = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+    unsigned int pool_size_y     = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
+    int          output_width    = 0;
+    int          output_height   = 0;
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(pool_info),
+                                    "Pooling region that is entirely outside input tensor is unsupported");
+
+    std::tie(output_width, output_height) =
+        scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], pool_size_x,
+                                 pool_size_y, pool_info.pad_stride_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1),
+                                    "Calculated output dimension size is invalid");
 
     // Check indices
-    if(indices)
+    if (indices)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX,
+                                        "Pooling indices only supported for MAX pooling method");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)),
+                                        "Pooling indices only supported for pool size 2x2");
 
-        if(indices->total_size() != 0)
+        if (indices->total_size() != 0)
         {
             TensorInfo idx_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, DataType::U32));
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &idx_info);
@@ -81,7 +92,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     }
 
     // Checks performed when dst is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
@@ -98,42 +109,47 @@ ClPool2dKernel::ClPool2dKernel()
     _type = CLKernelType::POOL;
 }
 
-void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
+void ClPool2dKernel::configure(const ClCompileContext &compile_context,
+                               ITensorInfo            *src,
+                               ITensorInfo            *dst,
+                               const PoolingLayerInfo &pool_info,
+                               ITensorInfo            *indices)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices));
 
-    auto padding_info = get_padding_info({ src, dst, indices });
+    auto padding_info = get_padding_info({src, dst, indices});
 
     // Auto init if empty
     TensorShape out_shape = compute_pool_shape(*src, pool_info);
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(out_shape));
-    if(indices)
+    if (indices)
     {
         auto_init_if_empty(*indices, src->clone()->set_tensor_shape(out_shape).set_data_type(DataType::U32));
     }
 
     // Set instance variables
-    _pool_info                         = pool_info;
-    _data_layout                       = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    _num_elems_processed_per_iteration = (_data_layout == DataLayout::NCHW) ? 1 : ((dst->data_type() == DataType::F32) ? 2 : 4);
+    _pool_info   = pool_info;
+    _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+    _num_elems_processed_per_iteration =
+        (_data_layout == DataLayout::NCHW) ? 1 : ((dst->data_type() == DataType::F32) ? 2 : 4);
     _num_elems_processed_per_iteration = adjust_vec_size(_num_elems_processed_per_iteration, dst->dimension(0));
 
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    const PoolingType   pool_type       = pool_info.pool_type;
-    const int           idx_width       = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const int           idx_height      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const int           idx_channel     = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-    const int           idx_batch_size  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES);
-    const int           pool_size_x     = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
-    const int           pool_size_y     = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
-    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
-    const bool          exclude_padding = pool_info.exclude_padding;
+    int               pool_stride_x  = 0;
+    int               pool_stride_y  = 0;
+    const PoolingType pool_type      = pool_info.pool_type;
+    const int         idx_width      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const int         idx_height     = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const int         idx_channel    = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+    const int         idx_batch_size = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES);
+    const int         pool_size_x = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
+    const PadStrideInfo pad_stride_info    = pool_info.pad_stride_info;
+    const bool          exclude_padding    = pool_info.exclude_padding;
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-    const int      pool_pad_top  = pad_stride_info.pad_top();
-    const int      pool_pad_left = pad_stride_info.pad_left();
-    const DataType data_type     = src->data_type();
+    const int      pool_pad_top            = pad_stride_info.pad_top();
+    const int      pool_pad_left           = pad_stride_info.pad_left();
+    const DataType data_type               = src->data_type();
 
     // Set build options
     CLBuildOptions build_opts;
@@ -148,20 +164,23 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI
     build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y));
     build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width)));
     build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height)));
-    build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left)));
-    build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top)));
+    build_opts.add_option("-DMAX_WIDTH=" +
+                          support::cpp11::to_string(src->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left)));
+    build_opts.add_option("-DMAX_HEIGHT=" +
+                          support::cpp11::to_string(src->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top)));
 
     // Tensor paddings are used to calculate the indicies for MAX pooling
-    if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type))
+    if (pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices &&
+        is_data_type_float(data_type))
     {
         build_opts.add_option("-DSRC_BATCH=" + support::cpp11::to_string(src->tensor_shape().total_size_lower(3)));
     }
 
-    if(is_data_type_quantized_asymmetric(data_type))
+    if (is_data_type_quantized_asymmetric(data_type))
     {
         build_opts.add_option("-DQUANTIZED");
 
-        if(src->quantization_info() != dst->quantization_info())
+        if (src->quantization_info() != dst->quantization_info())
         {
             const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
             const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
@@ -174,9 +193,9 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI
     }
 
     // Set the initial value for the pooling operation accordingly with the data type
-    if(pool_type == PoolingType::MAX)
+    if (pool_type == PoolingType::MAX)
     {
-        if(is_data_type_quantized(data_type))
+        if (is_data_type_quantized(data_type))
         {
             PixelValue type_min{};
             std::tie(type_min, std::ignore) = get_min_max(data_type);
@@ -184,7 +203,9 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI
         }
         else
         {
-            std::string initial_value = pool_info.use_inf_as_limit ? "(-INFINITY)" : float_to_string_with_full_precision(std::numeric_limits<float>::lowest());
+            std::string initial_value = pool_info.use_inf_as_limit
+                                            ? "(-INFINITY)"
+                                            : float_to_string_with_full_precision(std::numeric_limits<float>::lowest());
             build_opts.add_option("-DINITIAL_VALUE=" + initial_value);
         }
     }
@@ -195,22 +216,25 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI
     }
 
     // Create kernel
-    switch(_data_layout)
+    switch (_data_layout)
     {
         case DataLayout::NCHW:
         {
             const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision;
             const auto use_wider_accumulator  = use_fp_mixed_precision && (pool_type != PoolingType::MAX);
-            const auto acc_data_type          = get_cl_type_from_data_type(use_wider_accumulator ? DataType::F32 : (is_data_type_quantized(data_type) ? DataType::S32 : data_type));
+            const auto acc_data_type          = get_cl_type_from_data_type(
+                         use_wider_accumulator ? DataType::F32
+                                               : (is_data_type_quantized(data_type) ? DataType::S32 : data_type));
             build_opts.add_option("-DACC_DATA_TYPE=" + acc_data_type);
             build_opts.add_option_if(use_wider_accumulator, "-DFP_MIXED_PRECISION");
 
-            if(pool_type != PoolingType::MAX)
+            if (pool_type != PoolingType::MAX)
             {
                 build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
             }
 
-            if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type))
+            if (pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices &&
+                is_data_type_float(data_type))
             {
                 // For max pooling with pool2x2, store indicies which will be used in max unpooling
                 std::string kernel_name = "pooling_layer_2_nchw_indices";
@@ -226,18 +250,19 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI
         case DataLayout::NHWC:
         {
             // Floating point mixed precision is support on F16 only
-            const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX;
+            const auto use_fp_mixed_precision =
+                (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX;
 
             // Wider accumulation is required to avoid accuracy loss
             // Case 1: Floating point mixed precision (fp16 src data and fp32 accumulation)
             // Cast 2: Quantized (int8/uint8 src data and int32 accumulation )
             DataType acc_data_type = data_type;
 
-            if(use_fp_mixed_precision)
+            if (use_fp_mixed_precision)
             {
                 acc_data_type = DataType::F32;
             }
-            else if(is_data_type_quantized(data_type) && pool_type != PoolingType::MAX)
+            else if (is_data_type_quantized(data_type) && pool_type != PoolingType::MAX)
             {
                 acc_data_type = DataType::S32;
             }
@@ -250,8 +275,9 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI
             build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(idx_height)));
             build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(idx_channel)));
             build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(dst->dimension(idx_batch_size)));
-            build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration));
-            if(pool_info.pool_size == Size2D(2, 2) && is_data_type_float(data_type))
+            build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                                  support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration));
+            if (pool_info.pool_size == Size2D(2, 2) && is_data_type_float(data_type))
             {
                 build_opts.add_option_if(indices != nullptr && pool_type == PoolingType::MAX, "-DEXTRACT_MAX_INDEX");
 
@@ -260,7 +286,9 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI
             }
             else
             {
-                std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nhwc" : "pooling_layer_MxN_nhwc";
+                std::string kernel_name = is_data_type_quantized_asymmetric(data_type)
+                                              ? "pooling_layer_MxN_quantized_nhwc"
+                                              : "pooling_layer_MxN_nhwc";
                 _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
             }
             break;
@@ -290,7 +318,10 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status ClPool2dKernel::validate(const ITensorInfo      *src,
+                                const ITensorInfo      *dst,
+                                const PoolingLayerInfo &pool_info,
+                                const ITensorInfo      *indices)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices));
     return Status{};
@@ -301,18 +332,19 @@ void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    unsigned int pool_stride_x = 0;
-    unsigned int pool_stride_y = 0;
+    unsigned int pool_stride_x             = 0;
+    unsigned int pool_stride_y             = 0;
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
 
-    const auto src     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst     = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_0));
-    auto       indices = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_1));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst     = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_0));
+    auto indices = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_1));
 
     // Collapse window
     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
 
-    switch(_data_layout)
+    switch (_data_layout)
     {
         case DataLayout::NCHW:
         {
@@ -323,13 +355,12 @@ void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
                 unsigned int idx = 0;
                 add_3D_tensor_argument(idx, src, slice);
                 add_3D_tensor_argument(idx, dst, slice);
-                if(indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_size == Size2D(2, 2)))
+                if (indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_size == Size2D(2, 2)))
                 {
                     add_3D_tensor_argument(idx, indices, slice);
                 }
                 enqueue(queue, *this, slice, lws_hint());
-            }
-            while(window_collapsed.slide_window_slice_3D(slice));
+            } while (window_collapsed.slide_window_slice_3D(slice));
             break;
         }
         case DataLayout::NHWC:
@@ -338,7 +369,8 @@ void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
 
             Window slice    = window_collapsed.first_slice_window_4D();
             Window in_slice = window_collapsed.first_slice_window_4D();
-            in_slice.set(Window::DimX, Window::Dimension(0, src->info()->dimension(0), _num_elems_processed_per_iteration));
+            in_slice.set(Window::DimX,
+                         Window::Dimension(0, src->info()->dimension(0), _num_elems_processed_per_iteration));
             in_slice.set(Window::DimY, Window::Dimension(0, src->info()->dimension(1), pool_stride_x));
             in_slice.set(Window::DimZ, Window::Dimension(0, src->info()->dimension(2), pool_stride_y));
             in_slice.set(3, Window::Dimension(0, batch_size, 1));
@@ -348,13 +380,13 @@ void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
                 unsigned int idx = 0;
                 add_4D_tensor_argument(idx, src, in_slice);
                 add_4D_tensor_argument(idx, dst, slice);
-                if(indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2)))
+                if (indices && is_data_type_float(src->info()->data_type()) &&
+                    (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2)))
                 {
                     add_4D_tensor_argument(idx, indices, slice);
                 }
                 enqueue(queue, *this, slice, lws_hint());
-            }
-            while(window.slide_window_slice_4D(slice) && window.slide_window_slice_4D(in_slice));
+            } while (window.slide_window_slice_4D(slice) && window.slide_window_slice_4D(in_slice));
             break;
         }
         default:
diff --git a/src/gpu/cl/kernels/ClPool2dKernel.h b/src/gpu/cl/kernels/ClPool2dKernel.h
index f5bb0687e8..56b95a37d5 100644
--- a/src/gpu/cl/kernels/ClPool2dKernel.h
+++ b/src/gpu/cl/kernels/ClPool2dKernel.h
@@ -50,22 +50,29 @@ public:
      * @param[in]  pool_info       Contains pooling operation information described in @ref PoolingLayerInfo.
      * @param[out] indices         (optional) The indices of the maximal values. Data type supported: U32.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const PoolingLayerInfo &pool_info,
+                   ITensorInfo            *indices = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClPool2dKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &pool_info,
+                           const ITensorInfo      *indices = nullptr);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 public:
     PoolingLayerInfo _pool_info{};
-    DataLayout       _data_layout{ DataLayout::UNKNOWN };
-    unsigned int     _num_elems_processed_per_iteration{ 1 };
+    DataLayout       _data_layout{DataLayout::UNKNOWN};
+    unsigned int     _num_elems_processed_per_iteration{1};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClPool3dKernel.cpp b/src/gpu/cl/kernels/ClPool3dKernel.cpp
index d068832fed..a08c5d4be7 100644
--- a/src/gpu/cl/kernels/ClPool3dKernel.cpp
+++ b/src/gpu/cl/kernels/ClPool3dKernel.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -50,10 +51,13 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NDHWC, "Only NDHWC layout supported");
 
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.stride.x() == 0 || pool_info.stride.y() == 0 || pool_info.stride.z() == 0), "Strides cannot be zero.");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8_SIGNED, DataType::QASYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) && (!pool_info.exclude_padding
-                                                                                && (pool_info.pool_type == PoolingType::AVG)),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (pool_info.stride.x() == 0 || pool_info.stride.y() == 0 || pool_info.stride.z() == 0),
+        "Strides cannot be zero.");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8_SIGNED,
+                                                         DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) &&
+                                        (!pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG)),
                                     "Exclude padding is unsupported for non-float types for Avg op");
 
     const auto         data_layout       = src->data_layout();
@@ -68,17 +72,21 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     int                output_height     = 0;
     int                output_depth      = 0;
 
-    bool round_type_ceil_with_asymm_padding = (pool_info.round_type == DimensionRoundingType::CEIL) && (!is_symmetric(pool_info.padding));
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(round_type_ceil_with_asymm_padding, "Cannot use dimension round type CEIL when padding is asymmetric.");
+    bool round_type_ceil_with_asymm_padding =
+        (pool_info.round_type == DimensionRoundingType::CEIL) && (!is_symmetric(pool_info.padding));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(round_type_ceil_with_asymm_padding,
+                                    "Cannot use dimension round type CEIL when padding is asymmetric.");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_3d_region_entirely_outside_input(pool_info), "Pooling region that is entirely outside input tensor is unsupported");
-    std::tie(output_width, output_height, output_depth) = scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
-                                                                                      src->tensor_shape()[idx_depth], pool_size_x, pool_size_y,
-                                                                                      pool_size_z, pool_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_3d_region_entirely_outside_input(pool_info),
+                                    "Pooling region that is entirely outside input tensor is unsupported");
+    std::tie(output_width, output_height, output_depth) =
+        scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
+                                    src->tensor_shape()[idx_depth], pool_size_x, pool_size_y, pool_size_z, pool_info);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1), "Calculated output dimension size is invalid");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1),
+                                    "Calculated output dimension size is invalid");
     // Checks performed when dst is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
@@ -95,11 +103,14 @@ ClPool3dKernel::ClPool3dKernel()
     _type = CLKernelType::POOL;
 }
 
-void ClPool3dKernel::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
+void ClPool3dKernel::configure(const ClCompileContext   &compile_context,
+                               const ITensorInfo        *src,
+                               ITensorInfo              *dst,
+                               const Pooling3dLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info));
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     // Auto init if empty
     TensorShape out_shape = compute_pool3d_shape(src->tensor_shape(), pool_info);
@@ -112,23 +123,23 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT
     _num_elems_processed_per_iteration = (dst->data_type() == DataType::F32) ? 2 : 4;
     _num_elems_processed_per_iteration = adjust_vec_size(_num_elems_processed_per_iteration, dst->dimension(0));
 
-    const PoolingType pool_type       = pool_info.pool_type;
-    const int         idx_width       = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const int         idx_height      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const int         idx_depth       = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::DEPTH);
-    const int         idx_channel     = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-    const int         idx_batch_size  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES);
-    const int         pool_size_x     = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
-    const int         pool_size_y     = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
-    const int         pool_size_z     = pool_info.is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth;
-    const bool        exclude_padding = pool_info.exclude_padding;
-    const int         pool_stride_x   = pool_info.stride.x();
-    const int         pool_stride_y   = pool_info.stride.y();
-    const int         pool_stride_z   = pool_info.stride.z();
-    const int         pool_pad_top    = pool_info.padding.top;
-    const int         pool_pad_left   = pool_info.padding.left;
-    const int         pool_pad_front  = pool_info.padding.front;
-    const DataType    data_type       = src->data_type();
+    const PoolingType pool_type      = pool_info.pool_type;
+    const int         idx_width      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const int         idx_height     = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const int         idx_depth      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::DEPTH);
+    const int         idx_channel    = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+    const int         idx_batch_size = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES);
+    const int         pool_size_x = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+    const int      pool_size_y = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
+    const int      pool_size_z = pool_info.is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth;
+    const bool     exclude_padding = pool_info.exclude_padding;
+    const int      pool_stride_x   = pool_info.stride.x();
+    const int      pool_stride_y   = pool_info.stride.y();
+    const int      pool_stride_z   = pool_info.stride.z();
+    const int      pool_pad_top    = pool_info.padding.top;
+    const int      pool_pad_left   = pool_info.padding.left;
+    const int      pool_pad_front  = pool_info.padding.front;
+    const DataType data_type       = src->data_type();
 
     // Set build options
     CLBuildOptions build_opts;
@@ -149,7 +160,7 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT
     build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(idx_depth)));
 
     // If datatype is quantized add relevant parameters
-    if(is_data_type_quantized_asymmetric(data_type) && src->quantization_info() != dst->quantization_info())
+    if (is_data_type_quantized_asymmetric(data_type) && src->quantization_info() != dst->quantization_info())
     {
         const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
         const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
@@ -161,9 +172,9 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT
     }
 
     // Set the initial value for the pooling operation accordingly with the data type
-    if(pool_type == PoolingType::MAX)
+    if (pool_type == PoolingType::MAX)
     {
-        if(is_data_type_quantized(data_type))
+        if (is_data_type_quantized(data_type))
         {
             PixelValue type_min{};
             std::tie(type_min, std::ignore) = get_min_max(data_type);
@@ -171,7 +182,8 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT
         }
         else
         {
-            build_opts.add_option("-DINITIAL_VALUE=" + float_to_string_with_full_precision(std::numeric_limits<float>::lowest()));
+            build_opts.add_option("-DINITIAL_VALUE=" +
+                                  float_to_string_with_full_precision(std::numeric_limits<float>::lowest()));
         }
     }
     else
@@ -181,16 +193,18 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT
     }
     // Create kernel
     // Floating point mixed precision is support on F16 only
-    const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX;
+    const auto use_fp_mixed_precision =
+        (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX;
 
     // Wider accumulation is required to avoid accuracy loss
     // Case 1: Floating point mixed precision (fp16 src data and fp32 accumulation)
     DataType acc_data_type = data_type;
-    if(use_fp_mixed_precision)
+    if (use_fp_mixed_precision)
     {
         acc_data_type = DataType::F32;
     }
-    else if(is_data_type_quantized(data_type) && pool_type != PoolingType::MAX) // Use S32 for avg pooling to allow for integer division
+    else if (is_data_type_quantized(data_type) &&
+             pool_type != PoolingType::MAX) // Use S32 for avg pooling to allow for integer division
     {
         acc_data_type = DataType::S32;
     }
@@ -202,11 +216,13 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT
     build_opts.add_option("-DDST_DEPTH=" + support::cpp11::to_string(dst->dimension(idx_depth)));
     build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(idx_channel)));
     build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(dst->dimension(idx_batch_size)));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration));
 
     // if datatype is quantized use quantized kernel function
-    std::string kernel_name = (is_data_type_quantized_asymmetric(data_type) ? "pooling_3d_layer_MxN_ndhwc_quantized" : "pooling_3d_layer_MxN_ndhwc");
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+    std::string kernel_name = (is_data_type_quantized_asymmetric(data_type) ? "pooling_3d_layer_MxN_ndhwc_quantized"
+                                                                            : "pooling_3d_layer_MxN_ndhwc");
+    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*dst, Steps(_num_elems_processed_per_iteration));
@@ -240,8 +256,9 @@ void ClPool3dKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_0));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_0));
 
     // Collapse 3D window
     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
diff --git a/src/gpu/cl/kernels/ClPool3dKernel.h b/src/gpu/cl/kernels/ClPool3dKernel.h
index 00852349e6..6cd229c427 100644
--- a/src/gpu/cl/kernels/ClPool3dKernel.h
+++ b/src/gpu/cl/kernels/ClPool3dKernel.h
@@ -50,7 +50,10 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
      * @param[in]  pool_info       Contains pooling operation information described in @ref Pooling3dLayerInfo.
      */
-    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info);
+    void configure(const ClCompileContext   &compile_context,
+                   const ITensorInfo        *src,
+                   ITensorInfo              *dst,
+                   const Pooling3dLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClPool3dKernel::configure()
@@ -64,8 +67,8 @@ public:
 
 private:
     Pooling3dLayerInfo _pool_info{};
-    DataLayout         _data_layout{ DataLayout::UNKNOWN };
-    unsigned int       _num_elems_processed_per_iteration{ 1 };
+    DataLayout         _data_layout{DataLayout::UNKNOWN};
+    unsigned int       _num_elems_processed_per_iteration{1};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClQuantizeKernel.cpp b/src/gpu/cl/kernels/ClQuantizeKernel.cpp
index 5c8bf97f0f..e8df420f67 100644
--- a/src/gpu/cl/kernels/ClQuantizeKernel.cpp
+++ b/src/gpu/cl/kernels/ClQuantizeKernel.cpp
@@ -29,13 +29,12 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -50,12 +49,14 @@ namespace
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
 
     // Output must always be initialized
     ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QASYMM16);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
 
     return Status{};
@@ -71,7 +72,7 @@ void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
 
@@ -84,7 +85,7 @@ void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const
 
     float   scale_to_apply  = qinfo.scale;
     int32_t offset_to_apply = qinfo.offset;
-    if(is_data_type_quantized_asymmetric(src->data_type()))
+    if (is_data_type_quantized_asymmetric(src->data_type()))
     {
         /*
          * In case of requantization of a quantized input tensor to an output tensor with another quantization
@@ -132,8 +133,10 @@ void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
     build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output_data_type));
-    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0)));
-    std::pair<int, int> min_max_quant_values = quantization::get_min_max_values_from_quantized_data_type(output_data_type);
+    build_opts.add_option_if(
+        multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0)));
+    std::pair<int, int> min_max_quant_values =
+        quantization::get_min_max_values_from_quantized_data_type(output_data_type);
     build_opts.add_option("-DMIN_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.first));
     build_opts.add_option("-DMAX_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.second));
 
@@ -141,9 +144,10 @@ void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const
 
     // Configure kernel window
     Window win = calculate_max_window(*src, Steps());
-    if(multi_access_x)
+    if (multi_access_x)
     {
-        win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+        win.set(Window::DimX,
+                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
     }
     ICLKernel::configure_internal(win);
 
@@ -173,8 +177,7 @@ void ClQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, cl::Co
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
+    } while (window_collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClReshapeKernel.cpp b/src/gpu/cl/kernels/ClReshapeKernel.cpp
index 121bb33edf..53889f3a6b 100644
--- a/src/gpu/cl/kernels/ClReshapeKernel.cpp
+++ b/src/gpu/cl/kernels/ClReshapeKernel.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
@@ -51,7 +52,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
 
-    if(dst->tensor_shape().total_size() != 0)
+    if (dst->tensor_shape().total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
@@ -72,27 +73,17 @@ void ClReshapeKernel::configure(const CLCompileContext &compile_context, const I
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     // Create kernel
-    std::set<std::string> build_opts = { "-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()) };
+    std::set<std::string> build_opts = {"-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size())};
     _kernel                          = create_kernel(compile_context, "reshape_layer", build_opts);
 
     // Add static arguments
-    const cl_int2 src_shape =
-    {
-        {
-            static_cast<cl_int>(src->tensor_shape()[0]),
-            static_cast<cl_int>(src->tensor_shape()[1])
-        }
-    };
-    const cl_int2 dst_shape =
-    {
-        {
-            static_cast<cl_int>(dst->tensor_shape()[0]),
-            static_cast<cl_int>(dst->tensor_shape()[1])
-        }
-    };
+    const cl_int2 src_shape = {
+        {static_cast<cl_int>(src->tensor_shape()[0]), static_cast<cl_int>(src->tensor_shape()[1])}};
+    const cl_int2 dst_shape = {
+        {static_cast<cl_int>(dst->tensor_shape()[0]), static_cast<cl_int>(dst->tensor_shape()[1])}};
     unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
     _kernel.setArg<cl_int2>(idx++, src_shape);
     _kernel.setArg<cl_int2>(idx++, dst_shape);
@@ -119,8 +110,9 @@ void ClReshapeKernel::run_op(ITensorPack &tensors, const Window &window, cl::Com
     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice            = window_collapsed.first_slice_window_3D();
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     // Set srcs
     unsigned int idx = 0;
diff --git a/src/gpu/cl/kernels/ClReshapeKernel.h b/src/gpu/cl/kernels/ClReshapeKernel.h
index db6ab5da58..95eae82086 100644
--- a/src/gpu/cl/kernels/ClReshapeKernel.h
+++ b/src/gpu/cl/kernels/ClReshapeKernel.h
@@ -58,7 +58,7 @@ public:
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 };
-} // namespace opencl
 } // namespace kernels
+} // namespace opencl
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CL_RESHAPE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClScaleKernel.cpp b/src/gpu/cl/kernels/ClScaleKernel.cpp
index 4c4373a215..4305acad26 100644
--- a/src/gpu/cl/kernels/ClScaleKernel.cpp
+++ b/src/gpu/cl/kernels/ClScaleKernel.cpp
@@ -27,8 +27,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/InterpolationPolicyUtils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -43,7 +44,8 @@ namespace kernels
 {
 namespace
 {
-inline std::tuple<float, float> calculate_scale_factors(const ITensorInfo *src, const ITensorInfo *dst, DataLayout data_layout, bool align_corners)
+inline std::tuple<float, float>
+calculate_scale_factors(const ITensorInfo *src, const ITensorInfo *dst, DataLayout data_layout, bool align_corners)
 {
     const int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
@@ -64,20 +66,25 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::U8, DataType::S16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(dst == src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->num_channels()!=1);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) && !is_data_type_quantized_asymmetric(src->data_type()));
+    ARM_COMPUTE_RETURN_ERROR_ON(src->num_channels() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        info.align_corners &&
+        !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) &&
+                                !is_data_type_quantized_asymmetric(src->data_type()));
 
     float            scale_x     = 0.f;
     float            scale_y     = 0.f;
     const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
-    std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, data_layout, info.align_corners);
+    std::tie(scale_x, scale_y)   = calculate_scale_factors(src, dst, data_layout, info.align_corners);
 
-    ARM_COMPUTE_RETURN_ERROR_ON(info.interpolation_policy == InterpolationPolicy::AREA && (scale_x > 1.f || scale_y > 1.f));
+    ARM_COMPUTE_RETURN_ERROR_ON(info.interpolation_policy == InterpolationPolicy::AREA &&
+                                (scale_x > 1.f || scale_y > 1.f));
 
     return Status{};
 }
@@ -94,23 +101,26 @@ ClScaleKernel::ClScaleKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info)
+void ClScaleKernel::configure(const CLCompileContext &compile_context,
+                              ITensorInfo            *src,
+                              ITensorInfo            *dst,
+                              const ScaleKernelInfo  &info)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, info));
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     // Info required for the static tuning
     _data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
 
     const bool is_nhwc = _data_layout == DataLayout::NHWC;
 
-    float scale_x = 0.f;
-    float scale_y = 0.f;
+    float scale_x              = 0.f;
+    float scale_y              = 0.f;
     std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, _data_layout, info.align_corners);
 
     // Area interpolation behaves as Nearest Neighbour in case of up-sampling
     auto interpolation_policy_to_use = info.interpolation_policy;
-    if(info.interpolation_policy == InterpolationPolicy::AREA && scale_x <= 1.f && scale_y <= 1.f)
+    if (info.interpolation_policy == InterpolationPolicy::AREA && scale_x <= 1.f && scale_y <= 1.f)
     {
         interpolation_policy_to_use = InterpolationPolicy::NEAREST_NEIGHBOR;
     }
@@ -127,7 +137,7 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn
     unsigned int       vec_size_leftover = 0;
 
     CLBuildOptions build_opts;
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         vec_size          = adjust_vec_size(src->data_type() == DataType::F32 ? 4 : 8, dst_channels);
         vec_size_leftover = dst_channels % vec_size;
@@ -135,7 +145,8 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn
         build_opts.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
         build_opts.add_option("-DDST_TENSOR_TYPE=BUFFER");
         build_opts.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
-        build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type()));
+        build_opts.add_option("-DCONSTANT_VALUE=" +
+                              string_from_pixel_value(info.constant_border_value, src->data_type()));
         build_opts.add_option("-DN0=" + support::cpp11::to_string(vec_size));
         build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(vec_size_leftover));
         build_opts.add_option("-DSCALE_" + string_from_interpolation_policy(interpolation_policy_to_use));
@@ -144,27 +155,33 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn
         build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT");
         build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS");
         build_opts.add_option_if(is_data_type_float(src->data_type()), "-DIS_FLOATING_POINT");
-        build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
+        build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER",
+                                      "-DSAMPLING_POLICY_TOP_LEFT");
     }
-    else if(_data_layout == DataLayout::NCHW)
+    else if (_data_layout == DataLayout::NCHW)
     {
         vec_size          = adjust_vec_size(4, dst_width);
         vec_size_leftover = dst_width % vec_size;
         build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-        build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type()));
+        build_opts.add_option("-DCONSTANT_VALUE=" +
+                              string_from_pixel_value(info.constant_border_value, src->data_type()));
         build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width));
         build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height));
         build_opts.add_option("-DSCALE_X=" + float_to_string_with_full_precision(scale_x));
         build_opts.add_option("-DSCALE_Y=" + float_to_string_with_full_precision(scale_y));
         build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
-        build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + ((vec_size_leftover == 0) ? support::cpp11::to_string(vec_size) : support::cpp11::to_string(vec_size_leftover)));
+        build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + ((vec_size_leftover == 0)
+                                                            ? support::cpp11::to_string(vec_size)
+                                                            : support::cpp11::to_string(vec_size_leftover)));
         build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
         build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT");
         build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS");
-        build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
+        build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER",
+                                      "-DSAMPLING_POLICY_TOP_LEFT");
 
-        const bool is_qasymm_bilinear = is_data_type_quantized_asymmetric(src->data_type()) && info.interpolation_policy == InterpolationPolicy::BILINEAR;
-        if(is_qasymm_bilinear)
+        const bool is_qasymm_bilinear = is_data_type_quantized_asymmetric(src->data_type()) &&
+                                        info.interpolation_policy == InterpolationPolicy::BILINEAR;
+        if (is_qasymm_bilinear)
         {
             const UniformQuantizationInfo qinfo = src->quantization_info().uniform();
             build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale));
@@ -190,7 +207,7 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 
     // Pass scale kernel arguments
-    if(is_nhwc)
+    if (is_nhwc)
     {
         unsigned int idx = 2 * num_arguments_per_4d_tensor_nhwc();
         _kernel.setArg<cl_float>(idx++, scale_x);
@@ -219,7 +236,7 @@ void ClScaleKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comma
     auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
     auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
-    switch(_data_layout)
+    switch (_data_layout)
     {
         case DataLayout::NCHW:
         {
@@ -231,8 +248,7 @@ void ClScaleKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comma
                 add_2D_tensor_argument(idx, src, slice);
                 add_2D_tensor_argument(idx, dst, slice);
                 enqueue(queue, *this, slice, lws_hint());
-            }
-            while(window.slide_window_slice_2D(slice));
+            } while (window.slide_window_slice_2D(slice));
             break;
         }
         case DataLayout::NHWC:
diff --git a/src/gpu/cl/kernels/ClScaleKernel.h b/src/gpu/cl/kernels/ClScaleKernel.h
index dd09e92ee2..c09659017d 100644
--- a/src/gpu/cl/kernels/ClScaleKernel.h
+++ b/src/gpu/cl/kernels/ClScaleKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_SCALE_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -49,7 +50,8 @@ public:
      *                             All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]  info            @ref ScaleKernelInfo Kernel descriptor to be used to configure.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClScaleKernel::configure()
@@ -62,7 +64,7 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    DataLayout _data_layout{ DataLayout::UNKNOWN };
+    DataLayout _data_layout{DataLayout::UNKNOWN};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClSoftmaxKernel.cpp b/src/gpu/cl/kernels/ClSoftmaxKernel.cpp
index 59299fa441..1b5a2666bc 100644
--- a/src/gpu/cl/kernels/ClSoftmaxKernel.cpp
+++ b/src/gpu/cl/kernels/ClSoftmaxKernel.cpp
@@ -22,12 +22,14 @@
  * SOFTWARE.
  */
 #include "src/gpu/cl/kernels/ClSoftmaxKernel.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -60,15 +62,16 @@ CLBuildOptions prepare_quantized_softmax_build_options(float input_scale, float
     // Number of integer bits used in temporary fixed-point representation of exponent accumulator
     static const int exp_accumulation_in_bits = 12;
 
-    const double beta_multiplier = std::min(
-                                       1.0 * beta * input_scale * (1 << (31 - scaled_diff_int_bits)),
-                                       (1LL << 31) - 1.0);
+    const double beta_multiplier =
+        std::min(1.0 * beta * input_scale * (1 << (31 - scaled_diff_int_bits)), (1LL << 31) - 1.0);
     int input_beta_multiplier;
     int input_beta_left_shift;
-    quantization::calculate_quantized_multiplier_greater_than_one(beta_multiplier, &input_beta_multiplier, &input_beta_left_shift);
+    quantization::calculate_quantized_multiplier_greater_than_one(beta_multiplier, &input_beta_multiplier,
+                                                                  &input_beta_left_shift);
 
-    const double max_input_rescaled = 1.0 * ((1 << scaled_diff_int_bits) - 1) * (1LL << (31 - scaled_diff_int_bits)) / (1LL << input_beta_left_shift);
-    const int    diff_min           = -1.f * std::floor(max_input_rescaled);
+    const double max_input_rescaled =
+        1.0 * ((1 << scaled_diff_int_bits) - 1) * (1LL << (31 - scaled_diff_int_bits)) / (1LL << input_beta_left_shift);
+    const int diff_min = -1.f * std::floor(max_input_rescaled);
 
     CLBuildOptions build_opts;
     build_opts.add_option("-DSCALED_DIFF_INT_BITS=" + support::cpp11::to_string(scaled_diff_int_bits));
@@ -80,18 +83,22 @@ CLBuildOptions prepare_quantized_softmax_build_options(float input_scale, float
     return build_opts;
 }
 
-Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum)
+Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo &src,
+                                           const ITensorInfo &max,
+                                           const ITensorInfo &dst,
+                                           const ITensorInfo &sum)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &max);
 
     const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type());
 
     // Checks performed when output is configured
-    if(dst.total_size() != 0)
+    if (dst.total_size() != 0)
     {
-        if(is_quantized_asymmetric)
+        if (is_quantized_asymmetric)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::S32);
         }
@@ -103,9 +110,9 @@ Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo &src, const ITensor
     }
 
     // Checks performed when sum is configured
-    if(sum.total_size() != 0)
+    if (sum.total_size() != 0)
     {
-        if(is_quantized_asymmetric)
+        if (is_quantized_asymmetric)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&sum, 1, DataType::S32);
         }
@@ -119,7 +126,10 @@ Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo &src, const ITensor
     return Status{};
 }
 
-Status validate_arguments_1DNorm(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
+Status validate_arguments_1DNorm(const ITensorInfo       &src,
+                                 const ITensorInfo       &sum,
+                                 const ITensorInfo       &dst,
+                                 const SoftmaxKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::S32, DataType::F16, DataType::F32);
@@ -127,14 +137,15 @@ Status validate_arguments_1DNorm(const ITensorInfo &src, const ITensorInfo &sum,
     ARM_COMPUTE_RETURN_ERROR_ON(info.is_log && !is_data_type_float(info.input_data_type));
 
     // Note: output should always have a scale of 1/256 and offset 0
-    const QuantizationInfo allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log);
-    const bool             is_quantized_asymmetric   = is_data_type_quantized_asymmetric(info.input_data_type);
+    const QuantizationInfo allowed_quantization_info =
+        get_softmax_output_quantization_info(info.input_data_type, info.is_log);
+    const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(info.input_data_type);
 
     // Checks performed when output is configured
-    if(dst.total_size() != 0)
+    if (dst.total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
-        if(!is_quantized_asymmetric)
+        if (!is_quantized_asymmetric)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
         }
@@ -161,9 +172,14 @@ ClLogits1DMaxShiftExpSumKernel::ClLogits1DMaxShiftExpSumKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &max, ITensorInfo &dst, ITensorInfo &sum, const SoftmaxKernelInfo &info)
+void ClLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext  &compile_context,
+                                               const ITensorInfo       &src,
+                                               ITensorInfo             &max,
+                                               ITensorInfo             &dst,
+                                               ITensorInfo             &sum,
+                                               const SoftmaxKernelInfo &info)
 {
-    auto padding_info = get_padding_info({ &src, &max, &dst, &sum });
+    auto padding_info = get_padding_info({&src, &max, &dst, &sum});
 
     // Output auto initialization if not yet initialized
     auto_init_if_empty(sum, src.clone()->set_tensor_shape(max.tensor_shape()));
@@ -191,15 +207,21 @@ void ClLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_c
     build_opts.add_option("-DLOG_VECTOR_SIZE=" + support::cpp11::to_string(lround(log2(vector_size))));
     build_opts.add_option_if((reduction_dim_size % vector_size) != 0, "-DNON_MULTIPLE_OF_VECTOR_SIZE");
     build_opts.add_option_if(is_signed_qasymm8, "-DQASYMM8_SIGNED");
-    build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), "-DBETA=" + float_to_string_with_full_precision(beta));
+    build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f),
+                             "-DBETA=" + float_to_string_with_full_precision(beta));
     build_opts.add_option_if(is_data_type_float(dt) && info.is_log, "-DLOG_SOFTMAX");
-    build_opts.add_option_if(is_data_type_float(dt), "-DMINVAL=" + ((dt == DataType::F16) ? std::string("-HALF_MAX") : std::string("-FLT_MAX")));
-    build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DSCALE=" + float_to_string_with_full_precision(qinfo.scale));
-    build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DBETA=" + float_to_string_with_full_precision(beta));
-    build_opts.add_options_if(is_data_type_quantized_asymmetric(dt), prepare_quantized_softmax_build_options(qinfo.scale, beta).options());
+    build_opts.add_option_if(is_data_type_float(dt), "-DMINVAL=" + ((dt == DataType::F16) ? std::string("-HALF_MAX")
+                                                                                          : std::string("-FLT_MAX")));
+    build_opts.add_option_if(is_data_type_quantized_asymmetric(dt),
+                             "-DSCALE=" + float_to_string_with_full_precision(qinfo.scale));
+    build_opts.add_option_if(is_data_type_quantized_asymmetric(dt),
+                             "-DBETA=" + float_to_string_with_full_precision(beta));
+    build_opts.add_options_if(is_data_type_quantized_asymmetric(dt),
+                              prepare_quantized_softmax_build_options(qinfo.scale, beta).options());
 
     cl::NDRange lws_hint(cl::NullRange);
-    std::string kernel_name = std::string("softmax_layer_max_shift_exp_sum_") + (is_data_type_quantized_asymmetric(dt) ? "quantized_" : "") + "serial";
+    std::string kernel_name = std::string("softmax_layer_max_shift_exp_sum_") +
+                              (is_data_type_quantized_asymmetric(dt) ? "quantized_" : "") + "serial";
 
     // Create kernel.
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
@@ -211,7 +233,10 @@ void ClLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_c
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClLogits1DMaxShiftExpSumKernel::validate(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum)
+Status ClLogits1DMaxShiftExpSumKernel::validate(const ITensorInfo &src,
+                                                const ITensorInfo &max,
+                                                const ITensorInfo &dst,
+                                                const ITensorInfo &sum)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DMaxShiftExpSum(src, max, dst, sum));
     return Status{};
@@ -241,7 +266,7 @@ void ClLogits1DMaxShiftExpSumKernel::run_op(ITensorPack &tensors, const Window &
 
     // Reconfigure window in case of parallel reduction
     ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(src->info()->dimension(0));
-    if(std::get<0>(parallel_reduction_info))
+    if (std::get<0>(parallel_reduction_info))
     {
         // Launch grid_size parallel work items
         window_collapsed.set(Window::DimX, Window::Dimension(0, _grid_size, 1));
@@ -258,8 +283,7 @@ void ClLogits1DMaxShiftExpSumKernel::run_op(ITensorPack &tensors, const Window &
         add_3D_tensor_argument(idx, dst, slice);
         add_3D_tensor_argument(idx, sum, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
+    } while (window_collapsed.slide_window_slice_3D(slice));
 }
 
 ClLogits1DNormKernel::ClLogits1DNormKernel()
@@ -267,18 +291,24 @@ ClLogits1DNormKernel::ClLogits1DNormKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClLogits1DNormKernel::configure(const CLCompileContext &compile_context, const ITensorInfo &src, const ITensorInfo &sum, ITensorInfo &dst, const SoftmaxKernelInfo &info)
+void ClLogits1DNormKernel::configure(const CLCompileContext  &compile_context,
+                                     const ITensorInfo       &src,
+                                     const ITensorInfo       &sum,
+                                     ITensorInfo             &dst,
+                                     const SoftmaxKernelInfo &info)
 {
-    auto padding_info = get_padding_info({ &src, &dst, &sum });
+    auto padding_info = get_padding_info({&src, &dst, &sum});
 
     // Note: output should always have a scale of 1/256 and offset 0
-    const bool                    is_quantized_asymmetric   = is_data_type_quantized_asymmetric(info.input_data_type);
-    const DataType                output_data_type          = info.input_data_type;
-    const QuantizationInfo        allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log);
-    const UniformQuantizationInfo qinfo                     = src.quantization_info().uniform();
+    const bool             is_quantized_asymmetric = is_data_type_quantized_asymmetric(info.input_data_type);
+    const DataType         output_data_type        = info.input_data_type;
+    const QuantizationInfo allowed_quantization_info =
+        get_softmax_output_quantization_info(info.input_data_type, info.is_log);
+    const UniformQuantizationInfo qinfo = src.quantization_info().uniform();
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(dst, src.clone()->set_data_type(output_data_type).set_quantization_info(allowed_quantization_info));
+    auto_init_if_empty(dst,
+                       src.clone()->set_data_type(output_data_type).set_quantization_info(allowed_quantization_info));
 
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DNorm(src, sum, dst, info));
@@ -311,7 +341,10 @@ void ClLogits1DNormKernel::configure(const CLCompileContext &compile_context, co
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClLogits1DNormKernel::validate(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
+Status ClLogits1DNormKernel::validate(const ITensorInfo       &src,
+                                      const ITensorInfo       &sum,
+                                      const ITensorInfo       &dst,
+                                      const SoftmaxKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DNorm(src, sum, dst, info));
 
@@ -343,9 +376,8 @@ void ClLogits1DNormKernel::run_op(ITensorPack &tensors, const Window &window, ::
         add_3D_tensor_argument(idx, sum, sum_slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
+    } while (window_collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClSoftmaxKernel.h b/src/gpu/cl/kernels/ClSoftmaxKernel.h
index a221e12132..2dd53da346 100644
--- a/src/gpu/cl/kernels/ClSoftmaxKernel.h
+++ b/src/gpu/cl/kernels/ClSoftmaxKernel.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -61,14 +62,20 @@ public:
      * @param[out]    sum             Sum of 1D logits tensor. Data types supported: same as @p src
      * @param[in]     info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &max, ITensorInfo &dst, ITensorInfo &sum, const SoftmaxKernelInfo &info);
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       &src,
+                   ITensorInfo             &max,
+                   ITensorInfo             &dst,
+                   ITensorInfo             &sum,
+                   const SoftmaxKernelInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClLogits1DMaxShiftExpSumKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum);
+    static Status
+    validate(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum);
     /** Checks if the given size is eligible for parallel reduction
      *
      * @note  Serial reduction is launched for width < (_grid_size * _serial_vector_size).
@@ -100,14 +107,19 @@ public:
      * @param[out] dst             Destination tensor. Data types supported: QASYMM8/QASYMM8_SIGNED for S32 @p input, or same as @p input
      * @param[in]  info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo &src, const ITensorInfo &sum, ITensorInfo &dst, const SoftmaxKernelInfo &info);
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       &src,
+                   const ITensorInfo       &sum,
+                   ITensorInfo             &dst,
+                   const SoftmaxKernelInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClLogits1DNormKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info);
+    static Status
+    validate(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClTransposeKernel.cpp b/src/gpu/cl/kernels/ClTransposeKernel.cpp
index 6450ffb5b2..6eb2bf81c0 100644
--- a/src/gpu/cl/kernels/ClTransposeKernel.cpp
+++ b/src/gpu/cl/kernels/ClTransposeKernel.cpp
@@ -29,9 +29,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -58,12 +59,12 @@ void ClTransposeKernel::configure(const CLCompileContext &compile_context, const
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
 
     ARM_COMPUTE_ERROR_THROW_ON(ClTransposeKernel::validate(src, dst));
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     // Create kernel
-    const unsigned int vec_size_x           = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0));
+    const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0));
     const int          vec_size_x_leftovers = src->dimension(0) % vec_size_x;
-    const unsigned int vec_size_y           = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(1));
+    const unsigned int vec_size_y = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(1));
     const int          vec_size_y_leftovers = src->dimension(1) % vec_size_y;
 
     CLBuildOptions build_opts;
@@ -89,9 +90,10 @@ Status ClTransposeKernel::validate(const ITensorInfo *src, const ITensorInfo *ds
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 2, "Transpose up to 2-D src tensor is supported");
 
     // Validate configured dst
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        const TensorInfo dst_info = src->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*src));
+        const TensorInfo dst_info =
+            src->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*src));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &dst_info);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
@@ -106,8 +108,9 @@ void ClTransposeKernel::run_op(ITensorPack &tensors, const Window &window, cl::C
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window slice = window.first_slice_window_2D();
 
@@ -117,9 +120,8 @@ void ClTransposeKernel::run_op(ITensorPack &tensors, const Window &window, cl::C
         add_2D_tensor_argument(idx, src, slice);
         add_2D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
+    } while (window.slide_window_slice_2D(slice));
 }
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp
index ae825694c5..76f39ac500 100644
--- a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp
+++ b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp
@@ -26,14 +26,14 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
 
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-
 namespace arm_compute
 {
 namespace opencl
@@ -42,11 +42,15 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+Status validate_arguments(const ITensorInfo   *input,
+                          const ITensorInfo   *weights,
+                          const ITensorInfo   *biases,
+                          const ITensorInfo   *output,
                           const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QASYMM8_SIGNED, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32,
+                                                         DataType::QASYMM8_SIGNED, DataType::QASYMM8);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(weights, DataLayout::NHWC);
@@ -56,12 +60,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
     constexpr unsigned int height_idx  = 2;
     constexpr unsigned int batch_idx   = 3;
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != input->dimension(channel_idx), "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != input->dimension(channel_idx),
+                                    "Weights feature map dimension should match the respective src's one");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
-        if(is_data_type_quantized_asymmetric(input->data_type()))
+        if (is_data_type_quantized_asymmetric(input->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -77,15 +82,17 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
     }
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const size_t input_width    = input->dimension(width_idx);
         const size_t input_height   = input->dimension(height_idx);
         const size_t weights_width  = weights->dimension(width_idx);
         const size_t weights_height = weights->dimension(height_idx);
 
-        auto        out_dims     = deconvolution_output_dimensions(input_width, input_height, weights_width, weights_height, deconv_info);
-        TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
+        auto out_dims =
+            deconvolution_output_dimensions(input_width, input_height, weights_width, weights_height, deconv_info);
+        TensorShape output_shape =
+            misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -96,8 +103,12 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
 }
 } // namespace
 
-void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *input, const ITensorInfo *weights,
-                                              const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &deconv_info)
+void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_context,
+                                              const ITensorInfo      *input,
+                                              const ITensorInfo      *weights,
+                                              const ITensorInfo      *biases,
+                                              ITensorInfo            *output,
+                                              const PadStrideInfo    &deconv_info)
 {
     ARM_COMPUTE_UNUSED(biases, deconv_info);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
@@ -119,7 +130,8 @@ void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_co
     const size_t output_channels = output->dimension(channel_idx);
 
     // Calculate output shape
-    auto        out_dims     = deconvolution_output_dimensions(input_width, input_height, weights_width, weights_height, deconv_info);
+    auto out_dims =
+        deconvolution_output_dimensions(input_width, input_height, weights_width, weights_height, deconv_info);
     TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
     auto_init_if_empty(*output, output_shape, 1, input->data_type(), input->quantization_info());
 
@@ -147,7 +159,7 @@ void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_co
     const DataType    input_data_type = input->data_type();
     const PaddingInfo strides         = deconv_info.stride();
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         build_options.add_option(std::string("-DHAS_BIAS"));
         build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type())));
@@ -180,7 +192,7 @@ void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_co
     build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
     build_options.add_option_if((input_channels % k0) != 0, "-DLEFTOVER_LOOP");
 
-    if(is_data_type_quantized(output_data_type))
+    if (is_data_type_quantized(output_data_type))
     {
         const UniformQuantizationInfo iqinfo = input->quantization_info().uniform();
         const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
@@ -210,7 +222,7 @@ void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_co
         build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(0));
     }
 
-    if(compile_context.get_ddk_version() >= 30)
+    if (compile_context.get_ddk_version() >= 30)
     {
         build_options.add_option("-fregister-allocation=64");
     }
@@ -235,8 +247,11 @@ void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_co
     _config_id += support::cpp11::to_string(n0);
 }
 
-Status ClTransposedConvolutionKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases,
-                                               const ITensorInfo *dst, const PadStrideInfo &deconv_info)
+Status ClTransposedConvolutionKernel::validate(const ITensorInfo   *src,
+                                               const ITensorInfo   *weights,
+                                               const ITensorInfo   *biases,
+                                               const ITensorInfo   *dst,
+                                               const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, deconv_info));
     return Status{};
@@ -250,17 +265,20 @@ void ClTransposedConvolutionKernel::run_op(ITensorPack &tensors, const Window &w
     // Get initial windows
     Window slice = window.first_slice_window_3D();
 
-    const auto src     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto biases  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst     = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto weights =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto biases =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     unsigned int idx = 0;
     add_4d_tensor_nhwc_argument(idx, src);
     add_4d_tensor_nhwc_argument(idx, dst);
 
     add_4d_tensor_nhwc_argument(idx, weights);
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         add_1D_tensor_argument(idx, biases, slice);
     }
diff --git a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h
index d4350dda50..44f6f56b7a 100644
--- a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h
+++ b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h
@@ -45,16 +45,23 @@ public:
      * Similar to @ref ClTransposedConvolution::configure()
      *
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, const ITensorInfo *weights,
-                   const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &deconv_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *input,
+                   const ITensorInfo      *weights,
+                   const ITensorInfo      *biases,
+                   ITensorInfo            *output,
+                   const PadStrideInfo    &deconv_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClTransposedConvolution::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases,
-                           const ITensorInfo *output, const PadStrideInfo &deconv_info);
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *biases,
+                           const ITensorInfo   *output,
+                           const PadStrideInfo &deconv_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -63,4 +70,4 @@ public:
 } // namespace opencl
 } // namespace arm_compute
 
-#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_KERNEL_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp b/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp
index 8f36345076..af80c4d796 100644
--- a/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp
+++ b/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp
@@ -22,9 +22,11 @@
  * SOFTWARE.
  */
 #include "src/gpu/cl/kernels/ClWeightsReshapeKernel.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
@@ -39,7 +41,10 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *biases,
+                          const ITensorInfo *output,
+                          unsigned int       num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
@@ -48,20 +53,24 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, c
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4 && num_groups > 1);
     ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(3) % num_groups) != 0);
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(!is_data_type_float(input->data_type()));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
         ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->num_dimensions() != 1));
         ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->num_dimensions() != 2));
-        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->dimension(0) != input->tensor_shape()[3]));
-        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->dimension(0) != input->tensor_shape()[3] || biases->dimension(1) != input->tensor_shape()[4]));
+        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) &&
+                                    (biases->dimension(0) != input->tensor_shape()[3]));
+        ARM_COMPUTE_RETURN_ERROR_ON(
+            (input->num_dimensions() == 5) &&
+            (biases->dimension(0) != input->tensor_shape()[3] || biases->dimension(1) != input->tensor_shape()[4]));
     }
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_weights_reshaped_shape(*input, biases != nullptr, num_groups));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            output->tensor_shape(), compute_weights_reshaped_shape(*input, biases != nullptr, num_groups));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
@@ -75,16 +84,21 @@ ClWeightsReshapeKernel::ClWeightsReshapeKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClWeightsReshapeKernel::configure(const ClCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst, unsigned int num_groups)
+void ClWeightsReshapeKernel::configure(const ClCompileContext &compile_context,
+                                       const ITensorInfo      *src,
+                                       const ITensorInfo      *biases,
+                                       ITensorInfo            *dst,
+                                       unsigned int            num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
     // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_weights_reshaped_shape(*src, (biases != nullptr), num_groups)));
+    auto_init_if_empty(
+        *dst, src->clone()->set_tensor_shape(compute_weights_reshaped_shape(*src, (biases != nullptr), num_groups)));
 
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, biases, dst, num_groups));
-    auto padding_info = get_padding_info({ src, biases, dst });
+    auto padding_info = get_padding_info({src, biases, dst});
 
     const DataType data_type = src->data_type();
 
@@ -104,7 +118,10 @@ void ClWeightsReshapeKernel::configure(const ClCompileContext &compile_context,
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClWeightsReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst, unsigned int num_groups)
+Status ClWeightsReshapeKernel::validate(const ITensorInfo *src,
+                                        const ITensorInfo *biases,
+                                        const ITensorInfo *dst,
+                                        unsigned int       num_groups)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, biases, dst, num_groups));
     return Status{};
@@ -136,7 +153,7 @@ void ClWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window,
     _kernel.setArg<cl_uint>(idx++, src->info()->dimension(3));
     _kernel.setArg<cl_uint>(idx++, dst->info()->strides_in_bytes().z());
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         biases_window.use_tensor_dimensions(biases->info()->tensor_shape());
         biases_slice = biases_window.first_slice_window_1D();
@@ -148,7 +165,7 @@ void ClWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window,
         unsigned idx = 0;
         add_3D_tensor_argument(idx, src, in_slice);
         add_2D_tensor_argument(idx, dst, out_slice);
-        if(biases != nullptr)
+        if (biases != nullptr)
         {
             add_1D_tensor_argument(idx, biases, biases_slice);
             ARM_COMPUTE_UNUSED(biases_window.slide_window_slice_1D(biases_slice));
@@ -156,8 +173,7 @@ void ClWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window,
 
         // Run kernel
         enqueue(queue, *this, in_slice, lws_hint());
-    }
-    while(window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice));
+    } while (window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClWeightsReshapeKernel.h b/src/gpu/cl/kernels/ClWeightsReshapeKernel.h
index 7364eb97ae..5e05f8d006 100644
--- a/src/gpu/cl/kernels/ClWeightsReshapeKernel.h
+++ b/src/gpu/cl/kernels/ClWeightsReshapeKernel.h
@@ -75,14 +75,19 @@ public:
      * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      *                             Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
      */
-    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst, unsigned int num_groups = 1);
+    void configure(const ClCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   const ITensorInfo      *biases,
+                   ITensorInfo            *dst,
+                   unsigned int            num_groups = 1);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClWeightsReshapeKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst, unsigned int num_groups = 1);
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst, unsigned int num_groups = 1);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -90,4 +95,4 @@ public:
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_WEIGHTSRESHAPE_KERNEL_H */
-\ No newline at end of file
+#endif /*ARM_COMPUTE_CL_WEIGHTSRESHAPE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp
index 0a9a3f021f..15195025ce 100644
--- a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp
+++ b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp
@@ -29,11 +29,11 @@
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/helpers/tensor_info.h"
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -52,7 +52,7 @@ Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, cons
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) > dst->dimension(0));
 
-    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(i) != dst->dimension(i));
         ARM_COMPUTE_RETURN_ERROR_ON(src2->dimension(i) != dst->dimension(i));
@@ -63,7 +63,8 @@ Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, cons
 }
 } // namespace
 
-Status ClWidthConcatenate2TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
+Status
+ClWidthConcatenate2TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst));
     return Status{};
@@ -74,12 +75,15 @@ ClWidthConcatenate2TensorsKernel::ClWidthConcatenate2TensorsKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
+void ClWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile_context,
+                                                 ITensorInfo            *src1,
+                                                 ITensorInfo            *src2,
+                                                 ITensorInfo            *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst));
 
-    auto padding_info = get_padding_info({ src1, src2, dst });
+    auto padding_info = get_padding_info({src1, src2, dst});
 
     const unsigned int min_dimension                     = std::min(src1->dimension(0), src2->dimension(0));
     const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension);
@@ -91,11 +95,12 @@ void ClWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
     build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src1->element_size()));
-    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) %
+                                                                           num_elems_processed_per_iteration));
 
     // If input have different quantization info set quantization parameters needed for the re-quantization process
     const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2);
-    if(is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo)
+    if (is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo)
     {
         const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
         const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform();
@@ -146,9 +151,11 @@ void ClWidthConcatenate2TensorsKernel::run_op(ITensorPack &tensors, const Window
 
     Window slice = window.first_slice_window_4D();
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     do
     {
@@ -159,8 +166,7 @@ void ClWidthConcatenate2TensorsKernel::run_op(ITensorPack &tensors, const Window
         _kernel.setArg<cl_int>(idx++, _depth);
         _kernel.setArg<cl_int>(idx++, _input1_width);
         enqueue(queue, *this, window, lws_hint());
-    }
-    while(window.slide_window_slice_4D(slice));
+    } while (window.slide_window_slice_4D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h
index 5c54479002..8b53d6d66b 100644
--- a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h
+++ b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h
@@ -62,8 +62,8 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
 
 private:
-    int32_t _depth{ 0 };
-    int32_t _input1_width{ 0 };
+    int32_t _depth{0};
+    int32_t _input1_width{0};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp
index 54f7ad344a..c4f84e3e45 100644
--- a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp
+++ b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp
@@ -30,11 +30,11 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/helpers/tensor_info.h"
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -45,15 +45,20 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst)
+Status validate_arguments(const ITensorInfo *src1,
+                          const ITensorInfo *src2,
+                          const ITensorInfo *src3,
+                          const ITensorInfo *src4,
+                          const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, src3, src4, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1);
     ARM_COMPUTE_RETURN_ERROR_ON(src1->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2, src3, src4, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) + src3->dimension(0) + src4->dimension(0) > dst->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) + src3->dimension(0) + src4->dimension(0) >
+                                dst->dimension(0));
 
-    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(i) != dst->dimension(i));
         ARM_COMPUTE_RETURN_ERROR_ON(src2->dimension(i) != dst->dimension(i));
@@ -71,22 +76,29 @@ ClWidthConcatenate4TensorsKernel::ClWidthConcatenate4TensorsKernel()
     _type = CLKernelType::ELEMENTWISE;
 }
 
-Status ClWidthConcatenate4TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst)
+Status ClWidthConcatenate4TensorsKernel::validate(const ITensorInfo *src1,
+                                                  const ITensorInfo *src2,
+                                                  const ITensorInfo *src3,
+                                                  const ITensorInfo *src4,
+                                                  const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, src3, src4, dst));
     return Status{};
 }
 
 void ClWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile_context,
-                                                 ITensorInfo *src1, ITensorInfo *src2,
-                                                 ITensorInfo *src3, ITensorInfo *src4,
-                                                 ITensorInfo *dst)
+                                                 ITensorInfo            *src1,
+                                                 ITensorInfo            *src2,
+                                                 ITensorInfo            *src3,
+                                                 ITensorInfo            *src4,
+                                                 ITensorInfo            *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, src3, src4, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, src3, src4, dst));
 
-    auto               padding_info                      = get_padding_info({ src1, src2, src3, src4, dst });
-    const unsigned int min_dimension                     = std::min(std::min(src1->dimension(0), src2->dimension(0)), std::min(src3->dimension(0), src4->dimension(0)));
+    auto               padding_info = get_padding_info({src1, src2, src3, src4, dst});
+    const unsigned int min_dimension =
+        std::min(std::min(src1->dimension(0), src2->dimension(0)), std::min(src3->dimension(0), src4->dimension(0)));
     const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension);
     const unsigned int vec_size_leftover                 = dst->dimension(0) % num_elems_processed_per_iteration;
 
@@ -96,9 +108,14 @@ void ClWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
     build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src1->element_size()));
-    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DINPUT2_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DINPUT3_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) + src3->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) %
+                                                                           num_elems_processed_per_iteration));
+    build_opts.add_option("-DINPUT2_ROTATE_N=" +
+                          support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) - vec_size_leftover) %
+                                                    num_elems_processed_per_iteration));
+    build_opts.add_option("-DINPUT3_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) +
+                                                                            src3->dimension(0) - vec_size_leftover) %
+                                                                           num_elems_processed_per_iteration));
 
     _depth        = src1->dimension(2);
     _input1_width = src1->dimension(0);
@@ -106,8 +123,9 @@ void ClWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile
     _input3_width = src3->dimension(0);
 
     // If soources have different quantization info set quantization parameters needed for the re-quantization process
-    const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2, src3, src4);
-    if(is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo)
+    const bool have_different_qinfo =
+        helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2, src3, src4);
+    if (is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo)
     {
         const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
         const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform();
@@ -166,11 +184,15 @@ void ClWidthConcatenate4TensorsKernel::run_op(ITensorPack &tensors, const Window
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 2));
-    const auto src3 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 3));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 2));
+    const auto src3 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 3));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window slice = window.first_slice_window_4D();
 
@@ -187,8 +209,7 @@ void ClWidthConcatenate4TensorsKernel::run_op(ITensorPack &tensors, const Window
         _kernel.setArg<cl_int>(idx++, _input2_width);
         _kernel.setArg<cl_int>(idx++, _input3_width);
         enqueue(queue, *this, window, lws_hint());
-    }
-    while(window.slide_window_slice_4D(slice));
+    } while (window.slide_window_slice_4D(slice));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h
index baf8d381be..f589b8ac1a 100644
--- a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h
+++ b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h
@@ -52,23 +52,32 @@ public:
      * @param[in]  src4            Fourth source tensor info. Data types supported: same as @p src1
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *src3, ITensorInfo *src4, ITensorInfo *dst);
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *src1,
+                   ITensorInfo            *src2,
+                   ITensorInfo            *src3,
+                   ITensorInfo            *src4,
+                   ITensorInfo            *dst);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClWidthConcatenate4TensorsKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst);
+    static Status validate(const ITensorInfo *src1,
+                           const ITensorInfo *src2,
+                           const ITensorInfo *src3,
+                           const ITensorInfo *src4,
+                           const ITensorInfo *dst);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
 
 private:
-    int32_t _depth{ 0 };
-    int32_t _input1_width{ 0 };
-    int32_t _input2_width{ 0 };
-    int32_t _input3_width{ 0 };
+    int32_t _depth{0};
+    int32_t _input1_width{0};
+    int32_t _input2_width{0};
+    int32_t _input3_width{0};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp b/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp
index 2dfe7fce52..989de4a7b7 100644
--- a/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp
+++ b/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp
@@ -30,10 +30,10 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -53,7 +53,7 @@ Status validate_arguments(const ITensorInfo *src, unsigned int width_offset, con
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) + width_offset > dst->dimension(0));
 
-    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
     }
@@ -74,12 +74,15 @@ Status ClWidthConcatenateKernel::validate(const ITensorInfo *src, unsigned int w
     return Status{};
 }
 
-void ClWidthConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst)
+void ClWidthConcatenateKernel::configure(const CLCompileContext &compile_context,
+                                         ITensorInfo            *src,
+                                         unsigned int            width_offset,
+                                         ITensorInfo            *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, width_offset, dst));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, src->dimension(0));
 
@@ -87,10 +90,11 @@ void ClWidthConcatenateKernel::configure(const CLCompileContext &compile_context
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DWIDTH_OFFSET=" + support::cpp11::to_string(width_offset));
 
-    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
+    if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
     {
         const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
         const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
@@ -121,8 +125,9 @@ void ClWidthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     unsigned int idx = 0;
     add_4D_tensor_argument(idx, src, window);
diff --git a/src/gpu/cl/kernels/ClWidthConcatenateKernel.h b/src/gpu/cl/kernels/ClWidthConcatenateKernel.h
index 3ace4400e6..c10d6a4dc6 100644
--- a/src/gpu/cl/kernels/ClWidthConcatenateKernel.h
+++ b/src/gpu/cl/kernels/ClWidthConcatenateKernel.h
@@ -50,7 +50,8 @@ public:
      * @param[in,out] dst             Destination tensor info. Data types supported: same as @p src.
      *
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClWidthConcatenateKernel::configure()
@@ -63,7 +64,7 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
 
 private:
-    int32_t _depth{ 0 };
+    int32_t _depth{0};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp
index 7148a4c85c..58c01d4da5 100644
--- a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp
+++ b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp
@@ -29,10 +29,11 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -60,14 +61,18 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), "Winograd filter transform not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != kernel_size.width || input->dimension(idx_h) != kernel_size.height);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        !cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()),
+        "Winograd filter transform not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != kernel_size.width ||
+                                input->dimension(idx_h) != kernel_size.height);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input, winograd_info));
+        const TensorInfo tensor_info_output =
+            input->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input, winograd_info));
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -81,11 +86,15 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_UNUSED(output);
 
-    const unsigned int num_elems_processed_per_iteration_x = input->data_layout() == DataLayout::NCHW ? input->dimension(0) : 1;
+    const unsigned int num_elems_processed_per_iteration_x =
+        input->data_layout() == DataLayout::NCHW ? input->dimension(0) : 1;
     const unsigned int num_elems_processed_per_iteration_y = input->dimension(1);
-    const unsigned int num_elems_read_per_iteration_z      = input->data_layout() == DataLayout::NCHW ? 1 : input->dimension(2);
+    const unsigned int num_elems_read_per_iteration_z =
+        input->data_layout() == DataLayout::NCHW ? 1 : input->dimension(2);
 
-    Window win           = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y, num_elems_read_per_iteration_z));
+    Window win =
+        calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y,
+                                           num_elems_read_per_iteration_z));
     Window win_collapsed = win.collapse(win, Window::DimZ);
     return std::make_pair(Status{}, win_collapsed);
 }
@@ -96,21 +105,25 @@ ClWinogradFilterTransformKernel::ClWinogradFilterTransformKernel()
     _type = CLKernelType::WINOGRAD;
 }
 
-void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info)
+void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_context,
+                                                ITensorInfo            *src,
+                                                ITensorInfo            *dst,
+                                                const WinogradInfo     &winograd_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*src, winograd_info)));
+    auto_init_if_empty(*dst,
+                       src->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*src, winograd_info)));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, winograd_info));
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     // Set build options
     CLBuildOptions build_opts;
 
     // For NHWC layouts pass tensor dimesions at runtime
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         _src_dim_z = src->dimension(2);
     }
@@ -125,7 +138,8 @@ void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_
     const Size2D output_tile_size = winograd_info.output_tile_size;
 
     // Create kernel
-    std::string kernel_name = "winograd_filter_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(src->data_layout()));
+    std::string kernel_name = "winograd_filter_transform_" + output_tile_size.to_string() + "_" +
+                              kernel_size.to_string() + "_" + lower_string(string_from_data_layout(src->data_layout()));
 
     // A macro guard to compile ONLY the kernel of interest
     build_opts.add_option("-D" + upper_string(kernel_name));
@@ -138,7 +152,9 @@ void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClWinogradFilterTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info)
+Status ClWinogradFilterTransformKernel::validate(const ITensorInfo  *src,
+                                                 const ITensorInfo  *dst,
+                                                 const WinogradInfo &winograd_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, winograd_info));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first);
@@ -161,7 +177,7 @@ void ClWinogradFilterTransformKernel::run_op(ITensorPack &tensors, const Window
     unsigned int idx = 0;
     add_4D_tensor_argument(idx, src, window);
     add_3D_tensor_argument(idx, dst, window_out);
-    if(src->info()->data_layout() == DataLayout::NHWC)
+    if (src->info()->data_layout() == DataLayout::NHWC)
     {
         _kernel.setArg<cl_uint>(idx++, _src_dim_z);
     }
diff --git a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h
index b2130304e6..6e439f0c99 100644
--- a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h
+++ b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_WINOGRAD_FILTER_TRANSFORM_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -59,7 +60,10 @@ public:
      * @param[out] dst             The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_filter_transform_shape. Data types supported: Same as @p input
      * @param[in]  winograd_info   Contains Winograd's information described in @ref WinogradInfo
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info);
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const WinogradInfo     &winograd_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClWinogradFilterTransformKernel::configure()
@@ -72,7 +76,7 @@ public:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    int32_t _src_dim_z{ 0 };
+    int32_t _src_dim_z{0};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp
index fab6c36032..54c48986fc 100644
--- a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp
+++ b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp
@@ -32,6 +32,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -55,17 +56,21 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     const PadStrideInfo conv_info        = winograd_info.convolution_info;
     const Size2D        output_tile_size = winograd_info.output_tile_size;
     const Size2D        kernel_size      = winograd_info.kernel_size;
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd input transform only supports unit strides");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), "Winograd input transform not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1,
+                                    "Winograd input transform only supports unit strides");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        !cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()),
+        "Winograd input transform not supported");
 
     ARM_COMPUTE_UNUSED(conv_info);
     ARM_COMPUTE_UNUSED(output_tile_size);
     ARM_COMPUTE_UNUSED(kernel_size);
 
     // Validate configured output
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
+        const TensorShape output_shape =
+            misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -74,7 +79,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info)
 {
     ARM_COMPUTE_UNUSED(output);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -82,7 +88,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     bool window_changed                    = false;
     int  num_elems_processed_per_iteration = 1;
 
-    if(input->data_layout() == DataLayout::NHWC)
+    if (input->data_layout() == DataLayout::NHWC)
     {
         // In the case of FP16 computation, we can perform more
         // output feature maps in a single work-item.
@@ -94,9 +100,9 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
         const size_t   dim0 = input->dimension(0);
         const size_t   k_sz = winograd_info.kernel_size.area();
         const bool     cond = dt == DataType::F16 && ((dim0 % 2) == 0);
-        if(cond)
+        if (cond)
         {
-            if(k_sz == 3 || k_sz == 9)
+            if (k_sz == 3 || k_sz == 9)
             {
                 num_elems_processed_per_iteration = 2;
             }
@@ -104,7 +110,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     }
     Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
 
-    if(input->data_layout() == DataLayout::NCHW)
+    if (input->data_layout() == DataLayout::NCHW)
     {
         const PadStrideInfo conv_info        = winograd_info.convolution_info;
         const Size2D        output_tile_size = winograd_info.output_tile_size;
@@ -113,11 +119,13 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
         unsigned int num_elems_read_per_iteration_x = output_tile_size.width + kernel_size.width - 1;
         unsigned int num_elems_read_per_iteration_y = output_tile_size.height + kernel_size.height - 1;
 
-        AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(), num_elems_read_per_iteration_x, num_elems_read_per_iteration_y);
+        AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(),
+                                           num_elems_read_per_iteration_x, num_elems_read_per_iteration_y);
         window_changed = update_window_and_padding(win, input_access);
     }
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
@@ -132,12 +140,15 @@ BorderSize ClWinogradInputTransformKernel::border_size() const
     return _border_size;
 }
 
-void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info)
+void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_context,
+                                               ITensorInfo            *src,
+                                               ITensorInfo            *dst,
+                                               const WinogradInfo     &winograd_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, winograd_info));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     const PadStrideInfo conv_info        = winograd_info.convolution_info;
     const Size2D        output_tile_size = winograd_info.output_tile_size;
@@ -150,14 +161,13 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c
 
     // Compute the number of output tiles along the x and y direction of size "output_tile_size"
     const Size2D num_tiles = compute_winograd_convolution_tiles(Size2D(src->dimension(idx_w), src->dimension(idx_h)),
-                                                                kernel_size,
-                                                                output_tile_size,
-                                                                conv_info);
+                                                                kernel_size, output_tile_size, conv_info);
 
     _num_tiles_x = num_tiles.width;
     _num_tiles_y = num_tiles.height;
 
-    const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info);
+    const TensorShape output_shape =
+        misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info);
 
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(output_shape));
@@ -174,7 +184,7 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c
     _src_height = src->dimension(idx_h);
 
     CLBuildOptions build_opts;
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         build_opts.add_option("-DNHWC");
         build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step()));
@@ -201,13 +211,14 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c
     }
 
     // Create kernel
-    std::string kernel_name = "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string();
+    std::string kernel_name =
+        "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string();
 
     // Get the maximum dimension from the tile size
     const unsigned int tile_max_dim = std::max(output_tile_size.width, output_tile_size.height);
 
     // Check optimized kernel if output_dims == 2x2
-    if((tile_max_dim == 2) && (_data_layout == DataLayout::NCHW))
+    if ((tile_max_dim == 2) && (_data_layout == DataLayout::NCHW))
     {
         _step_z = (src->dimension(2) % 2) != 0 ? 1 : 2;
     }
@@ -239,11 +250,14 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c
     _config_id += lower_string(string_from_data_layout(_data_layout));
 }
 
-Status ClWinogradInputTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info)
+Status ClWinogradInputTransformKernel::validate(const ITensorInfo  *src,
+                                                const ITensorInfo  *dst,
+                                                const WinogradInfo &winograd_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, winograd_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), winograd_info).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(src->clone().get(), dst->clone().get(), winograd_info).first);
     return Status{};
 }
 
@@ -263,7 +277,7 @@ void ClWinogradInputTransformKernel::run_op(ITensorPack &tensors, const Window &
     // Collapse window
     Window window_collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
 
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         Window slice = window_collapsed.first_slice_window_3D();
         slice.set(1, Window::Dimension(0, _num_tiles_x * _num_tiles_y, 1));
@@ -298,8 +312,7 @@ void ClWinogradInputTransformKernel::run_op(ITensorPack &tensors, const Window &
             add_3D_tensor_argument(idx, dst, slice);
 
             enqueue(queue, *this, slice, lws_hint());
-        }
-        while(window_collapsed.slide_window_slice_3D(slice));
+        } while (window_collapsed.slide_window_slice_3D(slice));
     }
 }
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h
index c10c528b9b..cebebea1d3 100644
--- a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h
+++ b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_WINOGRAD_INPUT_TRANSFORM_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -59,7 +60,10 @@ public:
      * @param[in] dst             The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input
      * @param[in] winograd_info   Contains Winograd's information described in @ref WinogradInfo.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info);
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const WinogradInfo     &winograd_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClWinogradInputTransformKernel::configure()
@@ -69,19 +73,19 @@ public:
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+    void       run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
     BorderSize border_size() const override;
 
 private:
     using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>;
 
-    BorderSize   _border_size{ 0 };
-    DataLayout   _data_layout{ DataLayout::UNKNOWN };
-    int          _num_tiles_x{ 0 };
-    int          _num_tiles_y{ 0 };
-    unsigned int _step_z{ 1 };
-    int32_t      _src_width{ 0 };
-    int32_t      _src_height{ 0 };
+    BorderSize   _border_size{0};
+    DataLayout   _data_layout{DataLayout::UNKNOWN};
+    int          _num_tiles_x{0};
+    int          _num_tiles_y{0};
+    unsigned int _step_z{1};
+    int32_t      _src_width{0};
+    int32_t      _src_height{0};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
index bf974d30d8..89c80c55ef 100644
--- a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
+++ b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
@@ -23,7 +23,6 @@
  */
 #include "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
@@ -31,10 +30,12 @@
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -54,7 +55,11 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+Status validate_arguments(const ITensorInfo         *input,
+                          const ITensorInfo         *bias,
+                          const ITensorInfo         *output,
+                          const WinogradInfo        &winograd_info,
+                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
@@ -66,30 +71,32 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
     const Size2D        output_tile_size = winograd_info.output_tile_size;
     const Size2D        kernel_size      = winograd_info.kernel_size;
     const Size2D        input_dimensions = winograd_info.input_dimensions;
-    const unsigned int  num_channels     = (winograd_info.kernel_size.width + winograd_info.output_tile_size.width - 1) * (winograd_info.kernel_size.height + winograd_info.output_tile_size.height - 1);
+    const unsigned int  num_channels = (winograd_info.kernel_size.width + winograd_info.output_tile_size.width - 1) *
+                                      (winograd_info.kernel_size.height + winograd_info.output_tile_size.height - 1);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, winograd_info.output_data_layout), "Winograd output transform not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        !cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, winograd_info.output_data_layout),
+        "Winograd output transform not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != num_channels, "Wrong number of channels");
 
     // Compute number of elements to process in the X and Y direction
     // Compute the number of output tiles along the x and y direction of size "output_tile_size"
-    const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions,
-                                                                kernel_size,
-                                                                output_tile_size,
-                                                                conv_info);
+    const Size2D num_tiles =
+        compute_winograd_convolution_tiles(input_dimensions, kernel_size, output_tile_size, conv_info);
 
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != static_cast<unsigned int>((num_tiles.area())));
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
         ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
     }
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input, winograd_info));
+        const TensorInfo tensor_info_output =
+            input->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input, winograd_info));
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -98,14 +105,17 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output, const Size2D &output_tile_size)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo  *input,
+                                                        ITensorInfo  *bias,
+                                                        ITensorInfo  *output,
+                                                        const Size2D &output_tile_size)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_UNUSED(bias);
 
     unsigned int num_elems_processed_per_iteration = 1;
 
-    if(input->data_layout() == DataLayout::NHWC)
+    if (input->data_layout() == DataLayout::NHWC)
     {
         // In the case of FP16 computation, we can perform more
         // output feature maps in a single work-item.
@@ -115,7 +125,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
         const DataType dt   = input->data_type();
         const size_t   dim0 = input->dimension(0);
         const bool     cond = dt == DataType::F16 && ((dim0 % 2) == 0);
-        if(cond)
+        if (cond)
         {
             num_elems_processed_per_iteration = 2;
         }
@@ -124,17 +134,19 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     Window win            = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
     bool   window_changed = false;
 
-    if(output->data_layout() == DataLayout::NCHW)
+    if (output->data_layout() == DataLayout::NCHW)
     {
         const int output_static_window_end_x = ceil_to_multiple(output->dimension(0), output_tile_size.width);
         const int output_static_window_end_y = ceil_to_multiple(output->dimension(1), output_tile_size.height);
 
-        AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+        AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration,
+                                           num_elems_processed_per_iteration);
         AccessWindowStatic    output_access(output, 0, 0, output_static_window_end_x, output_static_window_end_y);
         window_changed = update_window_and_padding(win, input_access, output_access);
     }
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
@@ -144,13 +156,18 @@ ClWinogradOutputTransformKernel::ClWinogradOutputTransformKernel()
     _type = CLKernelType::WINOGRAD;
 }
 
-void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const WinogradInfo &winograd_info,
+void ClWinogradOutputTransformKernel::configure(const ClCompileContext    &compile_context,
+                                                ITensorInfo               *src,
+                                                ITensorInfo               *bias,
+                                                ITensorInfo               *dst,
+                                                const WinogradInfo        &winograd_info,
                                                 const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
     // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*src, winograd_info)));
+    auto_init_if_empty(*dst,
+                       src->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*src, winograd_info)));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, winograd_info, act_info));
 
@@ -159,7 +176,7 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     IClKernel::configure_internal(win_config.second);
 
-    auto padding_info = get_padding_info({ src, bias, dst });
+    auto padding_info = get_padding_info({src, bias, dst});
 
     _is_nhwc = winograd_info.output_data_layout == DataLayout::NHWC;
 
@@ -168,14 +185,13 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_
     const Size2D        kernel_size      = winograd_info.kernel_size;
     const Size2D        output_tile_size = winograd_info.output_tile_size;
     const PadStrideInfo conv_info        = winograd_info.convolution_info;
-    const int           idx_width        = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::WIDTH);
-    const int           idx_height       = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::HEIGHT);
+    const int idx_width = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::WIDTH);
+    const int idx_height =
+        get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::HEIGHT);
 
     // Compute the number of output tiles along the x and y direction of size "output_tile_size"
-    const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions,
-                                                                kernel_size,
-                                                                output_tile_size,
-                                                                conv_info);
+    const Size2D num_tiles =
+        compute_winograd_convolution_tiles(input_dimensions, kernel_size, output_tile_size, conv_info);
     const size_t total_batches = dst->tensor_shape().total_size_upper(3);
 
     // Set build options
@@ -184,11 +200,11 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_
     build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
     build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
 
-    if((output_tile_size.x() == 2) || (output_tile_size.x() == 1 && output_tile_size.y() == 2))
+    if ((output_tile_size.x() == 2) || (output_tile_size.x() == 1 && output_tile_size.y() == 2))
     {
         build_opts.add_option("-DVEC_SIZE=2");
     }
-    else if((output_tile_size.x() == 4) || (output_tile_size.x() == 1 && output_tile_size.y() == 4))
+    else if ((output_tile_size.x() == 4) || (output_tile_size.x() == 1 && output_tile_size.y() == 4))
     {
         build_opts.add_option("-DVEC_SIZE=4");
     }
@@ -200,9 +216,10 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_
     const auto      act_function  = act_info.activation();
     const auto      src_data_type = src->data_type();
 
-    if((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
-       && (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-       && (src_data_type == DataType::F32 || src_data_type == DataType::F16))
+    if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+        (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU ||
+         act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) &&
+        (src_data_type == DataType::F32 || src_data_type == DataType::F16))
     {
         // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
         // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
@@ -213,7 +230,7 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_
         build_opts.add_option("-cl-fast-relaxed-math");
     }
 
-    if(_is_nhwc)
+    if (_is_nhwc)
     {
         build_opts.add_option_if(bias != nullptr, std::string("-DHAS_BIAS"));
         build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step()));
@@ -247,7 +264,9 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_
     _dst_height = dst->dimension(idx_height);
 
     // Create kernel
-    std::string kernel_name = "winograd_output_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(winograd_info.output_data_layout));
+    std::string kernel_name = "winograd_output_transform_" + output_tile_size.to_string() + "_" +
+                              kernel_size.to_string() + "_" +
+                              lower_string(string_from_data_layout(winograd_info.output_data_layout));
 
     // A macro guard to compile ONLY the kernel of interest
     build_opts.add_option("-D" + upper_string(kernel_name));
@@ -271,10 +290,18 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info) && _is_nhwc);
 }
 
-Status ClWinogradOutputTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+Status ClWinogradOutputTransformKernel::validate(const ITensorInfo         *src,
+                                                 const ITensorInfo         *bias,
+                                                 const ITensorInfo         *dst,
+                                                 const WinogradInfo        &winograd_info,
+                                                 const ActivationLayerInfo &act_info)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, (bias != nullptr ? bias->clone().get() : nullptr), dst, winograd_info, act_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), (bias != nullptr ? bias->clone().get() : nullptr), dst->clone().get(), winograd_info.output_tile_size).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(src, (bias != nullptr ? bias->clone().get() : nullptr), dst, winograd_info, act_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(),
+                                                              (bias != nullptr ? bias->clone().get() : nullptr),
+                                                              dst->clone().get(), winograd_info.output_tile_size)
+                                    .first);
     return Status{};
 }
 
@@ -299,7 +326,7 @@ void ClWinogradOutputTransformKernel::run_op(ITensorPack &tensors, const Window
     slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
     slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         unsigned int idx1 = 2 * num_arguments_per_4D_tensor();
         Window       slice_biases;
@@ -307,7 +334,7 @@ void ClWinogradOutputTransformKernel::run_op(ITensorPack &tensors, const Window
         add_1D_tensor_argument(idx1, bias, slice_biases);
     }
 
-    if(_is_nhwc)
+    if (_is_nhwc)
     {
         unsigned int idx2 = 2 * num_arguments_per_4D_tensor() + ((bias != nullptr) ? num_arguments_per_1D_tensor() : 0);
         _kernel.setArg(idx2++, static_cast<int>(dst->info()->total_size() - dst->info()->strides_in_bytes().y()));
@@ -322,8 +349,7 @@ void ClWinogradOutputTransformKernel::run_op(ITensorPack &tensors, const Window
         add_4D_tensor_argument(idx, src, slice);
         add_4D_tensor_argument(idx, dst, slice_out);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out));
+    } while (window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out));
 }
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h
index 6f018967d0..65bb963061 100644
--- a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h
+++ b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_WINOGRAD_OUTPUT_TRANSFORM_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -61,7 +62,11 @@ public:
      * @param[in]  winograd_info   Contains Winograd's information described in @ref WinogradInfo
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const WinogradInfo &winograd_info,
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src,
+                   ITensorInfo               *bias,
+                   ITensorInfo               *dst,
+                   const WinogradInfo        &winograd_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -69,7 +74,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *dst,
+                           const WinogradInfo        &winograd_info,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -77,11 +86,11 @@ public:
 private:
     using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>;
 
-    bool    _is_nhwc{ false };
-    int32_t _src_height{ 0 };
-    int32_t _dst_width{ 0 };
-    int32_t _dst_height{ 0 };
-    int32_t _num_tiles_x{ 0 };
+    bool    _is_nhwc{false};
+    int32_t _src_height{0};
+    int32_t _dst_width{0};
+    int32_t _dst_height{0};
+    int32_t _num_tiles_x{0};
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp b/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp
index 9350bf74bb..b5ebac3b49 100644
--- a/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp
+++ b/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp
@@ -39,14 +39,24 @@ namespace kernels
 {
 namespace gemm
 {
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
-                                                                       bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m,
+                                                                       unsigned int n,
+                                                                       unsigned int m0,
+                                                                       unsigned int n0,
+                                                                       unsigned int k0,
+                                                                       unsigned int v0,
+                                                                       unsigned int h0,
+                                                                       bool         lhs_interleave,
+                                                                       bool         rhs_interleave,
+                                                                       bool         lhs_transpose,
+                                                                       bool         rhs_transpose,
+                                                                       bool         export_to_cl_image)
 {
     ARM_COMPUTE_ERROR_ON(m0 == 0 || n0 == 0);
     ARM_COMPUTE_ERROR_ON(v0 == 0);
     v0 = std::max(std::min(static_cast<int>(m / m0), static_cast<int>(v0)), static_cast<int>(1));
 
-    if(h0 == 0)
+    if (h0 == 0)
     {
         // When h0 is 0, we should take the maximum H0 possible
         h0 = std::max(n / n0, 1U);
@@ -62,17 +72,22 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned
     return std::make_pair(lhs_info, rhs_info);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img,
-                                                                    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf,
-                                                                    unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img,
+                    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf,
+                    unsigned int                                    n,
+                    unsigned int                                    k,
+                    unsigned int                                    b,
+                    DataType                                        data_type)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(info_buf.second.export_to_cl_image == true, "The fallback GeMM configuration cannot have export_to_cl_image = true");
+    ARM_COMPUTE_ERROR_ON_MSG(info_buf.second.export_to_cl_image == true,
+                             "The fallback GeMM configuration cannot have export_to_cl_image = true");
 
     const TensorInfo  tensor_rhs_info(TensorShape(n, k, b), 1, data_type);
     const TensorShape shape = misc::shape_calculator::compute_rhs_reshaped_shape(tensor_rhs_info, info_img.second);
     const TensorInfo  tensor_reshaped_info(shape, 1, data_type);
 
-    if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, info_img.second)))
+    if (bool(validate_image2d_support_on_rhs(tensor_reshaped_info, info_img.second)))
     {
         return info_img;
     }
@@ -90,42 +105,56 @@ void update_padding_for_cl_image(ITensorInfo *tensor)
     const unsigned int pixel_alignment      = get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device());
 
     ARM_COMPUTE_ERROR_ON_MSG(pixel_alignment == 0, "Cannot retrieve cl_image pitch alignment");
-    if(pixel_alignment == 0)
+    if (pixel_alignment == 0)
     {
         return;
     }
 
     const unsigned int row_pitch_alignment = pixel_alignment * num_floats_per_pixel;
-    const unsigned int round_up_width      = ((stride_y_in_elements + row_pitch_alignment - 1) / row_pitch_alignment) * row_pitch_alignment;
-    const unsigned int padding             = round_up_width - stride_y_in_elements;
+    const unsigned int round_up_width =
+        ((stride_y_in_elements + row_pitch_alignment - 1) / row_pitch_alignment) * row_pitch_alignment;
+    const unsigned int padding = round_up_width - stride_y_in_elements;
 
     tensor->extend_padding(PaddingSize(0, tensor->padding().right + padding, 0, 0));
 }
 
 Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info)
 {
-    if(rhs_info.export_to_cl_image)
+    if (rhs_info.export_to_cl_image)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 == 2) || (rhs_info.n0 == 3)) && rhs_info.transpose == false, "Export to cl_image only supported with n0 = 4, 8 or 16");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 == 2) || (rhs_info.k0 == 3)) && rhs_info.transpose == true, "Export to cl_image only supported with k0 = 4, 8 or 16");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 == 2) || (rhs_info.n0 == 3)) && rhs_info.transpose == false,
+                                        "Export to cl_image only supported with n0 = 4, 8 or 16");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 == 2) || (rhs_info.k0 == 3)) && rhs_info.transpose == true,
+                                        "Export to cl_image only supported with k0 = 4, 8 or 16");
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(&tensor_reshaped_info, DataType::F32, DataType::F16);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()), "The extension cl_khr_image2d_from_buffer is not supported on the target platform");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0, "Impossible to retrieve the cl_image pitch alignment");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            !image2d_from_buffer_supported(CLKernelLibrary::get().get_device()),
+            "The extension cl_khr_image2d_from_buffer is not supported on the target platform");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0,
+                                        "Impossible to retrieve the cl_image pitch alignment");
 
         // Check the width and height of the output tensor.
         // Since we cannot create a 3d image from a buffer, the third dimension is collapsed on the second dimension
         const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
         const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
 
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[0] > max_image_w * 4, "Not supported width for cl_image");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[1] * tensor_reshaped_info.tensor_shape()[2] > max_image_h, "Not supported height for cl_image");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[0] > max_image_w * 4,
+                                        "Not supported width for cl_image");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            tensor_reshaped_info.tensor_shape()[1] * tensor_reshaped_info.tensor_shape()[2] > max_image_h,
+            "Not supported height for cl_image");
     }
 
     return Status{};
 }
 
-bool is_mmul_kernel_preferred(const unsigned int m, const unsigned int n, const unsigned int k, const unsigned int b,
-                              const DataType data_type, unsigned int &best_m0, unsigned int &best_n0)
+bool is_mmul_kernel_preferred(const unsigned int m,
+                              const unsigned int n,
+                              const unsigned int k,
+                              const unsigned int b,
+                              const DataType     data_type,
+                              unsigned int      &best_m0,
+                              unsigned int      &best_n0)
 {
     ARM_COMPUTE_UNUSED(n, k, b, data_type);
 
@@ -141,7 +170,8 @@ bool is_mmul_kernel_preferred(const unsigned int m, const unsigned int n, const
     return ((k % mmul_k0) == 0) && (gws_y > 4);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     size_t min_acc = std::numeric_limits<size_t>::max();
     size_t min_idx = 0;
@@ -150,12 +180,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> find_lhs_rhs_info(const GeMMConf
     const size_t num_rows = configs.size();
     const size_t num_cols = configs[0].size();
 
-    ARM_COMPUTE_ERROR_ON_MSG(num_cols != 14U, "The entry should have 14 integer values representing: M, N, K, B, M0, N0. K0, V0, H0, INT_LHS, INT_RHS, TRA_LHS, TRA_RHS, IMG_RHS");
+    ARM_COMPUTE_ERROR_ON_MSG(num_cols != 14U, "The entry should have 14 integer values representing: M, N, K, B, M0, "
+                                              "N0. K0, V0, H0, INT_LHS, INT_RHS, TRA_LHS, TRA_RHS, IMG_RHS");
     ARM_COMPUTE_UNUSED(num_cols);
 
     // Find nearest GeMM workload
     // Note: the workload does not depend on the K dimension
-    for(size_t y = 0; y < num_rows; ++y)
+    for (size_t y = 0; y < num_rows; ++y)
     {
         size_t mc0 = static_cast<size_t>(configs[y][0]);
         size_t nc0 = static_cast<size_t>(configs[y][1]);
@@ -168,7 +199,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> find_lhs_rhs_info(const GeMMConf
         acc += (k - kc0) * (k - kc0);
         acc += (b - bc0) * (b - bc0);
         acc = std::sqrt(acc);
-        if(acc < min_acc)
+        if (acc < min_acc)
         {
             min_acc = acc;
             min_idx = y;
diff --git a/src/gpu/cl/kernels/gemm/ClGemmHelpers.h b/src/gpu/cl/kernels/gemm/ClGemmHelpers.h
index 6689b10e69..84776fb207 100644
--- a/src/gpu/cl/kernels/gemm/ClGemmHelpers.h
+++ b/src/gpu/cl/kernels/gemm/ClGemmHelpers.h
@@ -54,8 +54,18 @@ using GeMMConfigsMatrix = std::vector<std::vector<int32_t>>;
  *
  * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
  */
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
-                                                                       bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image = false);
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m,
+                                                                       unsigned int n,
+                                                                       unsigned int m0,
+                                                                       unsigned int n0,
+                                                                       unsigned int k0,
+                                                                       unsigned int v0,
+                                                                       unsigned int h0,
+                                                                       bool         lhs_interleave,
+                                                                       bool         rhs_interleave,
+                                                                       bool         lhs_transpose,
+                                                                       bool         rhs_transpose,
+                                                                       bool         export_to_cl_image = false);
 
 /** Select @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
  *
@@ -72,9 +82,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned
  *
  * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
  */
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img,
-                                                                    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf,
-                                                                    unsigned int n, unsigned int k, unsigned int b, DataType data_type);
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img,
+                    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf,
+                    unsigned int                                    n,
+                    unsigned int                                    k,
+                    unsigned int                                    b,
+                    DataType                                        data_type);
 
 /** Update padding required to export the OpenCL buffer to OpenCL image2d
  *
@@ -103,8 +117,13 @@ Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info,
  *
  * @return true if MMUL kernel is preferred over kernels w/o MMUL, false otherwise
  */
-bool is_mmul_kernel_preferred(const unsigned int m, const unsigned int n, const unsigned int k, const unsigned int b,
-                              const DataType data_type, unsigned int &best_m0, unsigned int &best_n0);
+bool is_mmul_kernel_preferred(const unsigned int m,
+                              const unsigned int n,
+                              const unsigned int k,
+                              const unsigned int b,
+                              const DataType     data_type,
+                              unsigned int      &best_m0,
+                              unsigned int      &best_n0);
 
 /** Find the preferred configurations for the LHS and RHS tensor using the GeMMConfigsMatrix provided by the user
  *
@@ -116,7 +135,8 @@ bool is_mmul_kernel_preferred(const unsigned int m, const unsigned int n, const
  *
  * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
  */
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 } // namespace gemm
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h b/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h
index a49836cfda..9d08633963 100644
--- a/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h
+++ b/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/common/Macros.h"
 
 #include <array>
@@ -56,8 +57,7 @@ public:
      * @param[in] func_int8 Function to call for GEMM Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
      *
      */
-    CLGEMMConfigArray(T func_f32, T func_f16, T func_int8)
-        : _configs{ func_f32, func_f16, func_int8 }
+    CLGEMMConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8}
     {
     }
 
@@ -69,7 +69,7 @@ public:
      */
     T get_function(DataType data_type)
     {
-        switch(data_type)
+        switch (data_type)
         {
             case DataType::F32:
                 return _configs.at(DT_F32);
@@ -96,8 +96,7 @@ public:
      *
      * @param[in] arch GPU target
      */
-    IClGemmKernelConfig(GPUTarget arch)
-        : _target(arch)
+    IClGemmKernelConfig(GPUTarget arch) : _target(arch)
     {
     }
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClGemmKernelConfig);
@@ -111,7 +110,8 @@ public:
      * @param[in] b         Batch size
      * @param[in] data_type Data type
      */
-    virtual std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) = 0;
+    virtual std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) = 0;
 
 protected:
     GPUTarget _target;
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp
index d74c7fac9b..2f37eef31f 100644
--- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/GPUTarget.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <utility>
@@ -38,31 +39,34 @@ namespace kernels
 {
 namespace gemm
 {
-ClGemmDefaultConfigNativeBifrost::ClGemmDefaultConfigNativeBifrost(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
+ClGemmDefaultConfigNativeBifrost::ClGemmDefaultConfigNativeBifrost(GPUTarget gpu) : IClGemmKernelConfig(gpu)
 {
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
 {
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigNativeBifrost::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigNativeBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(&ClGemmDefaultConfigNativeBifrost::configure_G71_f32,
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_G71_f32, // We use the F32 heuristic
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_G71_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(
+        &ClGemmDefaultConfigNativeBifrost::configure_G71_f32,
+        &ClGemmDefaultConfigNativeBifrost::configure_G71_f32, // We use the F32 heuristic
+        &ClGemmDefaultConfigNativeBifrost::configure_G71_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(&ClGemmDefaultConfigNativeBifrost::configure_G76_f32,
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_G76_f32, // We use the F32 heuristic
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_G76_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(
+        &ClGemmDefaultConfigNativeBifrost::configure_G76_f32,
+        &ClGemmDefaultConfigNativeBifrost::configure_G76_f32, // We use the F32 heuristic
+        &ClGemmDefaultConfigNativeBifrost::configure_G76_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClGemmDefaultConfigNativeBifrost::configure_default_f32,
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_default_f32, // We use the F32 heuristic
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_default_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(
+        &ClGemmDefaultConfigNativeBifrost::configure_default_f32,
+        &ClGemmDefaultConfigNativeBifrost::configure_default_f32, // We use the F32 heuristic
+        &ClGemmDefaultConfigNativeBifrost::configure_default_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
 
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G76:
             func = configs_G76.get_function(data_type);
@@ -79,18 +83,19 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
     return (this->*func)(m, n, k, b);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n < 2048)
+        if (n < 2048)
         {
             return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
         }
-        else if(n >= 2048 && n < 8192)
+        else if (n >= 2048 && n < 8192)
         {
             return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false);
         }
@@ -105,20 +110,21 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(dot8_supported(CLKernelLibrary::get().get_device()))
+    if (dot8_supported(CLKernelLibrary::get().get_device()))
     {
-        if(m == 1)
+        if (m == 1)
         {
-            if(n < 2048)
+            if (n < 2048)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false);
             }
-            else if(n >= 2048 && n < 16384)
+            else if (n >= 2048 && n < 16384)
             {
                 return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
             }
@@ -129,7 +135,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
         }
         else
         {
-            if(m < 64)
+            if (m < 64)
             {
                 return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false);
             }
@@ -141,9 +147,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
     }
     else
     {
-        if(m == 1)
+        if (m == 1)
         {
-            if(n < 8192)
+            if (n < 8192)
             {
                 return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
             }
@@ -159,24 +165,25 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n > 4196)
+        if (n > 4196)
         {
             return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 1, false, false, false, false);
         }
         else
         {
-            if(k < 2048)
+            if (k < 2048)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 1, false, false, false, false);
             }
-            else if(k >= 2048 && k < 16384)
+            else if (k >= 2048 && k < 16384)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
             }
@@ -192,18 +199,19 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n < 2048)
+        if (n < 2048)
         {
             return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false);
         }
-        else if(n >= 2048 && n < 16384)
+        else if (n >= 2048 && n < 16384)
         {
             return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
         }
@@ -214,7 +222,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
     }
     else
     {
-        if(m < 64)
+        if (m < 64)
         {
             return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false);
         }
@@ -225,7 +233,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
@@ -233,7 +242,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
     return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 1, false, false, false, false);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
@@ -243,4 +253,4 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost
 } // namespace gemm
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h
index 9af5dc4135..f822daae53 100644
--- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h
@@ -45,15 +45,22 @@ public:
     ClGemmDefaultConfigNativeBifrost(GPUTarget gpu);
 
     // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
 private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 };
 } // namespace gemm
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp
index b9f36c7210..f87fb1b659 100644
--- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/GPUTarget.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <utility>
@@ -38,18 +39,17 @@ namespace kernels
 {
 namespace gemm
 {
-ClGemmDefaultConfigNativeMidgard::ClGemmDefaultConfigNativeMidgard(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
+ClGemmDefaultConfigNativeMidgard::ClGemmDefaultConfigNativeMidgard(GPUTarget gpu) : IClGemmKernelConfig(gpu)
 {
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
 {
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigNativeMidgard::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigNativeMidgard::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(nullptr,
-                                                                        nullptr,
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(nullptr, nullptr,
                                                                         &ClGemmDefaultConfigNativeMidgard::default_q8);
 
     auto func = configs_default.get_function(data_type);
@@ -57,7 +57,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard
     return (this->*func)(m, n, k, b);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
@@ -70,4 +71,4 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard
 } // namespace gemm
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h
index c055753c48..fa76c5dba7 100644
--- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h
@@ -45,10 +45,12 @@ public:
     ClGemmDefaultConfigNativeMidgard(GPUTarget gpu);
 
     // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
 private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 };
 } // namespace gemm
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp
index 95a4d2bd69..97a1298b0a 100644
--- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/GPUTarget.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <utility>
@@ -38,37 +39,38 @@ namespace kernels
 {
 namespace gemm
 {
-ClGemmDefaultConfigNativeValhall::ClGemmDefaultConfigNativeValhall(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
+ClGemmDefaultConfigNativeValhall::ClGemmDefaultConfigNativeValhall(GPUTarget gpu) : IClGemmKernelConfig(gpu)
 {
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
 {
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigNativeValhall::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigNativeValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(&ClGemmDefaultConfigNativeValhall::configure_G77_f32,
-                                                                        &ClGemmDefaultConfigNativeValhall::configure_G77_f16,
-                                                                        &ClGemmDefaultConfigNativeValhall::configure_G77_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(
+        &ClGemmDefaultConfigNativeValhall::configure_G77_f32, &ClGemmDefaultConfigNativeValhall::configure_G77_f16,
+        &ClGemmDefaultConfigNativeValhall::configure_G77_u8);
 
     auto func = configs_default.get_function(data_type);
     ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
     return (this->*func)(m, n, k, b);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n < 2048)
+        if (n < 2048)
         {
             return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
         }
-        else if(n >= 2048 && n < 8192)
+        else if (n >= 2048 && n < 8192)
         {
             return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false);
         }
@@ -83,18 +85,19 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n < 2048)
+        if (n < 2048)
         {
             return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
         }
-        else if(n >= 2048 && n < 8192)
+        else if (n >= 2048 && n < 8192)
         {
             return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false);
         }
@@ -109,20 +112,21 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(dot8_supported(CLKernelLibrary::get().get_device()))
+    if (dot8_supported(CLKernelLibrary::get().get_device()))
     {
-        if(m == 1)
+        if (m == 1)
         {
-            if(n < 2048)
+            if (n < 2048)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false);
             }
-            else if(n >= 2048 && n < 16384)
+            else if (n >= 2048 && n < 16384)
             {
                 return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
             }
@@ -133,7 +137,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall
         }
         else
         {
-            if(m < 64)
+            if (m < 64)
             {
                 return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false);
             }
@@ -145,9 +149,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall
     }
     else
     {
-        if(m == 1)
+        if (m == 1)
         {
-            if(n < 8192)
+            if (n < 8192)
             {
                 return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
             }
@@ -165,4 +169,4 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall
 } // namespace gemm
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h
index f0f812fd46..c91b095279 100644
--- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h
@@ -45,12 +45,16 @@ public:
     ClGemmDefaultConfigNativeValhall(GPUTarget gpu);
 
     // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
 private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 };
 } // namespace gemm
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h b/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h
index cf8412830b..955bb3c01a 100644
--- a/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h
@@ -51,7 +51,7 @@ public:
      */
     static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
                 return std::make_unique<ClGemmDefaultConfigNativeMidgard>(gpu);
diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp
index 657018eb53..c956c347ef 100644
--- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <utility>
@@ -43,30 +44,31 @@ namespace gemm
 {
 using namespace arm_compute::misc::shape_calculator;
 
-ClGemmDefaultConfigReshapedBifrost::ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
+ClGemmDefaultConfigReshapedBifrost::ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu) : IClGemmKernelConfig(gpu)
 {
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
 {
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigReshapedBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(
+        &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32, &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16,
+        &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(&ClGemmDefaultConfigReshapedBifrost::configure_G52_f32,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G52_f16,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(
+        &ClGemmDefaultConfigReshapedBifrost::configure_G52_f32, &ClGemmDefaultConfigReshapedBifrost::configure_G52_f16,
+        &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(&ClGemmDefaultConfigReshapedBifrost::configure_G76_f32,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G76_f16,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G76_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(
+        &ClGemmDefaultConfigReshapedBifrost::configure_G76_f32, &ClGemmDefaultConfigReshapedBifrost::configure_G76_f16,
+        &ClGemmDefaultConfigReshapedBifrost::configure_G76_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
 
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G76:
             func = configs_G76.get_function(data_type);
@@ -83,12 +85,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     return (this->*func)(m, n, k, b);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(n <= 4)
+    if (n <= 4)
     {
         return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true);
     }
@@ -98,12 +101,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(n <= 4)
+    if (n <= 4)
     {
         return configure_lhs_rhs_info(m, n, 4, 2, 8, 8, 2, true, true, true, false);
     }
@@ -113,14 +117,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(dot8_supported(CLKernelLibrary::get().get_device()))
+    if (dot8_supported(CLKernelLibrary::get().get_device()))
     {
-        if(n <= 4)
+        if (n <= 4)
         {
             return configure_lhs_rhs_info(m, n, 4, 2, 16, 2, 2, true, false, false, true);
         }
@@ -131,7 +136,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     }
     else
     {
-        if(n <= 4)
+        if (n <= 4)
         {
             return configure_lhs_rhs_info(m, n, 4, 2, 8, 2, 2, true, false, false, true);
         }
@@ -142,7 +147,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
@@ -154,100 +160,108 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     GEMMLHSMatrixInfo lhs_info_img;
     GEMMRHSMatrixInfo rhs_info_img;
 
-    if(workload <= 274.4000f)
+    if (workload <= 274.4000f)
     {
-        if(r_nk <= 0.7461f)
+        if (r_nk <= 0.7461f)
         {
-            if(r_mn <= 21.1667f)
+            if (r_mn <= 21.1667f)
             {
                 return configure_lhs_rhs_info(m, n, 4, 2, 4, 4, 4, false, true, true, false, false);
             }
             else
             {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+                std::tie(lhs_info_buf, rhs_info_buf) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
             }
         }
         else
         {
-            std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+            std::tie(lhs_info_img, rhs_info_img) =
+                configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+            std::tie(lhs_info_buf, rhs_info_buf) =
+                configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
 
             return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                       std::make_pair(lhs_info_buf, rhs_info_buf),
-                                       n, k, b, DataType::F32);
+                                       std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
         }
     }
     else
     {
-        if(r_mk <= 17.3926f)
+        if (r_mk <= 17.3926f)
         {
-            if(workload <= 542.4000f)
+            if (workload <= 542.4000f)
             {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+                std::tie(lhs_info_buf, rhs_info_buf) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
             }
             else
             {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false);
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true);
+                std::tie(lhs_info_buf, rhs_info_buf) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
             }
         }
         else
         {
-            if(r_nk <= 0.5463f)
+            if (r_nk <= 0.5463f)
             {
-                if(workload <= 11767.6001f)
+                if (workload <= 11767.6001f)
                 {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-                    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+                    std::tie(lhs_info_img, rhs_info_img) =
+                        configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+                    std::tie(lhs_info_buf, rhs_info_buf) =
+                        configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F32);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
                 }
                 else
                 {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true);
-                    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false);
+                    std::tie(lhs_info_img, rhs_info_img) =
+                        configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true);
+                    std::tie(lhs_info_buf, rhs_info_buf) =
+                        configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F32);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
                 }
             }
             else
             {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+                std::tie(lhs_info_buf, rhs_info_buf) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
             }
         }
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
 
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
 
-    if(workload <= 323.4000f)
+    if (workload <= 323.4000f)
     {
         return configure_lhs_rhs_info(m, n, 2, 2, 8, 4, 8, false, false, false, true, false);
     }
@@ -257,7 +271,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
@@ -268,7 +283,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     GEMMRHSMatrixInfo rhs_info_img;
 
     // Get lhs_info/rhs_info in case of OpenCL buffer
-    if(n <= 4)
+    if (n <= 4)
     {
         std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true);
     }
@@ -279,15 +294,17 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
 
     // Get lhs_info/rhs_info in case of OpenCL image
     // Condition on the GPU workload
-    if((m / 4) * (n / 4) >= 2560)
+    if ((m / 4) * (n / 4) >= 2560)
     {
         // Big workload
-        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 8, true, true, true, false, true);
+        std::tie(lhs_info_img, rhs_info_img) =
+            configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 8, true, true, true, false, true);
     }
     else
     {
         // Small workload
-        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 1, true, true, true, false, true);
+        std::tie(lhs_info_img, rhs_info_img) =
+            configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 1, true, true, true, false, true);
     }
 
     const TensorInfo  tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32);
@@ -297,7 +314,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     // In case of vector by matrix with few work-items, we use the OpenCL buffer rather than the OpenCL image2d
     const bool use_cl_image2d = (n <= 4) ? false : true;
 
-    if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
+    if (bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
     {
         return std::make_pair(lhs_info_img, rhs_info_img);
     }
@@ -307,16 +324,17 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
     const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
 
-    if(workload <= 1595.2000f)
+    if (workload <= 1595.2000f)
     {
-        if(r_mk <= 2.1044f)
+        if (r_mk <= 2.1044f)
         {
-            if(workload <= 870.4000f)
+            if (workload <= 870.4000f)
             {
                 return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 2, true, false, true, false, false);
             }
@@ -336,12 +354,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(n <= 4)
+    if (n <= 4)
     {
         return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, false, false, false, true);
     }
diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h
index d86d1ba0a7..9227ec2551 100644
--- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h
@@ -45,17 +45,26 @@ public:
     ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu);
 
     // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
 private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 };
 } // namespace gemm
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp
index 58d0873b86..70b324eb5a 100644
--- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/GPUTarget.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <utility>
@@ -38,26 +39,27 @@ namespace kernels
 {
 namespace gemm
 {
-ClGemmDefaultConfigReshapedValhall::ClGemmDefaultConfigReshapedValhall(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
+ClGemmDefaultConfigReshapedValhall::ClGemmDefaultConfigReshapedValhall(GPUTarget gpu) : IClGemmKernelConfig(gpu)
 {
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
 {
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigReshapedValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClGemmDefaultConfigReshapedValhall::configure_G77_f32,
-                                                                    &ClGemmDefaultConfigReshapedValhall::configure_G77_f16,
-                                                                    &ClGemmDefaultConfigReshapedValhall::configure_G77_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(
+        &ClGemmDefaultConfigReshapedValhall::configure_G77_f32, &ClGemmDefaultConfigReshapedValhall::configure_G77_f16,
+        &ClGemmDefaultConfigReshapedValhall::configure_G77_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(&ClGemmDefaultConfigReshapedValhall::configure_G78_f32,
-                                                                    &ClGemmDefaultConfigReshapedValhall::configure_G78_f16,
-                                                                    &ClGemmDefaultConfigReshapedValhall::configure_G77_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(
+        &ClGemmDefaultConfigReshapedValhall::configure_G78_f32, &ClGemmDefaultConfigReshapedValhall::configure_G78_f16,
+        &ClGemmDefaultConfigReshapedValhall::configure_G77_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
 
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G78:
             func = configs_G78.get_function(data_type);
@@ -72,12 +74,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
     return (this->*func)(m, n, k, b);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(n <= 4)
+    if (n <= 4)
     {
         return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, 1, 0, 0, 1);
     }
@@ -87,7 +90,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
@@ -104,17 +108,17 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
 
     std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 0);
 
-    if(r_mk <= 0.11824845522642136)
+    if (r_mk <= 0.11824845522642136)
     {
-        if(workload <= 880.0)
+        if (workload <= 880.0)
         {
             return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0);
         }
         else
         {
-            if(r_nk <= 0.42521367967128754)
+            if (r_nk <= 0.42521367967128754)
             {
-                if(workload <= 1726.4000244140625)
+                if (workload <= 1726.4000244140625)
                 {
                     return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 0);
                 }
@@ -123,13 +127,12 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
                     std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
             }
             else
             {
-                if(workload <= 1241.6000366210938)
+                if (workload <= 1241.6000366210938)
                 {
                     return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0);
                 }
@@ -142,17 +145,16 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
     }
     else
     {
-        if(workload <= 11404.7998046875)
+        if (workload <= 11404.7998046875)
         {
-            if(r_mk <= 1.0126488208770752)
+            if (r_mk <= 1.0126488208770752)
             {
-                if(r_mn <= 2.545312523841858)
+                if (r_mn <= 2.545312523841858)
                 {
                     std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
                 else
                 {
@@ -161,43 +163,39 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
             }
             else
             {
-                if(workload <= 2881.199951171875)
+                if (workload <= 2881.199951171875)
                 {
                     std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, 0, 0, 1, 0, 1);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
                 else
                 {
                     std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
             }
         }
         else
         {
-            if(r_nk <= 0.5765306055545807)
+            if (r_nk <= 0.5765306055545807)
             {
-                if(r_mn <= 6.010416746139526)
+                if (r_mn <= 6.010416746139526)
                 {
                     std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
                 else
                 {
                     std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 1, 0, 1, 0, 1);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
             }
             else
@@ -205,27 +203,27 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
                 std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 1, 0, 1, 0, 1);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F16);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
             }
         }
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
     const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
     const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
 
-    if(workload <= 1288.0000f)
+    if (workload <= 1288.0000f)
     {
-        if(workload <= 505.6000f)
+        if (workload <= 505.6000f)
         {
-            if(r_mn <= 0.4466f)
+            if (r_mn <= 0.4466f)
             {
-                if(r_nk <= 0.2384f)
+                if (r_nk <= 0.2384f)
                 {
                     return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
                 }
@@ -241,9 +239,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
         }
         else
         {
-            if(r_mn <= 0.2250f)
+            if (r_mn <= 0.2250f)
             {
-                if(r_mn <= 0.1599f)
+                if (r_mn <= 0.1599f)
                 {
                     return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
                 }
@@ -254,11 +252,11 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
             }
             else
             {
-                if(r_mk <= 0.7609f)
+                if (r_mk <= 0.7609f)
                 {
-                    if(r_mn <= 2.5453f)
+                    if (r_mn <= 2.5453f)
                     {
-                        if(workload <= 1089.6000f)
+                        if (workload <= 1089.6000f)
                         {
                             return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
                         }
@@ -281,29 +279,29 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
     }
     else
     {
-        if(workload <= 5434.4001f)
+        if (workload <= 5434.4001f)
         {
-            if(workload <= 1603.2000f)
+            if (workload <= 1603.2000f)
             {
                 return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
             }
             else
             {
-                if(r_nk <= 0.6192f)
+                if (r_nk <= 0.6192f)
                 {
-                    if(r_mn <= 16.1016f)
+                    if (r_mn <= 16.1016f)
                     {
                         return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                     }
                     else
                     {
-                        if(workload <= 2750.0000f)
+                        if (workload <= 2750.0000f)
                         {
                             return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                         }
                         else
                         {
-                            if(r_mk <= 6.3151f)
+                            if (r_mk <= 6.3151f)
                             {
                                 return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1);
                             }
@@ -316,15 +314,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
                 }
                 else
                 {
-                    if(r_mk <= 0.0387f)
+                    if (r_mk <= 0.0387f)
                     {
                         return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1);
                     }
                     else
                     {
-                        if(r_mk <= 2.5859f)
+                        if (r_mk <= 2.5859f)
                         {
-                            if(r_mk <= 0.2734f)
+                            if (r_mk <= 0.2734f)
                             {
                                 return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1);
                             }
@@ -343,13 +341,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
         }
         else
         {
-            if(r_mk <= 25.7500f)
+            if (r_mk <= 25.7500f)
             {
-                if(r_mk <= 0.3615f)
+                if (r_mk <= 0.3615f)
                 {
-                    if(r_mn <= 0.0913f)
+                    if (r_mn <= 0.0913f)
                     {
-                        if(r_mk <= 0.0683f)
+                        if (r_mk <= 0.0683f)
                         {
                             return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1);
                         }
@@ -365,15 +363,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
                 }
                 else
                 {
-                    if(workload <= 11174.3999f)
+                    if (workload <= 11174.3999f)
                     {
-                        if(r_mk <= 0.8047f)
+                        if (r_mk <= 0.8047f)
                         {
                             return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                         }
                         else
                         {
-                            if(workload <= 7185.5999f)
+                            if (workload <= 7185.5999f)
                             {
                                 return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1);
                             }
@@ -385,9 +383,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
                     }
                     else
                     {
-                        if(workload <= 17917.5000f)
+                        if (workload <= 17917.5000f)
                         {
-                            if(r_mk <= 1.5078f)
+                            if (r_mk <= 1.5078f)
                             {
                                 return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                             }
@@ -398,7 +396,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
                         }
                         else
                         {
-                            if(workload <= 34449.6016f)
+                            if (workload <= 34449.6016f)
                             {
                                 return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                             }
@@ -412,11 +410,11 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
             }
             else
             {
-                if(r_mk <= 331.1111f)
+                if (r_mk <= 331.1111f)
                 {
-                    if(workload <= 53397.5996f)
+                    if (workload <= 53397.5996f)
                     {
-                        if(r_mn <= 57.8063f)
+                        if (r_mn <= 57.8063f)
                         {
                             return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                         }
@@ -427,7 +425,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
                     }
                     else
                     {
-                        if(r_nk <= 0.9211f)
+                        if (r_nk <= 0.9211f)
                         {
                             return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1);
                         }
@@ -439,7 +437,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
                 }
                 else
                 {
-                    if(workload <= 38070.4004f)
+                    if (workload <= 38070.4004f)
                     {
                         return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1);
                     }
@@ -453,27 +451,28 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
     const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
 
-    if(workload <= 801.6000f)
+    if (workload <= 801.6000f)
     {
         return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1);
     }
     else
     {
-        if(r_mn <= 0.1211f)
+        if (r_mn <= 0.1211f)
         {
-            if(workload <= 3296.0000f)
+            if (workload <= 3296.0000f)
             {
                 return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
             }
             else
             {
-                if(r_nk <= 1.0625f)
+                if (r_nk <= 1.0625f)
                 {
                     return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                 }
@@ -485,15 +484,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
         }
         else
         {
-            if(workload <= 5068.8000f)
+            if (workload <= 5068.8000f)
             {
                 return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1);
             }
             else
             {
-                if(r_nk <= 0.2361f)
+                if (r_nk <= 0.2361f)
                 {
-                    if(workload <= 12630.0000f)
+                    if (workload <= 12630.0000f)
                     {
                         return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1);
                     }
@@ -504,7 +503,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
                 }
                 else
                 {
-                    if(workload <= 178790.3984f)
+                    if (workload <= 178790.3984f)
                     {
                         return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                     }
@@ -518,12 +517,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(n <= 4)
+    if (n <= 4)
     {
         return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, 0, 0, 0, 1);
     }
diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h
index 466eda00a6..5f62efb59e 100644
--- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h
@@ -45,14 +45,20 @@ public:
     ClGemmDefaultConfigReshapedValhall(GPUTarget gpu);
 
     // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
 private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 };
 } // namespace gemm
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h
index 1c32f1358b..83928b3f4f 100644
--- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h
@@ -50,7 +50,7 @@ public:
      */
     static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
             case GPUTarget::BIFROST:
diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp
index 9c23d9c998..c4825bfbeb 100644
--- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp
@@ -29,7 +29,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+
 #include <utility>
 
 namespace arm_compute
@@ -47,33 +49,39 @@ ClGemmDefaultConfigReshapedRhsOnlyBifrost::ClGemmDefaultConfigReshapedRhsOnlyBif
 {
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
 {
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedRhsOnlyBifrost::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G51(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G31(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8);
-
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8);
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigReshapedRhsOnlyBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G51(
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G31(
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G76:
             func = configs_G76.get_function(data_type);
@@ -96,14 +104,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     return (this->*func)(m, n, k, b);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n <= 2548)
+        if (n <= 2548)
         {
             return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, false, true, false, true, false);
         }
@@ -118,12 +127,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
         const unsigned int h0 = std::max(n / 2, 1U);
         return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, 0, 1, 0, 1);
@@ -131,7 +141,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     else
     {
         const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1));
-        if(m >= 28)
+        if (m >= 28)
         {
             return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, 0, 1, 0, 1);
         }
@@ -142,7 +152,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
@@ -154,9 +165,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
 
     const bool is_workload_big = ((m * n * b) / 16) >= 2048;
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n >= 8192)
+        if (n >= 8192)
         {
             const unsigned int h0 = std::max(n / 4, 1U);
             return configure_lhs_rhs_info(m, n, 1, 4, 8, 1, h0, false, true, false, true, false);
@@ -164,7 +175,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
         else
         {
             const unsigned int h0 = std::max(n / 2, 1U);
-            if(n <= 204)
+            if (n <= 204)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true, false);
             }
@@ -177,25 +188,29 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     else
     {
         const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1));
-        if(is_workload_big)
+        if (is_workload_big)
         {
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, true);
+            std::tie(lhs_info_buf, rhs_info_buf) =
+                configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, true);
         }
         else
         {
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true);
+            std::tie(lhs_info_buf, rhs_info_buf) =
+                configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true);
         }
     }
 
     // Get lhs_info/rhs_info in case of OpenCL image
     const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1));
-    if(is_workload_big)
+    if (is_workload_big)
     {
-        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true);
+        std::tie(lhs_info_img, rhs_info_img) =
+            configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true);
     }
     else
     {
-        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true, true);
+        std::tie(lhs_info_img, rhs_info_img) =
+            configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true, true);
     }
 
     const TensorInfo  tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32);
@@ -205,7 +220,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     // In case of vector by matrix or small workloads, we use the OpenCL buffer rather than the OpenCL image2d
     const bool use_cl_image2d = ((m == 1) || ((((m * n * b) / 16) < 2048) && n < 128)) ? false : true;
 
-    if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
+    if (bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
     {
         return std::make_pair(lhs_info_img, rhs_info_img);
     }
@@ -215,7 +230,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
     const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
@@ -225,46 +241,49 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     GEMMLHSMatrixInfo lhs_info_img;
     GEMMRHSMatrixInfo rhs_info_img;
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(r_nk <= 0.4664f)
+        if (r_nk <= 0.4664f)
         {
             return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 16, false, true, false, true, false);
         }
         else
         {
-            std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, true);
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, false);
+            std::tie(lhs_info_img, rhs_info_img) =
+                configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, true);
+            std::tie(lhs_info_buf, rhs_info_buf) =
+                configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, false);
 
             return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                       std::make_pair(lhs_info_buf, rhs_info_buf),
-                                       n, k, b, DataType::F32);
+                                       std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
         }
     }
     else
     {
-        if(workload <= 274.4000f)
+        if (workload <= 274.4000f)
         {
             return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 16, false, false, false, true, false);
         }
         else
         {
-            std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, true);
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, false);
+            std::tie(lhs_info_img, rhs_info_img) =
+                configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, true);
+            std::tie(lhs_info_buf, rhs_info_buf) =
+                configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, false);
 
             return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                       std::make_pair(lhs_info_buf, rhs_info_buf),
-                                       n, k, b, DataType::F32);
+                                       std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
         }
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
         const unsigned int n0 = n < 1280 ? 2 : 4;
         const unsigned int h0 = std::max(n / n0, 1U);
@@ -276,14 +295,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n > 2048)
+        if (n > 2048)
         {
             const unsigned int h0 = std::max(n / 4, 1U);
             return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, false, true, false, true);
@@ -300,7 +320,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
@@ -312,57 +333,59 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     GEMMLHSMatrixInfo lhs_info_img;
     GEMMRHSMatrixInfo rhs_info_img;
 
-    if(m == 1)
+    if (m == 1)
     {
-        std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, false);
+        std::tie(lhs_info_buf, rhs_info_buf) =
+            configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, false);
 
-        if(r_mk <= 0.0026f)
+        if (r_mk <= 0.0026f)
         {
-            if(r_nk <= 0.4664f)
+            if (r_nk <= 0.4664f)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
             }
             else
             {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true);
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true);
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F16);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
             }
         }
         else
         {
-            if(r_mk <= 0.0148f)
+            if (r_mk <= 0.0148f)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
             }
             else
             {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true);
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true);
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F16);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
             }
         }
     }
     else
     {
-        std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 8, 4, 1, 2, false, false, false, false, false);
+        std::tie(lhs_info_buf, rhs_info_buf) =
+            configure_lhs_rhs_info(m, n, 5, 8, 4, 1, 2, false, false, false, false, false);
 
-        if(workload <= 362.6000f)
+        if (workload <= 362.6000f)
         {
             return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false);
         }
         else
         {
-            if(r_mn <= 22.6067f)
+            if (r_mn <= 22.6067f)
             {
-                if(workload <= 708.8000f)
+                if (workload <= 708.8000f)
                 {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true);
+                    std::tie(lhs_info_img, rhs_info_img) =
+                        configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true);
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
                 else
                 {
@@ -371,27 +394,28 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
             }
             else
             {
-                if(r_nk <= 0.0917f)
+                if (r_nk <= 0.0917f)
                 {
                     return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false);
                 }
                 else
                 {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true);
+                    std::tie(lhs_info_img, rhs_info_img) =
+                        configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true);
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
             }
         }
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
 
-    if(m == 1)
+    if (m == 1)
     {
         return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
     }
@@ -400,15 +424,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
         const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
         const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
 
-        if(workload <= 7449.60f)
+        if (workload <= 7449.60f)
         {
-            if(workload <= 691.60f)
+            if (workload <= 691.60f)
             {
                 return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 8, false, false, false, false, false);
             }
             else
             {
-                if(workload <= 4155.20f)
+                if (workload <= 4155.20f)
                 {
                     return configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
                 }
@@ -420,21 +444,22 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
         }
         else
         {
-            if(workload <= 16300.80f)
+            if (workload <= 16300.80f)
             {
-                if(r_mn <= 44.56f)
+                if (r_mn <= 44.56f)
                 {
                     GEMMLHSMatrixInfo lhs_info_buf;
                     GEMMRHSMatrixInfo rhs_info_buf;
                     GEMMLHSMatrixInfo lhs_info_img;
                     GEMMRHSMatrixInfo rhs_info_img;
 
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, false, true, false, false, true);
-                    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
+                    std::tie(lhs_info_img, rhs_info_img) =
+                        configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, false, true, false, false, true);
+                    std::tie(lhs_info_buf, rhs_info_buf) =
+                        configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
                 else
                 {
@@ -448,23 +473,25 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
                 GEMMLHSMatrixInfo lhs_info_img;
                 GEMMRHSMatrixInfo rhs_info_img;
 
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, true, false, false, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, true, false, false, true);
+                std::tie(lhs_info_buf, rhs_info_buf) =
+                    configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F16);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
             }
         }
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
         const unsigned int n0 = n < 1280 ? 2 : 4;
         const unsigned int h0 = std::max(n / n0, 1U);
@@ -476,14 +503,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(dot8_supported(CLKernelLibrary::get().get_device()))
+    if (dot8_supported(CLKernelLibrary::get().get_device()))
     {
-        if(m == 1)
+        if (m == 1)
         {
             const unsigned int h0 = std::max(n / 2, 1U);
             return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true);
@@ -497,7 +525,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     else
     {
         const int h0 = std::max(std::min(static_cast<int>(n / 2), static_cast<int>(128)), static_cast<int>(1));
-        if(m == 1)
+        if (m == 1)
         {
             return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, h0, false, true, false, true);
         }
@@ -508,12 +536,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
         const unsigned int h0 = std::max(n / 2, 1U);
         return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true);
@@ -524,12 +553,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
         const unsigned int h0 = std::max(n / 2, 1U);
         return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, false, true, false, true);
diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h
index 321cbb5250..77c0c8d500 100644
--- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h
@@ -45,21 +45,34 @@ public:
     ClGemmDefaultConfigReshapedRhsOnlyBifrost(GPUTarget gpu);
 
     // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
 private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G31_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G31_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 };
 } // namespace gemm
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp
index d08bf84c72..da3e2ec912 100644
--- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp
@@ -50,30 +50,35 @@ ClGemmDefaultConfigReshapedRhsOnlyValhall::ClGemmDefaultConfigReshapedRhsOnlyVal
 {
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
 {
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedRhsOnlyValhall::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigReshapedRhsOnlyValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G710(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32,
-                                                                     &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16,
-                                                                     &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G710(
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G715(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32,
-                                                                     &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16,
-                                                                     &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G715(
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
 
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G78:
             func = configs_G78.get_function(data_type);
@@ -96,29 +101,29 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     return (this->*func)(m, n, k, b);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
-    if(m == 1)
+    if (m == 1)
     {
         const float r_mn = static_cast<float>(m) / static_cast<float>(n);
         const float r_mk = static_cast<float>(m) / static_cast<float>(k);
 
-        if(r_mk <= 0.0064484127797186375)
+        if (r_mk <= 0.0064484127797186375)
         {
-            if(r_mn <= 0.0028273810748942196)
+            if (r_mn <= 0.0028273810748942196)
             {
                 GEMMLHSMatrixInfo lhs_info_buf;
                 GEMMRHSMatrixInfo rhs_info_buf;
                 GEMMLHSMatrixInfo lhs_info_img;
                 GEMMRHSMatrixInfo rhs_info_img;
 
-                const unsigned int h0 = std::max(n / 4, 1U);
+                const unsigned int h0                = std::max(n / 4, 1U);
                 std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, 0, 1, 0, 0, 1);
                 std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, 0, 1, 0, 1, 0);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
             }
             else
             {
@@ -127,7 +132,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
         }
         else
         {
-            if(r_mk <= 0.020312500186264515)
+            if (r_mk <= 0.020312500186264515)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, 0, 1, 0, 0, 0);
             }
@@ -143,9 +148,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
         const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
         const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
 
-        if(workload <= 1999.2000122070312)
+        if (workload <= 1999.2000122070312)
         {
-            if(workload <= 747.1999816894531)
+            if (workload <= 747.1999816894531)
             {
                 return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
             }
@@ -159,15 +164,14 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
                 std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
             }
         }
         else
         {
-            if(r_mn <= 0.03348214365541935)
+            if (r_mn <= 0.03348214365541935)
             {
-                if(r_mk <= 0.028125000186264515)
+                if (r_mk <= 0.028125000186264515)
                 {
                     return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
                 }
@@ -181,8 +185,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
                     std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F32);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
                 }
             }
             else
@@ -195,168 +198,112 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
                 std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 1, 0, 1, 0);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
             }
         }
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
-    const GeMMConfigsMatrix configs_1nkb_best =
-    {
-        { 1, 8984, 640, 1, 1, 8, 8, 1, 0, 1, 1, 1, 1, 0 },
-        { 1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 6512, 6404, 1, 1, 4, 8, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 5304, 640, 1, 1, 4, 4, 1, 0, 1, 0, 1, 1, 0 },
-        { 1, 1352, 1520, 1, 1, 2, 8, 1, 0, 1, 1, 1, 1, 0 },
-        { 1, 4096, 25088, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_n_small_best =
-    {
-        { 102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0 },
-        { 102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1 },
-        { 16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1 },
-        { 16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 1 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_n_small_fallback =
-    {
-        { 102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0 },
-        { 102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0 },
-        { 16384, 4, 128, 1, 2, 2, 16, 1, 2, 1, 1, 1, 1, 0 },
-        { 16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_m_gt_n_best =
-    {
-        { 25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 },
-        { 25584, 16, 68, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1 },
-        { 369664, 32, 28, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1 },
-        { 65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 23036, 56, 736, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-        { 90968, 40, 600, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-        { 8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-        { 50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-        { 12604, 60, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-        { 29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback =
-    {
-        { 25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 },
-        { 25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 0, 0 },
-        { 369664, 32, 28, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0 },
-        { 65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 23036, 56, 736, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 90968, 40, 600, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0 },
-        { 50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0 },
-        { 12604, 60, 160, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0 },
-        { 29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_n_gt_m_best =
-    {
-        { 24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0 },
-        { 49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 1 },
-        { 49, 1024, 1024, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback =
-    {
-        { 24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0 },
-        { 49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 49, 1024, 1024, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0 },
+    const GeMMConfigsMatrix configs_1nkb_best = {
+        {1, 8984, 640, 1, 1, 8, 8, 1, 0, 1, 1, 1, 1, 0},    {1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0},
+        {1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0},    {1, 6512, 6404, 1, 1, 4, 8, 1, 0, 1, 0, 1, 0, 0},
+        {1, 5304, 640, 1, 1, 4, 4, 1, 0, 1, 0, 1, 1, 0},    {1, 1352, 1520, 1, 1, 2, 8, 1, 0, 1, 1, 1, 1, 0},
+        {1, 4096, 25088, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0}, {1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_n_small_best = {{102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0},
+                                                         {102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1},
+                                                         {16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1},
+                                                         {16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 1}};
+
+    const GeMMConfigsMatrix configs_mnkb_n_small_fallback = {{102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0},
+                                                             {102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0},
+                                                             {16384, 4, 128, 1, 2, 2, 16, 1, 2, 1, 1, 1, 1, 0},
+                                                             {16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_m_gt_n_best = {
+        {25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},     {25584, 16, 68, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1},
+        {369664, 32, 28, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1},   {65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0},
+        {23036, 56, 736, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},   {90968, 40, 600, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},
+        {8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},    {50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0},
+        {16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},  {12604, 60, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},
+        {29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0},   {12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0},
+        {2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0}, {3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback = {
+        {25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},     {25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 0, 0},
+        {369664, 32, 28, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0},  {65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0},
+        {23036, 56, 736, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},   {90968, 40, 600, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},
+        {8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0},    {50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0},
+        {16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0},  {12604, 60, 160, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0},
+        {29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0},   {12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0},
+        {2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0}, {3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_n_gt_m_best = {
+        {24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0},
+        {49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 1},
+        {49, 1024, 1024, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},
     };
 
-    const GeMMConfigsMatrix configs_mnkb_squared_best =
-    {
-        { 72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0 },
-        { 268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0 },
-        { 180, 420, 952, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-        { 1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 196, 512, 512, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1 },
-        { 24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0 },
-        { 24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0 }
+    const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback = {
+        {24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0},
+        {49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {49, 1024, 1024, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0},
     };
 
-    const GeMMConfigsMatrix configs_mnkb_squared_fallback =
-    {
-        { 72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0 },
-        { 268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0 },
-        { 180, 420, 952, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 196, 512, 512, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0 },
-        { 24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0 },
-        { 24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_best_batched =
-    {
-        { 3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-        { 688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_fallback_batched =
-    {
-        { 3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 4096, 48, 32, 36, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }
-    };
+    const GeMMConfigsMatrix configs_mnkb_squared_best = {
+        {72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0},   {268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0},
+        {180, 420, 952, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},  {1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {196, 512, 512, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1},
+        {24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0},    {24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_squared_fallback = {
+        {72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0},   {268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0},
+        {180, 420, 952, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0}, {1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {196, 512, 512, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0},
+        {24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0},    {24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_best_batched = {
+        {3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},  {4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},
+        {688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},   {24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},
+        {1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},  {2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_fallback_batched = {
+        {3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},  {4096, 48, 32, 36, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},   {24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},
+        {1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},  {2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}};
 
     const GeMMConfigsMatrix *configs_best_to_use     = nullptr;
     const GeMMConfigsMatrix *configs_fallback_to_use = nullptr;
 
-    if(b == 1)
+    if (b == 1)
     {
         constexpr float        ratio_m_gt_n = 10.f;
         constexpr float        ratio_n_gt_m = 0.1f;
         constexpr unsigned int n_small_thr  = 4;
         const float            ratio        = static_cast<float>(m) / static_cast<float>(n);
 
-        if(m == 1)
+        if (m == 1)
         {
             // We do not need fallback in this case, as we never use cl_image for the rhs tensor
             configs_best_to_use     = &configs_1nkb_best;
             configs_fallback_to_use = &configs_1nkb_best;
         }
-        else if(n <= n_small_thr && ratio > ratio_m_gt_n)
+        else if (n <= n_small_thr && ratio > ratio_m_gt_n)
         {
             configs_best_to_use     = &configs_mnkb_n_small_best;
             configs_fallback_to_use = &configs_mnkb_n_small_fallback;
         }
-        else if(ratio > ratio_m_gt_n)
+        else if (ratio > ratio_m_gt_n)
         {
             configs_best_to_use     = &configs_mnkb_m_gt_n_best;
             configs_fallback_to_use = &configs_mnkb_m_gt_n_fallback;
         }
-        else if(ratio < ratio_n_gt_m)
+        else if (ratio < ratio_n_gt_m)
         {
             configs_best_to_use     = &configs_mnkb_n_gt_m_best;
             configs_fallback_to_use = &configs_mnkb_n_gt_m_fallback;
@@ -381,17 +328,17 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     std::tie(lhs_info0, rhs_info0) = find_lhs_rhs_info(*configs_best_to_use, m, n, k, b);
     std::tie(lhs_info1, rhs_info1) = find_lhs_rhs_info(*configs_fallback_to_use, m, n, k, b);
 
-    return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0),
-                               std::make_pair(lhs_info1, rhs_info1),
-                               n, k, b, DataType::F16);
+    return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0), std::make_pair(lhs_info1, rhs_info1), n, k, b,
+                               DataType::F16);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
         const unsigned int h0 = std::max(n / 2, 1U);
         return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, 0, 1, 0, 1);
@@ -399,7 +346,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     else
     {
         const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1));
-        if(m >= 28)
+        if (m >= 28)
         {
             return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, 0, 1, 0, 1);
         }
@@ -410,30 +357,31 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
     const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
     const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(workload <= 278.7000f)
+        if (workload <= 278.7000f)
         {
-            if(workload <= 7.5000f)
+            if (workload <= 7.5000f)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
             }
             else
             {
-                if(r_mn <= 0.0031f)
+                if (r_mn <= 0.0031f)
                 {
-                    if(workload <= 256.6000f)
+                    if (workload <= 256.6000f)
                     {
-                        if(workload <= 16.7500f)
+                        if (workload <= 16.7500f)
                         {
-                            if(r_nk <= 1.6671f)
+                            if (r_nk <= 1.6671f)
                             {
                                 return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
                             }
@@ -454,15 +402,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
                 }
                 else
                 {
-                    if(r_mk <= 0.0027f)
+                    if (r_mk <= 0.0027f)
                     {
-                        if(r_mk <= 0.0014f)
+                        if (r_mk <= 0.0014f)
                         {
                             return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
                         }
                         else
                         {
-                            if(workload <= 8.9500f)
+                            if (workload <= 8.9500f)
                             {
                                 return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
                             }
@@ -474,13 +422,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
                     }
                     else
                     {
-                        if(workload <= 14.1500f)
+                        if (workload <= 14.1500f)
                         {
                             return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
                         }
                         else
                         {
-                            if(r_mk <= 0.0041f)
+                            if (r_mk <= 0.0041f)
                             {
                                 return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
                             }
@@ -495,9 +443,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
         }
         else
         {
-            if(workload <= 363.7000f)
+            if (workload <= 363.7000f)
             {
-                if(r_mk <= 0.0031f)
+                if (r_mk <= 0.0031f)
                 {
                     return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 32, 0, 1, 0, 1, 0);
                 }
@@ -514,9 +462,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
     else
     {
-        if(workload <= 1384.8000f)
+        if (workload <= 1384.8000f)
         {
-            if(workload <= 704.0000f)
+            if (workload <= 704.0000f)
             {
                 return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 32, 0, 1, 0, 1, 0);
             }
@@ -527,9 +475,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
         }
         else
         {
-            if(workload <= 16761.6006f)
+            if (workload <= 16761.6006f)
             {
-                if(r_mn <= 187.1250f)
+                if (r_mn <= 187.1250f)
                 {
                     return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 0, 0, 1, 1);
                 }
@@ -540,7 +488,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
             }
             else
             {
-                if(r_mk <= 432.4630f)
+                if (r_mk <= 432.4630f)
                 {
                     return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 16, 0, 0, 0, 1, 1);
                 }
@@ -553,42 +501,37 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
     const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
     const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
     const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
 
-    if(m == 1)
+    if (m == 1)
     {
-        const GeMMConfigsMatrix configs_mnkb_best =
-        {
-            { 1, 8984, 640, 1, 1, 4, 2, 1, 0, 1, 0, 1, 1, 0 },
-            { 1, 420, 392, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 },
-            { 1, 644, 5288, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 },
-            { 1, 6512, 6404, 1, 1, 2, 2, 1, 0, 1, 0, 1, 1, 0 },
-            { 1, 5304, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0 },
-            { 1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 },
-            { 1, 4096, 25088, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 },
-            { 1, 732, 8988, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 }
-        };
+        const GeMMConfigsMatrix configs_mnkb_best = {
+            {1, 8984, 640, 1, 1, 4, 2, 1, 0, 1, 0, 1, 1, 0},   {1, 420, 392, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},
+            {1, 644, 5288, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},   {1, 6512, 6404, 1, 1, 2, 2, 1, 0, 1, 0, 1, 1, 0},
+            {1, 5304, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0},   {1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},
+            {1, 4096, 25088, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}, {1, 732, 8988, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}};
 
         return find_lhs_rhs_info(configs_mnkb_best, m, n, k, b);
     }
     else
     {
-        if(workload <= 1384.8000f)
+        if (workload <= 1384.8000f)
         {
-            if(r_nk <= 0.8333f)
+            if (r_nk <= 0.8333f)
             {
-                if(r_mk <= 0.9119f)
+                if (r_mk <= 0.9119f)
                 {
                     return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 4, 0, 1, 0, 1, 1);
                 }
                 else
                 {
-                    if(r_nk <= 0.1181f)
+                    if (r_nk <= 0.1181f)
                     {
                         return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 32, 0, 0, 1, 0, 0);
                     }
@@ -600,7 +543,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
             }
             else
             {
-                if(r_mk <= 1.0013f)
+                if (r_mk <= 1.0013f)
                 {
                     return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 1);
                 }
@@ -612,11 +555,11 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
         }
         else
         {
-            if(workload <= 11404.7998f)
+            if (workload <= 11404.7998f)
             {
-                if(r_mk <= 2.2884f)
+                if (r_mk <= 2.2884f)
                 {
-                    if(r_nk <= 0.9286f)
+                    if (r_nk <= 0.9286f)
                     {
                         return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 4, 0, 1, 1, 0, 1);
                     }
@@ -632,9 +575,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
             }
             else
             {
-                if(r_nk <= 1.1926f)
+                if (r_nk <= 1.1926f)
                 {
-                    if(r_mn <= 1385.7917f)
+                    if (r_mn <= 1385.7917f)
                     {
                         return configure_lhs_rhs_info(m, n, 6, 4, 8, 1, 4, 0, 1, 1, 0, 1);
                     }
@@ -652,12 +595,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     unsigned int best_m0;
     unsigned int best_n0;
 
-    if(is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0))
+    if (is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0))
     {
         return configure_lhs_rhs_info(m, n, best_m0, best_n0, 1, 1, 4, false, true, false, false, true);
     }
@@ -667,153 +611,101 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
-    const GeMMConfigsMatrix configs_1nkb_best =
-    {
-        { 1, 8984, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 6512, 6404, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 5304, 640, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 4096, 25088, 1, 1, 2, 8, 1, 0, 1, 0, 1, 1, 0 },
-        { 1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 }
+    const GeMMConfigsMatrix configs_1nkb_best = {
+        {1, 8984, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0},   {1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0},
+        {1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0},   {1, 6512, 6404, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},
+        {1, 5304, 640, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},   {1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},
+        {1, 4096, 25088, 1, 1, 2, 8, 1, 0, 1, 0, 1, 1, 0}, {1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_n_small_best = {{102400, 4, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0},
+                                                         {102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0},
+                                                         {16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0},
+                                                         {16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_m_gt_n_best = {
+        {25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0},    {25584, 16, 68, 1, 2, 4, 16, 1, 8, 1, 1, 1, 0, 1},
+        {369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0}, {65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},
+        {23036, 56, 736, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},   {90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {8944, 32, 776, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},    {2688, 136, 1492, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {50176, 64, 300, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1},   {16544, 104, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {12604, 60, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},   {3728, 96, 196, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0},   {12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0},
     };
 
-    const GeMMConfigsMatrix configs_mnkb_n_small_best =
-    {
-        { 102400, 4, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 },
-        { 102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 },
-        { 16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 },
-        { 16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 }
+    const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback = {
+        {25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0},    {25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0},
+        {369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0}, {65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},
+        {23036, 56, 736, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0},  {90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 0},
+        {8944, 32, 776, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0},   {2688, 136, 1492, 1, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0},
+        {50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}, {16544, 104, 160, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0},
+        {12604, 60, 160, 1, 2, 8, 8, 1, 8, 1, 1, 1, 0, 0},   {3728, 96, 196, 1, 2, 8, 8, 1, 64, 1, 1, 1, 0, 0},
+        {29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0},   {12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0},
     };
 
-    const GeMMConfigsMatrix configs_mnkb_m_gt_n_best =
-    {
-        { 25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0 },
-        { 25584, 16, 68, 1, 2, 4, 16, 1, 8, 1, 1, 1, 0, 1 },
-        { 369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 },
-        { 23036, 56, 736, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 8944, 32, 776, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 2688, 136, 1492, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 50176, 64, 300, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1 },
-        { 16544, 104, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 12604, 60, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 3728, 96, 196, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0 },
-        { 12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0 },
-    };
+    const GeMMConfigsMatrix configs_mnkb_n_gt_m_best = {{24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0},
+                                                        {49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0},
+                                                        {49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0}};
 
-    const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback =
-    {
-        { 25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0 },
-        { 25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0 },
-        { 369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 },
-        { 23036, 56, 736, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0 },
-        { 90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 0 },
-        { 8944, 32, 776, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0 },
-        { 2688, 136, 1492, 1, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0 },
-        { 50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 16544, 104, 160, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0 },
-        { 12604, 60, 160, 1, 2, 8, 8, 1, 8, 1, 1, 1, 0, 0 },
-        { 3728, 96, 196, 1, 2, 8, 8, 1, 64, 1, 1, 1, 0, 0 },
-        { 29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0 },
-        { 12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0 },
-    };
+    const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback = {{24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0},
+                                                            {49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0},
+                                                            {49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0}};
 
-    const GeMMConfigsMatrix configs_mnkb_n_gt_m_best =
-    {
-        { 24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0 },
-        { 49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0 },
-        { 49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0 }
+    const GeMMConfigsMatrix configs_mnkb_squared_best = {
+        {24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0},    {24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0},
+        {72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0},   {268, 824, 5076, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {180, 420, 952, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1}, {1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0},
+        {272, 400, 2116, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {196, 512, 512, 1, 5, 2, 8, 1, 4, 1, 1, 1, 1, 1},
     };
 
-    const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback =
-    {
-        { 24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0 },
-        { 49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0 },
-        { 49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_squared_best =
-    {
-        { 24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0 },
-        { 24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0 },
-        { 72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0 },
-        { 268, 824, 5076, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 180, 420, 952, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1 },
-        { 1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0 },
-        { 272, 400, 2116, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 196, 512, 512, 1, 5, 2, 8, 1, 4, 1, 1, 1, 1, 1 },
+    const GeMMConfigsMatrix configs_mnkb_squared_fallback = {
+        {24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0},    {24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0},
+        {72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0},   {268, 824, 5076, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},
+        {180, 420, 952, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0},  {1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0},
+        {272, 400, 2116, 1, 2, 8, 4, 1, 4, 1, 1, 1, 0, 0}, {196, 512, 512, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0},
     };
 
-    const GeMMConfigsMatrix configs_mnkb_squared_fallback =
-    {
-        { 24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0 },
-        { 24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0 },
-        { 72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0 },
-        { 268, 824, 5076, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 },
-        { 180, 420, 952, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0 },
-        { 1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0 },
-        { 272, 400, 2116, 1, 2, 8, 4, 1, 4, 1, 1, 1, 0, 0 },
-        { 196, 512, 512, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0 },
-    };
+    const GeMMConfigsMatrix configs_mnkb_best_batched = {
+        {3136, 64, 64, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 1}, {4096, 48, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 1},  {24, 464, 412, 24, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {112, 184, 144, 28, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {5776, 64, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {1568, 64, 40, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1},  {2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1}};
 
-    const GeMMConfigsMatrix configs_mnkb_best_batched =
-    {
-        { 3136, 64, 64, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 1 },
-        { 4096, 48, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 1 },
-        { 24, 464, 412, 24, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 112, 184, 144, 28, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 5776, 64, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 1568, 64, 40, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1 },
-        { 2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_fallback_batched =
-    {
-        { 3136, 64, 64, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 },
-        { 4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0 },
-        { 688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0 },
-        { 24, 464, 412, 24, 2, 8, 4, 1, 32, 1, 1, 1, 0, 0 },
-        { 112, 184, 144, 28, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0 },
-        { 5776, 64, 32, 36, 2, 8, 8, 1, 32, 1, 1, 1, 0, 0 },
-        { 1568, 64, 40, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0 },
-        { 2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 }
-    };
+    const GeMMConfigsMatrix configs_mnkb_fallback_batched = {
+        {3136, 64, 64, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},  {4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0},
+        {688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0},  {24, 464, 412, 24, 2, 8, 4, 1, 32, 1, 1, 1, 0, 0},
+        {112, 184, 144, 28, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0}, {5776, 64, 32, 36, 2, 8, 8, 1, 32, 1, 1, 1, 0, 0},
+        {1568, 64, 40, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0}, {2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}};
 
     const GeMMConfigsMatrix *configs_best_to_use     = nullptr;
     const GeMMConfigsMatrix *configs_fallback_to_use = nullptr;
 
-    if(b == 1)
+    if (b == 1)
     {
         constexpr float        ratio_m_gt_n = 10.f;
         constexpr float        ratio_n_gt_m = 0.1f;
         constexpr unsigned int n_small_thr  = 4;
         const float            ratio        = static_cast<float>(m) / static_cast<float>(n);
 
-        if(m == 1)
+        if (m == 1)
         {
             // We do not need fallback in this case, as we never use cl_image for the rhs tensor
             configs_best_to_use     = &configs_1nkb_best;
             configs_fallback_to_use = &configs_1nkb_best;
         }
-        else if(n <= n_small_thr && ratio > ratio_m_gt_n)
+        else if (n <= n_small_thr && ratio > ratio_m_gt_n)
         {
             configs_best_to_use     = &configs_mnkb_n_small_best;
             configs_fallback_to_use = &configs_mnkb_n_small_best;
         }
-        else if(ratio > ratio_m_gt_n)
+        else if (ratio > ratio_m_gt_n)
         {
             configs_best_to_use     = &configs_mnkb_m_gt_n_best;
             configs_fallback_to_use = &configs_mnkb_m_gt_n_fallback;
         }
-        else if(ratio < ratio_n_gt_m)
+        else if (ratio < ratio_n_gt_m)
         {
             configs_best_to_use     = &configs_mnkb_n_gt_m_best;
             configs_fallback_to_use = &configs_mnkb_n_gt_m_fallback;
@@ -838,17 +730,17 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn
     std::tie(lhs_info0, rhs_info0) = find_lhs_rhs_info(*configs_best_to_use, m, n, k, b);
     std::tie(lhs_info1, rhs_info1) = find_lhs_rhs_info(*configs_fallback_to_use, m, n, k, b);
 
-    return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0),
-                               std::make_pair(lhs_info1, rhs_info1),
-                               n, k, b, DataType::F16);
+    return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0), std::make_pair(lhs_info1, rhs_info1), n, k, b,
+                               DataType::F16);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     unsigned int best_m0;
     unsigned int best_n0;
 
-    if(is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0))
+    if (is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0))
     {
         return configure_lhs_rhs_info(m, n, best_m0, best_n0, 1, 1, 4, false, true, false, false, true);
     }
diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h
index f2952a3d30..a0ea337eb1 100644
--- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h
@@ -45,17 +45,26 @@ public:
     ClGemmDefaultConfigReshapedRhsOnlyValhall(GPUTarget gpu);
 
     // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
 private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 };
 } // namespace gemm
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h
index 1503e74eb6..e07ad993ed 100644
--- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h
@@ -50,7 +50,7 @@ public:
      */
     static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
             case GPUTarget::BIFROST:
diff --git a/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp
index 2407c6ca5e..689a743fdf 100644
--- a/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp
+++ b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp
@@ -36,7 +36,9 @@ namespace opencl
 {
 namespace kernels
 {
-Status validate_matmul_input_shapes(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const MatMulKernelInfo &matmul_kernel_info)
+Status validate_matmul_input_shapes(const TensorShape      &lhs_shape,
+                                    const TensorShape      &rhs_shape,
+                                    const MatMulKernelInfo &matmul_kernel_info)
 {
     const size_t lhs_k = matmul_kernel_info.adj_lhs ? lhs_shape.y() : lhs_shape.x();
     const size_t rhs_k = matmul_kernel_info.adj_rhs ? rhs_shape.x() : rhs_shape.y();
@@ -46,7 +48,7 @@ Status validate_matmul_input_shapes(const TensorShape &lhs_shape, const TensorSh
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_shape.total_size() == 0, "Rhs tensor can't be empty");
 
     constexpr size_t batch_dim_start = 2;
-    for(size_t i = batch_dim_start; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = batch_dim_start; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_shape[i] != rhs_shape[i], "Batch dimension broadcasting is not supported");
     }
@@ -54,9 +56,12 @@ Status validate_matmul_input_shapes(const TensorShape &lhs_shape, const TensorSh
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window_for_mmul_kernels(const ITensorInfo *lhs,
-                                                                         const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
-                                                                         int mmul_m0, int mmul_n0)
+std::pair<Status, Window> validate_and_configure_window_for_mmul_kernels(const ITensorInfo      *lhs,
+                                                                         const ITensorInfo      *rhs,
+                                                                         const ITensorInfo      *dst,
+                                                                         const MatMulKernelInfo &matmul_kernel_info,
+                                                                         int                     mmul_m0,
+                                                                         int                     mmul_n0)
 {
     ARM_COMPUTE_UNUSED(lhs, rhs);
 
diff --git a/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h
index 210f22b109..c2ae2a67f4 100644
--- a/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h
+++ b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h
@@ -44,7 +44,8 @@ namespace kernels
  *
  * @return true if the shapes and matmul kernel info matches
  */
-Status validate_matmul_input_shapes(const TensorShape &lhs_shape, const TensorShape &rhs_shape,
+Status validate_matmul_input_shapes(const TensorShape      &lhs_shape,
+                                    const TensorShape      &rhs_shape,
                                     const MatMulKernelInfo &matmul_kernel_info);
 
 /** Validate and configure window for Matmul MMUL kernels
@@ -58,9 +59,12 @@ Status validate_matmul_input_shapes(const TensorShape &lhs_shape, const TensorSh
  *
  * @return a pair of Status and Window object
  */
-std::pair<Status, Window> validate_and_configure_window_for_mmul_kernels(const ITensorInfo *lhs,
-                                                                         const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
-                                                                         int mmul_m0, int mmul_n0);
+std::pair<Status, Window> validate_and_configure_window_for_mmul_kernels(const ITensorInfo      *lhs,
+                                                                         const ITensorInfo      *rhs,
+                                                                         const ITensorInfo      *dst,
+                                                                         const MatMulKernelInfo &matmul_kernel_info,
+                                                                         int                     mmul_m0,
+                                                                         int                     mmul_n0);
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClActivation.cpp b/src/gpu/cl/operators/ClActivation.cpp
index 74a818d738..66877ebcec 100644
--- a/src/gpu/cl/operators/ClActivation.cpp
+++ b/src/gpu/cl/operators/ClActivation.cpp
@@ -23,19 +23,21 @@
  */
 #include "src/gpu/cl/operators/ClActivation.h"
 
-#include "src/gpu/cl/ClCompileContext.h"
-#include "src/gpu/cl/kernels/ClActivationKernel.h"
-
 #include "src/common/IOperator.h"
 #include "src/common/utils/LegacySupport.h"
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/ClContext.h"
+#include "src/gpu/cl/kernels/ClActivationKernel.h"
 
 namespace arm_compute
 {
 namespace opencl
 {
-void ClActivation::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClActivation::configure(const ClCompileContext    &compile_context,
+                             ITensorInfo               *src,
+                             ITensorInfo               *dst,
+                             const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src, dst, act_info);
     auto k = std::make_unique<kernels::ClActivationKernel>();
@@ -53,13 +55,17 @@ namespace gpu
 {
 namespace opencl
 {
-std::tuple<IOperator *, StatusCode> ClContext::create_activation(const AclTensorDescriptor &src, const AclTensorDescriptor &dst, const AclActivationDescriptor &act, bool is_validate)
+std::tuple<IOperator *, StatusCode> ClContext::create_activation(const AclTensorDescriptor     &src,
+                                                                 const AclTensorDescriptor     &dst,
+                                                                 const AclActivationDescriptor &act,
+                                                                 bool                           is_validate)
 {
     TensorInfo src_info = detail::convert_to_legacy_tensor_info(src);
     TensorInfo dst_info = detail::convert_to_legacy_tensor_info(dst);
     auto       info     = detail::convert_to_activation_info(act);
 
-    if(is_validate && !bool(arm_compute::opencl::ClActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info)))
+    if (is_validate && !bool(arm_compute::opencl::ClActivation::validate(&src_info.set_is_resizable(false),
+                                                                         &dst_info.set_is_resizable(false), info)))
     {
         return std::make_tuple(nullptr, StatusCode::UnsupportedConfig);
     }
@@ -68,7 +74,7 @@ std::tuple<IOperator *, StatusCode> ClContext::create_activation(const AclTensor
     act_op->configure(CLKernelLibrary::get().get_compile_context(), &src_info, &dst_info, info);
 
     auto op = new arm_compute::IOperator(static_cast<IContext *>(this));
-    if(op == nullptr)
+    if (op == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources");
         return std::make_tuple(nullptr, StatusCode::OutOfMemory);
diff --git a/src/gpu/cl/operators/ClActivation.h b/src/gpu/cl/operators/ClActivation.h
index 348dc27929..4f25bb5f24 100644
--- a/src/gpu/cl/operators/ClActivation.h
+++ b/src/gpu/cl/operators/ClActivation.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_ACTIVATION_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -43,7 +44,10 @@ public:
      * @param[out] dst             Destination tensor info. Data type supported: same as @p src
      * @param[in]  activation_info Activation layer parameters.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ActivationLayerInfo &activation_info);
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &activation_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClActivation::configure()
diff --git a/src/gpu/cl/operators/ClAdd.cpp b/src/gpu/cl/operators/ClAdd.cpp
index b9bf505bba..b58d0df58d 100644
--- a/src/gpu/cl/operators/ClAdd.cpp
+++ b/src/gpu/cl/operators/ClAdd.cpp
@@ -23,17 +23,20 @@
  */
 #include "src/gpu/cl/operators/ClAdd.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClAdd::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
-                      ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void ClAdd::configure(const ClCompileContext    &compile_context,
+                      ITensorInfo               *src1,
+                      ITensorInfo               *src2,
+                      ITensorInfo               *dst,
+                      ConvertPolicy              policy,
+                      const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, policy, act_info);
     auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>();
@@ -41,8 +44,11 @@ void ClAdd::configure(const ClCompileContext &compile_context, ITensorInfo *src1
     _kernel = std::move(k);
 }
 
-Status ClAdd::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst,
-                       ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status ClAdd::validate(const ITensorInfo         *src1,
+                       const ITensorInfo         *src2,
+                       const ITensorInfo         *dst,
+                       ConvertPolicy              policy,
+                       const ActivationLayerInfo &act_info)
 {
     return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::ADD, src1, src2, dst, policy, act_info);
 }
diff --git a/src/gpu/cl/operators/ClAdd.h b/src/gpu/cl/operators/ClAdd.h
index a17ce7b5d6..7aed902f5d 100644
--- a/src/gpu/cl/operators/ClAdd.h
+++ b/src/gpu/cl/operators/ClAdd.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_ADD_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -65,7 +66,11 @@ public:
      * @param[in]      policy          Policy to use to handle overflow.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, ConvertPolicy policy,
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   ConvertPolicy              policy,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -73,7 +78,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, ConvertPolicy policy,
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           ConvertPolicy              policy,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 } // namespace opencl
diff --git a/src/gpu/cl/operators/ClCast.cpp b/src/gpu/cl/operators/ClCast.cpp
index 05ea21b734..8f26ef003d 100644
--- a/src/gpu/cl/operators/ClCast.cpp
+++ b/src/gpu/cl/operators/ClCast.cpp
@@ -23,16 +23,18 @@
  */
 #include "src/gpu/cl/operators/ClCast.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClCastKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClCast::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
+void ClCast::configure(const ClCompileContext &compile_context,
+                       const ITensorInfo      *src,
+                       ITensorInfo            *dst,
+                       ConvertPolicy           policy)
 {
     ARM_COMPUTE_LOG_PARAMS(src, dst, policy);
     auto k = std::make_unique<kernels::ClCastKernel>();
diff --git a/src/gpu/cl/operators/ClCast.h b/src/gpu/cl/operators/ClCast.h
index 1b67ff7c8e..25d2293673 100644
--- a/src/gpu/cl/operators/ClCast.h
+++ b/src/gpu/cl/operators/ClCast.h
@@ -58,7 +58,8 @@ public:
      * @param[out] dst             The destinatio tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
      * @param[in]  policy          Conversion policy.
      */
-    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
+    void
+    configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClCast::configure()
diff --git a/src/gpu/cl/operators/ClConcatenate.cpp b/src/gpu/cl/operators/ClConcatenate.cpp
index a27fc37cc4..31018b9768 100644
--- a/src/gpu/cl/operators/ClConcatenate.cpp
+++ b/src/gpu/cl/operators/ClConcatenate.cpp
@@ -23,9 +23,14 @@
  */
 #include "src/gpu/cl/operators/ClConcatenate.h"
 
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
 #include "src/gpu/cl/kernels/ClBatchConcatenateKernel.h"
 #include "src/gpu/cl/kernels/ClDepthConcatenateKernel.h"
 #include "src/gpu/cl/kernels/ClHeightConcatenateKernel.h"
@@ -33,42 +38,39 @@
 #include "src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h"
 #include "src/gpu/cl/kernels/ClWidthConcatenateKernel.h"
 
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-
-#include "src/common/utils/Log.h"
-#include "src/core/helpers/AutoConfiguration.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClConcatenate::configure(const CLCompileContext &compile_context, const std::vector<ITensorInfo *> &src_vector, ITensorInfo *dst, size_t axis)
+void ClConcatenate::configure(const CLCompileContext           &compile_context,
+                              const std::vector<ITensorInfo *> &src_vector,
+                              ITensorInfo                      *dst,
+                              size_t                            axis)
 {
     ARM_COMPUTE_ERROR_ON(dst == nullptr);
     ARM_COMPUTE_LOG_PARAMS(src_vector, dst, axis);
     _axis       = axis;
     _num_inputs = src_vector.size();
 
-    TensorShape                      dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, _axis);
+    TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, _axis);
     std::vector<const ITensorInfo *> const_src_vector(src_vector.size());
-    std::transform(src_vector.begin(), src_vector.end(), const_src_vector.begin(), [](ITensorInfo * t)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(t);
-        return t;
-    });
+    std::transform(src_vector.begin(), src_vector.end(), const_src_vector.begin(),
+                   [](ITensorInfo *t)
+                   {
+                       ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+                       return t;
+                   });
 
     // dst auto inizialitation if not yet initialized
     auto_init_if_empty(*dst, dst_shape, 1, src_vector[0]->data_type());
     ARM_COMPUTE_ERROR_THROW_ON(ClConcatenate::validate(const_src_vector, dst, axis));
 
     unsigned int offset = 0;
-    switch(_axis)
+    switch (_axis)
     {
         case Window::DimX:
         {
-            switch(_num_inputs)
+            switch (_num_inputs)
             {
                 case 2:
                 {
@@ -82,14 +84,15 @@ void ClConcatenate::configure(const CLCompileContext &compile_context, const std
                 {
                     // Configure WidthConcatenate4Tensors kernel
                     auto kernel = std::make_unique<kernels::ClWidthConcatenate4TensorsKernel>();
-                    kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), src_vector.at(2), src_vector.at(3), dst);
+                    kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), src_vector.at(2),
+                                      src_vector.at(3), dst);
                     _concat_kernels.emplace_back(std::move(kernel));
                     break;
                 }
                 default:
                 {
                     // Configure generic case WidthConcatenate kernels
-                    for(unsigned int i = 0; i < _num_inputs; ++i)
+                    for (unsigned int i = 0; i < _num_inputs; ++i)
                     {
                         auto kernel = std::make_unique<kernels::ClWidthConcatenateKernel>();
                         kernel->configure(compile_context, src_vector.at(i), offset, dst);
@@ -103,7 +106,7 @@ void ClConcatenate::configure(const CLCompileContext &compile_context, const std
         }
         case Window::DimY:
         {
-            for(unsigned int i = 0; i < _num_inputs; ++i)
+            for (unsigned int i = 0; i < _num_inputs; ++i)
             {
                 auto kernel = std::make_unique<kernels::ClHeightConcatenateKernel>();
                 kernel->configure(compile_context, src_vector.at(i), offset, dst);
@@ -114,7 +117,7 @@ void ClConcatenate::configure(const CLCompileContext &compile_context, const std
         }
         case Window::DimZ:
         {
-            for(unsigned int i = 0; i < _num_inputs; ++i)
+            for (unsigned int i = 0; i < _num_inputs; ++i)
             {
                 auto kernel = std::make_unique<kernels::ClDepthConcatenateKernel>();
                 kernel->configure(compile_context, src_vector.at(i), offset, dst);
@@ -125,7 +128,7 @@ void ClConcatenate::configure(const CLCompileContext &compile_context, const std
         }
         case 3:
         {
-            for(unsigned int i = 0; i < _num_inputs; ++i)
+            for (unsigned int i = 0; i < _num_inputs; ++i)
             {
                 auto kernel = std::make_unique<kernels::ClBatchConcatenateKernel>();
                 kernel->configure(compile_context, src_vector.at(i), offset, dst);
@@ -148,25 +151,27 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto
     ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
 
     unsigned int offset = 0;
-    switch(axis)
+    switch (axis)
     {
         case Window::DimX:
         {
-            switch(num_inputs)
+            switch (num_inputs)
             {
                 case 2:
                     // Validate WidthConcatenate2Tensors kernels if there are 2 inputs
                     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1]);
-                    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate2TensorsKernel::validate(src_vector[0], src_vector[1], dst));
+                    ARM_COMPUTE_RETURN_ON_ERROR(
+                        kernels::ClWidthConcatenate2TensorsKernel::validate(src_vector[0], src_vector[1], dst));
                     break;
                 case 4:
                     // Validate WidthConcatenate4Tensors kernels if there are 4 inputs
                     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1], src_vector[2], src_vector[3]);
-                    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate4TensorsKernel::validate(src_vector[0], src_vector[1], src_vector[2], src_vector[3], dst));
+                    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate4TensorsKernel::validate(
+                        src_vector[0], src_vector[1], src_vector[2], src_vector[3], dst));
                     break;
                 default:
                     // Validate generic case of WidthConcatenate kernel
-                    for(const auto &src : src_vector)
+                    for (const auto &src : src_vector)
                     {
                         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
                         ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenateKernel::validate(src, offset, dst));
@@ -178,7 +183,7 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto
         }
         case Window::DimY:
         {
-            for(const auto &src : src_vector)
+            for (const auto &src : src_vector)
             {
                 ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClHeightConcatenateKernel::validate(src, offset, dst));
                 offset += src->dimension(axis);
@@ -187,7 +192,7 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto
         }
         case Window::DimZ:
         {
-            for(const auto &src : src_vector)
+            for (const auto &src : src_vector)
             {
                 ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDepthConcatenateKernel::validate(src, offset, dst));
                 offset += src->dimension(axis);
@@ -196,7 +201,7 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto
         }
         case 3:
         {
-            for(const auto &src : src_vector)
+            for (const auto &src : src_vector)
             {
                 ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClBatchConcatenateKernel::validate(src, offset, dst));
                 offset += src->dimension(axis);
@@ -207,7 +212,7 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto
             ARM_COMPUTE_ERROR("Axis not supported");
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, axis);
         ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size());
@@ -218,17 +223,17 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto
 
 void ClConcatenate::run(ITensorPack &tensors)
 {
-    if(tensors.empty())
+    if (tensors.empty())
     {
         ARM_COMPUTE_ERROR("No inputs provided");
     }
 
-    if(static_cast<int>(tensors.size()) - 1 != static_cast<int>(_num_inputs))
+    if (static_cast<int>(tensors.size()) - 1 != static_cast<int>(_num_inputs))
     {
         ARM_COMPUTE_ERROR("Configured with different number of inputs");
     }
 
-    if(_axis == Window::DimX && (_num_inputs == 2 || _num_inputs == 4))
+    if (_axis == Window::DimX && (_num_inputs == 2 || _num_inputs == 4))
     {
         ARM_COMPUTE_ERROR_ON(_concat_kernels.empty());
         CLScheduler::get().enqueue_op(*_concat_kernels.at(0), tensors, true);
@@ -236,7 +241,7 @@ void ClConcatenate::run(ITensorPack &tensors)
     else
     {
         int i = 0;
-        for(auto &k : _concat_kernels)
+        for (auto &k : _concat_kernels)
         {
             ITensorPack pack;
             pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i));
diff --git a/src/gpu/cl/operators/ClConcatenate.h b/src/gpu/cl/operators/ClConcatenate.h
index de0cf84d2c..d8ce9d2a5c 100644
--- a/src/gpu/cl/operators/ClConcatenate.h
+++ b/src/gpu/cl/operators/ClConcatenate.h
@@ -57,7 +57,10 @@ public:
      * @param[out]    dst             Destination tensor info. Data types supported: same as @p src_vector.
      * @param[in]     axis            Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
      */
-    void configure(const ClCompileContext &compile_context, const std::vector<ITensorInfo *> &src_vector, ITensorInfo *dst, size_t axis);
+    void configure(const ClCompileContext           &compile_context,
+                   const std::vector<ITensorInfo *> &src_vector,
+                   ITensorInfo                      *dst,
+                   size_t                            axis);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClConcatenate::configure()
@@ -71,8 +74,8 @@ public:
 
 private:
     std::vector<std::unique_ptr<IClKernel>> _concat_kernels{};
-    unsigned int                            _num_inputs{ 0 };
-    unsigned int                            _axis{ 0 };
+    unsigned int                            _num_inputs{0};
+    unsigned int                            _axis{0};
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClConv2d.cpp b/src/gpu/cl/operators/ClConv2d.cpp
index eb9475ccaa..2c3b0214fa 100644
--- a/src/gpu/cl/operators/ClConv2d.cpp
+++ b/src/gpu/cl/operators/ClConv2d.cpp
@@ -23,17 +23,17 @@
  */
 #include "src/gpu/cl/operators/ClConv2d.h"
 
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
+
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/operators/ClDirectConv2d.h"
 #include "src/gpu/cl/operators/ClGemmConv2d.h"
 #include "src/gpu/cl/operators/ClIndirectConv2d.h"
 #include "src/gpu/cl/operators/ClWinogradConv2d.h"
 
-#include "src/common/utils/Log.h"
-
 #include <memory>
 
 namespace
@@ -48,7 +48,7 @@ namespace
  */
 size_t get_direct_conv_kernel_threshold_nhwc(arm_compute::GPUTarget gpu_target)
 {
-    switch(gpu_target)
+    switch (gpu_target)
     {
         case arm_compute::GPUTarget::G76:
         case arm_compute::GPUTarget::G77:
@@ -71,27 +71,33 @@ namespace opencl
 {
 using namespace arm_compute::misc::shape_calculator;
 
-ClConv2d::ClConv2d()
-    : _operator()
+ClConv2d::ClConv2d() : _operator()
 {
 }
 
 ClConv2d::~ClConv2d() = default;
 
-void ClConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info,
-                         const WeightsInfo &weights_info)
+void ClConv2d::configure(const CLCompileContext &compile_context,
+                         ITensorInfo            *src,
+                         ITensorInfo            *weights,
+                         ITensorInfo            *biases,
+                         ITensorInfo            *dst,
+                         const Conv2dInfo       &conv2d_info,
+                         const WeightsInfo      &weights_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(ClConv2d::validate(src, weights, ((biases != nullptr) ? biases : nullptr), dst, conv2d_info, weights_info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        ClConv2d::validate(src, weights, ((biases != nullptr) ? biases : nullptr), dst, conv2d_info, weights_info));
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv2d_info, weights_info);
 
-    switch(ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, CLScheduler::get().target()))
+    switch (ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, CLScheduler::get().target()))
     {
         case ConvolutionMethod::WINOGRAD:
         {
             ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1);
             auto f = std::make_unique<ClWinogradConv2d>();
-            f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, conv2d_info.enable_fast_math);
+            f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info,
+                         conv2d_info.enable_fast_math);
             _operator = std::move(f);
             break;
         }
@@ -125,35 +131,46 @@ void ClConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *s
     _aux_mem = _operator->workspace();
 }
 
-Status ClConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
+Status ClConv2d::validate(const ITensorInfo *src,
+                          const ITensorInfo *weights,
+                          const ITensorInfo *biases,
+                          const ITensorInfo *dst,
+                          const Conv2dInfo  &conv2d_info,
                           const WeightsInfo &weights_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW),
+                                    "Grouping (num_groups != 1) with NHWC data layout is not supported");
 
     const GPUTarget gpu_target = CLScheduler::get().target();
 
-    switch(ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, gpu_target))
+    switch (ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, gpu_target))
     {
         case ConvolutionMethod::WINOGRAD:
         {
             //Validate Winograd
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClWinogradConv2d is not supported");
-            ARM_COMPUTE_RETURN_ON_ERROR(ClWinogradConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, conv2d_info.enable_fast_math));
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1,
+                                            "Grouping (num_groups != 1) with ClWinogradConv2d is not supported");
+            ARM_COMPUTE_RETURN_ON_ERROR(ClWinogradConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info,
+                                                                   conv2d_info.act_info, conv2d_info.enable_fast_math));
             break;
         }
         case ConvolutionMethod::DIRECT:
         {
             // Validate direct convolution layer
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClDirectConv2d is not supported");
-            ARM_COMPUTE_RETURN_ON_ERROR(ClDirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1,
+                                            "Grouping (num_groups != 1) with ClDirectConv2d is not supported");
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                ClDirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
             break;
         }
         case ConvolutionMethod::INDIRECT:
         {
             // Validate indirect convolution layer
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClIndirectConv2d is not supported");
-            ARM_COMPUTE_RETURN_ON_ERROR(ClIndirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1,
+                                            "Grouping (num_groups != 1) with ClIndirectConv2d is not supported");
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                ClIndirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
             break;
         }
         case ConvolutionMethod::GEMM:
@@ -170,8 +187,12 @@ Status ClConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, co
     return Status{};
 }
 
-ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
-                                                   const WeightsInfo &weights_info, const GPUTarget gpu_target)
+ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src,
+                                                   const ITensorInfo *weights,
+                                                   const ITensorInfo *dst,
+                                                   const Conv2dInfo  &conv2d_info,
+                                                   const WeightsInfo &weights_info,
+                                                   const GPUTarget    gpu_target)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
@@ -191,20 +212,35 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const
     using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo, DataLayout>;
     using ConfigurationMethod      = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
 
-    const std::vector<ConfigurationMethod> known_configs =
-    {
+    const std::vector<ConfigurationMethod> known_configs = {
         // Alexnet
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW), ConvolutionMethod::DIRECT),
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U),
+                                                     PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW),
+                            ConvolutionMethod::DIRECT),
         // VGG16 / VGG19
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW), ConvolutionMethod::DIRECT),
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U),
+                                                     PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW),
+                            ConvolutionMethod::DIRECT),
         // Mobilenet 224
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM),
+        ConfigurationMethod(ConvolutionConfiguration(
+                                Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U),
+                                PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW),
+                            ConvolutionMethod::GEMM),
         // Mobilenet 160
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM),
+        ConfigurationMethod(ConvolutionConfiguration(
+                                Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U),
+                                PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW),
+                            ConvolutionMethod::GEMM),
         // Mobilenet 224
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM),
+        ConfigurationMethod(ConvolutionConfiguration(
+                                Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U),
+                                PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC),
+                            ConvolutionMethod::GEMM),
         // Mobilenet 160
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM),
+        ConfigurationMethod(ConvolutionConfiguration(
+                                Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U),
+                                PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC),
+                            ConvolutionMethod::GEMM),
     };
 
     const auto find_config = [&](ConfigurationMethod c)
@@ -213,76 +249,89 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const
         const PadStrideInfo            info        = std::get<3>(config);
         const DataLayout               data_layout = std::get<4>(config);
 
-        return std::get<0>(config) == Size2D(src->dimension(idx_w), src->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
-               && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
-               && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride() && (data_layout == src->data_layout());
+        return std::get<0>(config) == Size2D(src->dimension(idx_w), src->dimension(idx_h)) &&
+               std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) &&
+               std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) &&
+               info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() &&
+               info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() &&
+               info.stride() == conv_info.stride() && (data_layout == src->data_layout());
     };
 
     std::vector<ConfigurationMethod>::const_iterator found;
-    if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
+    if ((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
     {
         return (*found).second;
     }
 
-    if(dilation != Size2D(1U, 1U))
+    if (dilation != Size2D(1U, 1U))
     {
         return ConvolutionMethod::GEMM;
     }
     else
     {
-        if(src->data_layout() == DataLayout::NCHW)
+        if (src->data_layout() == DataLayout::NCHW)
         {
             // SRGAN
-            if((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv_info.pad_top() < 3)
-               && (ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info)))
+            if ((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) &&
+                (conv_info.pad_top() < 3) &&
+                (ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info)))
             {
                 return ConvolutionMethod::DIRECT;
             }
-            if((weights->dimension(idx_h) > 5) && (src->dimension(idx_c) > dst->dimension(idx_c)) && (CLFFTConvolutionLayer::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)))
+            if ((weights->dimension(idx_h) > 5) && (src->dimension(idx_c) > dst->dimension(idx_c)) &&
+                (CLFFTConvolutionLayer::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)))
             {
                 return ConvolutionMethod::FFT;
             }
-            if(src->dimension(idx_c) < 16)
+            if (src->dimension(idx_c) < 16)
             {
                 return ConvolutionMethod::GEMM;
             }
-            return bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
+            return bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math))
+                       ? ConvolutionMethod::WINOGRAD
+                       : ConvolutionMethod::GEMM;
         }
         else
         {
-            const bool   is_direct_valid           = bool(ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info));
-            const bool   is_wino_valid             = bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math));
+            const bool is_direct_valid =
+                bool(ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info));
+            const bool is_wino_valid =
+                bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math));
             const size_t kernel_sz_direct_conv_thr = get_direct_conv_kernel_threshold_nhwc(gpu_target);
 
             // SRGAN case
-            if((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv_info.pad_top() < 3)
-               && is_direct_valid)
+            if ((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) &&
+                (conv_info.pad_top() < 3) && is_direct_valid)
             {
                 return ConvolutionMethod::DIRECT;
             }
 
             // Floating-point case: GeMM/Direct/Winograd
-            if(is_data_type_float(src->data_type()))
+            if (is_data_type_float(src->data_type()))
             {
                 // Get dst shape
-                TensorShape   output_shape       = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
-                const bool    is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr);
-                const bool    is_ifm_ge_8        = src->dimension(idx_c) >= 8;
-                const bool    is_ifm_ge_16       = src->dimension(idx_c) >= 16;
-                const bool    is_ofm_lte_8       = weights->dimension(3U) <= 8;
-                const bool    is_ofm_lt_64       = weights->dimension(3U) < 64;
-                const bool    workload_gte_8192  = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192;
-                const bool    is_ifm_gt_ofm      = src->dimension(idx_c) > weights->dimension(3U);
-                const bool    is_m_one           = output_shape[1] * output_shape[2] == 1;
-                const bool    is_unit_stride     = (conv2d_info.conv_info.stride().first == 1) && (conv2d_info.conv_info.stride().second == 1);
-                const int32_t kernel_sz          = weights->dimension(idx_w) * weights->dimension(idx_h);
+                TensorShape output_shape =
+                    misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
+                const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) &&
+                                                (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr);
+                const bool is_ifm_ge_8       = src->dimension(idx_c) >= 8;
+                const bool is_ifm_ge_16      = src->dimension(idx_c) >= 16;
+                const bool is_ofm_lte_8      = weights->dimension(3U) <= 8;
+                const bool is_ofm_lt_64      = weights->dimension(3U) < 64;
+                const bool workload_gte_8192 = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192;
+                const bool is_ifm_gt_ofm     = src->dimension(idx_c) > weights->dimension(3U);
+                const bool is_m_one          = output_shape[1] * output_shape[2] == 1;
+                const bool is_unit_stride =
+                    (conv2d_info.conv_info.stride().first == 1) && (conv2d_info.conv_info.stride().second == 1);
+                const int32_t kernel_sz = weights->dimension(idx_w) * weights->dimension(idx_h);
 
                 // Run Winograd if valid and IFM >= 8
-                if(is_wino_valid && is_ifm_ge_8)
+                if (is_wino_valid && is_ifm_ge_8)
                 {
-                    if(is_ofm_lte_8)
+                    if (is_ofm_lte_8)
                     {
-                        if(gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 || get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD)
+                        if (gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 ||
+                            get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD)
                         {
                             return ConvolutionMethod::WINOGRAD;
                         }
@@ -294,18 +343,19 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const
                 }
 
                 // Direct convolution case
-                if(is_direct_valid)
+                if (is_direct_valid)
                 {
-                    if((gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 || get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD))
+                    if ((gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 ||
+                         get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD))
                     {
-                        if(is_large_kernel_sz && is_ifm_ge_16 && is_ifm_gt_ofm)
+                        if (is_large_kernel_sz && is_ifm_ge_16 && is_ifm_gt_ofm)
                         {
                             return ConvolutionMethod::DIRECT;
                         }
                     }
-                    else if(gpu_target == arm_compute::GPUTarget::G76)
+                    else if (gpu_target == arm_compute::GPUTarget::G76)
                     {
-                        if((is_large_kernel_sz && workload_gte_8192 && is_ifm_ge_16) || (is_ofm_lte_8 && is_ifm_ge_16))
+                        if ((is_large_kernel_sz && workload_gte_8192 && is_ifm_ge_16) || (is_ofm_lte_8 && is_ifm_ge_16))
                         {
                             return ConvolutionMethod::DIRECT;
                         }
@@ -314,21 +364,24 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const
                     {
                         ConvolutionMethod preferred_conv_method = ConvolutionMethod::DIRECT;
 
-                        const bool is_indirect_valid = bool(ClIndirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info));
+                        const bool is_indirect_valid =
+                            bool(ClIndirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info));
 
                         // indirect conv2d should be called when:
                         // 1- When the kernel size is greater than 1x1 and less than or equal to 9x9 (81)
                         // 2- When the kernel size is odd
                         // 3- When the Gpu target is Arm Mali-G77
-                        if(is_indirect_valid)
+                        if (is_indirect_valid)
                         {
                             const bool is_kernel_sz_odd = kernel_sz % 2;
                             const bool is_g77           = gpu_target == GPUTarget::G77;
-                            preferred_conv_method       = (kernel_sz > 1) && (kernel_sz <= 81) && is_kernel_sz_odd && is_g77 ? ConvolutionMethod::INDIRECT : ConvolutionMethod::DIRECT;
+                            preferred_conv_method = (kernel_sz > 1) && (kernel_sz <= 81) && is_kernel_sz_odd && is_g77
+                                                        ? ConvolutionMethod::INDIRECT
+                                                        : ConvolutionMethod::DIRECT;
                         }
 
                         // Direct/indirect convolution used for the first layer of the network
-                        if(workload_gte_8192 && !is_ifm_ge_16 && !is_unit_stride && is_ofm_lt_64)
+                        if (workload_gte_8192 && !is_ifm_ge_16 && !is_unit_stride && is_ofm_lt_64)
                         {
                             // In general, the question we should ask for the first convolution layer of a model is:
                             // when the execution time of im2col + gemm < direct?. Since im2col does not depend on the OFM, it means that
@@ -337,13 +390,13 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const
                             return preferred_conv_method;
                         }
 
-                        if((is_large_kernel_sz || is_m_one) && workload_gte_8192 && is_ifm_ge_16)
+                        if ((is_large_kernel_sz || is_m_one) && workload_gte_8192 && is_ifm_ge_16)
                         {
                             return preferred_conv_method;
                         }
 
                         // Direct convolution used for the last layer of the network
-                        if(is_ofm_lte_8)
+                        if (is_ofm_lte_8)
                         {
                             return preferred_conv_method;
                         }
diff --git a/src/gpu/cl/operators/ClConv2d.h b/src/gpu/cl/operators/ClConv2d.h
index c6c366a762..0cf3cbc1ce 100644
--- a/src/gpu/cl/operators/ClConv2d.h
+++ b/src/gpu/cl/operators/ClConv2d.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
 #include "src/gpu/cl/IClOperator.h"
@@ -112,15 +113,24 @@ public:
      * @param[in]  conv2d_info     Contains convolution 2d info described in @ref Conv2dInfo.
      * @param[in]  weights_info    Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p src.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info,
-                   const WeightsInfo &weights_info = WeightsInfo());
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *weights,
+                   ITensorInfo            *biases,
+                   ITensorInfo            *dst,
+                   const Conv2dInfo       &conv2d_info,
+                   const WeightsInfo      &weights_info = WeightsInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref ClConv2d
      *
      * Similar to ClConv2d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *weights,
+                           const ITensorInfo *biases,
+                           const ITensorInfo *dst,
+                           const Conv2dInfo  &conv2d_info,
                            const WeightsInfo &weights_info = WeightsInfo());
     /** Static function to check if given info will return the convolution called by @ref ClConv2d
      *
@@ -137,11 +147,15 @@ public:
      *
      * @return the Convolution Method Hint
      */
-    static ConvolutionMethod get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
-                                                    const WeightsInfo &weights_info, const GPUTarget gpu_target);
+    static ConvolutionMethod get_convolution_method(const ITensorInfo *src,
+                                                    const ITensorInfo *weights,
+                                                    const ITensorInfo *dst,
+                                                    const Conv2dInfo  &conv2d_info,
+                                                    const WeightsInfo &weights_info,
+                                                    const GPUTarget    gpu_target);
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
diff --git a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp
index 08122b6852..cf24c68d21 100644
--- a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp
+++ b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp
@@ -23,16 +23,19 @@
  */
 #include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClConvertFullyConnectedWeights::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
+void ClConvertFullyConnectedWeights::configure(const ClCompileContext &compile_context,
+                                               const ITensorInfo      *src,
+                                               ITensorInfo            *dst,
+                                               const TensorShape      &original_src_shape,
+                                               DataLayout              data_layout)
 {
     ARM_COMPUTE_LOG_PARAMS(src, dst, original_src_shape, data_layout);
     auto k = std::make_unique<kernels::ClConvertFullyConnectedWeightsKernel>();
@@ -40,9 +43,12 @@ void ClConvertFullyConnectedWeights::configure(const ClCompileContext &compile_c
     _kernel = std::move(k);
 }
 
-Status ClConvertFullyConnectedWeights::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
+Status ClConvertFullyConnectedWeights::validate(const ITensorInfo *src,
+                                                const ITensorInfo *dst,
+                                                const TensorShape &original_src_shape,
+                                                DataLayout         data_layout)
 {
     return kernels::ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout);
 }
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h
index 2794eb17b0..c46152081c 100644
--- a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h
+++ b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h
@@ -43,14 +43,21 @@ public:
      * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer).
      * @param[in] data_layout        The data layout the weights have been trained in.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   const TensorShape      &original_src_shape,
+                   DataLayout              data_layout);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClConvertFullyConnectedWeights::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           const TensorShape &original_src_shape,
+                           DataLayout         data_layout);
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClCopy.cpp b/src/gpu/cl/operators/ClCopy.cpp
index d3b83040d0..e2be7cebd4 100644
--- a/src/gpu/cl/operators/ClCopy.cpp
+++ b/src/gpu/cl/operators/ClCopy.cpp
@@ -23,11 +23,10 @@
  */
 #include "src/gpu/cl/operators/ClCopy.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClCopyKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
@@ -45,4 +44,4 @@ Status ClCopy::validate(const ITensorInfo *src, const ITensorInfo *dst, Window *
     return kernels::ClCopyKernel::validate(src, dst, dst_window);
 }
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClCopy.h b/src/gpu/cl/operators/ClCopy.h
index 9b427f9675..fe9b58c607 100644
--- a/src/gpu/cl/operators/ClCopy.h
+++ b/src/gpu/cl/operators/ClCopy.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_COPY_H
 
 #include "arm_compute/core/Window.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -44,7 +45,10 @@ public:
      * @param[in]  dst_window      (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
      *
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, Window *dst_window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   Window                 *dst_window = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClCopy::configure()
diff --git a/src/gpu/cl/operators/ClCrop.cpp b/src/gpu/cl/operators/ClCrop.cpp
index cef9f14c7d..6313e4fbb5 100644
--- a/src/gpu/cl/operators/ClCrop.cpp
+++ b/src/gpu/cl/operators/ClCrop.cpp
@@ -23,17 +23,22 @@
  */
 #include "src/gpu/cl/operators/ClCrop.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClCropKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClCrop::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value,
-                       Window *dst_window)
+void ClCrop::configure(const ClCompileContext &compile_context,
+                       const ITensorInfo      *src,
+                       ITensorInfo            *dst,
+                       Coordinates2D           start,
+                       Coordinates2D           end,
+                       uint32_t                batch_index,
+                       float                   extrapolation_value,
+                       Window                 *dst_window)
 {
     ARM_COMPUTE_LOG_PARAMS(src, dst, start, end, batch_index, extrapolation_value, dst_window);
     auto k = std::make_unique<kernels::ClCropKernel>();
@@ -41,9 +46,15 @@ void ClCrop::configure(const ClCompileContext &compile_context, const ITensorInf
     _kernel = std::move(k);
 }
 
-Status ClCrop::validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window)
+Status ClCrop::validate(const ITensorInfo *src,
+                        const ITensorInfo *dst,
+                        Coordinates2D      start,
+                        Coordinates2D      end,
+                        uint32_t           batch_index,
+                        float              extrapolation_value,
+                        Window            *dst_window)
 {
     return kernels::ClCropKernel::validate(src, dst, start, end, batch_index, extrapolation_value, dst_window);
 }
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClCrop.h b/src/gpu/cl/operators/ClCrop.h
index 1cf1c9bff4..e845cf372c 100644
--- a/src/gpu/cl/operators/ClCrop.h
+++ b/src/gpu/cl/operators/ClCrop.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_CROP_H
 
 #include "arm_compute/core/Window.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -49,16 +50,27 @@ public:
      * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
      * @param[in]  dst_window          Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                   Window *dst_window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   Coordinates2D           start,
+                   Coordinates2D           end,
+                   uint32_t                batch_index,
+                   float                   extrapolation_value = 0,
+                   Window                 *dst_window          = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClCrop::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                           Window *dst_window = nullptr);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           Coordinates2D      start,
+                           Coordinates2D      end,
+                           uint32_t           batch_index,
+                           float              extrapolation_value = 0,
+                           Window            *dst_window          = nullptr);
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClDequantize.cpp b/src/gpu/cl/operators/ClDequantize.cpp
index 0fccab63e0..eb6f9e7abb 100644
--- a/src/gpu/cl/operators/ClDequantize.cpp
+++ b/src/gpu/cl/operators/ClDequantize.cpp
@@ -25,10 +25,10 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/gpu/cl/ClCompileContext.h"
-#include "src/gpu/cl/kernels/ClDequantizeKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClDequantizeKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/gpu/cl/operators/ClDirectConv2d.cpp b/src/gpu/cl/operators/ClDirectConv2d.cpp
index 0215dba422..17a196ce6b 100644
--- a/src/gpu/cl/operators/ClDirectConv2d.cpp
+++ b/src/gpu/cl/operators/ClDirectConv2d.cpp
@@ -26,6 +26,8 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/gpu/cl/kernels/ClActivationKernel.h"
@@ -35,8 +37,6 @@
 #include "src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h"
 #include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h"
 
-#include "src/common/utils/Log.h"
-
 using namespace arm_compute::cl_direct_conv;
 
 namespace arm_compute
@@ -53,7 +53,8 @@ ITensorPack select_activation_src_dst(ITensorPack &tensors)
     return pack;
 }
 
-DirectConvComputeKernelInfo config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo
+config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info)
 {
     // Get GPU target
     GPUTarget gpu_target = CLScheduler::get().target();
@@ -65,8 +66,13 @@ DirectConvComputeKernelInfo config_direct_convolution_nhwc(const ITensorInfo *sr
 
 } // namespace
 
-void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                               const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void ClDirectConv2d::configure(const CLCompileContext    &compile_context,
+                               ITensorInfo               *src,
+                               ITensorInfo               *weights,
+                               ITensorInfo               *biases,
+                               ITensorInfo               *dst,
+                               const PadStrideInfo       &conv_info,
+                               const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info);
@@ -75,15 +81,17 @@ void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorI
     const DirectConvComputeKernelInfo desc = config_direct_convolution_nhwc(src, weights, conv_info);
 
     // Configure direct convolution kernel
-    const ActivationLayerInfo conv2d_act_info = (src->data_layout() == DataLayout::NHWC && is_data_type_float(src->data_type())) ? act_info : ActivationLayerInfo();
-    auto                      k               = std::make_unique<kernels::ClDirectConv2dKernel>();
+    const ActivationLayerInfo conv2d_act_info =
+        (src->data_layout() == DataLayout::NHWC && is_data_type_float(src->data_type())) ? act_info
+                                                                                         : ActivationLayerInfo();
+    auto k = std::make_unique<kernels::ClDirectConv2dKernel>();
     k->set_target(CLScheduler::get().target());
     k->configure(compile_context, src, weights, biases, dst, conv_info, conv2d_act_info, desc);
     _direct_conv_kernel = std::move(k);
 
     // Configure border handler
     PixelValue zero_value(0.f);
-    if(is_data_type_quantized_asymmetric(src->data_type()))
+    if (is_data_type_quantized_asymmetric(src->data_type()))
     {
         zero_value = PixelValue(0, src->data_type(), src->quantization_info());
     }
@@ -92,7 +100,7 @@ void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorI
     _src_border_handler = std::move(b);
 
     // Fused activation is currently supported for NHWC and floating point types
-    if(act_info.enabled() && !conv2d_act_info.enabled())
+    if (act_info.enabled() && !conv2d_act_info.enabled())
     {
         auto a = std::make_unique<kernels::ClActivationKernel>();
         a->configure(compile_context, dst, dst, act_info);
@@ -103,14 +111,19 @@ void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorI
     CLScheduler::get().tune_kernel_static(*_direct_conv_kernel);
 }
 
-Status ClDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+Status ClDirectConv2d::validate(const ITensorInfo         *src,
+                                const ITensorInfo         *weights,
+                                const ITensorInfo         *biases,
+                                const ITensorInfo         *dst,
+                                const PadStrideInfo       &conv_info,
+                                const ActivationLayerInfo &act_info)
 {
     // Initialize the direct convolution descriptor
     const DirectConvComputeKernelInfo desc = config_direct_convolution_nhwc(src, weights, conv_info);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, ActivationLayerInfo(), desc));
-    if(act_info.enabled())
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, ActivationLayerInfo(), desc));
+    if (act_info.enabled())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, dst, act_info));
     }
@@ -124,7 +137,7 @@ void ClDirectConv2d::run(ITensorPack &tensors)
     // Run direct convolution
     CLScheduler::get().enqueue_op(*_direct_conv_kernel.get(), tensors, false);
     // Run activation kernel
-    if(_activation_kernel)
+    if (_activation_kernel)
     {
         auto act_pack = select_activation_src_dst(tensors);
         CLScheduler::get().enqueue_op(*_activation_kernel.get(), act_pack, false);
diff --git a/src/gpu/cl/operators/ClDirectConv2d.h b/src/gpu/cl/operators/ClDirectConv2d.h
index fedb9e971e..0f18490814 100644
--- a/src/gpu/cl/operators/ClDirectConv2d.h
+++ b/src/gpu/cl/operators/ClDirectConv2d.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_DIRECT_CONV2D_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
 #include "src/gpu/cl/IClOperator.h"
@@ -59,7 +60,12 @@ public:
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      *
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src,
+                   ITensorInfo               *weights,
+                   ITensorInfo               *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -67,16 +73,20 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *dst,
+                           const PadStrideInfo       &conv_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited method overridden
     void run(ITensorPack &tensors) override;
 
 private:
-    std::unique_ptr<IClKernel> _direct_conv_kernel{ nullptr };
-    std::unique_ptr<IClKernel> _src_border_handler{ nullptr };
-    std::unique_ptr<IClKernel> _activation_kernel{ nullptr };
+    std::unique_ptr<IClKernel> _direct_conv_kernel{nullptr};
+    std::unique_ptr<IClKernel> _src_border_handler{nullptr};
+    std::unique_ptr<IClKernel> _activation_kernel{nullptr};
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClDirectConv3d.cpp b/src/gpu/cl/operators/ClDirectConv3d.cpp
index 5d37f07f31..b08347936b 100644
--- a/src/gpu/cl/operators/ClDirectConv3d.cpp
+++ b/src/gpu/cl/operators/ClDirectConv3d.cpp
@@ -24,13 +24,19 @@
 #include "src/gpu/cl/operators/ClDirectConv3d.h"
 
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
 #include "src/gpu/cl/kernels/ClDirectConv3dKernel.h"
 
 namespace arm_compute
 {
 namespace opencl
 {
-void ClDirectConv3d::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv3d_info)
+void ClDirectConv3d::configure(const CLCompileContext &compile_context,
+                               const ITensorInfo      *src0,
+                               const ITensorInfo      *src1,
+                               const ITensorInfo      *src2,
+                               ITensorInfo            *dst,
+                               const Conv3dInfo       &conv3d_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0);
 
@@ -40,7 +46,11 @@ void ClDirectConv3d::configure(const CLCompileContext &compile_context, const IT
     _direct_conv3d_kernel = std::move(k);
 }
 
-Status ClDirectConv3d::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info)
+Status ClDirectConv3d::validate(const ITensorInfo *src0,
+                                const ITensorInfo *src1,
+                                const ITensorInfo *src2,
+                                const ITensorInfo *dst,
+                                const Conv3dInfo  &conv3d_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv3dKernel::validate(src0, src1, src2, dst, conv3d_info));
     return Status{};
diff --git a/src/gpu/cl/operators/ClDirectConv3d.h b/src/gpu/cl/operators/ClDirectConv3d.h
index fa58b5aedd..5fb32460e2 100644
--- a/src/gpu/cl/operators/ClDirectConv3d.h
+++ b/src/gpu/cl/operators/ClDirectConv3d.h
@@ -67,7 +67,12 @@ public:
      * @param[in]  conv3d_info     Contains strides, padding, rounding, activation, dilation and fast math information. Activation and fast math are currently unused.
      *
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv3d_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src0,
+                   const ITensorInfo      *src1,
+                   const ITensorInfo      *src2,
+                   ITensorInfo            *dst,
+                   const Conv3dInfo       &conv3d_info);
 
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -75,14 +80,18 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info);
+    static Status validate(const ITensorInfo *src0,
+                           const ITensorInfo *src1,
+                           const ITensorInfo *src2,
+                           const ITensorInfo *dst,
+                           const Conv3dInfo  &conv3d_info);
 
     // Inherited method overridden
     void run(ITensorPack &tensors) override;
 
 private:
-    std::unique_ptr<IClKernel> _direct_conv3d_kernel{ nullptr };
+    std::unique_ptr<IClKernel> _direct_conv3d_kernel{nullptr};
 };
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_DIRECT_CONV3D_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_DIRECT_CONV3D_H */
diff --git a/src/gpu/cl/operators/ClElementwiseOperations.cpp b/src/gpu/cl/operators/ClElementwiseOperations.cpp
index 32d2b88798..1325371d19 100644
--- a/src/gpu/cl/operators/ClElementwiseOperations.cpp
+++ b/src/gpu/cl/operators/ClElementwiseOperations.cpp
@@ -23,15 +23,18 @@
  */
 #include "src/gpu/cl/operators/ClElementwiseOperations.h"
 
-#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
 namespace arm_compute
 {
 namespace opencl
 {
-void ClElementwiseDivision::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClElementwiseDivision::configure(const ClCompileContext    &compile_context,
+                                      ITensorInfo               *src1,
+                                      ITensorInfo               *src2,
+                                      ITensorInfo               *dst,
+                                      const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
     auto k = std::make_unique<kernels::ClArithmeticKernel>();
@@ -39,12 +42,19 @@ void ClElementwiseDivision::configure(const ClCompileContext &compile_context, I
     _kernel = std::move(k);
 }
 
-Status ClElementwiseDivision::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClElementwiseDivision::validate(const ITensorInfo         *src1,
+                                       const ITensorInfo         *src2,
+                                       const ITensorInfo         *dst,
+                                       const ActivationLayerInfo &act_info)
 {
     return kernels::ClArithmeticKernel::validate(ArithmeticOperation::DIV, src1, src2, dst, act_info);
 }
 
-void ClElementwiseMax::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClElementwiseMax::configure(const ClCompileContext    &compile_context,
+                                 ITensorInfo               *src1,
+                                 ITensorInfo               *src2,
+                                 ITensorInfo               *dst,
+                                 const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
     auto k = std::make_unique<kernels::ClArithmeticKernel>();
@@ -52,12 +62,19 @@ void ClElementwiseMax::configure(const ClCompileContext &compile_context, ITenso
     _kernel = std::move(k);
 }
 
-Status ClElementwiseMax::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClElementwiseMax::validate(const ITensorInfo         *src1,
+                                  const ITensorInfo         *src2,
+                                  const ITensorInfo         *dst,
+                                  const ActivationLayerInfo &act_info)
 {
     return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MAX, src1, src2, dst, act_info);
 }
 
-void ClElementwiseMin::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClElementwiseMin::configure(const ClCompileContext    &compile_context,
+                                 ITensorInfo               *src1,
+                                 ITensorInfo               *src2,
+                                 ITensorInfo               *dst,
+                                 const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
     auto k = std::make_unique<kernels::ClArithmeticKernel>();
@@ -65,12 +82,19 @@ void ClElementwiseMin::configure(const ClCompileContext &compile_context, ITenso
     _kernel = std::move(k);
 }
 
-Status ClElementwiseMin::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClElementwiseMin::validate(const ITensorInfo         *src1,
+                                  const ITensorInfo         *src2,
+                                  const ITensorInfo         *dst,
+                                  const ActivationLayerInfo &act_info)
 {
     return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MIN, src1, src2, dst, act_info);
 }
 
-void ClElementwiseSquaredDiff::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClElementwiseSquaredDiff::configure(const ClCompileContext    &compile_context,
+                                         ITensorInfo               *src1,
+                                         ITensorInfo               *src2,
+                                         ITensorInfo               *dst,
+                                         const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
     auto k = std::make_unique<kernels::ClArithmeticKernel>();
@@ -78,12 +102,19 @@ void ClElementwiseSquaredDiff::configure(const ClCompileContext &compile_context
     _kernel = std::move(k);
 }
 
-Status ClElementwiseSquaredDiff::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClElementwiseSquaredDiff::validate(const ITensorInfo         *src1,
+                                          const ITensorInfo         *src2,
+                                          const ITensorInfo         *dst,
+                                          const ActivationLayerInfo &act_info)
 {
     return kernels::ClArithmeticKernel::validate(ArithmeticOperation::SQUARED_DIFF, src1, src2, dst, act_info);
 }
 
-void ClElementwisePower::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClElementwisePower::configure(const ClCompileContext    &compile_context,
+                                   ITensorInfo               *src1,
+                                   ITensorInfo               *src2,
+                                   ITensorInfo               *dst,
+                                   const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
     auto k = std::make_unique<kernels::ClArithmeticKernel>();
@@ -91,7 +122,10 @@ void ClElementwisePower::configure(const ClCompileContext &compile_context, ITen
     _kernel = std::move(k);
 }
 
-Status ClElementwisePower::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClElementwisePower::validate(const ITensorInfo         *src1,
+                                    const ITensorInfo         *src2,
+                                    const ITensorInfo         *dst,
+                                    const ActivationLayerInfo &act_info)
 {
     return kernels::ClArithmeticKernel::validate(ArithmeticOperation::POWER, src1, src2, dst, act_info);
 }
diff --git a/src/gpu/cl/operators/ClElementwiseOperations.h b/src/gpu/cl/operators/ClElementwiseOperations.h
index 120049cb7f..de7c018d75 100644
--- a/src/gpu/cl/operators/ClElementwiseOperations.h
+++ b/src/gpu/cl/operators/ClElementwiseOperations.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -48,14 +49,21 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClElementwiseDivision::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 
 /** Basic function to run @ref opencl::kernels::ClArithmeticKernel for max
@@ -74,14 +82,21 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClElementwiseMax::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 
 /** Basic function to run @ref opencl::kernels::ClArithmeticKernel for min
@@ -100,14 +115,21 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClElementwiseMin::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 
 /** Basic function to run @ref opencl::kernels::ClArithmeticKernel for squared difference
@@ -126,14 +148,21 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClElementwiseSquaredDiff::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 
 /** Basic function to run @ref opencl::kernels::ClArithmeticKernel for power
@@ -152,14 +181,21 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported:F16/F32.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClElementwisePower::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClElementwiseUnary.cpp b/src/gpu/cl/operators/ClElementwiseUnary.cpp
index f94d402c05..914621183e 100644
--- a/src/gpu/cl/operators/ClElementwiseUnary.cpp
+++ b/src/gpu/cl/operators/ClElementwiseUnary.cpp
@@ -23,9 +23,8 @@
  */
 #include "src/gpu/cl/operators/ClElementwiseUnary.h"
 
-#include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/gpu/cl/operators/ClFill.cpp b/src/gpu/cl/operators/ClFill.cpp
index ad22b15cff..817b15ab20 100644
--- a/src/gpu/cl/operators/ClFill.cpp
+++ b/src/gpu/cl/operators/ClFill.cpp
@@ -23,16 +23,18 @@
  */
 #include "src/gpu/cl/operators/ClFill.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClFillKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClFill::configure(const ClCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *dst_window)
+void ClFill::configure(const ClCompileContext &compile_context,
+                       ITensorInfo            *tensor,
+                       const PixelValue       &constant_value,
+                       Window                 *dst_window)
 {
     ARM_COMPUTE_LOG_PARAMS(tensor, constant_value, dst_window);
     auto k = std::make_unique<kernels::ClFillKernel>();
@@ -45,4 +47,4 @@ Status ClFill::validate(const ITensorInfo *tensor, const PixelValue &constant_va
     return kernels::ClFillKernel::validate(tensor, constant_value, dst_window);
 }
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClFill.h b/src/gpu/cl/operators/ClFill.h
index 3bbe27ef71..e13862aa6b 100644
--- a/src/gpu/cl/operators/ClFill.h
+++ b/src/gpu/cl/operators/ClFill.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -44,7 +45,10 @@ public:
      * @param[in]     constant_value  The value used to fill the planes of the tensor
      * @param[in]     window          Window to be used in case setting only part of a tensor. Default is nullptr.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *tensor,
+                   const PixelValue       &constant_value,
+                   Window                 *window = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClFill::configure()
diff --git a/src/gpu/cl/operators/ClFlatten.cpp b/src/gpu/cl/operators/ClFlatten.cpp
index e277c0d7e4..7532532c94 100644
--- a/src/gpu/cl/operators/ClFlatten.cpp
+++ b/src/gpu/cl/operators/ClFlatten.cpp
@@ -23,11 +23,10 @@
  */
 #include "src/gpu/cl/operators/ClFlatten.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClReshapeKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
diff --git a/src/gpu/cl/operators/ClFloor.cpp b/src/gpu/cl/operators/ClFloor.cpp
index 84f685e381..6790160172 100644
--- a/src/gpu/cl/operators/ClFloor.cpp
+++ b/src/gpu/cl/operators/ClFloor.cpp
@@ -23,11 +23,10 @@
  */
 #include "src/gpu/cl/operators/ClFloor.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClFloorKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
diff --git a/src/gpu/cl/operators/ClFullyConnected.cpp b/src/gpu/cl/operators/ClFullyConnected.cpp
index 5845bbc69e..6969ac8ab3 100644
--- a/src/gpu/cl/operators/ClFullyConnected.cpp
+++ b/src/gpu/cl/operators/ClFullyConnected.cpp
@@ -24,12 +24,13 @@
 #include "src/gpu/cl/operators/ClFullyConnected.h"
 
 #include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
 #include "src/gpu/cl/operators/ClFlatten.h"
@@ -38,11 +39,8 @@
 #include "src/gpu/cl/operators/ClMatMul.h"
 #include "src/gpu/cl/operators/ClTranspose.h"
 #include "src/gpu/cl/utils/ClAuxTensorHandler.h"
-
 #include "src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h"
 #include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h"
-
-#include "src/common/utils/Log.h"
 #include "support/Cast.h"
 
 #include <algorithm>
@@ -62,8 +60,11 @@ inline TensorShape get_reshaped_matmul_tensor(const TensorShape &src)
     return TensorShape(src.x(), 1, src.y(), src.collapsed_from(2).z()); // Return value optimisation
 }
 
-Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo &weights, const ITensorInfo &dst,
-                                       GEMMLowpOutputStageInfo &gemmlowp_output_stage, ActivationLayerInfo activation_info)
+Status construct_gemmlowp_output_stage(const ITensorInfo       &src,
+                                       const ITensorInfo       &weights,
+                                       const ITensorInfo       &dst,
+                                       GEMMLowpOutputStageInfo &gemmlowp_output_stage,
+                                       ActivationLayerInfo      activation_info)
 {
     gemmlowp_output_stage.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
     gemmlowp_output_stage.gemmlowp_offset     = 0;
@@ -73,7 +74,7 @@ Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo
     const auto data_type = src.data_type();
 
     // Configure output stage for quantized case
-    if(is_data_type_quantized_asymmetric(data_type))
+    if (is_data_type_quantized_asymmetric(data_type))
     {
         const QuantizationInfo        oq_info = dst.quantization_info();
         const UniformQuantizationInfo iq_unif = src.quantization_info().uniform();
@@ -85,15 +86,17 @@ Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo
         const float multiplier        = (iq_unif.scale * wq_unif.scale) / output_quant_info.scale;
         int         output_multiplier = 0;
         int         output_shift      = 0;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
 
         PixelValue type_min{};
         PixelValue type_max{};
         std::tie(type_min, type_max) = get_min_max(data_type);
 
-        if(activation_info.enabled())
+        if (activation_info.enabled())
         {
-            std::tie(type_min, type_max) = get_quantized_activation_min_max(activation_info, data_type, output_quant_info);
+            std::tie(type_min, type_max) =
+                get_quantized_activation_min_max(activation_info, data_type, output_quant_info);
         }
 
         // Set the GEMMLowp output stage info
@@ -109,31 +112,41 @@ Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo
     return Status{};
 }
 
-Status validate_mm(const ITensorInfo &src, const ITensorInfo &weights, const ITensorInfo *bias, const ITensorInfo &dst, const FullyConnectedLayerInfo &fc_info, bool use_matmul)
+Status validate_mm(const ITensorInfo             &src,
+                   const ITensorInfo             &weights,
+                   const ITensorInfo             *bias,
+                   const ITensorInfo             &dst,
+                   const FullyConnectedLayerInfo &fc_info,
+                   bool                           use_matmul)
 {
     // Note : If input is dynamic and data is not batched, use matmul, else use gemm
     const bool transpose_weights = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false;
-    const bool use_dynamic_gemm  = !use_matmul && !weights.are_values_constant() && transpose_weights; // use dynamic gemm as fallback for matmul
-    const bool is_quantized      = is_data_type_quantized_asymmetric(src.data_type());
+    const bool use_dynamic_gemm =
+        !use_matmul && !weights.are_values_constant() && transpose_weights; // use dynamic gemm as fallback for matmul
+    const bool is_quantized = is_data_type_quantized_asymmetric(src.data_type());
 
-    if(use_matmul)
+    if (use_matmul)
     {
         const MatMulInfo m_info = MatMulInfo().adj_rhs(transpose_weights);
 
         // Note: LHS is reshaped here to match ClMatMul expectations of batch index - From [M, B0, B1] to [M, 1, B0, B1]
         TensorInfo lhs_to_use = src.clone()->set_tensor_shape(get_reshaped_matmul_tensor(src.tensor_shape()));
 
-        const GPUTarget                                         gpu_target  = CLScheduler::get().target();
-        std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> t           = cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
-        const MatMulKernelInfo                                  kernel_info = t->configure(&lhs_to_use, &weights, m_info);
+        const GPUTarget                                         gpu_target = CLScheduler::get().target();
+        std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> t =
+            cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
+        const MatMulKernelInfo kernel_info = t->configure(&lhs_to_use, &weights, m_info);
 
-        return is_quantized ? kernels::ClMatMulLowpNativeKernel::validate(&lhs_to_use, &weights, bias, &dst, kernel_info, fc_info.activation_info) :
-               kernels::ClMatMulNativeKernel::validate(&lhs_to_use, &weights, bias, &dst, kernel_info, fc_info.activation_info);
+        return is_quantized ? kernels::ClMatMulLowpNativeKernel::validate(&lhs_to_use, &weights, bias, &dst,
+                                                                          kernel_info, fc_info.activation_info)
+                            : kernels::ClMatMulNativeKernel::validate(&lhs_to_use, &weights, bias, &dst, kernel_info,
+                                                                      fc_info.activation_info);
     }
     else
     {
         GEMMLowpOutputStageInfo gemmlowp_output_stage;
-        ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(src, weights, dst, gemmlowp_output_stage, fc_info.activation_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            construct_gemmlowp_output_stage(src, weights, dst, gemmlowp_output_stage, fc_info.activation_info));
 
         const GEMMInfo &gemm_info = GEMMInfo(false,                           // is_a_reshaped
                                              false,                           // is_b_reshaped
@@ -147,7 +160,7 @@ Status validate_mm(const ITensorInfo &src, const ITensorInfo &weights, const ITe
                                              true,                            // broadcast_bias
                                              ActivationLayerInfo());          // activation_info
 
-        if(is_quantized)
+        if (is_quantized)
         {
             const UniformQuantizationInfo iq_info = src.quantization_info().uniform();
             const UniformQuantizationInfo wq_info = weights.quantization_info().uniform();
@@ -158,11 +171,9 @@ Status validate_mm(const ITensorInfo &src, const ITensorInfo &weights, const ITe
             const QuantizationInfo weights_quantization_info(wq_info.scale, -wq_info.offset);
 
             // Validate gemmlowp function
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyCore::validate(&src.clone()->set_quantization_info(src_quantization_info),
-                                                                               &weights.clone()->set_quantization_info(weights_quantization_info),
-                                                                               bias,
-                                                                               &dst,
-                                                                               gemm_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyCore::validate(
+                &src.clone()->set_quantization_info(src_quantization_info),
+                &weights.clone()->set_quantization_info(weights_quantization_info), bias, &dst, gemm_info));
         }
         else
         {
@@ -188,11 +199,15 @@ ClFullyConnected::ClFullyConnected()
 
 ClFullyConnected::~ClFullyConnected() = default;
 
-void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst,
+void ClFullyConnected::configure_mm(const CLCompileContext        &compile_context,
+                                    ITensorInfo                   *src,
+                                    ITensorInfo                   *weights,
+                                    ITensorInfo                   *bias,
+                                    ITensorInfo                   *dst,
                                     const FullyConnectedLayerInfo &fc_info)
 {
     // If weights are dynamic and matmul is supported use matmul, else use gemm
-    if(_use_matmul)
+    if (_use_matmul)
     {
         // Specify whether transpose weights is necessary in matmul info
         const MatMulInfo mat_info = MatMulInfo().adj_rhs(_transpose_weights);
@@ -202,22 +217,25 @@ void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITe
         _lhs_to_use = src->clone()->set_tensor_shape(get_reshaped_matmul_tensor(_lhs_to_use.tensor_shape()));
 
         // 2. Use heuristics to get kernel info object
-        const GPUTarget                                         gpu_target    = CLScheduler::get().target();
-        std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> kernel_config = cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
-        MatMulKernelInfo                                        kernel_info   = kernel_config->configure(src, weights, mat_info);
+        const GPUTarget                                         gpu_target = CLScheduler::get().target();
+        std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> kernel_config =
+            cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
+        MatMulKernelInfo kernel_info = kernel_config->configure(src, weights, mat_info);
 
         // 3. Configure relevant matmul kernel
-        if(_is_quantized)
+        if (_is_quantized)
         {
             _matmul_lowp_native_kernel = std::make_unique<kernels::ClMatMulLowpNativeKernel>();
             _matmul_lowp_native_kernel->set_target(gpu_target);
-            _matmul_lowp_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info, fc_info.activation_info);
+            _matmul_lowp_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info,
+                                                  fc_info.activation_info);
         }
         else
         {
             _matmul_native_kernel = std::make_unique<kernels::ClMatMulNativeKernel>();
             _matmul_native_kernel->set_target(gpu_target);
-            _matmul_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info, fc_info.activation_info);
+            _matmul_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info,
+                                             fc_info.activation_info);
         }
     }
     else
@@ -238,7 +256,7 @@ void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITe
                                              true,                            // broadcast_bias
                                              fc_info.activation_info);        // activation_info
 
-        if(_is_quantized)
+        if (_is_quantized)
         {
             // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
             // Extract and negate input and weights offset
@@ -248,8 +266,10 @@ void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITe
             TensorInfo src_info     = src->clone()->set_quantization_info(src_quantization_info);
             TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info);
 
-            src_info.set_quantization_info(QuantizationInfo(src_quantization_info.uniform().scale, -src_quantization_info.uniform().offset));
-            weights_info.set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+            src_info.set_quantization_info(
+                QuantizationInfo(src_quantization_info.uniform().scale, -src_quantization_info.uniform().offset));
+            weights_info.set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale,
+                                                                -weights_quantization_info.uniform().offset));
 
             // Configure gemmlowp function
             _mm_gemmlowp = std::make_unique<ClGemmLowpMatrixMultiplyCore>();
@@ -264,16 +284,25 @@ void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITe
     }
 }
 
-void ClFullyConnected::configure_conv_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst,
+void ClFullyConnected::configure_conv_fc(const CLCompileContext        &compile_context,
+                                         ITensorInfo                   *src,
+                                         ITensorInfo                   *weights,
+                                         ITensorInfo                   *bias,
+                                         ITensorInfo                   *dst,
                                          const FullyConnectedLayerInfo &fc_info)
 {
     // MatMul fuses transpose operation, so we use the first dimension for comparison where appropriate.
-    ARM_COMPUTE_ERROR_ON((weights->dimension((_use_matmul && _transpose_weights) ? 0 : 1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
+    ARM_COMPUTE_ERROR_ON((weights->dimension((_use_matmul && _transpose_weights) ? 0 : 1) !=
+                          (src->dimension(0) * src->dimension(1) * src->dimension(2))));
 
     // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
 
     // Initialize output tensor for flatten
-    _flattened_src = src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)).set_data_layout(DataLayout::NCHW);
+    _flattened_src = src->clone()
+                         ->set_is_resizable(true)
+                         .reset_padding()
+                         .set_tensor_shape(compute_flatten_shape(src))
+                         .set_data_layout(DataLayout::NCHW);
 
     // Configure flatten kernel
     _flatten = std::make_unique<ClFlatten>();
@@ -284,7 +313,11 @@ void ClFullyConnected::configure_conv_fc(const CLCompileContext &compile_context
     configure_mm(compile_context, &_flattened_src, weights, bias, dst, fc_info);
 }
 
-void ClFullyConnected::configure_fc_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst,
+void ClFullyConnected::configure_fc_fc(const CLCompileContext        &compile_context,
+                                       ITensorInfo                   *src,
+                                       ITensorInfo                   *weights,
+                                       ITensorInfo                   *bias,
+                                       ITensorInfo                   *dst,
                                        const FullyConnectedLayerInfo &fc_info)
 {
     // MatMul fuses transpose operation, so we use the first dimension for comparison where appropriate.
@@ -294,7 +327,11 @@ void ClFullyConnected::configure_fc_fc(const CLCompileContext &compile_context,
     configure_mm(compile_context, src, weights, bias, dst, fc_info);
 }
 
-void ClFullyConnected::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
+void ClFullyConnected::configure(const CLCompileContext &compile_context,
+                                 ITensorInfo            *src,
+                                 ITensorInfo            *weights,
+                                 ITensorInfo            *biases,
+                                 ITensorInfo            *dst,
                                  FullyConnectedLayerInfo fc_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
@@ -317,8 +354,9 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso
     // 2. MatMul does not support broadcasting batch dimension, and therefore is disabled if fc is batched.
     // 3. When FC is after convolution and src tensor data layout does not match weights trained data layout (weights conversion kernel is required)
     const bool is_batched_fc_layer = dst->dimension(1) > 1;
-    _use_matmul                    = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() && !is_batched_fc_layer && !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout));
-    _dynamic_gemm                  = !weights->are_values_constant() && _transpose_weights && !_use_matmul;
+    _use_matmul = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() && !is_batched_fc_layer &&
+                  !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout));
+    _dynamic_gemm = !weights->are_values_constant() && _transpose_weights && !_use_matmul;
 
     // With the Fully Connected layer we can have 4 different cases:
     //  1) Convolution layer -> Fully Connected layer without batches
@@ -327,11 +365,11 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso
     //  4) Fully Connected layer -> Fully Connected layer with batches
 
     // Check if we have a fully connected layer with batches
-    if(is_batched_fc_layer)
+    if (is_batched_fc_layer)
     {
-        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3,
-                                                                                  src->tensor_shape().cend(),
-                                                                                  dst->tensor_shape().cbegin() + 1));
+        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                            (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(),
+                                        dst->tensor_shape().cbegin() + 1));
     }
     else
     {
@@ -341,7 +379,7 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso
     ITensorInfo *weights_used = weights;
 
     // Reshape weights if needed - Not needed when matmul is in use as matmul fuses transpose op.
-    if(_transpose_weights && !_use_matmul)
+    if (_transpose_weights && !_use_matmul)
     {
         // Reshape the weights
         _reshape_weights = std::make_unique<ClTranspose>();
@@ -351,14 +389,11 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso
     }
 
     // Convert weights if needed
-    if(_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
+    if (_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
     {
         // Convert weights
         _convert_weights = std::make_unique<ClConvertFullyConnectedWeights>();
-        _convert_weights->configure(compile_context,
-                                    weights_used,
-                                    &_converted_weights,
-                                    src->tensor_shape(),
+        _convert_weights->configure(compile_context, weights_used, &_converted_weights, src->tensor_shape(),
                                     fc_info.weights_trained_layout);
 
         weights_used         = &_converted_weights;
@@ -366,7 +401,7 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso
         _run_convert_weights = true;
     }
 
-    if(_is_fc_after_conv)
+    if (_is_fc_after_conv)
     {
         // Fully Connected layer after a Convolution Layer without batches
         configure_conv_fc(compile_context, src, weights_used, biases, dst, fc_info);
@@ -379,60 +414,69 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso
     // Update TensorInfo of final weights used (Need to be done in the end due to padding expansion)
     _weights_to_use = *weights_used;
 
-    if(_use_matmul)
+    if (_use_matmul)
     {
         // Note : MatMul does not use transpose and does not need auxillary memory, so only converted weights are added to aux_mem
-        _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Temporary, _converted_weights.total_size());
+        _aux_mem[ConvertedWeights] =
+            MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Temporary, _converted_weights.total_size());
     }
     else
     {
         // Set auxiliary memory requirements for gemm operators
         auto gemm_mem_req = (_is_quantized) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace();
-        for(unsigned int i = 0; i < gemm_mem_req.size(); ++i)
+        for (unsigned int i = 0; i < gemm_mem_req.size(); ++i)
         {
             _aux_mem[i] = gemm_mem_req[i];
         }
-        if(_aux_mem[1].size > 0 || _aux_mem[2].size > 0) // Persistent weights memory on GEMMs
+        if (_aux_mem[1].size > 0 || _aux_mem[2].size > 0) // Persistent weights memory on GEMMs
         {
             // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
             // Keep all the auxiliary tensors in case of dynamic weights as they are recalculated every time
             _aux_mem[TransposedWeights] = MemoryInfo(
-                                              offset_int_vec(TransposedWeights),
-                                              _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
-                                              _reshaped_weights.total_size());
-            _aux_mem[ConvertedWeights] = MemoryInfo(
-                                             offset_int_vec(ConvertedWeights),
-                                             _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
-                                             _converted_weights.total_size());
+                offset_int_vec(TransposedWeights), _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
+                _reshaped_weights.total_size());
+            _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights),
+                                                    _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
+                                                    _converted_weights.total_size());
         }
         else
         {
             // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
-            const auto transposed_wei_lft = (_weights_to_use_idx == offset_int_vec(TransposedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare;
-            const auto converted_wei_lft  = (_weights_to_use_idx == offset_int_vec(ConvertedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare;
-
-            _aux_mem[TransposedWeights] = MemoryInfo(
-                                              offset_int_vec(TransposedWeights),
-                                              _dynamic_gemm ? MemoryLifetime::Temporary : transposed_wei_lft,
-                                              _reshaped_weights.total_size());
-            _aux_mem[ConvertedWeights] = MemoryInfo(
-                                             offset_int_vec(ConvertedWeights),
-                                             _dynamic_gemm ? MemoryLifetime::Temporary : converted_wei_lft,
-                                             _converted_weights.total_size());
+            const auto transposed_wei_lft = (_weights_to_use_idx == offset_int_vec(TransposedWeights))
+                                                ? MemoryLifetime::Persistent
+                                                : MemoryLifetime::Prepare;
+            const auto converted_wei_lft  = (_weights_to_use_idx == offset_int_vec(ConvertedWeights))
+                                                ? MemoryLifetime::Persistent
+                                                : MemoryLifetime::Prepare;
+
+            _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights),
+                                                     _dynamic_gemm ? MemoryLifetime::Temporary : transposed_wei_lft,
+                                                     _reshaped_weights.total_size());
+            _aux_mem[ConvertedWeights]  = MemoryInfo(offset_int_vec(ConvertedWeights),
+                                                    _dynamic_gemm ? MemoryLifetime::Temporary : converted_wei_lft,
+                                                     _converted_weights.total_size());
         }
     }
-    _aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
+    _aux_mem[FlattenedSrc] =
+        MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
 }
 
-Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+Status ClFullyConnected::validate(const ITensorInfo      *src,
+                                  const ITensorInfo      *weights,
+                                  const ITensorInfo      *biases,
+                                  const ITensorInfo      *dst,
                                   FullyConnectedLayerInfo fc_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU
-                                && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
     const GPUTarget gpu_target = get_arch_from_target(CLScheduler::get().target());
 
     const bool transpose_weights = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false;
@@ -441,11 +485,20 @@ Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *wei
     // When using dynamic weights - use matmul kernels.
     // Note: MatMul does not support broadcasting so fallback with batched cases.
     const bool is_batched_fc_layer = dst->dimension(1) > 1;
-    const bool use_matmul          = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() && !is_batched_fc_layer && !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout));
-
-    const ITensorInfo &flatten_src       = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)).set_data_layout(DataLayout::NCHW));
-    const ITensorInfo &reshaped_weights  = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
-    const ITensorInfo &converted_weights = (transpose_weights && !use_matmul) ? TensorInfo(*reshaped_weights.clone()) : TensorInfo(weights->clone()->set_is_resizable(true).reset_padding());
+    const bool use_matmul          = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() &&
+                            !is_batched_fc_layer &&
+                            !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout));
+
+    const ITensorInfo &flatten_src      = TensorInfo(src->clone()
+                                                         ->set_is_resizable(true)
+                                                         .reset_padding()
+                                                         .set_tensor_shape(compute_flatten_shape(src))
+                                                         .set_data_layout(DataLayout::NCHW));
+    const ITensorInfo &reshaped_weights = TensorInfo(
+        weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
+    const ITensorInfo &converted_weights = (transpose_weights && !use_matmul)
+                                               ? TensorInfo(*reshaped_weights.clone())
+                                               : TensorInfo(weights->clone()->set_is_resizable(true).reset_padding());
 
     // With the Fully Connected layer we can have 4 different cases:
     //  1) Convolution layer -> Fully Connected layer without batches
@@ -456,10 +509,10 @@ Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *wei
     const ITensorInfo *src_to_use     = src;
     const ITensorInfo *weights_to_use = weights;
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-        if(is_data_type_quantized(src->data_type()))
+        if (is_data_type_quantized(src->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -470,11 +523,11 @@ Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *wei
     }
 
     // Check if FC is after conv (flatten kernel is run in case where FC is after conv.)
-    if(is_batched_fc_layer)
+    if (is_batched_fc_layer)
     {
-        is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3,
-                                                                                 src->tensor_shape().cend(),
-                                                                                 dst->tensor_shape().cbegin() + 1));
+        is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                           (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(),
+                                       dst->tensor_shape().cbegin() + 1));
     }
     else
     {
@@ -482,29 +535,28 @@ Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *wei
     }
 
     // Transpose kernel does not run when matmul is supported as matmul fuses transpose op.
-    if(transpose_weights && !use_matmul)
+    if (transpose_weights && !use_matmul)
     {
         // Validate reshape weights kernel
         ARM_COMPUTE_RETURN_ON_ERROR(ClTranspose::validate(weights, &reshaped_weights));
         weights_to_use = &reshaped_weights;
     }
 
-    if(is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
+    if (is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
     {
         // Validate convert weights kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(ClConvertFullyConnectedWeights::validate(weights_to_use,
-                                                                             &converted_weights,
-                                                                             src->tensor_shape(),
-                                                                             fc_info.weights_trained_layout));
+        ARM_COMPUTE_RETURN_ON_ERROR(ClConvertFullyConnectedWeights::validate(
+            weights_to_use, &converted_weights, src->tensor_shape(), fc_info.weights_trained_layout));
         weights_to_use = &converted_weights;
     }
 
-    if(is_fc_after_conv)
+    if (is_fc_after_conv)
     {
         // Fully Connected layer after a Convolution Layer without batches
         // K Index of matrix multiplication. MatMul performs transpose in kernel, so index is 0 when matmul and transpose enabled
         const int weight_idx = (use_matmul && transpose_weights) ? 0 : 1;
-        ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(weight_idx) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
+        ARM_COMPUTE_RETURN_ERROR_ON(
+            (weights_to_use->dimension(weight_idx) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
 
         // Validate flatten kernel
         ARM_COMPUTE_RETURN_ON_ERROR(ClFlatten::validate(src, &flatten_src));
@@ -539,24 +591,24 @@ void ClFullyConnected::run(ITensorPack &tensors)
     CLAuxTensorHandler weights(_weights_to_use_idx, _weights_to_use, tensors, false);
 
     // Linearize input if it comes from a convolutional layer
-    if(_is_fc_after_conv)
+    if (_is_fc_after_conv)
     {
-        ITensorPack flatten_pack{ { ACL_SRC, src }, { ACL_DST, flattened_src.get() } };
+        ITensorPack flatten_pack{{ACL_SRC, src}, {ACL_DST, flattened_src.get()}};
         _flatten->run(flatten_pack);
     }
 
     ITensorPack gemm_pack = tensors;
     gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src);
-    if(_weights_to_use_idx != ACL_SRC_1)
+    if (_weights_to_use_idx != ACL_SRC_1)
     {
         gemm_pack.add_const_tensor(ACL_SRC_1, weights.get());
     }
 
     // Run MatMul Op
-    if(_use_matmul)
+    if (_use_matmul)
     {
         // Run matmul kernels for matrix multiplication
-        if(_is_quantized)
+        if (_is_quantized)
         {
             CLScheduler::get().enqueue_op(*_matmul_lowp_native_kernel, gemm_pack, true);
         }
@@ -568,7 +620,7 @@ void ClFullyConnected::run(ITensorPack &tensors)
     else
     {
         // Run matrix multiply
-        if(_is_quantized)
+        if (_is_quantized)
         {
             _mm_gemmlowp->run(gemm_pack);
         }
@@ -582,7 +634,7 @@ void ClFullyConnected::run(ITensorPack &tensors)
 void ClFullyConnected::prepare(ITensorPack &tensors)
 {
     // Note : Running prepare() each run when _use_matmul is true is unnecessary unless weights conversion is needed.
-    if(!_is_prepared || _dynamic_gemm)
+    if (!_is_prepared || _dynamic_gemm)
     {
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
         ++_asrt_prepare_count;
@@ -598,10 +650,10 @@ void ClFullyConnected::prepare(ITensorPack &tensors)
         const ITensor *cur_weights = weights;
 
         // Reshape weights if needed. Disabled when matmul kernels are enabled as matmul fuses transpose.
-        if(_transpose_weights && !_use_matmul)
+        if (_transpose_weights && !_use_matmul)
         {
             // Run reshape weights kernel and mark weights as unused
-            ITensorPack transpose_pack{ { ACL_SRC, weights }, { ACL_DST, reshaped_weights.get() } };
+            ITensorPack transpose_pack{{ACL_SRC, weights}, {ACL_DST, reshaped_weights.get()}};
             _reshape_weights->run(transpose_pack);
 
             cur_weights->mark_as_unused();
@@ -609,9 +661,9 @@ void ClFullyConnected::prepare(ITensorPack &tensors)
         }
 
         // Convert weights if needed
-        if(_run_convert_weights)
+        if (_run_convert_weights)
         {
-            ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } };
+            ITensorPack convert_pack{{ACL_SRC, cur_weights}, {ACL_DST, converted_weights.get()}};
             _convert_weights->run(convert_pack);
 
             cur_weights->mark_as_unused();
@@ -622,9 +674,9 @@ void ClFullyConnected::prepare(ITensorPack &tensors)
         gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights);
 
         // Prepare GEMM prepare and release unused weights
-        if(_dynamic_gemm || !_use_matmul)
+        if (_dynamic_gemm || !_use_matmul)
         {
-            if(!_is_quantized)
+            if (!_is_quantized)
             {
                 _mm_gemm->prepare(gemm_pack);
             }
diff --git a/src/gpu/cl/operators/ClFullyConnected.h b/src/gpu/cl/operators/ClFullyConnected.h
index d975859d87..0621238ab5 100644
--- a/src/gpu/cl/operators/ClFullyConnected.h
+++ b/src/gpu/cl/operators/ClFullyConnected.h
@@ -47,7 +47,7 @@ namespace kernels
 {
 class ClMatMulNativeKernel;
 class ClMatMulLowpNativeKernel;
-}
+} // namespace kernels
 /** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following OpenCL kernels:
  *
  *  -# @ref opencl::kernels::ClIm2ColKernel (called when the input comes from a convolutional layer)
@@ -88,7 +88,11 @@ public:
      *                             Data type supported: Same as @p src.
      * @param[in]  fc_info         (Optional) Fully connected layer additional info
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *weights,
+                   ITensorInfo            *biases,
+                   ITensorInfo            *dst,
                    FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -96,18 +100,36 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *weights,
+                           const ITensorInfo      *biases,
+                           const ITensorInfo      *dst,
                            FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
 
     // Inherited methods overriden
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
-    void configure_fc_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info);
-    void configure_conv_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info);
-    void configure_mm(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info);
+    void configure_fc_fc(const CLCompileContext        &compile_context,
+                         ITensorInfo                   *src,
+                         ITensorInfo                   *weights,
+                         ITensorInfo                   *bias,
+                         ITensorInfo                   *dst,
+                         const FullyConnectedLayerInfo &fc_info);
+    void configure_conv_fc(const CLCompileContext        &compile_context,
+                           ITensorInfo                   *src,
+                           ITensorInfo                   *weights,
+                           ITensorInfo                   *bias,
+                           ITensorInfo                   *dst,
+                           const FullyConnectedLayerInfo &fc_info);
+    void configure_mm(const CLCompileContext        &compile_context,
+                      ITensorInfo                   *src,
+                      ITensorInfo                   *weights,
+                      ITensorInfo                   *bias,
+                      ITensorInfo                   *dst,
+                      const FullyConnectedLayerInfo &fc_info);
 
 private:
     enum AuxTensorIdx
@@ -134,19 +156,19 @@ private:
     TensorInfo _reshaped_weights{};
     TensorInfo _lhs_to_use{};
     TensorInfo _weights_to_use{};
-    int        _weights_to_use_idx{ ACL_SRC_1 };
+    int        _weights_to_use_idx{ACL_SRC_1};
 
-    bool _run_convert_weights{ false };
-    bool _transpose_weights{ false };
-    bool _dynamic_gemm{ false };
-    bool _use_matmul{ false };
+    bool _run_convert_weights{false};
+    bool _transpose_weights{false};
+    bool _dynamic_gemm{false};
+    bool _use_matmul{false};
 
-    bool _is_fc_after_conv{ true };
-    bool _is_quantized{ false };
-    bool _is_prepared{ false };
+    bool _is_fc_after_conv{true};
+    bool _is_quantized{false};
+    bool _is_prepared{false};
 
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
-    int _asrt_run_count {};
+    int _asrt_run_count{};
     int _asrt_prepare_count{};
 #endif // ARM_COMPUTE_ASSERTS_ENABLED
 };
diff --git a/src/gpu/cl/operators/ClGemm.cpp b/src/gpu/cl/operators/ClGemm.cpp
index 7e331a86f3..815c254c69 100644
--- a/src/gpu/cl/operators/ClGemm.cpp
+++ b/src/gpu/cl/operators/ClGemm.cpp
@@ -33,11 +33,12 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
 
+#include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/core/utils/helpers/float_ops.h"
@@ -45,8 +46,6 @@
 #include "src/gpu/cl/utils/ClAuxTensorHandler.h"
 #include "src/runtime/CL/gemm/CLGEMMKernelSelection.h"
 #include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h"
-
-#include "src/common/utils/Log.h"
 #include "support/Cast.h"
 #include "utils/TypePrinter.h"
 
@@ -67,35 +66,43 @@ inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type)
     return kernel_type == CLGEMMKernelType::NATIVE ? false : true;
 }
 //Automatically select between mlgo (prioritized) and default heuristics for gemm kernel type
-inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run, bool constant_weights)
+inline CLGEMMKernelType
+auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run, bool constant_weights)
 {
-    if(!constant_weights)
+    if (!constant_weights)
     {
         return CLGEMMKernelType::NATIVE;
     }
 
     auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run);
-    if(bool(gemm_kernel))
+    if (bool(gemm_kernel))
     {
-        if(validate_gemm_kernel(gemm_kernel.gemm_type))
+        if (validate_gemm_kernel(gemm_kernel.gemm_type))
         {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.",
+                                                      to_string(gemm_kernel.gemm_type).c_str());
             return gemm_kernel.gemm_type;
         }
     }
     gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.",
+                                              to_string(gemm_kernel.gemm_type).c_str());
     return gemm_kernel.gemm_type;
 }
 // Validate lhs_info and rhs_info for reshaped only rhs kernel
-inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,
-                                                    const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info)
+inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info,
+                                                    const GEMMRHSMatrixInfo &rhs_info,
+                                                    const ITensorInfo       *a,
+                                                    const ITensorInfo       *b,
+                                                    const ITensorInfo       *c,
+                                                    const ITensorInfo       *output,
+                                                    GEMMKernelInfo           gemm_kernel_info)
 {
     // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
     TensorInfo tmp_b_info{};
     // Validate reshape RHS kernel
     auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-    if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+    if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
     {
         return false;
     }
@@ -103,12 +110,14 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs
     gemm_kernel_info.lhs_info  = lhs_info;
     gemm_kernel_info.rhs_info  = rhs_info;
     gemm_kernel_info.has_pad_y = false;
-    if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))
+    if (!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info,
+                                                                  rhs_info, gemm_kernel_info)))
     {
         return false;
     }
     gemm_kernel_info.has_pad_y = true;
-    if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))
+    if (!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info,
+                                                                  rhs_info, gemm_kernel_info)))
     {
         return false;
     }
@@ -116,49 +125,65 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs
 }
 
 //Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
-inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a,
-                                                                                                 const ITensorInfo *b,
-                                                                                                 const ITensorInfo *c, const ITensorInfo *output)
+inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query,
+                                          GEMMKernelInfo               kernel_info,
+                                          const ITensorInfo           *a,
+                                          const ITensorInfo           *b,
+                                          const ITensorInfo           *c,
+                                          const ITensorInfo           *output)
 {
     auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query);
-    if(config)
+    if (config)
     {
-        if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info))
+        if (validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info))
         {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-            return { config.lhs_info, config.rhs_info };
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+                "Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ",
+                to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+            return {config.lhs_info, config.rhs_info};
         }
     }
     config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-    return { config.lhs_info, config.rhs_info };
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+        "Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ",
+        to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
 }
 
 // Validate lhs_info and rhs_info for reshaped kernel
-inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,
-                                           const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info, bool reinterpret_input_as_3d)
+inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info,
+                                           const GEMMRHSMatrixInfo &rhs_info,
+                                           const ITensorInfo       *a,
+                                           const ITensorInfo       *b,
+                                           const ITensorInfo       *c,
+                                           const ITensorInfo       *output,
+                                           GEMMKernelInfo           gemm_kernel_info,
+                                           bool                     reinterpret_input_as_3d)
 {
     // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped kernel
     TensorInfo tmp_a_info{};
     TensorInfo tmp_b_info{};
 
     // Validate reshape LHS kernel
-    auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, reinterpret_input_as_3d)));
-    if(!bool(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, reinterpret_input_as_3d)))
+    auto_init_if_empty(tmp_a_info,
+                       a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, reinterpret_input_as_3d)));
+    if (!bool(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, reinterpret_input_as_3d)))
     {
         return false;
     }
 
     // Validate reshape RHS kernel
     auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-    if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+    if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
     {
         return false;
     }
     // Validate mm kernel
     gemm_kernel_info.lhs_info = lhs_info;
     gemm_kernel_info.rhs_info = rhs_info;
-    if(!bool(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))
+    if (!bool(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, 1.f, 0.f, lhs_info,
+                                                           rhs_info, gemm_kernel_info)))
     {
         return false;
     }
@@ -166,21 +191,32 @@ inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info, co
 }
 
 //Automatically select between mlgo (prioritized) and default heuristics for reshaped kernel configs
-inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a, const ITensorInfo *b,
-                                                                                        const ITensorInfo *c, const ITensorInfo *output, bool reinterpret_input_as_3d)
+inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery query,
+                                 GEMMKernelInfo               kernel_info,
+                                 const ITensorInfo           *a,
+                                 const ITensorInfo           *b,
+                                 const ITensorInfo           *c,
+                                 const ITensorInfo           *output,
+                                 bool                         reinterpret_input_as_3d)
 {
     auto config = auto_heuristics::select_mlgo_gemm_config_reshaped(query);
-    if(config)
+    if (config)
     {
-        if(validate_lhs_rhs_info_reshaped(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info, reinterpret_input_as_3d))
+        if (validate_lhs_rhs_info_reshaped(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info,
+                                           reinterpret_input_as_3d))
         {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-            return { config.lhs_info, config.rhs_info };
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+                "Use reshaped config from mlgo heuristics: LHS info: %s ; RHS info: %s ",
+                to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+            return {config.lhs_info, config.rhs_info};
         }
     }
     config = auto_heuristics::select_default_gemm_config_reshaped(query);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-    return { config.lhs_info, config.rhs_info };
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+        "Use reshaped config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(),
+        to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
 }
 } // namespace
 
@@ -200,18 +236,24 @@ ClGemm::ClGemm()
 {
 }
 
-void ClGemm::configure_native(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
-                              const GEMMInfo &gemm_info)
+void ClGemm::configure_native(const CLCompileContext &compile_context,
+                              ITensorInfo            *a,
+                              ITensorInfo            *b,
+                              ITensorInfo            *c,
+                              ITensorInfo            *output,
+                              float                   alpha,
+                              float                   beta,
+                              const GEMMInfo         &gemm_info)
 {
     DataType           data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const GPUTarget    gpu_target              = CLScheduler::get().target();
-    bool               broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target          = CLScheduler::get().target();
+    bool               broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -225,24 +267,32 @@ void ClGemm::configure_native(const CLCompileContext &compile_context, ITensorIn
     // Set the target for the kernels
     _mm_native_kernel->set_target(gpu_target);
 
-    auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
+    auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
 
     // Configure and tune matrix multiply kernel
-    _mm_native_kernel->configure(compile_context, a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info, kernel_info);
+    _mm_native_kernel->configure(compile_context, a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info,
+                                 kernel_info);
 }
 
-void ClGemm::configure_reshaped(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
-                                const GEMMInfo &gemm_info)
+void ClGemm::configure_reshaped(const CLCompileContext &compile_context,
+                                ITensorInfo            *a,
+                                ITensorInfo            *b,
+                                ITensorInfo            *c,
+                                ITensorInfo            *output,
+                                float                   alpha,
+                                float                   beta,
+                                const GEMMInfo         &gemm_info)
 {
     DataType           data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const GPUTarget    gpu_target              = CLScheduler::get().target();
-    bool               broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target          = CLScheduler::get().target();
+    bool               broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -261,32 +311,42 @@ void ClGemm::configure_reshaped(const CLCompileContext &compile_context, ITensor
     GEMMRHSMatrixInfo rhs_info{};
 
     // Pick up the GEMM configuration
-    std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b,
-                                                                    c, output, gemm_info.reinterpret_input_as_3d());
+    std::tie(lhs_info, rhs_info) =
+        auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size},
+                                         kernel_info, a, b, c, output, gemm_info.reinterpret_input_as_3d());
 
     _reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
     _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
 
     // Configure and tune matrix multiply kernel
-    _mm_reshaped_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+    _mm_reshaped_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info,
+                                   kernel_info);
 
     // Request memory for LHS and RHS reshape matrix
     _aux_mem[LhsReshape] = MemoryInfo(offset_int_vec(LhsReshape), MemoryLifetime::Temporary, _tmp_a.total_size());
-    _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+    _aux_mem[RhsReshape] = MemoryInfo(
+        offset_int_vec(RhsReshape),
+        _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
 }
 
-void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
-                                         const GEMMInfo &gemm_info)
+void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context,
+                                         ITensorInfo            *a,
+                                         ITensorInfo            *b,
+                                         ITensorInfo            *c,
+                                         ITensorInfo            *output,
+                                         float                   alpha,
+                                         float                   beta,
+                                         const GEMMInfo         &gemm_info)
 {
     DataType           data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const GPUTarget    gpu_target              = CLScheduler::get().target();
-    bool               broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target          = CLScheduler::get().target();
+    bool               broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -304,7 +364,8 @@ void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context
     GEMMRHSMatrixInfo rhs_info{};
 
     // Pick up the GEMM configuration
-    std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b, c, output);
+    std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}, kernel_info, a, b, c, output);
 
     // Transpose matrix
     _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
@@ -315,24 +376,33 @@ void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context
 
     // Configure matrix multiply kernel with no y padding support
     kernel_info.has_pad_y = false;
-    _mm_reshaped_only_rhs_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+    _mm_reshaped_only_rhs_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info,
+                                            kernel_info);
 
     // Request memory for RHS reshape matrix
-    _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+    _aux_mem[RhsReshape] = MemoryInfo(
+        offset_int_vec(RhsReshape),
+        _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
 }
 
-void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
-                                              const GEMMInfo &gemm_info)
+void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context,
+                                              ITensorInfo            *a,
+                                              ITensorInfo            *b,
+                                              ITensorInfo            *c,
+                                              ITensorInfo            *output,
+                                              float                   alpha,
+                                              float                   beta,
+                                              const GEMMInfo         &gemm_info)
 {
     DataType           data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const GPUTarget    gpu_target              = CLScheduler::get().target();
-    bool               broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target          = CLScheduler::get().target();
+    bool               broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -350,9 +420,10 @@ void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_co
     GEMMRHSMatrixInfo rhs_info{};
 
     // Pick up the GEMM configuration
-    auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
-    lhs_info         = gemm_config.lhs_info;
-    rhs_info         = gemm_config.rhs_info;
+    auto gemm_config = select_default_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
+    lhs_info = gemm_config.lhs_info;
+    rhs_info = gemm_config.rhs_info;
     // Force H0 to 4 in order to use the MMUL extension
     rhs_info.h0 = 4;
 
@@ -361,13 +432,22 @@ void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_co
 
     // Configure matrix multiply kernel with no y padding support
     kernel_info.has_pad_y = false;
-    _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+    _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info,
+                                                 rhs_info, kernel_info);
 
     // Request memory for RHS reshape matrix
-    _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+    _aux_mem[RhsReshape] = MemoryInfo(
+        offset_int_vec(RhsReshape),
+        _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
 }
 
-Status ClGemm::validate_native(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status ClGemm::validate_native(const ITensorInfo *a,
+                               const ITensorInfo *b,
+                               const ITensorInfo *c,
+                               const ITensorInfo *output,
+                               float              alpha,
+                               float              beta,
+                               const GEMMInfo    &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_UNUSED(output);
@@ -376,12 +456,12 @@ Status ClGemm::validate_native(const ITensorInfo *a, const ITensorInfo *b, const
     const GPUTarget    gpu_target              = CLScheduler::get().target();
     DataType           data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const bool         broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const bool         broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -392,15 +472,23 @@ Status ClGemm::validate_native(const ITensorInfo *a, const ITensorInfo *b, const
     kernel_info.broadcast_bias          = broadcast_bias;
     kernel_info.activation_info         = gemm_info.activation_info();
 
-    auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
+    auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
 
     // Validate matrix multiply
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyNativeKernel::validate(a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info, kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyNativeKernel::validate(
+        a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info, kernel_info));
 
     return Status{};
 }
 
-Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status ClGemm::validate_reshaped(const ITensorInfo *a,
+                                 const ITensorInfo *b,
+                                 const ITensorInfo *c,
+                                 const ITensorInfo *output,
+                                 float              alpha,
+                                 float              beta,
+                                 const GEMMInfo    &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_UNUSED(output);
@@ -412,12 +500,12 @@ Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, con
     const GPUTarget    gpu_target              = CLScheduler::get().target();
     DataType           data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const bool         broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const bool         broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -433,23 +521,33 @@ Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, con
 
     // Pick up the GEMM configuration
     // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
-    const auto gemm_config = select_default_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
-    lhs_info               = gemm_config.lhs_info;
-    rhs_info               = gemm_config.rhs_info;
+    const auto gemm_config =
+        select_default_gemm_config_reshaped(auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
+    lhs_info = gemm_config.lhs_info;
+    rhs_info = gemm_config.rhs_info;
 
-    auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
+    auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(
+                                       compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
 
     auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
     ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));
 
     // Validate matrix multiply
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha,
+                                                                             beta, lhs_info, rhs_info, kernel_info));
 
     return Status{};
 }
 
-Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a,
+                                          const ITensorInfo *b,
+                                          const ITensorInfo *c,
+                                          const ITensorInfo *output,
+                                          float              alpha,
+                                          float              beta,
+                                          const GEMMInfo    &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_UNUSED(output);
@@ -460,12 +558,12 @@ Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInf
     const GPUTarget    gpu_target              = CLScheduler::get().target();
     const DataType     data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const bool         broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const bool         broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -481,24 +579,33 @@ Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInf
 
     // Pick up the GEMM configuration
     // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
-    const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
-    lhs_info               = gemm_config.lhs_info;
-    rhs_info               = gemm_config.rhs_info;
+    const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
+    lhs_info = gemm_config.lhs_info;
+    rhs_info = gemm_config.rhs_info;
 
     auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
     ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));
 
     // Validate matrix multiply
     kernel_info.has_pad_y = false;
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(
+        a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
 
     kernel_info.has_pad_y = true;
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(
+        a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
 
     return Status{};
 }
 
-Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a,
+                                               const ITensorInfo *b,
+                                               const ITensorInfo *c,
+                                               const ITensorInfo *output,
+                                               float              alpha,
+                                               float              beta,
+                                               const GEMMInfo    &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_UNUSED(output);
@@ -508,12 +615,12 @@ Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITens
     const GPUTarget    gpu_target              = CLScheduler::get().target();
     const DataType     data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const bool         broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const bool         broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -529,9 +636,10 @@ Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITens
 
     // Pick up the GEMM configuration
     // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
-    const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
-    lhs_info               = gemm_config.lhs_info;
-    rhs_info               = gemm_config.rhs_info;
+    const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
+    lhs_info = gemm_config.lhs_info;
+    rhs_info = gemm_config.rhs_info;
     // Force H0 to 4 in order to use the MMUL extension
     rhs_info.h0 = 4;
 
@@ -540,12 +648,20 @@ Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITens
 
     // Validate matrix multiply
     kernel_info.has_pad_y = false;
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(
+        a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
 
     return Status{};
 }
 
-void ClGemm::configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void ClGemm::configure(const CLCompileContext &compile_context,
+                       ITensorInfo            *a,
+                       ITensorInfo            *b,
+                       ITensorInfo            *c,
+                       ITensorInfo            *output,
+                       float                   alpha,
+                       float                   beta,
+                       const GEMMInfo         &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
 
@@ -558,20 +674,21 @@ void ClGemm::configure(const CLCompileContext &compile_context, ITensorInfo *a,
     _is_prepared                 = gemm_info.retain_internal_weights();
 
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
 
     // Select GEMMType
-    _gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery{ CLScheduler::get().target(), a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run,
-                                                b->are_values_constant());
+    _gemm_kernel_type = auto_select_gemm_kernel(
+        auto_heuristics::CommonQuery{CLScheduler::get().target(), a->data_type(), m, n, k, batch_size},
+        _reshape_b_only_on_first_run, b->are_values_constant());
 
     const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
 
     ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;
 
-    switch(_gemm_kernel_type)
+    switch (_gemm_kernel_type)
     {
         case CLGEMMKernelType::NATIVE:
         {
@@ -600,35 +717,41 @@ void ClGemm::configure(const CLCompileContext &compile_context, ITensorInfo *a,
     }
 }
 
-Status ClGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status ClGemm::validate(const ITensorInfo *a,
+                        const ITensorInfo *b,
+                        const ITensorInfo *c,
+                        const ITensorInfo *output,
+                        float              alpha,
+                        float              beta,
+                        const GEMMInfo    &gemm_info)
 {
     // Get the GPU target
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
 
     // Check data type early because the auto_select_gemm_kernel has assertions on supported data types
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16);
 
     // Select GEMMType
-    CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery
-    {
-        CLScheduler::get().target(),
-        a->data_type(),
-        m,
-        n,
-        k,
-        batch_size,
-    },
-    gemm_info.reshape_b_only_on_first_run(), b->are_values_constant());
+    CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel(
+        auto_heuristics::CommonQuery{
+            CLScheduler::get().target(),
+            a->data_type(),
+            m,
+            n,
+            k,
+            batch_size,
+        },
+        gemm_info.reshape_b_only_on_first_run(), b->are_values_constant());
 
     const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
 
     const ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;
 
-    switch(gemm_kernel_type)
+    switch (gemm_kernel_type)
     {
         case CLGEMMKernelType::NATIVE:
         {
@@ -647,7 +770,8 @@ Status ClGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso
         }
         case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL:
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs_mmul(a, b, c_to_use, output, alpha, beta, gemm_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                validate_reshaped_only_rhs_mmul(a, b, c_to_use, output, alpha, beta, gemm_info));
             break;
         }
         default:
@@ -674,7 +798,7 @@ void ClGemm::run(ITensorPack &tensors)
     prepare(tensors);
 
     // Run matrix multiply kernel
-    switch(_gemm_kernel_type)
+    switch (_gemm_kernel_type)
     {
         case CLGEMMKernelType::NATIVE:
         {
@@ -684,13 +808,13 @@ void ClGemm::run(ITensorPack &tensors)
         case CLGEMMKernelType::RESHAPED:
         {
             // Run interleave kernel
-            ITensorPack reshape_lhs_pack{ { ACL_SRC, lhs }, { ACL_DST, lhs_reshaped.get() } };
+            ITensorPack reshape_lhs_pack{{ACL_SRC, lhs}, {ACL_DST, lhs_reshaped.get()}};
             CLScheduler::get().enqueue_op(*_reshape_lhs_kernel, reshape_lhs_pack, false);
 
-            if(!_reshape_b_only_on_first_run)
+            if (!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
-                ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } };
+                ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}};
                 CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);
             }
             // Copy original tensor pack and overwrite lhs and rhs with reshaped counterparts
@@ -698,7 +822,7 @@ void ClGemm::run(ITensorPack &tensors)
             gemm_reshaped_pack.add_const_tensor(ACL_SRC_0, lhs_reshaped.get());
             gemm_reshaped_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get());
 
-            if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED)
+            if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED)
             {
                 CLScheduler::get().enqueue_op(*_mm_reshaped_kernel, gemm_reshaped_pack, true);
             }
@@ -706,10 +830,10 @@ void ClGemm::run(ITensorPack &tensors)
         }
         case CLGEMMKernelType::RESHAPED_ONLY_RHS:
         {
-            if(!_reshape_b_only_on_first_run)
+            if (!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
-                ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } };
+                ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}};
                 CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);
             }
             // In case of RESHAPED_ONLY_RHS, we need to check the padding requirement
@@ -722,7 +846,7 @@ void ClGemm::run(ITensorPack &tensors)
             ITensorPack gemm_reshaped_onlyrhs_pack(tensors);
             gemm_reshaped_onlyrhs_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get());
 
-            if(has_pad_y)
+            if (has_pad_y)
             {
                 ARM_COMPUTE_ERROR_ON(has_pad_y);
             }
@@ -734,10 +858,10 @@ void ClGemm::run(ITensorPack &tensors)
         }
         case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL:
         {
-            if(!_reshape_b_only_on_first_run)
+            if (!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
-                ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } };
+                ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}};
                 CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);
             }
             // In case of RESHAPED_ONLY_RHS, we need to check the padding requirement
@@ -750,7 +874,7 @@ void ClGemm::run(ITensorPack &tensors)
             ITensorPack gemm_reshaped_onlyrhs_pack(tensors);
             gemm_reshaped_onlyrhs_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get());
 
-            if(has_pad_y)
+            if (has_pad_y)
             {
                 ARM_COMPUTE_ERROR_ON(has_pad_y);
             }
@@ -769,20 +893,22 @@ void ClGemm::run(ITensorPack &tensors)
 
 void ClGemm::prepare(ITensorPack &constants)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
-        const ITensor *src1    = constants.get_const_tensor(ACL_SRC_1);
-        ICLTensor     *rhs_aux = utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(RhsReshape)));
+        const ITensor *src1 = constants.get_const_tensor(ACL_SRC_1);
+        ICLTensor     *rhs_aux =
+            utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(RhsReshape)));
 
         // If memory for RHS is persistent and src1 is provided re-transform else assume that RHS is transformed
-        if((_aux_mem[AuxTensorIdx::RhsReshape].lifetime == MemoryLifetime::Persistent) && (src1 != nullptr && rhs_aux != nullptr) && rhs_aux)
+        if ((_aux_mem[AuxTensorIdx::RhsReshape].lifetime == MemoryLifetime::Persistent) &&
+            (src1 != nullptr && rhs_aux != nullptr) && rhs_aux)
         {
             ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Transforming RHS Matrix!");
 
             CLAuxTensorHandler rhs_reshaped(_tmp_b, *rhs_aux);
             ARM_COMPUTE_ERROR_ON(rhs_reshaped.get()->cl_buffer().get() == nullptr);
 
-            ITensorPack reshape_rhs_pack{ { ACL_SRC, src1 }, { ACL_DST, rhs_reshaped.get() } };
+            ITensorPack reshape_rhs_pack{{ACL_SRC, src1}, {ACL_DST, rhs_reshaped.get()}};
             CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, true);
         }
         _is_prepared = true;
diff --git a/src/gpu/cl/operators/ClGemm.h b/src/gpu/cl/operators/ClGemm.h
index 11f9f2b3d8..85dc1d6c8f 100644
--- a/src/gpu/cl/operators/ClGemm.h
+++ b/src/gpu/cl/operators/ClGemm.h
@@ -90,30 +90,95 @@ public:
      *                             if the reshape of matrix B should happen only for the first run. GEMMInfo also contains information about the reshaping
      *                             in case matrix A and matrix B have been already transformed.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *a,
+                   ITensorInfo            *b,
+                   ITensorInfo            *c,
+                   ITensorInfo            *output,
+                   float                   alpha,
+                   float                   beta,
+                   const GEMMInfo         &gemm_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClGemm::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *output,
+                           float              alpha,
+                           float              beta,
+                           const GEMMInfo    &gemm_info);
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
-    void configure_native(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    void configure_reshaped(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    void configure_reshaped_only_rhs(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    void configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+    void configure_native(const CLCompileContext &compile_context,
+                          ITensorInfo            *a,
+                          ITensorInfo            *b,
+                          ITensorInfo            *c,
+                          ITensorInfo            *output,
+                          float                   alpha,
+                          float                   beta,
+                          const GEMMInfo         &gemm_info);
+    void configure_reshaped(const CLCompileContext &compile_context,
+                            ITensorInfo            *a,
+                            ITensorInfo            *b,
+                            ITensorInfo            *c,
+                            ITensorInfo            *output,
+                            float                   alpha,
+                            float                   beta,
+                            const GEMMInfo         &gemm_info);
+    void configure_reshaped_only_rhs(const CLCompileContext &compile_context,
+                                     ITensorInfo            *a,
+                                     ITensorInfo            *b,
+                                     ITensorInfo            *c,
+                                     ITensorInfo            *output,
+                                     float                   alpha,
+                                     float                   beta,
+                                     const GEMMInfo         &gemm_info);
+    void configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context,
+                                          ITensorInfo            *a,
+                                          ITensorInfo            *b,
+                                          ITensorInfo            *c,
+                                          ITensorInfo            *output,
+                                          float                   alpha,
+                                          float                   beta,
+                                          const GEMMInfo         &gemm_info);
 
-    static Status validate_native(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    static Status validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    static Status validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    static Status validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+    static Status validate_native(const ITensorInfo *a,
+                                  const ITensorInfo *b,
+                                  const ITensorInfo *c,
+                                  const ITensorInfo *output,
+                                  float              alpha,
+                                  float              beta,
+                                  const GEMMInfo    &gemm_info);
+    static Status validate_reshaped(const ITensorInfo *a,
+                                    const ITensorInfo *b,
+                                    const ITensorInfo *c,
+                                    const ITensorInfo *output,
+                                    float              alpha,
+                                    float              beta,
+                                    const GEMMInfo    &gemm_info);
+    static Status validate_reshaped_only_rhs(const ITensorInfo *a,
+                                             const ITensorInfo *b,
+                                             const ITensorInfo *c,
+                                             const ITensorInfo *output,
+                                             float              alpha,
+                                             float              beta,
+                                             const GEMMInfo    &gemm_info);
+    static Status validate_reshaped_only_rhs_mmul(const ITensorInfo *a,
+                                                  const ITensorInfo *b,
+                                                  const ITensorInfo *c,
+                                                  const ITensorInfo *output,
+                                                  float              alpha,
+                                                  float              beta,
+                                                  const GEMMInfo    &gemm_info);
 
 private:
     enum AuxTensorIdx
diff --git a/src/gpu/cl/operators/ClGemmConv2d.cpp b/src/gpu/cl/operators/ClGemmConv2d.cpp
index 5620471ff9..55d815a1ef 100644
--- a/src/gpu/cl/operators/ClGemmConv2d.cpp
+++ b/src/gpu/cl/operators/ClGemmConv2d.cpp
@@ -28,10 +28,12 @@
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/kernels/ClActivationKernel.h"
@@ -41,8 +43,6 @@
 #include "src/gpu/cl/operators/ClGemm.h"
 #include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
 #include "src/gpu/cl/utils/ClAuxTensorHandler.h"
-
-#include "src/common/utils/Log.h"
 #include "support/Cast.h"
 
 namespace arm_compute
@@ -53,18 +53,38 @@ using namespace utils::cast;
 namespace opencl
 {
 ClGemmConv2d::ClGemmConv2d()
-    : _weights_reshape_kernel(nullptr), _im2col_kernel(nullptr), _mm_gemm(nullptr), _mm_gemmlowp(nullptr), _col2im_kernel(nullptr), _activation_kernel(nullptr), _im2col_output(), _weights_reshaped(),
-      _gemm_output(), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _append_bias(false), _is_prepared(false), _aux_mem(AuxTensorIdx::Count)
+    : _weights_reshape_kernel(nullptr),
+      _im2col_kernel(nullptr),
+      _mm_gemm(nullptr),
+      _mm_gemmlowp(nullptr),
+      _col2im_kernel(nullptr),
+      _activation_kernel(nullptr),
+      _im2col_output(),
+      _weights_reshaped(),
+      _gemm_output(),
+      _skip_im2col(false),
+      _skip_col2im(false),
+      _is_quantized(false),
+      _fuse_activation(true),
+      _append_bias(false),
+      _is_prepared(false),
+      _aux_mem(AuxTensorIdx::Count)
 {
 }
 ClGemmConv2d::~ClGemmConv2d() = default;
 
-void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
+void ClGemmConv2d::configure_mm(const ClCompileContext        &compile_context,
+                                const ITensorInfo             *src,
+                                ITensorInfo                   *weights,
+                                ITensorInfo                   *biases,
+                                ITensorInfo                   *dst,
                                 const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
-                                int gemm_3d_depth, const ActivationLayerInfo &act_info)
+                                int                            gemm_3d_depth,
+                                const ActivationLayerInfo     &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_mm(src, weights, biases, dst, gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info));
 
     const GEMMInfo &gemm_info = GEMMInfo(false,                 // is_a_reshaped
                                          false,                 // is_b_reshaped
@@ -77,18 +97,20 @@ void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const I
                                          false,                 // fp_mixed_precision
                                          true,                  // broadcast_bias
                                          act_info               // activation_info
-                                        );
+    );
 
-    TensorInfo tmp_src{ *src };
-    if(_is_quantized)
+    TensorInfo tmp_src{*src};
+    if (_is_quantized)
     {
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate input and weights offset
         const QuantizationInfo input_quantization_info   = src->quantization_info();
         const QuantizationInfo weights_quantization_info = weights->quantization_info();
 
-        tmp_src.set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
-        weights->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+        tmp_src.set_quantization_info(
+            QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+        weights->set_quantization_info(
+            QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
 
         _mm_gemmlowp = std::make_unique<ClGemmLowpMatrixMultiplyCore>();
         _mm_gemmlowp->configure(compile_context, &tmp_src, weights, biases, dst, gemm_info);
@@ -97,7 +119,7 @@ void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const I
         weights->set_quantization_info(weights_quantization_info);
 
         auto mm_mem_req = _mm_gemmlowp->workspace();
-        for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+        for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
         {
             _aux_mem[cont] = mm_mem_req[cont];
         }
@@ -108,15 +130,21 @@ void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const I
         _mm_gemm = std::make_unique<ClGemm>();
         _mm_gemm->configure(compile_context, &tmp_src, weights, biases, dst, 1.0f, 1.0f, gemm_info);
         auto mm_mem_req = _mm_gemm->workspace();
-        for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+        for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
         {
             _aux_mem[cont] = mm_mem_req[cont];
         }
     }
 }
 
-Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                 const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info)
+Status ClGemmConv2d::validate_mm(const ITensorInfo             *src,
+                                 const ITensorInfo             *weights,
+                                 const ITensorInfo             *biases,
+                                 const ITensorInfo             *dst,
+                                 const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
+                                 int                            gemm_3d_depth,
+                                 bool                           skip_im2col,
+                                 const ActivationLayerInfo     &act_info)
 {
     const bool is_quantized = is_data_type_quantized_asymmetric(src->data_type());
 
@@ -131,9 +159,9 @@ Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weig
                                          false,                 // fp_mixed_precision
                                          true,                  // broadcast_bias
                                          act_info               // activation_info
-                                        );
+    );
 
-    if(is_quantized)
+    if (is_quantized)
     {
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate input and weights offset
@@ -142,8 +170,10 @@ Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weig
 
         std::unique_ptr<ITensorInfo> src_qa     = src->clone();
         std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
-        src_qa->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
-        weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+        src_qa->set_quantization_info(
+            QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+        weights_qa->set_quantization_info(
+            QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
 
         // Perform validation step on GEMMLowp
         return ClGemmLowpMatrixMultiplyCore::validate(src_qa.get(), weights_qa.get(), biases, dst, gemm_info);
@@ -155,14 +185,17 @@ Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weig
     }
 }
 
-void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                             const Conv2dInfo &conv2d_info, const WeightsInfo &weights_info)
+void ClGemmConv2d::configure(const CLCompileContext &compile_context,
+                             ITensorInfo            *src,
+                             ITensorInfo            *weights,
+                             ITensorInfo            *biases,
+                             ITensorInfo            *dst,
+                             const Conv2dInfo       &conv2d_info,
+                             const WeightsInfo      &weights_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
 
-    ARM_COMPUTE_ERROR_THROW_ON(ClGemmConv2d::validate(src, weights, biases, dst,
-                                                      conv2d_info,
-                                                      weights_info));
+    ARM_COMPUTE_ERROR_THROW_ON(ClGemmConv2d::validate(src, weights, biases, dst, conv2d_info, weights_info));
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv2d_info, weights_info);
 
     const DataType   data_type   = src->data_type();
@@ -180,7 +213,8 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf
 
     _is_prepared  = weights_info.retain_internal_weights();
     _is_quantized = is_data_type_quantized_asymmetric(src->data_type());
-    _skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1);
+    _skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 &&
+                    conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1);
     _skip_col2im  = data_layout == DataLayout::NHWC;
 
     // Only for quantize there are few cases where we cannot fuse the activation function in GEMM
@@ -197,12 +231,8 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf
     // Get convolved dimensions
     unsigned int conv_w      = 0;
     unsigned int conv_h      = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
-                                                 src->dimension(idx_height),
-                                                 kernel_width,
-                                                 kernel_height,
-                                                 conv2d_info.conv_info,
-                                                 conv2d_info.dilation);
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+                                                 kernel_height, conv2d_info.conv_info, conv2d_info.dilation);
 
     unsigned int mat_weights_cols = num_kernels / conv2d_info.num_groups;
 
@@ -210,28 +240,31 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf
     _append_bias               = false;
 
     _weights_reshape_kernel = std::make_unique<kernels::ClWeightsReshapeKernel>();
-    if(conv2d_info.num_groups != 1 && biases != nullptr)
+    if (conv2d_info.num_groups != 1 && biases != nullptr)
     {
         // num_groups != 1 can only be for NCHW
         // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor
         biases_to_use = nullptr;
         _append_bias  = true;
-        _weights_reshape_kernel->configure(compile_context, weights, biases, &_weights_reshaped, conv2d_info.num_groups);
+        _weights_reshape_kernel->configure(compile_context, weights, biases, &_weights_reshaped,
+                                           conv2d_info.num_groups);
     }
     else
     {
-        _weights_reshape_kernel->configure(compile_context, weights, nullptr, &_weights_reshaped, conv2d_info.num_groups);
+        _weights_reshape_kernel->configure(compile_context, weights, nullptr, &_weights_reshaped,
+                                           conv2d_info.num_groups);
     }
 
     // Create tensor to store im2col reshaped inputs
-    if(!_skip_im2col)
+    if (!_skip_im2col)
     {
         // Configure and tune im2col. im2col output shape is auto-initialized
         _im2col_kernel = std::make_unique<opencl::kernels::ClIm2ColKernel>();
 
         // Set the GPU target for im2col
         _im2col_kernel->set_target(CLScheduler::get().target());
-        _im2col_kernel->configure(compile_context, src, &_im2col_output, Size2D(kernel_width, kernel_height), conv2d_info.conv_info, _append_bias, conv2d_info.dilation, conv2d_info.num_groups);
+        _im2col_kernel->configure(compile_context, src, &_im2col_output, Size2D(kernel_width, kernel_height),
+                                  conv2d_info.conv_info, _append_bias, conv2d_info.dilation, conv2d_info.num_groups);
 
         // Set quantization info
         _im2col_output.set_quantization_info(src->quantization_info());
@@ -242,7 +275,7 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf
     }
 
     // Create GEMM output tensor
-    if(!_skip_col2im)
+    if (!_skip_col2im)
     {
         TensorShape shape_gemm;
 
@@ -263,7 +296,7 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf
     gemmlowp_output_stage.gemmlowp_offset = 0;
 
     // Configure output stage for quantized case
-    if(_is_quantized)
+    if (_is_quantized)
     {
         const auto         output_quant_info        = (dst->total_size() == 0) ? iq_info : oq_info;
         const bool         is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type());
@@ -286,16 +319,16 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf
         auto min_activation = min_val.get<int32_t>();
         auto max_activation = max_val.get<int32_t>();
 
-        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                                 };
+        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+            ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+            ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
 
-        if(conv2d_info.act_info.enabled())
+        if (conv2d_info.act_info.enabled())
         {
-            if(supported_acts.count(conv2d_info.act_info.activation()) != 0)
+            if (supported_acts.count(conv2d_info.act_info.activation()) != 0)
             {
-                std::tie(min_activation, max_activation) = get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info);
+                std::tie(min_activation, max_activation) =
+                    get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info);
             }
             else
             {
@@ -313,48 +346,60 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf
     // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
     const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
 
-    configure_mm(compile_context, gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, conv2d_info.act_info);
+    configure_mm(compile_context, gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use,
+                 gemmlowp_output_stage, gemm_3d_depth, conv2d_info.act_info);
 
-    if(!_skip_col2im)
+    if (!_skip_col2im)
     {
         // Set the GPU target for col2im
         _col2im_kernel = std::make_unique<opencl::kernels::ClCol2ImKernel>();
         _col2im_kernel->set_target(CLScheduler::get().target());
         // Configure and tune Col2Im
-        _col2im_kernel->configure(compile_context, gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups);
+        _col2im_kernel->configure(compile_context, gemm_output_to_use, dst, Size2D(conv_w, conv_h),
+                                  conv2d_info.num_groups);
         CLScheduler::get().tune_kernel_static(*_col2im_kernel.get());
     }
 
     ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h),
                              "Output shape does not match the expected one");
 
-    if(!_fuse_activation)
+    if (!_fuse_activation)
     {
         _activation_kernel = std::make_unique<opencl::kernels::ClActivationKernel>();
         _activation_kernel->configure(compile_context, dst, nullptr, conv2d_info.act_info);
     }
 
-    _aux_mem[Im2ColOutput]    = MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size());
-    _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), MemoryLifetime::Persistent, _weights_reshaped.total_size());
-    _aux_mem[GemmOutput]      = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size());
+    _aux_mem[Im2ColOutput] =
+        MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size());
+    _aux_mem[WeightsReshaped] =
+        MemoryInfo(offset_int_vec(WeightsReshaped), MemoryLifetime::Persistent, _weights_reshaped.total_size());
+    _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size());
 }
 
-Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
+Status ClGemmConv2d::validate(const ITensorInfo *src,
+                              const ITensorInfo *weights,
+                              const ITensorInfo *biases,
+                              const ITensorInfo *dst,
+                              const Conv2dInfo  &conv2d_info,
                               const WeightsInfo &weights_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type());
 
-    if(!is_quantized_per_channel)
+    if (!is_quantized_per_channel)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
     }
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_type() == DataType::QASYMM8), "Grouping (num_groups != 1) is not supported with QASYMM8");
-    ARM_COMPUTE_RETURN_ERROR_ON(((src->dimension(2) / weights->dimension(2)) != conv2d_info.num_groups) && (src->data_layout() == DataLayout::NCHW));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW),
+                                    "Grouping (num_groups != 1) with NHWC data layout is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_type() == DataType::QASYMM8),
+                                    "Grouping (num_groups != 1) is not supported with QASYMM8");
+    ARM_COMPUTE_RETURN_ERROR_ON(((src->dimension(2) / weights->dimension(2)) != conv2d_info.num_groups) &&
+                                (src->data_layout() == DataLayout::NCHW));
 
     const DataLayout data_layout = src->data_layout();
     const DataType   data_type   = src->data_type();
@@ -374,18 +419,19 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights
     const ITensorInfo *gemm_output_to_use = dst;
     const ITensorInfo *weights_to_use     = weights;
     const bool         is_quantized       = is_data_type_quantized_asymmetric(data_type);
-    const bool         skip_im2col        = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv2d_info.conv_info.stride().first == 1
-                                             && conv2d_info.conv_info.stride().second == 1);
-    const bool         skip_col2im        = data_layout == DataLayout::NHWC;
-    bool               fuse_activation    = true;
+    const bool         skip_im2col     = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 &&
+                              conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1);
+    const bool         skip_col2im     = data_layout == DataLayout::NHWC;
+    bool               fuse_activation = true;
 
-    ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * conv2d_info.num_groups) != src->dimension(idx_channel));
+    ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * conv2d_info.num_groups) !=
+                                src->dimension(idx_channel));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
 
     // Validate biases
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
-        if(is_quantized)
+        if (is_quantized)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -397,7 +443,7 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
     }
 
-    if(conv2d_info.act_info.enabled())
+    if (conv2d_info.act_info.enabled())
     {
         ARM_COMPUTE_ERROR_ON(conv2d_info.act_info.b() > conv2d_info.act_info.a());
     }
@@ -406,48 +452,50 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights
     unsigned int conv_w = 0;
     unsigned int conv_h = 0;
 
-    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
-                                                 src->dimension(idx_height),
-                                                 kernel_width,
-                                                 kernel_height,
-                                                 conv2d_info.conv_info,
-                                                 conv2d_info.dilation);
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+                                                 kernel_height, conv2d_info.conv_info, conv2d_info.dilation);
 
     unsigned int mat_weights_cols = num_kernels / conv2d_info.num_groups;
 
     const ITensorInfo *biases_to_use = biases;
     bool               append_bias   = false;
 
-    if(conv2d_info.num_groups != 1 && biases != nullptr)
+    if (conv2d_info.num_groups != 1 && biases != nullptr)
     {
         // num_groups != 1 can only be for NCHW
         // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor
-        biases_to_use         = nullptr;
-        append_bias           = true;
-        weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, true, conv2d_info.num_groups), 1, data_type);
+        biases_to_use = nullptr;
+        append_bias   = true;
+        weights_reshaped_info =
+            TensorInfo(compute_weights_reshaped_shape(*weights, true, conv2d_info.num_groups), 1, data_type);
     }
     else
     {
-        weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, false, conv2d_info.num_groups), 1, data_type);
+        weights_reshaped_info =
+            TensorInfo(compute_weights_reshaped_shape(*weights, false, conv2d_info.num_groups), 1, data_type);
     }
 
     weights_to_use = &weights_reshaped_info;
 
-    if(!skip_im2col)
+    if (!skip_im2col)
     {
         const Size2D kernel_dims(kernel_width, kernel_height);
 
         // Output tensor auto initialization if not yet initialized
-        TensorShape expected_output_shape = compute_im2col_conv_shape(src, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation, conv2d_info.num_groups == 1, conv2d_info.num_groups);
+        TensorShape expected_output_shape =
+            compute_im2col_conv_shape(src, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation,
+                                      conv2d_info.num_groups == 1, conv2d_info.num_groups);
 
         auto_init_if_empty(im2col_reshaped_info, src->clone()->set_tensor_shape(expected_output_shape));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(opencl::kernels::ClIm2ColKernel::validate(src, &im2col_reshaped_info, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation, conv2d_info.num_groups));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            opencl::kernels::ClIm2ColKernel::validate(src, &im2col_reshaped_info, kernel_dims, conv2d_info.conv_info,
+                                                      append_bias, conv2d_info.dilation, conv2d_info.num_groups));
         gemm_input_to_use = &im2col_reshaped_info;
     }
 
     // Create GEMM output tensor
-    if(!skip_col2im)
+    if (!skip_col2im)
     {
         TensorShape shape_gemm;
 
@@ -465,7 +513,7 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights
     gemmlowp_output_stage.gemmlowp_offset          = 0;
     gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel;
 
-    if(is_quantized)
+    if (is_quantized)
     {
         const UniformQuantizationInfo iq_info           = src->quantization_info().uniform();
         const UniformQuantizationInfo oq_info           = dst->quantization_info().uniform();
@@ -483,16 +531,16 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights
         int min_activation = 0;
         int max_activation = 0;
 
-        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                                 };
+        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+            ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+            ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
 
-        if(conv2d_info.act_info.enabled())
+        if (conv2d_info.act_info.enabled())
         {
-            if(supported_acts.count(conv2d_info.act_info.activation()) != 0)
+            if (supported_acts.count(conv2d_info.act_info.activation()) != 0)
             {
-                std::tie(min_activation, max_activation) = get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info);
+                std::tie(min_activation, max_activation) =
+                    get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info);
             }
             else
             {
@@ -509,16 +557,18 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights
     // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
     const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
 
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, conv2d_info.act_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use,
+                                            gemmlowp_output_stage, gemm_3d_depth, skip_im2col, conv2d_info.act_info));
 
     // Validate Col2Im
-    if(!skip_col2im)
+    if (!skip_col2im)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            kernels::ClCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups));
     }
 
     // Validate Activation Layer
-    if(!fuse_activation)
+    if (!fuse_activation)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, nullptr, conv2d_info.act_info));
     }
@@ -541,30 +591,26 @@ void ClGemmConv2d::run(ITensorPack &tensors)
     CLAuxTensorHandler weights_reshaped(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, false);
 
     // Run im2col
-    if(!_skip_im2col)
+    if (!_skip_im2col)
     {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, src },
-            { TensorType::ACL_DST, im2col_output.get() }
-        };
+        ITensorPack pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, im2col_output.get()}};
         CLScheduler::get().enqueue_op(*_im2col_kernel, pack, false);
         gemm_input_to_use = im2col_output.get();
     }
-    if(!_skip_col2im)
+    if (!_skip_col2im)
     {
         gemm_output_to_use = gemm_output.get();
     }
     ITensorPack pack_mm = tensors;
     pack_mm.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use);
     pack_mm.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get());
-    if(!_append_bias)
+    if (!_append_bias)
     {
         pack_mm.add_const_tensor(TensorType::ACL_SRC_2, biases);
     }
     pack_mm.add_tensor(TensorType::ACL_DST, gemm_output_to_use);
     // Runs ClGemm or ClGemmLowpMatrixMultiplyCore functions
-    if(_is_quantized)
+    if (_is_quantized)
     {
         // Run gemmlowp
         _mm_gemmlowp->run(pack_mm);
@@ -576,43 +622,32 @@ void ClGemmConv2d::run(ITensorPack &tensors)
     }
 
     // Reshape output matrix
-    if(!_skip_col2im)
+    if (!_skip_col2im)
     {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, gemm_output_to_use },
-            { TensorType::ACL_DST, dst }
-        };
+        ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}};
         CLScheduler::get().enqueue_op(*_col2im_kernel.get(), pack, false);
     }
 
     //Run Activation Layer if we cannot fuse in GEMM
-    if(!_fuse_activation)
+    if (!_fuse_activation)
     {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, dst },
-            { TensorType::ACL_DST, dst }
-        };
+        ITensorPack pack = {{TensorType::ACL_SRC, dst}, {TensorType::ACL_DST, dst}};
         CLScheduler::get().enqueue_op(*_activation_kernel.get(), pack, false);
     }
 }
 
 void ClGemmConv2d::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         // Run weights reshaping and mark original weights tensor as unused
-        ICLTensor         *weights_reshaped_p = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(WeightsReshaped)));
+        ICLTensor *weights_reshaped_p =
+            utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(WeightsReshaped)));
         CLAuxTensorHandler weights_reshaped(_weights_reshaped, *weights_reshaped_p);
         auto               weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-        ITensorPack        pack    =
-        {
-            { TensorType::ACL_SRC, weights },
-            { TensorType::ACL_DST, weights_reshaped.get() }
-        };
+        ITensorPack        pack    = {{TensorType::ACL_SRC, weights}, {TensorType::ACL_DST, weights_reshaped.get()}};
 
-        if(_append_bias)
+        if (_append_bias)
         {
             const auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2);
             pack.add_const_tensor(TensorType::ACL_BIAS, biases);
diff --git a/src/gpu/cl/operators/ClGemmConv2d.h b/src/gpu/cl/operators/ClGemmConv2d.h
index 8a46ee2dc3..e8f3147ac3 100644
--- a/src/gpu/cl/operators/ClGemmConv2d.h
+++ b/src/gpu/cl/operators/ClGemmConv2d.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -100,15 +101,24 @@ public:
      * @param[in]  weights_info    Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. If this is not part of the fully connected layer the weights
      *                             tensor has also been transposed with CLGEMMReshapeRHSMatrixKernel. Data type supported: Same as @p input.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info,
-                   const WeightsInfo &weights_info = WeightsInfo());
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *weights,
+                   ITensorInfo            *biases,
+                   ITensorInfo            *dst,
+                   const Conv2dInfo       &conv2d_info,
+                   const WeightsInfo      &weights_info = WeightsInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClGemmConvolution::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &conv2d_info,
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *weights,
+                           const ITensorInfo *biases,
+                           const ITensorInfo *output,
+                           const Conv2dInfo  &conv2d_info,
                            const WeightsInfo &weights_info = WeightsInfo());
 
     // Inherited methods overridden:
@@ -130,9 +140,14 @@ private:
      * @param[in]      gemm_3d_depth         Depth of GEMM 3D
      * @param[in]      act_info              Activation to apply after the matrix multiplication
      */
-    void configure_mm(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
+    void configure_mm(const CLCompileContext        &compile_context,
+                      const ITensorInfo             *src,
+                      ITensorInfo                   *weights,
+                      ITensorInfo                   *biases,
+                      ITensorInfo                   *dst,
                       const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
-                      int gemm_3d_depth, const ActivationLayerInfo &act_info);
+                      int                            gemm_3d_depth,
+                      const ActivationLayerInfo     &act_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer matrix multiply routines
      *
      * @param[in] src                   Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -148,8 +163,14 @@ private:
      *
      * @return a status
      */
-    static Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
-                              int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info);
+    static Status validate_mm(const ITensorInfo             *src,
+                              const ITensorInfo             *weights,
+                              const ITensorInfo             *biases,
+                              const ITensorInfo             *dst,
+                              const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
+                              int                            gemm_3d_depth,
+                              bool                           skip_im2col,
+                              const ActivationLayerInfo     &act_info);
 
     enum AuxTensorIdx
     {
diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
index 2622274587..71c247de79 100644
--- a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
+++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
@@ -52,7 +52,7 @@ namespace
 {
 inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type)
 {
-    switch(kernel_type)
+    switch (kernel_type)
     {
         case CLGEMMKernelType::NATIVE:
         case CLGEMMKernelType::RESHAPED_ONLY_RHS:
@@ -71,32 +71,41 @@ inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type)
 inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run)
 {
     auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run);
-    if(bool(gemm_kernel))
+    if (bool(gemm_kernel))
     {
-        if(validate_gemm_kernel(gemm_kernel.gemm_type))
+        if (validate_gemm_kernel(gemm_kernel.gemm_type))
         {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.",
+                                                      to_string(gemm_kernel.gemm_type).c_str());
             return gemm_kernel.gemm_type;
         }
     }
     gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.",
+                                              to_string(gemm_kernel.gemm_type).c_str());
     return gemm_kernel.gemm_type;
 }
 
 // Validate lhs_info and rhs_info for native kernel
-inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info)
+inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info,
+                                         const GEMMRHSMatrixInfo &rhs_info,
+                                         const ITensorInfo       *a,
+                                         const ITensorInfo       *b,
+                                         const GEMMReshapeInfo   &reshape_info)
 {
     // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
     TensorInfo mm_result_s32_info{};
     // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*a, *b, false, reshape_info)).set_data_type(DataType::S32));
+    auto_init_if_empty(
+        mm_result_s32_info,
+        a->clone()->set_tensor_shape(compute_mm_shape(*a, *b, false, reshape_info)).set_data_type(DataType::S32));
     // Validate mm kernel
     // NOTE: Ignore all other parameters (eg. output stage etc.) and only validate lhs and rhs info
     // NOTE: This assumes:
     //  1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_arguments).
     //  2. lhs and rhs info does not cause window and padding issues through side effects (in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_and_configure_window).
-    if(!bool(ClGemmLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info, reshape_info)))
+    if (!bool(ClGemmLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info,
+                                                             reshape_info)))
     {
         return false;
     }
@@ -104,31 +113,45 @@ inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info, cons
 }
 
 // Automatically select between mlgo (prioritized) and default heuristics for native kernel configs
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_native(auto_heuristics::CommonQuery query, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_native(auto_heuristics::CommonQuery query,
+                                                                               const ITensorInfo           *a,
+                                                                               const ITensorInfo           *b,
+                                                                               const GEMMReshapeInfo &reshape_info)
 {
     auto config = auto_heuristics::select_mlgo_gemm_config_native(query);
-    if(config)
+    if (config)
     {
-        if(validate_lhs_rhs_info_native(config.lhs_info, config.rhs_info, a, b, reshape_info))
+        if (validate_lhs_rhs_info_native(config.lhs_info, config.rhs_info, a, b, reshape_info))
         {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-            return { config.lhs_info, config.rhs_info };
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+                "Use native config from mlgo heuristics: LHS info: %s ; RHS info: %s ",
+                to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+            return {config.lhs_info, config.rhs_info};
         }
     }
     config = auto_heuristics::select_default_gemm_config_native(query);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-    return { config.lhs_info, config.rhs_info };
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from default heuristics: LHS info: %s ; RHS info: %s ",
+                                              to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
 }
 
 // Validate lhs_info and rhs_info for reshaped only rhs kernel
-inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output,
-                                                    unsigned int m, unsigned int n, unsigned int k, bool reinterpret_input_as_3d, int depth_output_gemm3d)
+inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info,
+                                                    const GEMMRHSMatrixInfo &rhs_info,
+                                                    const ITensorInfo       *a,
+                                                    const ITensorInfo       *b,
+                                                    const ITensorInfo       *output,
+                                                    unsigned int             m,
+                                                    unsigned int             n,
+                                                    unsigned int             k,
+                                                    bool                     reinterpret_input_as_3d,
+                                                    int                      depth_output_gemm3d)
 {
     // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
     TensorInfo tmp_b_info{};
     // Validate reshape RHS kernel
     auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-    if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+    if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
     {
         return false;
     }
@@ -148,7 +171,8 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs
     // Since we ignore the output stage, output data type has to be S32 to pass the validation
     TensorInfo output_info_copy(*output);
     output_info_copy.set_data_type(DataType::S32);
-    if(!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info)))
+    if (!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, &output_info_copy,
+                                                                      gemm_kernel_info)))
     {
         return false;
     }
@@ -156,14 +180,22 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs
 }
 
 // Validate lhs_info and rhs_info for reshaped only rhs kernel
-inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output,
-                                                         unsigned int m, unsigned int n, unsigned int k, bool reinterpret_input_as_3d, int depth_output_gemm3d)
+inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo &lhs_info,
+                                                         const GEMMRHSMatrixInfo &rhs_info,
+                                                         const ITensorInfo       *a,
+                                                         const ITensorInfo       *b,
+                                                         const ITensorInfo       *output,
+                                                         unsigned int             m,
+                                                         unsigned int             n,
+                                                         unsigned int             k,
+                                                         bool                     reinterpret_input_as_3d,
+                                                         int                      depth_output_gemm3d)
 {
     // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
     TensorInfo tmp_b_info{};
     // Validate reshape RHS kernel
     auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-    if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+    if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
     {
         return false;
     }
@@ -183,7 +215,8 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo
     // Since we ignore the output stage, output data type has to be S32 to pass the validation
     TensorInfo output_info_copy(*output);
     output_info_copy.set_data_type(DataType::S32);
-    if(!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info)))
+    if (!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, &output_info_copy,
+                                                                          gemm_kernel_info)))
     {
         return false;
     }
@@ -191,40 +224,55 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo
 }
 
 // Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d,
-                                                                                          const ITensorInfo *a,
-                                                                                          const ITensorInfo *b, const ITensorInfo *output)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query,
+                                          bool                         reinterpret_input_as_3d,
+                                          int                          depth_output_gemm3d,
+                                          const ITensorInfo           *a,
+                                          const ITensorInfo           *b,
+                                          const ITensorInfo           *output)
 {
     auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query);
-    if(config)
+    if (config)
     {
-        if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, query.k, reinterpret_input_as_3d, depth_output_gemm3d))
+        if (validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n,
+                                                    query.k, reinterpret_input_as_3d, depth_output_gemm3d))
         {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-            return { config.lhs_info, config.rhs_info };
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+                "Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ",
+                to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+            return {config.lhs_info, config.rhs_info};
         }
     }
     config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-    return { config.lhs_info, config.rhs_info };
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+        "Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ",
+        to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
 }
 
 // Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d,
-                                                                                               const ITensorInfo *a,
-                                                                                               const ITensorInfo *b, const ITensorInfo *output)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery query,
+                                               bool                         reinterpret_input_as_3d,
+                                               int                          depth_output_gemm3d,
+                                               const ITensorInfo           *a,
+                                               const ITensorInfo           *b,
+                                               const ITensorInfo           *output)
 {
     ARM_COMPUTE_UNUSED(a, b, output, reinterpret_input_as_3d, depth_output_gemm3d);
     auto config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);
-    validate_lhs_rhs_info_reshaped_only_rhs_mmul(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, query.k, reinterpret_input_as_3d, depth_output_gemm3d);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs_mmul config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(),
-                                              to_string(config.rhs_info).c_str());
-    return { config.lhs_info, config.rhs_info };
+    validate_lhs_rhs_info_reshaped_only_rhs_mmul(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n,
+                                                 query.k, reinterpret_input_as_3d, depth_output_gemm3d);
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+        "Use reshaped_only_rhs_mmul config from default heuristics: LHS info: %s ; RHS info: %s ",
+        to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
 }
 
 inline bool is_gemm_reshaped(CLGEMMKernelType kernel_type)
 {
-    switch(kernel_type)
+    switch (kernel_type)
     {
         case CLGEMMKernelType::NATIVE:
             return false;
@@ -254,8 +302,11 @@ ClGemmLowpMatrixMultiplyCore::ClGemmLowpMatrixMultiplyCore()
 ClGemmLowpMatrixMultiplyCore::~ClGemmLowpMatrixMultiplyCore() = default;
 
 void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context,
-                                             ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output,
-                                             const GEMMInfo &gemm_info)
+                                             ITensorInfo            *a,
+                                             ITensorInfo            *b,
+                                             ITensorInfo            *c,
+                                             ITensorInfo            *output,
+                                             const GEMMInfo         &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
     ARM_COMPUTE_ERROR_THROW_ON(ClGemmLowpMatrixMultiplyCore::validate(a, b, c, output, gemm_info));
@@ -263,8 +314,8 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
 
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
     _a_offset                    = a->quantization_info().uniform().offset;
-    _convert_to_qasymm8          = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type())
-                                   && a->data_type() == DataType::QASYMM8;
+    _convert_to_qasymm8          = is_data_type_quantized_per_channel(b->data_type()) &&
+                          is_data_type_quantized_symmetric(b->data_type()) && a->data_type() == DataType::QASYMM8;
     _b_offset  = _convert_to_qasymm8 ? -128 : b->quantization_info().uniform().offset;
     _gemm_info = gemm_info;
 
@@ -282,17 +333,18 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
     // Arguments used by GEMMReshapeInfo
     // in order to know how the matrices have been reshaped
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
 
     const auto reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
 
-    _gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run);
+    _gemm_kernel_type = auto_select_gemm_kernel(
+        auto_heuristics::CommonQuery{gpu_target, a->data_type(), m, n, k, batch_size}, _reshape_b_only_on_first_run);
 
-    if(_convert_to_qasymm8)
+    if (_convert_to_qasymm8)
     {
         // Set data type for converted weights
         _qasymm8_weights = *b;
@@ -301,47 +353,50 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
     }
 
     ITensorInfo *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b;
-    if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
+    if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
     {
         matrix_b = &_tmp_b;
 
         // Pick up the GEMM configuration
         // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-        std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, reinterpret_input_as_3d,
-                                                                                 depth_output_gemm3d,
-                                                                                 a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
+        std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(
+            auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, reinterpret_input_as_3d,
+            depth_output_gemm3d, a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
 
         // Configure reshape RHS kernel
-        _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
+        _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b,
+                                         rhs_info);
     }
-    if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+    if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
     {
         matrix_b = &_tmp_b;
 
         // Pick up the GEMM configuration
         // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-        std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, reinterpret_input_as_3d,
-                                                                                      depth_output_gemm3d,
-                                                                                      a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
+        std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs_mmul(
+            auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, reinterpret_input_as_3d,
+            depth_output_gemm3d, a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
 
         // Configure reshape RHS kernel
-        _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
+        _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b,
+                                         rhs_info);
     }
 
     // Using default reduction info
-    const GEMMLowpReductionKernelInfo reduction_info {};
+    const GEMMLowpReductionKernelInfo reduction_info{};
 
     // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
-    if(_a_offset != 0)
+    if (_a_offset != 0)
     {
         _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
 
         // Configure Matrix B reduction kernel
-        _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info);
+        _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b,
+                                           &_vector_sum_col, reduction_info);
     }
 
     // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
-    if(_b_offset != 0)
+    if (_b_offset != 0)
     {
         _vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
 
@@ -360,17 +415,19 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
     gemm_kernel_info.a_offset                = _a_offset;
     gemm_kernel_info.b_offset                = _b_offset;
     // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
-    if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+    if (gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
     {
         // Configure offset contribution kernel
-        const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
+        const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel)
+                                       ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size()
+                                       : 1;
 
         _gemm_output_stage_multipliers = TensorInfo(TensorShape(num_filters), 1, DataType::S32);
         _gemm_output_stage_shifts      = TensorInfo(TensorShape(num_filters), 1, DataType::S32);
 
         GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
         gemmlowp_output_stage.output_data_type        = a->data_type();
-        if(num_filters == 1)
+        if (num_filters == 1)
         {
             // Per-channel quantization with OFM == 1 is equivalent to uniform quantization.
             // Setting this flag to false prevents the kernel from adding useless padding to the output multipliers and shifts
@@ -379,55 +436,67 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
 
         gemm_kernel_info.output_stage = gemmlowp_output_stage;
 
-        if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS &&
+            gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
         {
             // Configure and tune matrix multiply kernel with fused output stage
-            _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
-                                                    _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+            _mm_reshaped_only_rhs_kernel->configure(
+                compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr,
+                &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
         }
-        else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL &&
+                 gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
         {
             // Configure and tune matrix multiply kernel with fused output stage
-            _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
-                                                         _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+            _mm_reshaped_only_rhs_mmul_kernel->configure(
+                compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr,
+                &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
         }
         else
         {
             _run_output_stage = true;
 
-            if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
+            if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
             {
-                _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info);
+                _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32,
+                                                        gemm_kernel_info);
             }
-            if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+            if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
             {
-                _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info);
+                _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32,
+                                                             gemm_kernel_info);
             }
             else
             {
                 // Pick up the GEMM configuration
                 // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-                std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size },
-                                                                              a, _convert_to_qasymm8 ? &_qasymm8_weights : matrix_b, reshape_info);
+                std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(
+                    auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, a,
+                    _convert_to_qasymm8 ? &_qasymm8_weights : matrix_b, reshape_info);
 
                 // Configure matrix multiply kernel
-                _mm_native_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, reshape_info);
-
-                _offset_contribution_output_stage_kernel->configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row,
-                                                                    c != nullptr ? c : nullptr, output, a->dimension(0), _a_offset, _b_offset, gemmlowp_output_stage,
-                                                                    &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+                _mm_native_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, lhs_info, rhs_info,
+                                             reshape_info);
+
+                _offset_contribution_output_stage_kernel->configure(
+                    compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                    _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, output, a->dimension(0),
+                    _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers,
+                    &_gemm_output_stage_shifts);
             }
         }
     }
     else
     {
         _run_offset_contribution = true;
-        if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
+        if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
         {
             // Configure and tune matrix multiply kernel
             _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info);
         }
-        else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+        else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
         {
             // Configure and tune matrix multiply kernel
             _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info);
@@ -436,44 +505,65 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
         {
             // Pick up the GEMM configuration
             // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-            std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size },
-                                                                          a, _convert_to_qasymm8 ? &_qasymm8_weights : b, reshape_info);
+            std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(
+                auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, a,
+                _convert_to_qasymm8 ? &_qasymm8_weights : b, reshape_info);
 
             // Configure matrix multiply kernel
             _mm_native_kernel->configure(compile_context, a, matrix_b, output, lhs_info, rhs_info, reshape_info);
         }
 
         // Configure offset contribution kernel
-        _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row,
-                                               c != nullptr ? c : nullptr, a->dimension(0), _a_offset, _b_offset);
+        _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                                               _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr,
+                                               a->dimension(0), _a_offset, _b_offset);
     }
 
     // Request memory
-    _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _qasymm8_weights.total_size());
-    if(is_gemm_reshaped(_gemm_kernel_type))
+    _aux_mem[RhsQAsymm8] =
+        MemoryInfo(offset_int_vec(RhsQAsymm8),
+                   _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary,
+                   _qasymm8_weights.total_size());
+    if (is_gemm_reshaped(_gemm_kernel_type))
     {
         // Overwrite Rhs as prepare if gemm is reshaped as there will be a two-step transformation
-        _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Prepare : MemoryLifetime::Temporary, _qasymm8_weights.total_size());
-        _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
-    }
-    if(_a_offset != 0)
-    {
-        _aux_mem[VecSumCol] = MemoryInfo(offset_int_vec(VecSumCol), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _vector_sum_col.total_size());
-    }
-    if(_b_offset != 0)
-    {
-        _aux_mem[VecSumRow] = MemoryInfo(offset_int_vec(VecSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
-    }
-    _aux_mem[ResultS32]   = MemoryInfo(offset_int_vec(ResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
-    _aux_mem[Multipliers] = MemoryInfo(offset_int_vec(Multipliers), MemoryLifetime::Persistent, _gemm_output_stage_multipliers.total_size());
-    _aux_mem[Shifts]      = MemoryInfo(offset_int_vec(Shifts), MemoryLifetime::Persistent, _gemm_output_stage_shifts.total_size());
+        _aux_mem[RhsQAsymm8] =
+            MemoryInfo(offset_int_vec(RhsQAsymm8),
+                       _reshape_b_only_on_first_run ? MemoryLifetime::Prepare : MemoryLifetime::Temporary,
+                       _qasymm8_weights.total_size());
+        _aux_mem[RhsReshape] = MemoryInfo(
+            offset_int_vec(RhsReshape),
+            _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+    }
+    if (_a_offset != 0)
+    {
+        _aux_mem[VecSumCol] =
+            MemoryInfo(offset_int_vec(VecSumCol),
+                       _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary,
+                       _vector_sum_col.total_size());
+    }
+    if (_b_offset != 0)
+    {
+        _aux_mem[VecSumRow] =
+            MemoryInfo(offset_int_vec(VecSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
+    }
+    _aux_mem[ResultS32] = MemoryInfo(offset_int_vec(ResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
+    _aux_mem[Multipliers] = MemoryInfo(offset_int_vec(Multipliers), MemoryLifetime::Persistent,
+                                       _gemm_output_stage_multipliers.total_size());
+    _aux_mem[Shifts] =
+        MemoryInfo(offset_int_vec(Shifts), MemoryLifetime::Persistent, _gemm_output_stage_shifts.total_size());
 }
 
-Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
+Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a,
+                                              const ITensorInfo *b,
+                                              const ITensorInfo *c,
+                                              const ITensorInfo *output,
+                                              const GEMMInfo    &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
     ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8 && b->data_type() == DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8_SIGNED && b->data_type() == DataType::QASYMM8);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
@@ -492,39 +582,44 @@ Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
     const GPUTarget gpu_target = CLScheduler::get().target();
 
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
 
-    bool reshape_matrix_b = is_gemm_reshaped(auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, gemm_info.reshape_b_only_on_first_run()));
+    bool reshape_matrix_b = is_gemm_reshaped(
+        auto_select_gemm_kernel(auto_heuristics::CommonQuery{gpu_target, a->data_type(), m, n, k, batch_size},
+                                gemm_info.reshape_b_only_on_first_run()));
 
     const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
 
-    bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type())
-                              && is_data_type_quantized_asymmetric(a->data_type());
+    bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) &&
+                              is_data_type_quantized_symmetric(b->data_type()) &&
+                              is_data_type_quantized_asymmetric(a->data_type());
     TensorInfo weights_info(*b);
-    if(convert_to_qasymm8)
+    if (convert_to_qasymm8)
     {
         b_offset = -128;
         weights_info.set_data_type(DataType::QASYMM8);
         ARM_COMPUTE_RETURN_ON_ERROR(ClCastKernel::validate(b, &weights_info, ConvertPolicy::WRAP));
     }
     const ITensorInfo *matrix_b_info = &weights_info;
-    if(reshape_matrix_b)
+    if (reshape_matrix_b)
     {
         matrix_b_info = &tmp_b_info;
 
         // Pick up the GEMM configuration
         // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
         // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-        const auto res = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size });
-        lhs_info       = res.lhs_info;
-        rhs_info       = res.rhs_info;
+        const auto res = select_default_gemm_config_reshaped_only_rhs(
+            auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size});
+        lhs_info = res.lhs_info;
+        rhs_info = res.rhs_info;
 
         // Validate reshape RHS kernel
-        auto_init_if_empty(tmp_b_info, weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info)));
+        auto_init_if_empty(tmp_b_info,
+                           weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info)));
         ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info));
     }
 
@@ -533,21 +628,23 @@ Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
 
     const GEMMLowpReductionKernelInfo reduction_info;
     // Validate matrix B reduction kernel only if _a_offset is not equal to 0
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32);
 
         // Configure Matrix B reduction kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            ClGemmLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info));
     }
 
     // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
-    if(b_offset != 0)
+    if (b_offset != 0)
     {
         info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
 
         // Configure matrix A reduction kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            ClGemmLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info));
     }
 
     GEMMKernelInfo gemm_kernel_info;
@@ -560,92 +657,99 @@ Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
     gemm_kernel_info.rhs_info                = rhs_info;
     gemm_kernel_info.a_offset                = a_offset;
     gemm_kernel_info.b_offset                = b_offset;
-    if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+    if (gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
     {
-        const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
+        const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel)
+                                       ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size()
+                                       : 1;
 
-        const TensorInfo gemm_output_stage_multipliers_shifts_info(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
+        const TensorInfo gemm_output_stage_multipliers_shifts_info(
+            TensorInfo(TensorShape(num_filters), 1, DataType::S32));
 
         GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
         gemmlowp_output_stage.output_data_type        = a->data_type();
 
         gemm_kernel_info.output_stage = gemmlowp_output_stage;
-        if(reshape_matrix_b && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        if (reshape_matrix_b &&
+            gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info,
-                                                                                                a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                                b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                                c,
-                                                                                                &gemm_output_stage_multipliers_shifts_info,
-                                                                                                &gemm_output_stage_multipliers_shifts_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(
+                matrix_a_info, matrix_b_info, output, gemm_kernel_info, a_offset == 0 ? nullptr : &info_vector_sum_col,
+                b_offset == 0 ? nullptr : &info_vector_sum_row, c, &gemm_output_stage_multipliers_shifts_info,
+                &gemm_output_stage_multipliers_shifts_info));
         }
         else
         {
             TensorInfo mm_result_s32_info{};
 
-            if(reshape_matrix_b)
+            if (reshape_matrix_b)
             {
                 // Output tensor auto inizialitation if not yet initialized
-                auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32));
+                auto_init_if_empty(mm_result_s32_info, a->clone()
+                                                           ->set_tensor_shape(compute_mm_shape(
+                                                               *matrix_a_info, *matrix_b_info, reshape_info))
+                                                           .set_data_type(DataType::S32));
 
                 // Validate matrix multiply
-                ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info));
+                ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(
+                    matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info));
             }
             else
             {
                 // Output tensor auto inizialitation if not yet initialized
-                auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32));
+                auto_init_if_empty(mm_result_s32_info, a->clone()
+                                                           ->set_tensor_shape(compute_mm_shape(
+                                                               *matrix_a_info, *matrix_b_info, false, reshape_info))
+                                                           .set_data_type(DataType::S32));
 
                 // Pick up the GEMM configuration
                 // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
                 // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-                const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size });
-                lhs_info       = res.lhs_info;
-                rhs_info       = res.rhs_info;
+                const auto res = select_default_gemm_config_native(
+                    auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size});
+                lhs_info = res.lhs_info;
+                rhs_info = res.rhs_info;
 
                 // Validate matrix multiply
-                ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
+                ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(
+                    matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
             }
 
             // Validate offset contribution kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
-                                                                                                a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                                b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                                c,
-                                                                                                output,
-                                                                                                a_offset, b_offset,
-                                                                                                gemmlowp_output_stage,
-                                                                                                &gemm_output_stage_multipliers_shifts_info,
-                                                                                                &gemm_output_stage_multipliers_shifts_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionOutputStageKernel::validate(
+                &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col,
+                b_offset == 0 ? nullptr : &info_vector_sum_row, c, output, a_offset, b_offset, gemmlowp_output_stage,
+                &gemm_output_stage_multipliers_shifts_info, &gemm_output_stage_multipliers_shifts_info));
         }
     }
     else
     {
-        if(reshape_matrix_b)
+        if (reshape_matrix_b)
         {
             // Validate matrix multiply
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(
+                matrix_a_info, matrix_b_info, output, gemm_kernel_info));
         }
         else
         {
             // Pick up the GEMM configuration
             // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-            const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size });
-            lhs_info       = res.lhs_info;
-            rhs_info       = res.rhs_info;
+            const auto res = select_default_gemm_config_native(
+                auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size});
+            lhs_info = res.lhs_info;
+            rhs_info = res.rhs_info;
 
             // Validate matrix multiply
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(
+                matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
         }
 
-        if(output->total_size() != 0)
+        if (output->total_size() != 0)
         {
             // Validate offset contribution kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionKernel::validate(output,
-                                                                                     a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                     b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                     c,
-                                                                                     a_offset, b_offset));
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionKernel::validate(
+                output, a_offset == 0 ? nullptr : &info_vector_sum_col, b_offset == 0 ? nullptr : &info_vector_sum_row,
+                c, a_offset, b_offset));
         }
     }
 
@@ -675,73 +779,61 @@ void ClGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
     const ITensor *matrix_a = a;
     const ITensor *matrix_b = _convert_to_qasymm8 ? rhs_qasymm8.get() : b;
 
-    if(is_gemm_reshaped(_gemm_kernel_type))
+    if (is_gemm_reshaped(_gemm_kernel_type))
     {
         matrix_b = tmp_b.get();
-        if(!_reshape_b_only_on_first_run)
+        if (!_reshape_b_only_on_first_run)
         {
             // Run reshape matrix B
-            ITensorPack mtx_b_reshape_pack =
-            {
-                { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
-                { TensorType::ACL_DST, tmp_b.get() }
-            };
+            ITensorPack mtx_b_reshape_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b},
+                                              {TensorType::ACL_DST, tmp_b.get()}};
             CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_reshape_pack, false);
         }
     }
 
     // Run matrix B reduction kernel only if _a_offset is not equal to 0
-    if(_a_offset != 0 && !_reshape_b_only_on_first_run)
+    if (_a_offset != 0 && !_reshape_b_only_on_first_run)
     {
-        ITensorPack mtx_b_red_pack =
-        {
-            { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
-            { TensorType::ACL_DST, vec_sum_col.get() }
-        };
+        ITensorPack mtx_b_red_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b},
+                                      {TensorType::ACL_DST, vec_sum_col.get()}};
         CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false);
     }
 
     // Run matrix A reduction kernel only if _b_offset is not equal to 0
-    if(_b_offset != 0)
+    if (_b_offset != 0)
     {
-        ITensorPack mtx_a_red_pack =
-        {
-            { TensorType::ACL_SRC, matrix_a },
-            { TensorType::ACL_DST, vec_sum_row.get() }
-        };
+        ITensorPack mtx_a_red_pack = {{TensorType::ACL_SRC, matrix_a}, {TensorType::ACL_DST, vec_sum_row.get()}};
         CLScheduler::get().enqueue_op(*_mtx_a_reduction_kernel, mtx_a_red_pack, false);
     }
 
     // Run matrix multiply
-    if(is_gemm_reshaped(_gemm_kernel_type))
+    if (is_gemm_reshaped(_gemm_kernel_type))
     {
         ITensorPack gemm_reshaped_pack;
-        if(_run_offset_contribution)
+        if (_run_offset_contribution)
         {
-            gemm_reshaped_pack = ITensorPack({ { TensorType::ACL_SRC_0, matrix_a },
-                { TensorType::ACL_SRC_1, matrix_b },
-                { TensorType::ACL_DST, _run_output_stage ? res32.get() : dst }
-            });
+            gemm_reshaped_pack = ITensorPack({{TensorType::ACL_SRC_0, matrix_a},
+                                              {TensorType::ACL_SRC_1, matrix_b},
+                                              {TensorType::ACL_DST, _run_output_stage ? res32.get() : dst}});
         }
         else
         {
-            gemm_reshaped_pack = ITensorPack(
-            {
-                { TensorType::ACL_SRC, matrix_a },
-                { TensorType::ACL_SRC_1, matrix_b },
-                { TensorType::ACL_BIAS, c },
-                { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() },
-                { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() },
-                { TensorType::ACL_SHIFTS, shifts.get() },
-                { TensorType::ACL_MULTIPLIERS, multipliers.get() },
-                { TensorType::ACL_DST, dst },
+            gemm_reshaped_pack = ITensorPack({
+                {TensorType::ACL_SRC, matrix_a},
+                {TensorType::ACL_SRC_1, matrix_b},
+                {TensorType::ACL_BIAS, c},
+                {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()},
+                {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()},
+                {TensorType::ACL_SHIFTS, shifts.get()},
+                {TensorType::ACL_MULTIPLIERS, multipliers.get()},
+                {TensorType::ACL_DST, dst},
             });
         }
-        if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
+        if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
         {
             CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_pack, false);
         }
-        else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+        else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
         {
             CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_mmul_kernel, gemm_reshaped_pack, false);
         }
@@ -752,46 +844,39 @@ void ClGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
     }
     else
     {
-        ITensorPack gemm_native_pack =
-        {
-            { TensorType::ACL_SRC_0, matrix_a },
-            { TensorType::ACL_SRC_1, matrix_b },
-            { TensorType::ACL_DST, _run_offset_contribution ? dst : res32.get() }
-        };
+        ITensorPack gemm_native_pack = {{TensorType::ACL_SRC_0, matrix_a},
+                                        {TensorType::ACL_SRC_1, matrix_b},
+                                        {TensorType::ACL_DST, _run_offset_contribution ? dst : res32.get()}};
         CLScheduler::get().enqueue_op(*_mm_native_kernel, gemm_native_pack, false);
     }
-    if(_run_output_stage)
+    if (_run_output_stage)
     {
         // Run offset contribution/output stage kernel
-        ITensorPack output_stage_pack =
-        {
-            { TensorType::ACL_SRC, res32.get() },
-            { TensorType::ACL_BIAS, c },
-            { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() },
-            { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() },
-            { TensorType::ACL_SHIFTS, shifts.get() },
-            { TensorType::ACL_MULTIPLIERS, multipliers.get() },
-            { TensorType::ACL_DST, dst },
+        ITensorPack output_stage_pack = {
+            {TensorType::ACL_SRC, res32.get()},
+            {TensorType::ACL_BIAS, c},
+            {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()},
+            {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()},
+            {TensorType::ACL_SHIFTS, shifts.get()},
+            {TensorType::ACL_MULTIPLIERS, multipliers.get()},
+            {TensorType::ACL_DST, dst},
         };
         CLScheduler::get().enqueue_op(*_offset_contribution_output_stage_kernel, output_stage_pack, true);
     }
-    if(_run_offset_contribution)
+    if (_run_offset_contribution)
     {
         // Run offset contribution kernel
-        ITensorPack offset_contrib_pack =
-        {
-            { TensorType::ACL_SRC_DST, dst },
-            { TensorType::ACL_BIAS, c },
-            { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() },
-            { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() }
-        };
+        ITensorPack offset_contrib_pack = {{TensorType::ACL_SRC_DST, dst},
+                                           {TensorType::ACL_BIAS, c},
+                                           {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()},
+                                           {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()}};
         CLScheduler::get().enqueue_op(*_offset_contribution_kernel, offset_contrib_pack, true);
     }
 }
 
 void ClGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         auto               b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
         CLAuxTensorHandler tmp_b(offset_int_vec(RhsReshape), _tmp_b, tensors, true);
@@ -800,56 +885,55 @@ void ClGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)
 
         ARM_COMPUTE_ERROR_ON_NULLPTR(b);
 
-        if(_convert_to_qasymm8)
+        if (_convert_to_qasymm8)
         {
-            ITensorPack convert_to_qs8_pack = { { ACL_SRC, b }, { ACL_DST, rhs_qasymm8.get() } };
+            ITensorPack convert_to_qs8_pack = {{ACL_SRC, b}, {ACL_DST, rhs_qasymm8.get()}};
             CLScheduler::get().enqueue_op(*_weights_to_qasymm8, convert_to_qs8_pack, false);
             b->mark_as_unused();
         }
 
-        if(is_gemm_reshaped(_gemm_kernel_type) && _reshape_b_only_on_first_run)
+        if (is_gemm_reshaped(_gemm_kernel_type) && _reshape_b_only_on_first_run)
         {
             // Run reshape kernel and mark original weights tensor as unused
-            ITensorPack mtx_b_pack =
-            {
-                { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
-                { TensorType::ACL_DST, tmp_b.get() }
-            };
+            ITensorPack mtx_b_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b},
+                                      {TensorType::ACL_DST, tmp_b.get()}};
             CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_pack, false);
             b->mark_as_unused();
         }
 
         // Run matrix B reduction kernel only if _a_offset is not equal to 0
-        if(_a_offset != 0 && _reshape_b_only_on_first_run)
+        if (_a_offset != 0 && _reshape_b_only_on_first_run)
         {
-            ITensorPack mtx_b_red_pack =
-            {
-                { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
-                { TensorType::ACL_DST, vec_sum_col.get() }
-            };
+            ITensorPack mtx_b_red_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b},
+                                          {TensorType::ACL_DST, vec_sum_col.get()}};
             CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false);
         }
 
         // Compute GEMM output multipliers and shifts for output stage
         {
-            const size_t num_filters = (_gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
+            const size_t num_filters = (_gemm_info.gemmlowp_output_stage().is_quantized_per_channel)
+                                           ? _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size()
+                                           : 1;
 
             CLAuxTensorHandler multipliers(offset_int_vec(Multipliers), _gemm_output_stage_multipliers, tensors, false);
             CLAuxTensorHandler shifts(offset_int_vec(Shifts), _gemm_output_stage_shifts, tensors, false);
 
             ICLTensor *multiplier_tensor = multipliers.get();
-            if(multiplier_tensor != nullptr && multiplier_tensor->info()->total_size() > 0)
+            if (multiplier_tensor != nullptr && multiplier_tensor->info()->total_size() > 0)
             {
                 multiplier_tensor->map(CLScheduler::get().queue(), true);
-                std::memcpy(multiplier_tensor->ptr_to_element(Coordinates(0)), _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(), num_filters * sizeof(int32_t));
+                std::memcpy(multiplier_tensor->ptr_to_element(Coordinates(0)),
+                            _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(),
+                            num_filters * sizeof(int32_t));
                 multiplier_tensor->unmap(CLScheduler::get().queue());
             }
 
             ICLTensor *shifts_tensor = shifts.get();
-            if(shifts.get() != nullptr && shifts_tensor->info()->total_size() > 0)
+            if (shifts.get() != nullptr && shifts_tensor->info()->total_size() > 0)
             {
                 shifts_tensor->map(CLScheduler::get().queue(), true);
-                std::memcpy(shifts_tensor->ptr_to_element(Coordinates(0)), _gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t));
+                std::memcpy(shifts_tensor->ptr_to_element(Coordinates(0)),
+                            _gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t));
                 shifts_tensor->unmap(CLScheduler::get().queue());
             }
         }
diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h
index 6e32a90fc4..c80dc3a182 100644
--- a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h
+++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h
@@ -93,18 +93,27 @@ public:
      * @param[in]  gemm_info       (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
      *                       if the reshape of matrix B should be executed only for the first run
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *a,
+                   ITensorInfo            *b,
+                   ITensorInfo            *c,
+                   ITensorInfo            *output,
+                   const GEMMInfo         &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClGemmLowpMatrixMultiplyCore::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *output,
+                           const GEMMInfo    &gemm_info = GEMMInfo());
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
@@ -130,7 +139,7 @@ private:
     std::unique_ptr<kernels::ClGemmLowpMatrixAReductionKernel>                  _mtx_a_reduction_kernel;
     std::unique_ptr<kernels::ClGemmLowpMatrixBReductionKernel>                  _mtx_b_reduction_kernel;
     std::unique_ptr<kernels::ClGemmLowpOffsetContributionKernel>                _offset_contribution_kernel;
-    std::unique_ptr<kernels::ClGemmLowpOffsetContributionOutputStageKernel>     _offset_contribution_output_stage_kernel;
+    std::unique_ptr<kernels::ClGemmLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel;
 
     // Temporary tensors
     TensorInfo _qasymm8_weights{};
@@ -141,13 +150,13 @@ private:
     TensorInfo _gemm_output_stage_multipliers{};
     TensorInfo _gemm_output_stage_shifts{};
 
-    int32_t          _a_offset{ 0 };
-    int32_t          _b_offset{ 0 };
-    bool             _reshape_b_only_on_first_run{ false };
-    bool             _run_output_stage{ false };
-    bool             _convert_to_qasymm8{ false };
-    bool             _run_offset_contribution{ false };
-    bool             _is_prepared{ false };
+    int32_t          _a_offset{0};
+    int32_t          _b_offset{0};
+    bool             _reshape_b_only_on_first_run{false};
+    bool             _run_output_stage{false};
+    bool             _convert_to_qasymm8{false};
+    bool             _run_offset_contribution{false};
+    bool             _is_prepared{false};
     GEMMInfo         _gemm_info{};
     CLGEMMKernelType _gemm_kernel_type{};
 
diff --git a/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp b/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp
index a61b11a3b1..e3363e3685 100644
--- a/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp
+++ b/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp
@@ -27,22 +27,25 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h"
 #include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h"
 #include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClGemmLowpOutputStage::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info)
+void ClGemmLowpOutputStage::configure(const CLCompileContext        &compile_context,
+                                      const ITensorInfo             *src,
+                                      const ITensorInfo             *bias,
+                                      ITensorInfo                   *dst,
+                                      const GEMMLowpOutputStageInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_LOG_PARAMS(src, bias, dst, info);
 
-    switch(info.type)
+    switch (info.type)
     {
         case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
         {
@@ -70,12 +73,16 @@ void ClGemmLowpOutputStage::configure(const CLCompileContext &compile_context, c
     }
 }
 
-Status ClGemmLowpOutputStage::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info)
+Status ClGemmLowpOutputStage::validate(const ITensorInfo             *src,
+                                       const ITensorInfo             *bias,
+                                       const ITensorInfo             *dst,
+                                       const GEMMLowpOutputStageInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM16);
 
-    switch(info.type)
+    switch (info.type)
     {
         case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
             return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(src, bias, dst, &info);
@@ -94,7 +101,7 @@ void ClGemmLowpOutputStage::run(ITensorPack &tensors)
     const ITensor *bias = tensors.get_const_tensor(ACL_BIAS);
     ITensor       *dst  = tensors.get_tensor(ACL_DST);
 
-    ITensorPack pack{ { ACL_SRC, src }, { ACL_BIAS, bias }, { ACL_DST, dst } };
+    ITensorPack pack{{ACL_SRC, src}, {ACL_BIAS, bias}, {ACL_DST, dst}};
     CLScheduler::get().enqueue_op(*_kernel, pack, true);
 }
 } // namespace opencl
diff --git a/src/gpu/cl/operators/ClGemmLowpOutputStage.h b/src/gpu/cl/operators/ClGemmLowpOutputStage.h
index 3f1b04dcce..6357e0200b 100644
--- a/src/gpu/cl/operators/ClGemmLowpOutputStage.h
+++ b/src/gpu/cl/operators/ClGemmLowpOutputStage.h
@@ -71,14 +71,21 @@ public:
      * @param[out] dst             Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
      * @param[in]  info            GEMMLowp output stage metadata.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info);
+    void configure(const CLCompileContext        &compile_context,
+                   const ITensorInfo             *src,
+                   const ITensorInfo             *bias,
+                   ITensorInfo                   *dst,
+                   const GEMMLowpOutputStageInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClGemmLowpOutputStage::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info);
+    static Status validate(const ITensorInfo             *src,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           const GEMMLowpOutputStageInfo &info);
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
diff --git a/src/gpu/cl/operators/ClIndirectConv2d.cpp b/src/gpu/cl/operators/ClIndirectConv2d.cpp
index b900974574..777fc9e5e1 100644
--- a/src/gpu/cl/operators/ClIndirectConv2d.cpp
+++ b/src/gpu/cl/operators/ClIndirectConv2d.cpp
@@ -27,16 +27,15 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h"
 #include "src/gpu/cl/kernels/ClIndirectConv2dKernel.h"
+#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
 #include "src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h"
 #include "src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h"
 
-#include "src/core/helpers/MemoryHelpers.h"
-#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
-
-#include "src/common/utils/Log.h"
-
 using namespace arm_compute::cl_indirect_conv;
 
 namespace arm_compute
@@ -47,7 +46,8 @@ using namespace arm_compute::experimental;
 
 namespace
 {
-DirectConvComputeKernelInfo config_indirect_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo
+config_indirect_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info)
 {
     // Get GPU target
     GPUTarget gpu_target = CLScheduler::get().target();
@@ -59,8 +59,13 @@ DirectConvComputeKernelInfo config_indirect_convolution_nhwc(const ITensorInfo *
 
 } // namespace
 
-void ClIndirectConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                                 const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void ClIndirectConv2d::configure(const CLCompileContext    &compile_context,
+                                 ITensorInfo               *src,
+                                 ITensorInfo               *weights,
+                                 ITensorInfo               *biases,
+                                 ITensorInfo               *dst,
+                                 const PadStrideInfo       &conv_info,
+                                 const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info);
@@ -86,25 +91,29 @@ void ClIndirectConv2d::configure(const CLCompileContext &compile_context, ITenso
     CLScheduler::get().tune_kernel_static(*_indirect_conv_kernel);
 
     // Request memory for the indirect buffer
-    _aux_mem[IndirectBuffer] = MemoryInfo(offset_int_vec(IndirectBuffer), MemoryLifetime::Persistent, _indirect_buffer.total_size());
+    _aux_mem[IndirectBuffer] =
+        MemoryInfo(offset_int_vec(IndirectBuffer), MemoryLifetime::Persistent, _indirect_buffer.total_size());
 }
 
-Status ClIndirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                  const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+Status ClIndirectConv2d::validate(const ITensorInfo         *src,
+                                  const ITensorInfo         *weights,
+                                  const ITensorInfo         *biases,
+                                  const ITensorInfo         *dst,
+                                  const PadStrideInfo       &conv_info,
+                                  const ActivationLayerInfo &act_info)
 {
     // Initialize the direct convolution descriptor
     const DirectConvComputeKernelInfo desc = config_indirect_convolution_nhwc(src, weights, conv_info);
 
-    TensorShape ind_buffer_shape = misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(),
-                                                                                         src->data_layout(),
-                                                                                         weights->tensor_shape(),
-                                                                                         conv_info,
-                                                                                         desc);
+    TensorShape ind_buffer_shape = misc::shape_calculator::compute_indirect_buffer_shape(
+        src->tensor_shape(), src->data_layout(), weights->tensor_shape(), conv_info, desc);
 
     TensorInfo indirect_buffer(ind_buffer_shape, 1, DataType::S32);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dAddressPrecalculationKernel::validate(src, weights, &indirect_buffer, conv_info, desc));
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dKernel::validate(src, weights, biases, &indirect_buffer, dst, conv_info, act_info, desc));
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dAddressPrecalculationKernel::validate(
+        src, weights, &indirect_buffer, conv_info, desc));
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dKernel::validate(src, weights, biases, &indirect_buffer, dst,
+                                                                          conv_info, act_info, desc));
 
     return Status{};
 }
@@ -124,9 +133,10 @@ void ClIndirectConv2d::run(ITensorPack &tensors)
 
 void ClIndirectConv2d::prepare(ITensorPack &constants)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
-        ICLTensor *indirect_buffer_aux = utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(IndirectBuffer)));
+        ICLTensor *indirect_buffer_aux =
+            utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(IndirectBuffer)));
         ARM_COMPUTE_ERROR_ON(indirect_buffer_aux == nullptr);
 
         ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Preparing indirect buffer");
@@ -134,7 +144,7 @@ void ClIndirectConv2d::prepare(ITensorPack &constants)
         CLAuxTensorHandler indirect_buffer(_indirect_buffer, *indirect_buffer_aux);
         ARM_COMPUTE_ERROR_ON(indirect_buffer.get()->cl_buffer().get() == nullptr);
 
-        ITensorPack indirect_buffer_pack{ { ACL_DST, indirect_buffer.get() } };
+        ITensorPack indirect_buffer_pack{{ACL_DST, indirect_buffer.get()}};
         CLScheduler::get().enqueue_op(*_addr_precalculation_kernel, indirect_buffer_pack, true);
 
         _is_prepared = true;
diff --git a/src/gpu/cl/operators/ClIndirectConv2d.h b/src/gpu/cl/operators/ClIndirectConv2d.h
index e50fa25069..29e796efd9 100644
--- a/src/gpu/cl/operators/ClIndirectConv2d.h
+++ b/src/gpu/cl/operators/ClIndirectConv2d.h
@@ -77,7 +77,12 @@ public:
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      *
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src,
+                   ITensorInfo               *weights,
+                   ITensorInfo               *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -85,12 +90,16 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *dst,
+                           const PadStrideInfo       &conv_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
@@ -100,11 +109,11 @@ private:
         Count
     };
 
-    std::unique_ptr<IClKernel>       _indirect_conv_kernel{ nullptr };
-    std::unique_ptr<IClKernel>       _addr_precalculation_kernel{ nullptr };
+    std::unique_ptr<IClKernel>       _indirect_conv_kernel{nullptr};
+    std::unique_ptr<IClKernel>       _addr_precalculation_kernel{nullptr};
     TensorInfo                       _indirect_buffer{};
-    bool                             _is_prepared{ false };
-    experimental::MemoryRequirements _aux_mem{ Count };
+    bool                             _is_prepared{false};
+    experimental::MemoryRequirements _aux_mem{Count};
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClLogicalNot.cpp b/src/gpu/cl/operators/ClLogicalNot.cpp
index b2eb89b320..d8d4186d00 100644
--- a/src/gpu/cl/operators/ClLogicalNot.cpp
+++ b/src/gpu/cl/operators/ClLogicalNot.cpp
@@ -23,11 +23,10 @@
  */
 #include "src/gpu/cl/operators/ClLogicalNot.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
diff --git a/src/gpu/cl/operators/ClMatMul.cpp b/src/gpu/cl/operators/ClMatMul.cpp
index 49d14127ca..c14b1f2992 100644
--- a/src/gpu/cl/operators/ClMatMul.cpp
+++ b/src/gpu/cl/operators/ClMatMul.cpp
@@ -47,11 +47,17 @@ ClMatMul::ClMatMul()
 {
 }
 
-Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info)
+Status ClMatMul::validate(const ITensorInfo         *lhs,
+                          const ITensorInfo         *rhs,
+                          const ITensorInfo         *dst,
+                          const MatMulInfo          &matmul_info,
+                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
 
     const GPUTarget gpu_target = CLScheduler::get().target();
 
@@ -61,11 +67,16 @@ Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const
 
     const bool is_quantized = is_data_type_quantized_asymmetric(lhs->data_type());
 
-    return is_quantized ? ClMatMulLowpNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info) :
-           ClMatMulNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+    return is_quantized ? ClMatMulLowpNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info)
+                        : ClMatMulNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
 }
 
-void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info)
+void ClMatMul::configure(const CLCompileContext    &compile_context,
+                         ITensorInfo               *lhs,
+                         ITensorInfo               *rhs,
+                         ITensorInfo               *dst,
+                         const MatMulInfo          &matmul_info,
+                         const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_info);
@@ -81,12 +92,13 @@ void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *l
 
     MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info);
 
-    if(_is_quantized)
+    if (_is_quantized)
     {
         _matmul_lowp_native_kernel->set_target(gpu_target);
 
         // Configure the low-precision native matrix multiply kernel
-        _matmul_lowp_native_kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+        _matmul_lowp_native_kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info,
+                                              act_info);
     }
     else
     {
@@ -99,7 +111,7 @@ void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *l
 
 void ClMatMul::run(ITensorPack &tensors)
 {
-    if(_is_quantized)
+    if (_is_quantized)
     {
         CLScheduler::get().enqueue_op(*_matmul_lowp_native_kernel, tensors, true);
     }
diff --git a/src/gpu/cl/operators/ClMatMul.h b/src/gpu/cl/operators/ClMatMul.h
index abbb75239a..64dcf217bd 100644
--- a/src/gpu/cl/operators/ClMatMul.h
+++ b/src/gpu/cl/operators/ClMatMul.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/function_info/MatMulInfo.h"
+
 #include "src/gpu/cl/IClOperator.h"
 #include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h"
 #include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
@@ -73,7 +74,11 @@ public:
      * @param[in]  matmul_info     Contains MatMul operation information described in @ref MatMulInfo.
      * @param[in]  act_info        Class containing information about fused activation function.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info,
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *lhs,
+                   ITensorInfo               *rhs,
+                   ITensorInfo               *dst,
+                   const MatMulInfo          &matmul_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -81,15 +86,19 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *dst,
+                           const MatMulInfo          &matmul_info,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
 
 private:
-    std::unique_ptr<kernels::ClMatMulNativeKernel>     _matmul_native_kernel{ nullptr };
-    std::unique_ptr<kernels::ClMatMulLowpNativeKernel> _matmul_lowp_native_kernel{ nullptr };
+    std::unique_ptr<kernels::ClMatMulNativeKernel>     _matmul_native_kernel{nullptr};
+    std::unique_ptr<kernels::ClMatMulLowpNativeKernel> _matmul_lowp_native_kernel{nullptr};
 
-    bool _is_quantized{ false };
+    bool _is_quantized{false};
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClMul.cpp b/src/gpu/cl/operators/ClMul.cpp
index 2066f0cfaa..10cf8a6a38 100644
--- a/src/gpu/cl/operators/ClMul.cpp
+++ b/src/gpu/cl/operators/ClMul.cpp
@@ -24,17 +24,23 @@
 #include "src/gpu/cl/operators/ClMul.h"
 
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/gpu/cl/ClCompileContext.h"
-#include "src/gpu/cl/kernels/ClMulKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClMulKernel.h"
 
 namespace arm_compute
 {
 namespace opencl
 {
-void ClMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
-                      ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+void ClMul::configure(const CLCompileContext    &compile_context,
+                      ITensorInfo               *src1,
+                      ITensorInfo               *src2,
+                      ITensorInfo               *dst,
+                      float                      scale,
+                      ConvertPolicy              overflow_policy,
+                      RoundingPolicy             rounding_policy,
+                      const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info);
     auto k = std::make_unique<kernels::ClMulKernel>();
@@ -42,22 +48,34 @@ void ClMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1
     _kernel = std::move(k);
 }
 
-Status ClMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
-                       ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+Status ClMul::validate(const ITensorInfo         *src1,
+                       const ITensorInfo         *src2,
+                       const ITensorInfo         *dst,
+                       float                      scale,
+                       ConvertPolicy              overflow_policy,
+                       RoundingPolicy             rounding_policy,
+                       const ActivationLayerInfo &act_info)
 {
     return kernels::ClMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info);
 }
 
-void ClComplexMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClComplexMul::configure(const CLCompileContext    &compile_context,
+                             ITensorInfo               *src1,
+                             ITensorInfo               *src2,
+                             ITensorInfo               *dst,
+                             const ActivationLayerInfo &act_info)
 {
     auto k = std::make_unique<kernels::ClComplexMulKernel>();
     k->configure(compile_context, src1, src2, dst, act_info);
     _kernel = std::move(k);
 }
 
-Status ClComplexMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClComplexMul::validate(const ITensorInfo         *src1,
+                              const ITensorInfo         *src2,
+                              const ITensorInfo         *dst,
+                              const ActivationLayerInfo &act_info)
 {
     return kernels::ClComplexMulKernel::validate(src1, src2, dst, act_info);
 }
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClMul.h b/src/gpu/cl/operators/ClMul.h
index 6086bc9d52..1cf4d68d4c 100644
--- a/src/gpu/cl/operators/ClMul.h
+++ b/src/gpu/cl/operators/ClMul.h
@@ -66,16 +66,27 @@ public:
      * @param[in]      rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
-                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   float                      scale,
+                   ConvertPolicy              overflow_policy,
+                   RoundingPolicy             rounding_policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClMul::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
-                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           float                      scale,
+                           ConvertPolicy              overflow_policy,
+                           RoundingPolicy             rounding_policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 
 /** Basic function to run @ref opencl::kernels::ClComplexMulKernel */
@@ -92,14 +103,21 @@ public:
      * @param[out]     dst             The dst tensor info, Data types supported: same as @p src1. Number of channels supported: same as @p src1.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClComplexMul::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClPRelu.cpp b/src/gpu/cl/operators/ClPRelu.cpp
index cf4ebe6083..f3efd00bba 100644
--- a/src/gpu/cl/operators/ClPRelu.cpp
+++ b/src/gpu/cl/operators/ClPRelu.cpp
@@ -23,16 +23,18 @@
  */
 #include "src/gpu/cl/operators/ClPRelu.h"
 
-#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
 namespace arm_compute
 {
 namespace opencl
 {
 using KernelType = kernels::ClArithmeticKernel;
-void ClPRelu::configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output)
+void ClPRelu::configure(const CLCompileContext &compile_context,
+                        ITensorInfo            *input,
+                        ITensorInfo            *alpha,
+                        ITensorInfo            *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input, alpha, output);
     auto k = std::make_unique<KernelType>();
@@ -49,7 +51,7 @@ void ClPRelu::run(ITensorPack &tensors)
 {
     // Output tensor can be given as nullptr for in-place computation.
     // In this case, get the input tensor and use it as the output tensor.
-    if(tensors.get_tensor(TensorType::ACL_DST) == nullptr)
+    if (tensors.get_tensor(TensorType::ACL_DST) == nullptr)
     {
         auto src_tensor = const_cast<ITensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
         ARM_COMPUTE_ERROR_ON_MSG(src_tensor == nullptr, "invalid source tensor is given for in-place computation");
@@ -58,4 +60,4 @@ void ClPRelu::run(ITensorPack &tensors)
     IClOperator::run(tensors);
 }
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClPRelu.h b/src/gpu/cl/operators/ClPRelu.h
index 8084ab86cd..45ce858fb0 100644
--- a/src/gpu/cl/operators/ClPRelu.h
+++ b/src/gpu/cl/operators/ClPRelu.h
@@ -47,7 +47,8 @@ public:
      * @param[in]  alpha           PRelu layer parameters. Data types supported: same of @p input.
      * @param[out] output          Destination tensor. Data type supported: same as @p input
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClPRelu::configure()
diff --git a/src/gpu/cl/operators/ClPermute.cpp b/src/gpu/cl/operators/ClPermute.cpp
index ed56f97bfe..3851e22b6a 100644
--- a/src/gpu/cl/operators/ClPermute.cpp
+++ b/src/gpu/cl/operators/ClPermute.cpp
@@ -23,16 +23,18 @@
  */
 #include "src/gpu/cl/operators/ClPermute.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClPermuteKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClPermute::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm)
+void ClPermute::configure(const ClCompileContext  &compile_context,
+                          const ITensorInfo       *src,
+                          ITensorInfo             *dst,
+                          const PermutationVector &perm)
 {
     ARM_COMPUTE_LOG_PARAMS(src, dst, perm);
     auto k = std::make_unique<kernels::ClPermuteKernel>();
@@ -45,4 +47,4 @@ Status ClPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, const
     return kernels::ClPermuteKernel::validate(src, dst, perm);
 }
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClPermute.h b/src/gpu/cl/operators/ClPermute.h
index 3e87329f9b..6349358a18 100644
--- a/src/gpu/cl/operators/ClPermute.h
+++ b/src/gpu/cl/operators/ClPermute.h
@@ -44,7 +44,10 @@ public:
      * @param[in] dst             The dst tensor info. Data types supported: Same as @p src
      * @param[in] perm            Permutation vector
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm);
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       *src,
+                   ITensorInfo             *dst,
+                   const PermutationVector &perm);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClPermute::configure()
@@ -55,4 +58,4 @@ public:
 };
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_PERMUTE_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_PERMUTE_H */
diff --git a/src/gpu/cl/operators/ClPool2d.cpp b/src/gpu/cl/operators/ClPool2d.cpp
index 3da90b8ced..e4507dc1a1 100644
--- a/src/gpu/cl/operators/ClPool2d.cpp
+++ b/src/gpu/cl/operators/ClPool2d.cpp
@@ -25,16 +25,19 @@
 
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClPool2dKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClPool2d::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices)
+void ClPool2d::configure(const ClCompileContext &compile_context,
+                         ITensorInfo            *src,
+                         ITensorInfo            *dst,
+                         const PoolingLayerInfo &info,
+                         ITensorInfo            *indices)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, dst, info, indices);
@@ -49,7 +52,10 @@ void ClPool2d::configure(const ClCompileContext &compile_context, ITensorInfo *s
     CLScheduler::get().tune_kernel_static(*_kernel);
 }
 
-Status ClPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices)
+Status ClPool2d::validate(const ITensorInfo      *src,
+                          const ITensorInfo      *dst,
+                          const PoolingLayerInfo &info,
+                          const ITensorInfo      *indices)
 {
     return kernels::ClPool2dKernel::validate(src, dst, info, indices);
 }
diff --git a/src/gpu/cl/operators/ClPool2d.h b/src/gpu/cl/operators/ClPool2d.h
index f353ba262e..9c2fd1c3f2 100644
--- a/src/gpu/cl/operators/ClPool2d.h
+++ b/src/gpu/cl/operators/ClPool2d.h
@@ -50,14 +50,21 @@ public:
      * @param[in]  info            Pooling layer parameters.
      * @param[out] indices         (optional) The indices info of the maximal values. Data type supported: U32.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices = nullptr);
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const PoolingLayerInfo &info,
+                   ITensorInfo            *indices = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClPool2d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices = nullptr);
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &info,
+                           const ITensorInfo      *indices = nullptr);
 };
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClPool3d.cpp b/src/gpu/cl/operators/ClPool3d.cpp
index 7dec6c5958..d230413659 100644
--- a/src/gpu/cl/operators/ClPool3d.cpp
+++ b/src/gpu/cl/operators/ClPool3d.cpp
@@ -25,16 +25,18 @@
 
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClPool3dKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClPool3d::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &info)
+void ClPool3d::configure(const ClCompileContext   &compile_context,
+                         const ITensorInfo        *src,
+                         ITensorInfo              *dst,
+                         const Pooling3dLayerInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, dst, info);
diff --git a/src/gpu/cl/operators/ClPool3d.h b/src/gpu/cl/operators/ClPool3d.h
index 7d994fd194..9fd78bfd69 100644
--- a/src/gpu/cl/operators/ClPool3d.h
+++ b/src/gpu/cl/operators/ClPool3d.h
@@ -51,7 +51,10 @@ public:
      * @param[out] dst             Destination tensor info.
      * @param[in]  info            3d Pooling layer parameters.
      */
-    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &info);
+    void configure(const ClCompileContext   &compile_context,
+                   const ITensorInfo        *src,
+                   ITensorInfo              *dst,
+                   const Pooling3dLayerInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClPool3d::configure()
diff --git a/src/gpu/cl/operators/ClQuantize.cpp b/src/gpu/cl/operators/ClQuantize.cpp
index 47ae5cea47..8560b5553e 100644
--- a/src/gpu/cl/operators/ClQuantize.cpp
+++ b/src/gpu/cl/operators/ClQuantize.cpp
@@ -25,10 +25,10 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/gpu/cl/ClCompileContext.h"
-#include "src/gpu/cl/kernels/ClQuantizeKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClQuantizeKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/gpu/cl/operators/ClReshape.cpp b/src/gpu/cl/operators/ClReshape.cpp
index 560966f4fc..1dd5b760cb 100644
--- a/src/gpu/cl/operators/ClReshape.cpp
+++ b/src/gpu/cl/operators/ClReshape.cpp
@@ -23,11 +23,10 @@
  */
 #include "src/gpu/cl/operators/ClReshape.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClReshapeKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
@@ -45,4 +44,4 @@ Status ClReshape::validate(const ITensorInfo *src, const ITensorInfo *dst)
     return kernels::ClReshapeKernel::validate(src, dst);
 }
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClScale.cpp b/src/gpu/cl/operators/ClScale.cpp
index 0798b19ca0..184e2aa006 100644
--- a/src/gpu/cl/operators/ClScale.cpp
+++ b/src/gpu/cl/operators/ClScale.cpp
@@ -25,17 +25,20 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClScaleKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClScale::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info)
+void ClScale::configure(const CLCompileContext &compile_context,
+                        ITensorInfo            *src,
+                        ITensorInfo            *dst,
+                        const ScaleKernelInfo  &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, dst, info);
@@ -61,4 +64,4 @@ void ClScale::run(ITensorPack &tensors)
     CLScheduler::get().enqueue_op(*_kernel.get(), tensors);
 }
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClScale.h b/src/gpu/cl/operators/ClScale.h
index af97cf23e7..1427bb4fdc 100644
--- a/src/gpu/cl/operators/ClScale.h
+++ b/src/gpu/cl/operators/ClScale.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_SCALE_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -49,7 +50,8 @@ public:
      *                                All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]     info            @ref ScaleKernelInfo descriptor to be used to configure
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClScale::configure()
diff --git a/src/gpu/cl/operators/ClSoftmax.cpp b/src/gpu/cl/operators/ClSoftmax.cpp
index 03809553a3..2bec400597 100644
--- a/src/gpu/cl/operators/ClSoftmax.cpp
+++ b/src/gpu/cl/operators/ClSoftmax.cpp
@@ -22,7 +22,10 @@
  * SOFTWARE.
  */
 #include "src/gpu/cl/operators/ClSoftmax.h"
+
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/core/helpers/SoftmaxHelpers.h"
 #include "src/gpu/cl/kernels/ClSoftmaxKernel.h"
@@ -30,8 +33,6 @@
 #include "src/gpu/cl/utils/ClAuxTensorHandler.h"
 #include "support/Cast.h"
 
-#include "src/common/utils/Log.h"
-
 using namespace arm_compute::experimental;
 
 namespace arm_compute
@@ -52,7 +53,10 @@ ClSoftmax::ClSoftmax()
 {
 }
 
-void ClSoftmax::configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &dst, const SoftmaxKernelInfo &info)
+void ClSoftmax::configure(const CLCompileContext  &compile_context,
+                          const ITensorInfo       &src,
+                          ITensorInfo             &dst,
+                          const SoftmaxKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst, info));
     ARM_COMPUTE_LOG_PARAMS(src, dst, info);
@@ -64,14 +68,15 @@ void ClSoftmax::configure(const CLCompileContext &compile_context, const ITensor
     const ITensorInfo &tmp_input_info  = _needs_permute ? _permuted_src_info : src;
     ITensorInfo       &tmp_output_info = _needs_permute ? _permuted_dst_info : dst;
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
         const auto perm_info = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
         _permute_input->configure(compile_context, &src, &_permuted_src_info, perm_info);
     }
 
-    DataType tmp_data_type = is_data_type_quantized_asymmetric(tmp_input_info.data_type()) ? DataType::S32 : tmp_input_info.data_type();
-    _tmp_info              = tmp_input_info.clone()->set_data_type(tmp_data_type);
+    DataType tmp_data_type =
+        is_data_type_quantized_asymmetric(tmp_input_info.data_type()) ? DataType::S32 : tmp_input_info.data_type();
+    _tmp_info = tmp_input_info.clone()->set_data_type(tmp_data_type);
 
     TensorShape max_sum_shape = tmp_input_info.tensor_shape();
     _max_info                 = tmp_input_info.clone()->set_tensor_shape(max_sum_shape);
@@ -83,33 +88,41 @@ void ClSoftmax::configure(const CLCompileContext &compile_context, const ITensor
     _max_shift_exp_sum_kernel->configure(compile_context, tmp_input_info, _max_info, _tmp_info, _sum_info, info);
     _norm_kernel->configure(compile_context, _tmp_info, _sum_info, tmp_output_info, info);
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
         const auto perm_info = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
         _permute_output->configure(compile_context, &_permuted_dst_info, &dst, perm_info);
     }
 
-    _aux_mem[InternalTensorIdx::SUM] = MemoryInfo(offset_int_vec(InternalTensorIdx::SUM), MemoryLifetime::Temporary, _sum_info.total_size());
-    _aux_mem[InternalTensorIdx::TMP] = MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp_info.total_size());
-    _aux_mem[InternalTensorIdx::MAX] = MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max_info.total_size());
-
-    _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), MemoryLifetime::Temporary, _permuted_src_info.total_size());
-    _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), MemoryLifetime::Temporary, _permuted_dst_info.total_size());
+    _aux_mem[InternalTensorIdx::SUM] =
+        MemoryInfo(offset_int_vec(InternalTensorIdx::SUM), MemoryLifetime::Temporary, _sum_info.total_size());
+    _aux_mem[InternalTensorIdx::TMP] =
+        MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp_info.total_size());
+    _aux_mem[InternalTensorIdx::MAX] =
+        MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max_info.total_size());
+
+    _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC),
+                                                           MemoryLifetime::Temporary, _permuted_src_info.total_size());
+    _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST),
+                                                           MemoryLifetime::Temporary, _permuted_dst_info.total_size());
 }
 
 Status ClSoftmax::validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src.num_dimensions() > 4, "Only up to 4 dimensions are supported");
     ARM_COMPUTE_UNUSED(info.beta);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.axis < static_cast<int32_t>(-src.num_dimensions()) || static_cast<int32_t>(src.num_dimensions()) <= info.axis);
+    ARM_COMPUTE_RETURN_ERROR_ON(info.axis < static_cast<int32_t>(-src.num_dimensions()) ||
+                                static_cast<int32_t>(src.num_dimensions()) <= info.axis);
 
-    const size_t actual_axis   = static_cast<size_t>(wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions())));
+    const size_t actual_axis = static_cast<size_t>(wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions())));
     const bool   needs_permute = actual_axis != 0;
-    if(needs_permute)
+    if (needs_permute)
     {
-        const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
-        const TensorShape       permuted_shape     = misc::shape_calculator::compute_permutation_output_shape(src, permutation_vector);
-        TensorInfo              input_permuted(src.clone()->set_tensor_shape(permuted_shape));
+        const PermutationVector permutation_vector =
+            softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
+        const TensorShape permuted_shape =
+            misc::shape_calculator::compute_permutation_output_shape(src, permutation_vector);
+        TensorInfo input_permuted(src.clone()->set_tensor_shape(permuted_shape));
         ARM_COMPUTE_RETURN_ON_ERROR(ClPermute::validate(&src, &input_permuted, permutation_vector));
         TensorInfo output_permuted(dst.clone()->set_tensor_shape(permuted_shape));
         ARM_COMPUTE_RETURN_ON_ERROR(ClPermute::validate(&output_permuted, &dst, permutation_vector));
@@ -122,9 +135,14 @@ Status ClSoftmax::validate(const ITensorInfo &src, const ITensorInfo &dst, const
     TensorShape max_sum_shape = src.tensor_shape();
     max_sum_shape.set(0, 1);
     TensorInfo tensor_info_max(src.clone()->set_tensor_shape(max_sum_shape).set_is_resizable(true));
-    TensorInfo tensor_info_sum(src.clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(QuantizationInfo()).set_is_resizable(true));
-
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClLogits1DMaxShiftExpSumKernel::validate(src, tensor_info_max, tensor_info_tmp, tensor_info_sum));
+    TensorInfo tensor_info_sum(src.clone()
+                                   ->set_tensor_shape(max_sum_shape)
+                                   .set_data_type(tmp_data_type)
+                                   .set_quantization_info(QuantizationInfo())
+                                   .set_is_resizable(true));
+
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        kernels::ClLogits1DMaxShiftExpSumKernel::validate(src, tensor_info_max, tensor_info_tmp, tensor_info_sum));
     ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClLogits1DNormKernel::validate(tensor_info_tmp, tensor_info_sum, dst, info));
 
     return Status{};
@@ -139,10 +157,12 @@ void ClSoftmax::run(ITensorPack &tensors)
     CLAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp_info, tensors, false);
     CLAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max_info, tensors, false);
 
-    CLAuxTensorHandler permuted_src(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _permuted_src_info, tensors, false);
-    CLAuxTensorHandler permuted_dst(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _permuted_dst_info, tensors, false);
+    CLAuxTensorHandler permuted_src(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _permuted_src_info, tensors,
+                                    false);
+    CLAuxTensorHandler permuted_dst(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _permuted_dst_info, tensors,
+                                    false);
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
         ITensorPack pack;
         pack.add_const_tensor(TensorType::ACL_SRC, src);
@@ -152,7 +172,7 @@ void ClSoftmax::run(ITensorPack &tensors)
 
     ITensorPack sum_pack;
     ITensorPack norm_pack;
-    if(_needs_permute)
+    if (_needs_permute)
     {
         sum_pack.add_const_tensor(TensorType::ACL_SRC, permuted_src.get());
         norm_pack.add_tensor(TensorType::ACL_DST, permuted_dst.get());
@@ -172,7 +192,7 @@ void ClSoftmax::run(ITensorPack &tensors)
     CLScheduler::get().enqueue_op(*_max_shift_exp_sum_kernel.get(), sum_pack, false);
     CLScheduler::get().enqueue_op(*_norm_kernel.get(), norm_pack, false);
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
         ITensorPack pack;
         pack.add_const_tensor(TensorType::ACL_SRC, permuted_dst.get());
@@ -186,4 +206,4 @@ experimental::MemoryRequirements ClSoftmax::workspace() const
     return _aux_mem;
 }
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClSoftmax.h b/src/gpu/cl/operators/ClSoftmax.h
index 6c9af585d6..6c2aaaea80 100644
--- a/src/gpu/cl/operators/ClSoftmax.h
+++ b/src/gpu/cl/operators/ClSoftmax.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_SOFTMAX_H
 
 #include "arm_compute/runtime/CL/CLTensor.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -52,7 +53,10 @@ public:
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src
      * @param[in]  info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &dst, const SoftmaxKernelInfo &info);
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       &src,
+                   ITensorInfo             &dst,
+                   const SoftmaxKernelInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClSoftmax::configure()
@@ -61,7 +65,7 @@ public:
      */
     static Status validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info);
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
@@ -79,7 +83,7 @@ private:
     std::unique_ptr<ClPermute>                               _permute_output;
     std::unique_ptr<kernels::ClLogits1DMaxShiftExpSumKernel> _max_shift_exp_sum_kernel;
     std::unique_ptr<kernels::ClLogits1DNormKernel>           _norm_kernel;
-    bool                                                     _needs_permute{ false };
+    bool                                                     _needs_permute{false};
 
     TensorInfo _max_info;
     TensorInfo _sum_info;
@@ -90,6 +94,6 @@ private:
     experimental::MemoryRequirements _aux_mem{};
 };
 
-} // opencl
-} // arm_compute
-#endif /* ARM_COMPUTE_CL_SOFTMAX_H */
-\ No newline at end of file
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_SOFTMAX_H */
diff --git a/src/gpu/cl/operators/ClSub.cpp b/src/gpu/cl/operators/ClSub.cpp
index 53be04a70f..5c6d0c3184 100644
--- a/src/gpu/cl/operators/ClSub.cpp
+++ b/src/gpu/cl/operators/ClSub.cpp
@@ -23,17 +23,20 @@
  */
 #include "src/gpu/cl/operators/ClSub.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClSub::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
-                      ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void ClSub::configure(const ClCompileContext    &compile_context,
+                      ITensorInfo               *src1,
+                      ITensorInfo               *src2,
+                      ITensorInfo               *dst,
+                      ConvertPolicy              policy,
+                      const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, policy, act_info);
     auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>();
@@ -41,8 +44,11 @@ void ClSub::configure(const ClCompileContext &compile_context, ITensorInfo *src1
     _kernel = std::move(k);
 }
 
-Status ClSub::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst,
-                       ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status ClSub::validate(const ITensorInfo         *src1,
+                       const ITensorInfo         *src2,
+                       const ITensorInfo         *dst,
+                       ConvertPolicy              policy,
+                       const ActivationLayerInfo &act_info)
 {
     return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::SUB, src1, src2, dst, policy, act_info);
 }
diff --git a/src/gpu/cl/operators/ClSub.h b/src/gpu/cl/operators/ClSub.h
index 7eac437143..6a97275b86 100644
--- a/src/gpu/cl/operators/ClSub.h
+++ b/src/gpu/cl/operators/ClSub.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_SUB_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -65,7 +66,11 @@ public:
      * @param[in]      policy          Policy to use to handle overflow.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, ConvertPolicy policy,
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   ConvertPolicy              policy,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -73,7 +78,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, ConvertPolicy policy,
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           ConvertPolicy              policy,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 } // namespace opencl
diff --git a/src/gpu/cl/operators/ClTranspose.cpp b/src/gpu/cl/operators/ClTranspose.cpp
index 26feffe2b9..28da0d640a 100644
--- a/src/gpu/cl/operators/ClTranspose.cpp
+++ b/src/gpu/cl/operators/ClTranspose.cpp
@@ -23,11 +23,10 @@
  */
 #include "src/gpu/cl/operators/ClTranspose.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClTransposeKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
@@ -45,4 +44,4 @@ Status ClTranspose::validate(const ITensorInfo *src, const ITensorInfo *dst)
     return kernels::ClTransposeKernel::validate(src, dst);
 }
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClTransposedConvolution.cpp b/src/gpu/cl/operators/ClTransposedConvolution.cpp
index 90dbe7f291..cec438faeb 100644
--- a/src/gpu/cl/operators/ClTransposedConvolution.cpp
+++ b/src/gpu/cl/operators/ClTransposedConvolution.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/gpu/cl/kernels/ClTransposedConvolutionKernel.h"
 
@@ -32,8 +33,12 @@ namespace arm_compute
 {
 namespace opencl
 {
-void ClTransposedConvolution::configure(const CLCompileContext &compile_context, const ITensorInfo *input, const ITensorInfo *weights,
-                                        const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &deconv_info)
+void ClTransposedConvolution::configure(const CLCompileContext &compile_context,
+                                        const ITensorInfo      *input,
+                                        const ITensorInfo      *weights,
+                                        const ITensorInfo      *biases,
+                                        ITensorInfo            *output,
+                                        const PadStrideInfo    &deconv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, deconv_info);
@@ -43,10 +48,14 @@ void ClTransposedConvolution::configure(const CLCompileContext &compile_context,
     _transposed_conv_kernel = std::move(kernel_object);
 }
 
-Status ClTransposedConvolution::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases,
-                                         const ITensorInfo *output, const PadStrideInfo &deconv_info)
+Status ClTransposedConvolution::validate(const ITensorInfo   *input,
+                                         const ITensorInfo   *weights,
+                                         const ITensorInfo   *biases,
+                                         const ITensorInfo   *output,
+                                         const PadStrideInfo &deconv_info)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClTransposedConvolutionKernel::validate(input, weights, biases, output, deconv_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        kernels::ClTransposedConvolutionKernel::validate(input, weights, biases, output, deconv_info));
     return Status{};
 }
 
diff --git a/src/gpu/cl/operators/ClTransposedConvolution.h b/src/gpu/cl/operators/ClTransposedConvolution.h
index 58ebc689ed..660c4f85c1 100644
--- a/src/gpu/cl/operators/ClTransposedConvolution.h
+++ b/src/gpu/cl/operators/ClTransposedConvolution.h
@@ -68,23 +68,30 @@ public:
      * @param[in]  deconv_info     Contains padding and stride information described in @ref PadStrideInfo.
      *
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, const ITensorInfo *weights,
-                   const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &deconv_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *input,
+                   const ITensorInfo      *weights,
+                   const ITensorInfo      *biases,
+                   ITensorInfo            *output,
+                   const PadStrideInfo    &deconv_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClTransposedConvolution::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases,
-                           const ITensorInfo *output, const PadStrideInfo &deconv_info);
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *biases,
+                           const ITensorInfo   *output,
+                           const PadStrideInfo &deconv_info);
 
     // Inherited method overridden
     void run(ITensorPack &tensors) override;
 
 private:
-    std::unique_ptr<IClKernel> _transposed_conv_kernel{ nullptr };
+    std::unique_ptr<IClKernel> _transposed_conv_kernel{nullptr};
 };
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_H */
diff --git a/src/gpu/cl/operators/ClWinogradConv2d.cpp b/src/gpu/cl/operators/ClWinogradConv2d.cpp
index b4163a5986..8ec96b247e 100644
--- a/src/gpu/cl/operators/ClWinogradConv2d.cpp
+++ b/src/gpu/cl/operators/ClWinogradConv2d.cpp
@@ -24,20 +24,19 @@
 #include "src/gpu/cl/operators/ClWinogradConv2d.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h"
 #include "src/gpu/cl/kernels/ClWinogradInputTransformKernel.h"
 #include "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h"
 #include "src/gpu/cl/utils/ClAuxTensorHandler.h"
-
-#include "src/common/utils/Log.h"
 #include "support/Cast.h"
 
 using namespace arm_compute::experimental;
@@ -55,15 +54,16 @@ Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims,
     const unsigned int kernel_max_dim = std::max(kernel_dims.width, kernel_dims.height);
 
     // Check if the input spatial dimensions are smaller than 4
-    const bool is_input_lt4_nchw = (input_dims.width <= 4 && input_dims.height <= 4) && (data_layout == DataLayout::NCHW);
+    const bool is_input_lt4_nchw =
+        (input_dims.width <= 4 && input_dims.height <= 4) && (data_layout == DataLayout::NCHW);
 
-    if(kernel_max_dim == 3U)
+    if (kernel_max_dim == 3U)
     {
-        if(kernel_dims == Size2D(3U, 3U))
+        if (kernel_dims == Size2D(3U, 3U))
         {
             output_tile = is_input_lt4_nchw ? Size2D(2U, 2U) : Size2D(4U, 4U);
         }
-        else if(kernel_dims == Size2D(3U, 1U))
+        else if (kernel_dims == Size2D(3U, 1U))
         {
             output_tile = is_input_lt4_nchw ? Size2D(2U, 1U) : Size2D(4U, 1U);
         }
@@ -72,15 +72,13 @@ Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims,
             output_tile = is_input_lt4_nchw ? Size2D(1U, 2U) : Size2D(1U, 4U);
         }
     }
-    else if(kernel_max_dim == 5U)
+    else if (kernel_max_dim == 5U)
     {
-        output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U,
-                             kernel_dims.height == 1 ? 1U : 4U);
+        output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U, kernel_dims.height == 1 ? 1U : 4U);
     }
-    else if(kernel_max_dim == 7U)
+    else if (kernel_max_dim == 7U)
     {
-        output_tile = Size2D(kernel_dims.width == 1 ? 1U : 2U,
-                             kernel_dims.height == 1 ? 1U : 2U);
+        output_tile = Size2D(kernel_dims.width == 1 ? 1U : 2U, kernel_dims.height == 1 ? 1U : 2U);
     }
 
     return output_tile;
@@ -91,11 +89,9 @@ bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_siz
     // Check if we want to configure a Winograd configuration which requires fast math
     using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
 
-    std::vector<WinogradConfiguration> fast_math_winograd =
-    {
+    std::vector<WinogradConfiguration> fast_math_winograd = {
         WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)),
-        WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7))
-    };
+        WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7))};
 
     auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
                             std::pair<int, int>(kernel_size.width, kernel_size.height));
@@ -103,8 +99,13 @@ bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_siz
     return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end();
 }
 
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                          const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status validate_arguments(const ITensorInfo         *src,
+                          const ITensorInfo         *weights,
+                          const ITensorInfo         *biases,
+                          const ITensorInfo         *dst,
+                          const PadStrideInfo       &conv_info,
+                          const ActivationLayerInfo &act_info,
+                          bool                       enable_fast_math)
 {
     // Get indeces for the width and height
     const size_t idx_width  = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
@@ -115,41 +116,49 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
     const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
     const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout());
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_left() > (kernel_size.x() / 2u)) || (conv_info.pad_right() > (kernel_size.x() / 2u))), "Winograd only supports padding up to half kernel size");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_top() > (kernel_size.y() / 2u)) || (conv_info.pad_bottom() > (kernel_size.y() / 2u))), "Winograd only supports padding up to half kernel size");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        ((conv_info.pad_left() > (kernel_size.x() / 2u)) || (conv_info.pad_right() > (kernel_size.x() / 2u))),
+        "Winograd only supports padding up to half kernel size");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        ((conv_info.pad_top() > (kernel_size.y() / 2u)) || (conv_info.pad_bottom() > (kernel_size.y() / 2u))),
+        "Winograd only supports padding up to half kernel size");
 
     // Check if the Winograd configuration requires fast math
-    if(!enable_fast_math)
+    if (!enable_fast_math)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); //disable winograd for fp16 if fast math is false.
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+            src, 1, DataType::F32); //disable winograd for fp16 if fast math is false.
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size),
+                                        "This Winograd configuration requires enable_fast_math=true");
     }
 
-    const WinogradInfo winograd_info = WinogradInfo(output_tile,
-                                                    kernel_size,
-                                                    input_dims,
-                                                    conv_info,
-                                                    src->data_layout());
+    const WinogradInfo winograd_info =
+        WinogradInfo(output_tile, kernel_size, input_dims, conv_info, src->data_layout());
 
     // Validate input transform
-    const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info);
-    const TensorInfo  input0       = src->clone()->set_tensor_shape(input0_shape);
+    const TensorShape input0_shape =
+        misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info);
+    const TensorInfo input0 = src->clone()->set_tensor_shape(input0_shape);
     ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradInputTransformKernel::validate(src, &input0, winograd_info));
 
     // Validate filter transform
-    const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
-    const TensorInfo  input1       = weights->clone()->set_tensor_shape(input1_shape);
+    const TensorShape input1_shape =
+        misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
+    const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape);
     ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradFilterTransformKernel::validate(weights, &input1, winograd_info));
 
     // Validate batched matrix multiply
     TensorShape batched_mm_output_shape = input0.tensor_shape();
     batched_mm_output_shape[0]          = input1.tensor_shape()[0];
     const TensorInfo batched_mm_output  = input0.clone()->set_tensor_shape(batched_mm_output_shape);
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemm::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false,
-                                                                                                                     GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16))));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        ClGemm::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f,
+                         GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false,
+                                  GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16))));
 
     // Configure output transform
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradOutputTransformKernel::validate(&batched_mm_output, biases, dst, winograd_info, act_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        kernels::ClWinogradOutputTransformKernel::validate(&batched_mm_output, biases, dst, winograd_info, act_info));
     return Status{};
 }
 
@@ -171,8 +180,14 @@ ClWinogradConv2d::ClWinogradConv2d()
 
 ClWinogradConv2d::~ClWinogradConv2d() = default;
 
-void ClWinogradConv2d::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                                 const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math)
+void ClWinogradConv2d::configure(const ClCompileContext    &compile_context,
+                                 ITensorInfo               *src,
+                                 ITensorInfo               *weights,
+                                 ITensorInfo               *biases,
+                                 ITensorInfo               *dst,
+                                 const PadStrideInfo       &conv_info,
+                                 const ActivationLayerInfo &act_info,
+                                 bool                       enable_fast_math)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math));
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info, enable_fast_math);
@@ -187,50 +202,53 @@ void ClWinogradConv2d::configure(const ClCompileContext &compile_context, ITenso
     const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout());
 
     // Check if the Winograd configuration requires fast math
-    if(!enable_fast_math)
+    if (!enable_fast_math)
     {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); //disable winograd for fp16 if fast math is false.
-        ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1,
+                                                      DataType::F32); //disable winograd for fp16 if fast math is false.
+        ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size),
+                                 "This Winograd configuration requires enable_fast_math=true");
     }
-    const WinogradInfo winograd_info = WinogradInfo(output_tile,
-                                                    kernel_size,
-                                                    input_dims,
-                                                    conv_info,
-                                                    src->data_layout());
+    const WinogradInfo winograd_info =
+        WinogradInfo(output_tile, kernel_size, input_dims, conv_info, src->data_layout());
 
     _is_prepared = false;
 
     // Configure input transform
     _input_transform->configure(compile_context, src, &_input0, winograd_info);
-    _border_handler.configure(compile_context, src, _input_transform->border_size(), BorderMode::CONSTANT, PixelValue());
+    _border_handler.configure(compile_context, src, _input_transform->border_size(), BorderMode::CONSTANT,
+                              PixelValue());
 
     // Configure filter transform
     _filter_transform->configure(compile_context, weights, &_input1, winograd_info);
 
     // Configure batched matrix multiply
-    _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0,
-                                                                                                                  false, false,
-                                                                                                                  GEMMLowpOutputStageInfo(),
-                                                                                                                  (src->data_type() == DataType::F16)));
+    _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f,
+                          GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false,
+                                   GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16)));
 
     // Configure output transform
     _output_transform->set_target(CLScheduler::get().target());
     _output_transform->configure(compile_context, &_batched_mm_output, biases, dst, winograd_info, act_info);
 
-    _aux_mem                             = _batched_mm.workspace();
-    const MemoryLifetime wino_wei_lifetm = std::any_of(std::begin(_aux_mem), std::end(_aux_mem), [](const auto & r)
-    {
-        return (r.lifetime == MemoryLifetime::Persistent) && (r.size > 0);
-    }) ?
-    MemoryLifetime::Prepare :
-    MemoryLifetime::Persistent;
+    _aux_mem = _batched_mm.workspace();
+    const MemoryLifetime wino_wei_lifetm =
+        std::any_of(std::begin(_aux_mem), std::end(_aux_mem),
+                    [](const auto &r) { return (r.lifetime == MemoryLifetime::Persistent) && (r.size > 0); })
+            ? MemoryLifetime::Prepare
+            : MemoryLifetime::Persistent;
     _aux_mem.push_back(MemoryInfo(offset_int_vec(2), MemoryLifetime::Temporary, _input0.total_size()));
     _aux_mem.push_back(MemoryInfo(offset_int_vec(3), wino_wei_lifetm, _input1.total_size()));
     _aux_mem.push_back(MemoryInfo(offset_int_vec(4), MemoryLifetime::Temporary, _batched_mm_output.total_size()));
 }
 
-Status ClWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                                  const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status ClWinogradConv2d::validate(const ITensorInfo         *src,
+                                  const ITensorInfo         *weights,
+                                  const ITensorInfo         *biases,
+                                  const ITensorInfo         *dst,
+                                  const PadStrideInfo       &conv_info,
+                                  const ActivationLayerInfo &act_info,
+                                  bool                       enable_fast_math)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math));
     return Status{};
@@ -251,10 +269,9 @@ void ClWinogradConv2d::run(ITensorPack &tensors)
     prepare(tensors);
 
     // Run input transform
-    ITensorPack pack_it
-    {
-        { TensorType::ACL_SRC, src },
-        { TensorType::ACL_DST, input0.get() },
+    ITensorPack pack_it{
+        {TensorType::ACL_SRC, src},
+        {TensorType::ACL_DST, input0.get()},
     };
     CLScheduler::get().enqueue_op(_border_handler, pack_it, false);
     CLScheduler::get().enqueue_op(*_input_transform, pack_it, false);
@@ -263,31 +280,31 @@ void ClWinogradConv2d::run(ITensorPack &tensors)
     ITensorPack pack_mm = tensors;
     pack_mm.add_const_tensor(TensorType::ACL_SRC_0, input0.get());
     pack_mm.add_tensor(TensorType::ACL_DST, batched_mm_output.get());
-    is_gemm_reshaped ? pack_mm.remove_tensor(TensorType::ACL_SRC_1) : pack_mm.add_const_tensor(TensorType::ACL_SRC_1, input1.get());
+    is_gemm_reshaped ? pack_mm.remove_tensor(TensorType::ACL_SRC_1)
+                     : pack_mm.add_const_tensor(TensorType::ACL_SRC_1, input1.get());
     _batched_mm.run(pack_mm);
 
     // Run output transform
-    ITensorPack pack_ot
-    {
-        { TensorType::ACL_SRC_0, batched_mm_output.get() },
-        { TensorType::ACL_SRC_1, biases },
-        { TensorType::ACL_DST, dst },
+    ITensorPack pack_ot{
+        {TensorType::ACL_SRC_0, batched_mm_output.get()},
+        {TensorType::ACL_SRC_1, biases},
+        {TensorType::ACL_DST, dst},
     };
     CLScheduler::get().enqueue_op(*_output_transform, pack_ot);
 }
 
 void ClWinogradConv2d::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
-        auto       weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+        auto weights =
+            utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
         ICLTensor *in1_aux = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(3)));
 
         CLAuxTensorHandler input1(_input1, *in1_aux);
-        ITensorPack        pack_ft
-        {
-            { TensorType::ACL_SRC, weights },
-            { TensorType::ACL_DST, input1.get() },
+        ITensorPack        pack_ft{
+            {TensorType::ACL_SRC, weights},
+            {TensorType::ACL_DST, input1.get()},
         };
         // Run filter transform and mark original weights as unused
         CLScheduler::get().enqueue_op(*_filter_transform, pack_ft, false);
@@ -308,4 +325,4 @@ experimental::MemoryRequirements ClWinogradConv2d::workspace() const
     return _aux_mem;
 }
 } // namespace opencl
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClWinogradConv2d.h b/src/gpu/cl/operators/ClWinogradConv2d.h
index eb2f7a72b2..54ec1a1737 100644
--- a/src/gpu/cl/operators/ClWinogradConv2d.h
+++ b/src/gpu/cl/operators/ClWinogradConv2d.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_WINOGRADCONV2D_H
 
 #include "arm_compute/runtime/CL/CLTensor.h"
+
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
@@ -41,7 +42,7 @@ namespace kernels
 class ClWinogradInputTransformKernel;
 class ClWinogradFilterTransformKernel;
 class ClWinogradOutputTransformKernel;
-} // kernels
+} // namespace kernels
 /** Basic function to execute Winograd-based convolution on OpenCL. This function calls the following OpenCL functions/kernels:
  *
  *  -# @ref kernels::ClWinogradInputTransformKernel
@@ -93,20 +94,31 @@ public:
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                              available which may introduce a drop of accuracy as well. Default is false
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src,
+                   ITensorInfo               *weights,
+                   ITensorInfo               *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClWinogradConv2d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *dst,
+                           const PadStrideInfo       &conv_info,
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false);
 
     // Inherited method overridden
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
diff --git a/src/gpu/cl/utils/ClAuxTensorHandler.h b/src/gpu/cl/utils/ClAuxTensorHandler.h
index af383489a1..81dc3baef4 100644
--- a/src/gpu/cl/utils/ClAuxTensorHandler.h
+++ b/src/gpu/cl/utils/ClAuxTensorHandler.h
@@ -39,25 +39,26 @@ namespace opencl
 class CLAuxTensorHandler
 {
 public:
-    CLAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false)
+    CLAuxTensorHandler(
+        int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false)
         : _tensor()
     {
-        if(info.total_size() == 0)
+        if (info.total_size() == 0)
         {
             return;
         }
         _tensor.allocator()->soft_init(info);
 
         ICLTensor *packed_tensor = utils::cast::polymorphic_downcast<ICLTensor *>(pack.get_tensor(slot_id));
-        if((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size()))
+        if ((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size()))
         {
-            if(!bypass_alloc)
+            if (!bypass_alloc)
             {
                 _tensor.allocator()->allocate();
                 ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Allocating auxiliary tensor");
             }
 
-            if(pack_inject)
+            if (pack_inject)
             {
                 pack.add_tensor(slot_id, &_tensor);
                 _injected_tensor_pack = &pack;
@@ -70,22 +71,21 @@ public:
         }
     }
 
-    CLAuxTensorHandler(TensorInfo &info, ICLTensor &tensor)
-        : _tensor()
+    CLAuxTensorHandler(TensorInfo &info, ICLTensor &tensor) : _tensor()
     {
         _tensor.allocator()->soft_init(info);
-        if(info.total_size() <= tensor.info()->total_size())
+        if (info.total_size() <= tensor.info()->total_size())
         {
             _tensor.allocator()->import_memory(tensor.cl_buffer());
         }
     }
 
-    CLAuxTensorHandler(const CLAuxTensorHandler &) = delete;
+    CLAuxTensorHandler(const CLAuxTensorHandler &)          = delete;
     CLAuxTensorHandler &operator=(const CLAuxTensorHandler) = delete;
 
     ~CLAuxTensorHandler()
     {
-        if(_injected_tensor_pack)
+        if (_injected_tensor_pack)
         {
             _injected_tensor_pack->remove_tensor(_injected_slot_id);
         }
@@ -103,9 +103,9 @@ public:
 
 private:
     CLTensor     _tensor{};
-    ITensorPack *_injected_tensor_pack{ nullptr };
-    int          _injected_slot_id{ TensorType::ACL_UNKNOWN };
+    ITensorPack *_injected_tensor_pack{nullptr};
+    int          _injected_slot_id{TensorType::ACL_UNKNOWN};
 };
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_UTILS_CL_AUX_TENSOR_HANDLER_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_UTILS_CL_AUX_TENSOR_HANDLER_H */
diff --git a/src/graph/DataLayerVisitor.cpp b/src/graph/DataLayerVisitor.cpp
index 073ffd413d..f0fac25577 100644
--- a/src/graph/DataLayerVisitor.cpp
+++ b/src/graph/DataLayerVisitor.cpp
@@ -25,8 +25,8 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/TypePrinter.h"
 #include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/graph/TypePrinter.h"
 
 namespace arm_compute
 {
@@ -43,17 +43,14 @@ void add_convolution_layer_data(DataLayerVisitor::LayerData &layer_data, T &node
     layer_data["data_layout"] = to_string(layout);
     // Add padding info
     std::ostringstream padding;
-    padding << "[" << to_string(ps_info.pad_left()) << ","
-            << to_string(ps_info.pad_top()) << ","
-            << to_string(ps_info.pad_bottom()) << ","
-            << to_string(ps_info.pad_right()) << "]";
+    padding << "[" << to_string(ps_info.pad_left()) << "," << to_string(ps_info.pad_top()) << ","
+            << to_string(ps_info.pad_bottom()) << "," << to_string(ps_info.pad_right()) << "]";
 
     layer_data["pad"] = padding.str();
 
     // Add stride info
     std::ostringstream stride;
-    stride << "[" << to_string(ps_info.stride().first) << ","
-           << to_string(ps_info.stride().second) << "]";
+    stride << "[" << to_string(ps_info.stride().first) << "," << to_string(ps_info.stride().second) << "]";
 
     layer_data["stride"] = stride.str();
 
@@ -68,12 +65,12 @@ void add_convolution_layer_data(DataLayerVisitor::LayerData &layer_data, T &node
 
     // Change input names for weights / bias (if applicable)
     // Assumes input(1) is weights and input(2) is bias
-    if(layer_data.count("input_shape1"))
+    if (layer_data.count("input_shape1"))
     {
         layer_data["weights_shape"] = layer_data["input_shape1"];
         layer_data.erase("input_shape1");
     }
-    if(layer_data.count("input_shape2"))
+    if (layer_data.count("input_shape2"))
     {
         layer_data["bias_shape"] = layer_data["input_shape2"];
         layer_data.erase("input_shape2");
@@ -92,16 +89,17 @@ template <typename T>
 void add_generic_layer_data(DataLayerVisitor::LayerData &layer_data, T &node)
 {
     // Loop over each input tensor
-    for(size_t tensor_no = 0; tensor_no < node.num_inputs(); ++tensor_no)
+    for (size_t tensor_no = 0; tensor_no < node.num_inputs(); ++tensor_no)
     {
         // Add input tensor shapes
-        if(node.input(tensor_no) != nullptr)
+        if (node.input(tensor_no) != nullptr)
         {
-            layer_data["input_shape" + to_string(tensor_no)] = "[" + to_string(node.input(tensor_no)->desc().shape) + "]";
+            layer_data["input_shape" + to_string(tensor_no)] =
+                "[" + to_string(node.input(tensor_no)->desc().shape) + "]";
         }
     }
     // Add output tensor shape
-    if(node.output(0) != nullptr)
+    if (node.output(0) != nullptr)
     {
         layer_data["output_shape0"] = "[" + to_string(node.output(0)->desc().shape) + "]";
     }
diff --git a/src/graph/Graph.cpp b/src/graph/Graph.cpp
index 4ce53589d4..3ae83f2e80 100644
--- a/src/graph/Graph.cpp
+++ b/src/graph/Graph.cpp
@@ -34,24 +34,24 @@ Graph::Graph(GraphID id, std::string name)
 
 bool Graph::remove_node(NodeID nid)
 {
-    if(nid >= _nodes.size())
+    if (nid >= _nodes.size())
     {
         return false;
     }
 
     std::unique_ptr<INode> &node = _nodes[nid];
 
-    if(node)
+    if (node)
     {
         // Remove input connections
-        for(auto &input_eid : node->_input_edges)
+        for (auto &input_eid : node->_input_edges)
         {
             remove_connection(input_eid);
         }
 
         // Remove output connections
         std::set<EdgeID> output_edges_copy = node->output_edges();
-        for(auto &output_eid : output_edges_copy)
+        for (auto &output_eid : output_edges_copy)
         {
             remove_connection(output_eid);
         }
@@ -71,8 +71,10 @@ EdgeID Graph::add_connection(NodeID source, size_t source_idx, NodeID sink, size
     arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
 
     // Check if node index is valid, if node exists and finally if the connection index is valid
-    ARM_COMPUTE_ERROR_ON((source >= _nodes.size()) || (_nodes[source] == nullptr) || (source_idx >= _nodes[source]->num_outputs()));
-    ARM_COMPUTE_ERROR_ON((sink >= _nodes.size()) || (_nodes[sink] == nullptr) || (sink_idx >= _nodes[sink]->num_inputs()));
+    ARM_COMPUTE_ERROR_ON((source >= _nodes.size()) || (_nodes[source] == nullptr) ||
+                         (source_idx >= _nodes[source]->num_outputs()));
+    ARM_COMPUTE_ERROR_ON((sink >= _nodes.size()) || (_nodes[sink] == nullptr) ||
+                         (sink_idx >= _nodes[sink]->num_inputs()));
 
     // Get nodes
     std::unique_ptr<INode> &source_node = _nodes[source];
@@ -80,23 +82,25 @@ EdgeID Graph::add_connection(NodeID source, size_t source_idx, NodeID sink, size
 
     // Check for duplicate connections (Check only sink node)
     Edge *sink_node_edge = sink_node->input_edge(sink_idx);
-    if((sink_node_edge != nullptr) && (sink_node_edge->producer_id() == source) && (sink_node_edge->producer_idx() == source_idx)
-       && (sink_node_edge->consumer_id() == sink) && (sink_node_edge->consumer_idx() == sink_idx))
+    if ((sink_node_edge != nullptr) && (sink_node_edge->producer_id() == source) &&
+        (sink_node_edge->producer_idx() == source_idx) && (sink_node_edge->consumer_id() == sink) &&
+        (sink_node_edge->consumer_idx() == sink_idx))
     {
         return sink_node_edge->id();
     }
 
     // Check if there is already a tensor associated with output if not create one
     TensorID tid = source_node->output_id(source_idx);
-    if(tid == NullTensorID)
+    if (tid == NullTensorID)
     {
         tid = create_tensor();
     }
     std::unique_ptr<Tensor> &tensor = _tensors[tid];
 
     // Create connections
-    EdgeID eid        = _edges.size();
-    auto   connection = std::make_unique<Edge>(eid, source_node.get(), source_idx, sink_node.get(), sink_idx, tensor.get());
+    EdgeID eid = _edges.size();
+    auto   connection =
+        std::make_unique<Edge>(eid, source_node.get(), source_idx, sink_node.get(), sink_idx, tensor.get());
     _edges.push_back(std::move(connection));
 
     // Add connections to source and sink nodes
@@ -117,7 +121,7 @@ EdgeID Graph::add_connection(NodeID source, size_t source_idx, NodeID sink, size
 
 bool Graph::remove_connection(EdgeID eid)
 {
-    if(eid >= _edges.size())
+    if (eid >= _edges.size())
     {
         return false;
     }
@@ -125,22 +129,22 @@ bool Graph::remove_connection(EdgeID eid)
     std::unique_ptr<Edge> &edge = _edges[eid];
 
     // Remove node connections
-    if(edge != nullptr)
+    if (edge != nullptr)
     {
         // Get tensor bound to the edge
-        if(edge->tensor() != nullptr)
+        if (edge->tensor() != nullptr)
         {
             edge->tensor()->unbind_edge(eid);
         }
 
         // Remove edges from source node
-        if(edge->producer() != nullptr)
+        if (edge->producer() != nullptr)
         {
             edge->producer()->_output_edges.erase(eid);
         }
 
         // Remove edges from sink node
-        if((edge->consumer() != nullptr) && (edge->consumer_idx() < edge->consumer()->_input_edges.size()))
+        if ((edge->consumer() != nullptr) && (edge->consumer_idx() < edge->consumer()->_input_edges.size()))
         {
             edge->consumer()->_input_edges[edge->consumer_idx()] = EmptyEdgeID;
         }
@@ -231,4 +235,4 @@ Tensor *Graph::tensor(TensorID id)
     return (id >= _tensors.size()) ? nullptr : _tensors[id].get();
 }
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/GraphBuilder.cpp b/src/graph/GraphBuilder.cpp
index 7e5d3133d1..eab91b2347 100644
--- a/src/graph/GraphBuilder.cpp
+++ b/src/graph/GraphBuilder.cpp
@@ -24,10 +24,10 @@
 #include "arm_compute/graph/GraphBuilder.h"
 
 #include "arm_compute/core/utils/DataTypeUtils.h"
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/Utils.h"
 #include "arm_compute/graph/algorithms/TopologicalSort.h"
+#include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/graph/Utils.h"
 
 #include "support/ToolchainSupport.h"
 
@@ -41,7 +41,8 @@ inline void check_nodeidx_pair(const NodeIdxPair &pair, const Graph &g)
 {
     ARM_COMPUTE_UNUSED(pair);
     ARM_COMPUTE_UNUSED(g);
-    ARM_COMPUTE_ERROR_ON((pair.node_id >= g.nodes().size()) || (g.node((pair).node_id) == nullptr) || (pair.index >= g.node(pair.node_id)->num_outputs()));
+    ARM_COMPUTE_ERROR_ON((pair.node_id >= g.nodes().size()) || (g.node((pair).node_id) == nullptr) ||
+                         (pair.index >= g.node(pair.node_id)->num_outputs()));
 }
 
 Status set_node_params(Graph &g, NodeID nid, NodeParams &params)
@@ -67,7 +68,8 @@ Status set_accessor_on_node(Graph &g, NodeID nid, bool is_output, size_t idx, IT
     return Status{};
 }
 
-NodeID add_const_node_with_name(Graph &g, NodeParams params, const std::string &name, const TensorDescriptor &desc, ITensorAccessorUPtr accessor)
+NodeID add_const_node_with_name(
+    Graph &g, NodeParams params, const std::string &name, const TensorDescriptor &desc, ITensorAccessorUPtr accessor)
 {
     params.name = params.name.empty() ? "" : params.name + name;
     auto nid    = GraphBuilder::add_const_node(g, params, desc, std::move(accessor));
@@ -76,7 +78,7 @@ NodeID add_const_node_with_name(Graph &g, NodeParams params, const std::string &
 }
 
 template <typename NT, typename... Args>
-NodeID create_simple_single_input_output_node(Graph &g, NodeParams &params, NodeIdxPair input, Args &&... args)
+NodeID create_simple_single_input_output_node(Graph &g, NodeParams &params, NodeIdxPair input, Args &&...args)
 {
     check_nodeidx_pair(input, g);
 
@@ -88,14 +90,17 @@ NodeID create_simple_single_input_output_node(Graph &g, NodeParams &params, Node
 }
 
 template <typename NT, typename... Args>
-NodeID create_simple_multiple_input_single_output_node(Graph &g, NodeParams &params, const std::vector<NodeIdxPair> &inputs, Args &&... args)
+NodeID create_simple_multiple_input_single_output_node(Graph                          &g,
+                                                       NodeParams                     &params,
+                                                       const std::vector<NodeIdxPair> &inputs,
+                                                       Args &&...args)
 {
     ARM_COMPUTE_ERROR_ON(inputs.size() == 0);
 
     NodeID nid = g.add_node<NT>(std::forward<Args>(args)...);
 
     unsigned int i = 0;
-    for(const auto &input : inputs)
+    for (const auto &input : inputs)
     {
         check_nodeidx_pair(input, g);
         g.add_connection(input.node_id, input.index, nid, i++);
@@ -106,7 +111,8 @@ NodeID create_simple_multiple_input_single_output_node(Graph &g, NodeParams &par
 }
 } // namespace
 
-NodeID GraphBuilder::add_const_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor)
+NodeID
+GraphBuilder::add_const_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor)
 {
     auto nid = g.add_node<ConstNode>(desc);
     set_node_params(g, nid, params);
@@ -114,7 +120,8 @@ NodeID GraphBuilder::add_const_node(Graph &g, NodeParams params, const TensorDes
     return nid;
 }
 
-NodeID GraphBuilder::add_input_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor)
+NodeID
+GraphBuilder::add_input_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor)
 {
     auto nid = g.add_node<InputNode>(desc);
     set_node_params(g, nid, params);
@@ -134,21 +141,35 @@ NodeID GraphBuilder::add_output_node(Graph &g, NodeParams params, NodeIdxPair in
     return nid;
 }
 
-NodeID GraphBuilder::add_activation_node(Graph &g, NodeParams params, NodeIdxPair input, ActivationLayerInfo act_info,
+NodeID GraphBuilder::add_activation_node(Graph                  &g,
+                                         NodeParams              params,
+                                         NodeIdxPair             input,
+                                         ActivationLayerInfo     act_info,
                                          const QuantizationInfo &out_quant_info)
 {
     return create_simple_single_input_output_node<ActivationLayerNode>(g, params, input, act_info, out_quant_info);
 }
 
-NodeID GraphBuilder::add_arg_min_max_node(Graph &g, NodeParams params, NodeIdxPair input, ReductionOperation op, unsigned int axis,
-                                          DataType out_data_type, const QuantizationInfo &out_quant_info)
+NodeID GraphBuilder::add_arg_min_max_node(Graph                  &g,
+                                          NodeParams              params,
+                                          NodeIdxPair             input,
+                                          ReductionOperation      op,
+                                          unsigned int            axis,
+                                          DataType                out_data_type,
+                                          const QuantizationInfo &out_quant_info)
 {
-    return create_simple_single_input_output_node<ArgMinMaxLayerNode>(g, params, input, op, axis, out_data_type, out_quant_info);
+    return create_simple_single_input_output_node<ArgMinMaxLayerNode>(g, params, input, op, axis, out_data_type,
+                                                                      out_quant_info);
 }
 
-NodeID GraphBuilder::add_batch_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, float epsilon,
-                                                  ITensorAccessorUPtr mean_accessor, ITensorAccessorUPtr var_accessor,
-                                                  ITensorAccessorUPtr beta_accessor, ITensorAccessorUPtr gamma_accessor)
+NodeID GraphBuilder::add_batch_normalization_node(Graph              &g,
+                                                  NodeParams          params,
+                                                  NodeIdxPair         input,
+                                                  float               epsilon,
+                                                  ITensorAccessorUPtr mean_accessor,
+                                                  ITensorAccessorUPtr var_accessor,
+                                                  ITensorAccessorUPtr beta_accessor,
+                                                  ITensorAccessorUPtr gamma_accessor)
 {
     check_nodeidx_pair(input, g);
 
@@ -168,14 +189,14 @@ NodeID GraphBuilder::add_batch_normalization_node(Graph &g, NodeParams params, N
 
     // Create beta node
     NodeID beta_nid = EmptyNodeID;
-    if(has_beta)
+    if (has_beta)
     {
         beta_nid = add_const_node_with_name(g, params, "Beta", common_desc, std::move(beta_accessor));
     }
 
     // Create gamma node
     NodeID gamma_nid = EmptyNodeID;
-    if(has_gamma)
+    if (has_gamma)
     {
         gamma_nid = add_const_node_with_name(g, params, "Gamma", common_desc, std::move(gamma_accessor));
     }
@@ -185,11 +206,11 @@ NodeID GraphBuilder::add_batch_normalization_node(Graph &g, NodeParams params, N
     g.add_connection(input.node_id, input.index, batch_norm_nid, 0);
     g.add_connection(mean_nid, 0, batch_norm_nid, 1);
     g.add_connection(var_nid, 0, batch_norm_nid, 2);
-    if(has_beta)
+    if (has_beta)
     {
         g.add_connection(beta_nid, 0, batch_norm_nid, 3);
     }
-    if(has_gamma)
+    if (has_gamma)
     {
         g.add_connection(gamma_nid, 0, batch_norm_nid, 4);
     }
@@ -198,7 +219,8 @@ NodeID GraphBuilder::add_batch_normalization_node(Graph &g, NodeParams params, N
     return batch_norm_nid;
 }
 
-NodeID GraphBuilder::add_bounding_box_transform_node(Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair deltas, BoundingBoxTransformInfo info)
+NodeID GraphBuilder::add_bounding_box_transform_node(
+    Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair deltas, BoundingBoxTransformInfo info)
 {
     check_nodeidx_pair(input, g);
     check_nodeidx_pair(deltas, g);
@@ -217,10 +239,17 @@ NodeID GraphBuilder::add_channel_shuffle_node(Graph &g, NodeParams params, NodeI
     return create_simple_single_input_output_node<ChannelShuffleLayerNode>(g, params, input, num_groups);
 }
 
-NodeID GraphBuilder::add_convolution_node(Graph &g, NodeParams params, NodeIdxPair input,
-                                          Size2D kernel_spatial_extend, unsigned int depth, PadStrideInfo conv_info,
-                                          unsigned int num_groups, ConvolutionMethod method, FastMathHint fast_math_hint,
-                                          ITensorAccessorUPtr weights_accessor, ITensorAccessorUPtr bias_accessor,
+NodeID GraphBuilder::add_convolution_node(Graph                  &g,
+                                          NodeParams              params,
+                                          NodeIdxPair             input,
+                                          Size2D                  kernel_spatial_extend,
+                                          unsigned int            depth,
+                                          PadStrideInfo           conv_info,
+                                          unsigned int            num_groups,
+                                          ConvolutionMethod       method,
+                                          FastMathHint            fast_math_hint,
+                                          ITensorAccessorUPtr     weights_accessor,
+                                          ITensorAccessorUPtr     bias_accessor,
                                           const QuantizationInfo &weights_quant_info,
                                           const QuantizationInfo &out_quant_info)
 {
@@ -241,7 +270,7 @@ NodeID GraphBuilder::add_convolution_node(Graph &g, NodeParams params, NodeIdxPa
     w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL),
                      get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL) / num_groups);
     w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::BATCHES), depth);
-    if(!weights_quant_info.empty())
+    if (!weights_quant_info.empty())
     {
         w_desc.quant_info = weights_quant_info;
     }
@@ -250,11 +279,11 @@ NodeID GraphBuilder::add_convolution_node(Graph &g, NodeParams params, NodeIdxPa
 
     // Create bias nodes
     NodeID b_nid = EmptyNodeID;
-    if(has_bias)
+    if (has_bias)
     {
         TensorDescriptor b_desc = input_tensor_desc;
         b_desc.shape            = TensorShape(depth);
-        if(is_data_type_quantized_asymmetric(input_tensor_desc.data_type))
+        if (is_data_type_quantized_asymmetric(input_tensor_desc.data_type))
         {
             b_desc.data_type = DataType::S32;
         }
@@ -265,7 +294,7 @@ NodeID GraphBuilder::add_convolution_node(Graph &g, NodeParams params, NodeIdxPa
     NodeID conv_nid = g.add_node<ConvolutionLayerNode>(conv_info, num_groups, method, fast_math_hint, out_quant_info);
     g.add_connection(input.node_id, input.index, conv_nid, 0);
     g.add_connection(w_nid, 0, conv_nid, 1);
-    if(has_bias)
+    if (has_bias)
     {
         g.add_connection(b_nid, 0, conv_nid, 2);
     }
@@ -274,8 +303,12 @@ NodeID GraphBuilder::add_convolution_node(Graph &g, NodeParams params, NodeIdxPa
     return conv_nid;
 }
 
-NodeID GraphBuilder::add_deconvolution_node(Graph &g, NodeParams params, NodeIdxPair input,
-                                            Size2D kernel_spatial_extend, unsigned int depth, PadStrideInfo deconv_info,
+NodeID GraphBuilder::add_deconvolution_node(Graph              &g,
+                                            NodeParams          params,
+                                            NodeIdxPair         input,
+                                            Size2D              kernel_spatial_extend,
+                                            unsigned int        depth,
+                                            PadStrideInfo       deconv_info,
                                             ITensorAccessorUPtr weights_accessor,
                                             ITensorAccessorUPtr bias_accessor)
 {
@@ -301,11 +334,11 @@ NodeID GraphBuilder::add_deconvolution_node(Graph &g, NodeParams params, NodeIdx
 
     // Create bias nodes
     NodeID b_nid = EmptyNodeID;
-    if(has_bias)
+    if (has_bias)
     {
         TensorDescriptor b_desc = input_tensor_desc;
         b_desc.shape            = TensorShape(depth);
-        if(is_data_type_quantized_asymmetric(input_tensor_desc.data_type))
+        if (is_data_type_quantized_asymmetric(input_tensor_desc.data_type))
         {
             b_desc.data_type = DataType::S32;
         }
@@ -313,10 +346,10 @@ NodeID GraphBuilder::add_deconvolution_node(Graph &g, NodeParams params, NodeIdx
     }
 
     // Create convolution node and connect
-    NodeID deconv_nid = g.add_node<DeconvolutionLayerNode>(descriptors::DeconvolutionLayerDescriptor{ deconv_info });
+    NodeID deconv_nid = g.add_node<DeconvolutionLayerNode>(descriptors::DeconvolutionLayerDescriptor{deconv_info});
     g.add_connection(input.node_id, input.index, deconv_nid, 0);
     g.add_connection(w_nid, 0, deconv_nid, 1);
-    if(has_bias)
+    if (has_bias)
     {
         g.add_connection(b_nid, 0, deconv_nid, 2);
     }
@@ -325,14 +358,26 @@ NodeID GraphBuilder::add_deconvolution_node(Graph &g, NodeParams params, NodeIdx
     return deconv_nid;
 }
 
-NodeID GraphBuilder::add_concatenate_node(Graph &g, NodeParams params, const std::vector<NodeIdxPair> &inputs, const descriptors::ConcatLayerDescriptor &concat_descriptor)
+NodeID GraphBuilder::add_concatenate_node(Graph                                    &g,
+                                          NodeParams                                params,
+                                          const std::vector<NodeIdxPair>           &inputs,
+                                          const descriptors::ConcatLayerDescriptor &concat_descriptor)
 {
-    return create_simple_multiple_input_single_output_node<ConcatenateLayerNode>(g, params, inputs, inputs.size(), concat_descriptor);
+    return create_simple_multiple_input_single_output_node<ConcatenateLayerNode>(g, params, inputs, inputs.size(),
+                                                                                 concat_descriptor);
 }
 
-NodeID GraphBuilder::add_depthwise_convolution_node(Graph &g, NodeParams params, NodeIdxPair input, Size2D kernel_spatial_extend,
-                                                    PadStrideInfo conv_info, int depth_multiplier, DepthwiseConvolutionMethod method,
-                                                    ITensorAccessorUPtr weights_accessor, ITensorAccessorUPtr bias_accessor, const QuantizationInfo &quant_info, const QuantizationInfo &out_quant_info)
+NodeID GraphBuilder::add_depthwise_convolution_node(Graph                     &g,
+                                                    NodeParams                 params,
+                                                    NodeIdxPair                input,
+                                                    Size2D                     kernel_spatial_extend,
+                                                    PadStrideInfo              conv_info,
+                                                    int                        depth_multiplier,
+                                                    DepthwiseConvolutionMethod method,
+                                                    ITensorAccessorUPtr        weights_accessor,
+                                                    ITensorAccessorUPtr        bias_accessor,
+                                                    const QuantizationInfo    &quant_info,
+                                                    const QuantizationInfo    &out_quant_info)
 {
     check_nodeidx_pair(input, g);
     ARM_COMPUTE_ERROR_ON((kernel_spatial_extend.width == 0) || (kernel_spatial_extend.height == 0));
@@ -349,7 +394,7 @@ NodeID GraphBuilder::add_depthwise_convolution_node(Graph &g, NodeParams params,
     w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::HEIGHT), kernel_spatial_extend.height);
     w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL),
                      get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL) * depth_multiplier);
-    if(!quant_info.empty())
+    if (!quant_info.empty())
     {
         w_desc.quant_info = quant_info;
     }
@@ -358,12 +403,13 @@ NodeID GraphBuilder::add_depthwise_convolution_node(Graph &g, NodeParams params,
 
     // Create bias nodes
     NodeID b_nid = EmptyNodeID;
-    if(has_bias)
+    if (has_bias)
     {
         TensorDescriptor b_desc = input_tensor_desc;
-        b_desc.shape            = TensorShape(get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL) * depth_multiplier);
+        b_desc.shape =
+            TensorShape(get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL) * depth_multiplier);
 
-        if(is_data_type_quantized_asymmetric(b_desc.data_type))
+        if (is_data_type_quantized_asymmetric(b_desc.data_type))
         {
             b_desc.data_type = DataType::S32;
         }
@@ -375,7 +421,7 @@ NodeID GraphBuilder::add_depthwise_convolution_node(Graph &g, NodeParams params,
     NodeID conv_nid = g.add_node<DepthwiseConvolutionLayerNode>(conv_info, depth_multiplier, method, out_quant_info);
     g.add_connection(input.node_id, input.index, conv_nid, 0);
     g.add_connection(w_nid, 0, conv_nid, 1);
-    if(has_bias)
+    if (has_bias)
     {
         g.add_connection(b_nid, 0, conv_nid, 2);
     }
@@ -394,7 +440,12 @@ NodeID GraphBuilder::add_dequantization_node(Graph &g, NodeParams params, NodeId
     return create_simple_single_input_output_node<DequantizationLayerNode>(g, params, input);
 }
 
-NodeID GraphBuilder::add_detection_output_node(Graph &g, NodeParams params, NodeIdxPair input_loc, NodeIdxPair input_conf, NodeIdxPair input_priorbox, const DetectionOutputLayerInfo &detect_info)
+NodeID GraphBuilder::add_detection_output_node(Graph                          &g,
+                                               NodeParams                      params,
+                                               NodeIdxPair                     input_loc,
+                                               NodeIdxPair                     input_conf,
+                                               NodeIdxPair                     input_priorbox,
+                                               const DetectionOutputLayerInfo &detect_info)
 {
     check_nodeidx_pair(input_loc, g);
     check_nodeidx_pair(input_conf, g);
@@ -411,18 +462,24 @@ NodeID GraphBuilder::add_detection_output_node(Graph &g, NodeParams params, Node
     return detect_nid;
 }
 
-NodeID GraphBuilder::add_detection_post_process_node(Graph &g, NodeParams params, NodeIdxPair input_box_encoding, NodeIdxPair input_class_prediction, const DetectionPostProcessLayerInfo &detect_info,
-                                                     ITensorAccessorUPtr anchors_accessor, const QuantizationInfo &anchor_quant_info)
+NodeID GraphBuilder::add_detection_post_process_node(Graph                               &g,
+                                                     NodeParams                           params,
+                                                     NodeIdxPair                          input_box_encoding,
+                                                     NodeIdxPair                          input_class_prediction,
+                                                     const DetectionPostProcessLayerInfo &detect_info,
+                                                     ITensorAccessorUPtr                  anchors_accessor,
+                                                     const QuantizationInfo              &anchor_quant_info)
 {
     check_nodeidx_pair(input_box_encoding, g);
     check_nodeidx_pair(input_class_prediction, g);
 
     // Get input tensor descriptor
-    const TensorDescriptor input_box_encoding_tensor_desc = get_tensor_descriptor(g, g.node(input_box_encoding.node_id)->outputs()[0]);
+    const TensorDescriptor input_box_encoding_tensor_desc =
+        get_tensor_descriptor(g, g.node(input_box_encoding.node_id)->outputs()[0]);
 
     // Calculate anchor descriptor
     TensorDescriptor anchor_desc = input_box_encoding_tensor_desc;
-    if(!anchor_quant_info.empty())
+    if (!anchor_quant_info.empty())
     {
         anchor_desc.quant_info = anchor_quant_info;
     }
@@ -446,12 +503,13 @@ NodeID GraphBuilder::add_dummy_node(Graph &g, NodeParams params, NodeIdxPair inp
     return create_simple_single_input_output_node<DummyNode>(g, params, input, shape);
 }
 
-NodeID GraphBuilder::add_elementwise_node(Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, EltwiseOperation operation)
+NodeID GraphBuilder::add_elementwise_node(
+    Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, EltwiseOperation operation)
 {
     check_nodeidx_pair(input0, g);
     check_nodeidx_pair(input1, g);
 
-    NodeID nid = g.add_node<EltwiseLayerNode>(descriptors::EltwiseLayerDescriptor{ operation });
+    NodeID nid = g.add_node<EltwiseLayerNode>(descriptors::EltwiseLayerDescriptor{operation});
 
     g.add_connection(input0.node_id, input0.index, nid, 0);
     g.add_connection(input1.node_id, input1.index, nid, 1);
@@ -466,9 +524,15 @@ NodeID GraphBuilder::add_flatten_node(Graph &g, NodeParams params, NodeIdxPair i
     return create_simple_single_input_output_node<FlattenLayerNode>(g, params, input);
 }
 
-NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_outputs,
-                                               NodeID weights_nid, NodeID bias_nid,
-                                               const FullyConnectedLayerInfo fc_info, const QuantizationInfo &out_quant_info, FastMathHint fast_math_hint)
+NodeID GraphBuilder::add_fully_connected_layer(Graph                        &g,
+                                               NodeParams                    params,
+                                               NodeIdxPair                   input,
+                                               unsigned int                  num_outputs,
+                                               NodeID                        weights_nid,
+                                               NodeID                        bias_nid,
+                                               const FullyConnectedLayerInfo fc_info,
+                                               const QuantizationInfo       &out_quant_info,
+                                               FastMathHint                  fast_math_hint)
 {
     check_nodeidx_pair(input, g);
     ARM_COMPUTE_ERROR_ON(num_outputs == 0);
@@ -483,7 +547,7 @@ NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, Node
     NodeID fc_nid = g.add_node<FullyConnectedLayerNode>(num_outputs, out_quant_info, fc_info, fast_math_hint);
     g.add_connection(input.node_id, input.index, fc_nid, 0);
     g.add_connection(weights_nid, 0, fc_nid, 1);
-    if(has_bias)
+    if (has_bias)
     {
         g.add_connection(bias_nid, 0, fc_nid, 2);
     }
@@ -493,10 +557,16 @@ NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, Node
     return fc_nid;
 }
 
-NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_outputs,
-                                               ITensorAccessorUPtr weights_accessor, ITensorAccessorUPtr bias_accessor,
+NodeID GraphBuilder::add_fully_connected_layer(Graph                        &g,
+                                               NodeParams                    params,
+                                               NodeIdxPair                   input,
+                                               unsigned int                  num_outputs,
+                                               ITensorAccessorUPtr           weights_accessor,
+                                               ITensorAccessorUPtr           bias_accessor,
                                                const FullyConnectedLayerInfo fc_info,
-                                               const QuantizationInfo &weights_quant_info, const QuantizationInfo &out_quant_info, FastMathHint fast_math_hint)
+                                               const QuantizationInfo       &weights_quant_info,
+                                               const QuantizationInfo       &out_quant_info,
+                                               FastMathHint                  fast_math_hint)
 {
     check_nodeidx_pair(input, g);
     ARM_COMPUTE_ERROR_ON(num_outputs == 0);
@@ -507,16 +577,17 @@ NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, Node
     const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
 
     // Create weights node
-    TensorDescriptor w_desc = FullyConnectedLayerNode::compute_weights_descriptor(input_tensor_desc, num_outputs, fc_info, weights_quant_info);
+    TensorDescriptor w_desc = FullyConnectedLayerNode::compute_weights_descriptor(input_tensor_desc, num_outputs,
+                                                                                  fc_info, weights_quant_info);
     NodeID           w_nid  = add_const_node_with_name(g, params, "Weights", w_desc, std::move(weights_accessor));
 
     // Create bias nodes
     NodeID b_nid = EmptyNodeID;
-    if(has_bias)
+    if (has_bias)
     {
         TensorDescriptor b_desc = input_tensor_desc;
         b_desc.shape            = TensorShape(num_outputs);
-        if(is_data_type_quantized_asymmetric(input_tensor_desc.data_type))
+        if (is_data_type_quantized_asymmetric(input_tensor_desc.data_type))
         {
             b_desc.data_type = DataType::S32;
         }
@@ -527,7 +598,7 @@ NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, Node
     NodeID fc_nid = g.add_node<FullyConnectedLayerNode>(num_outputs, out_quant_info, fc_info, fast_math_hint);
     g.add_connection(input.node_id, input.index, fc_nid, 0);
     g.add_connection(w_nid, 0, fc_nid, 1);
-    if(has_bias)
+    if (has_bias)
     {
         g.add_connection(b_nid, 0, fc_nid, 2);
     }
@@ -537,7 +608,12 @@ NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, Node
     return fc_nid;
 }
 
-NodeID GraphBuilder::add_generate_proposals_node(Graph &g, NodeParams params, NodeIdxPair scores, NodeIdxPair deltas, NodeIdxPair anchors, GenerateProposalsInfo info)
+NodeID GraphBuilder::add_generate_proposals_node(Graph                &g,
+                                                 NodeParams            params,
+                                                 NodeIdxPair           scores,
+                                                 NodeIdxPair           deltas,
+                                                 NodeIdxPair           anchors,
+                                                 GenerateProposalsInfo info)
 {
     check_nodeidx_pair(scores, g);
     check_nodeidx_pair(deltas, g);
@@ -558,13 +634,14 @@ NodeID GraphBuilder::add_l2_normalize_node(Graph &g, NodeParams params, NodeIdxP
     return create_simple_single_input_output_node<L2NormalizeLayerNode>(g, params, input, axis, epsilon);
 }
 
-NodeID GraphBuilder::add_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, NormalizationLayerInfo norm_info)
+NodeID
+GraphBuilder::add_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, NormalizationLayerInfo norm_info)
 {
     return create_simple_single_input_output_node<NormalizationLayerNode>(g, params, input, norm_info);
 }
 
-NodeID GraphBuilder::add_normalize_planar_yuv_node(Graph &g, NodeParams params, NodeIdxPair input,
-                                                   ITensorAccessorUPtr mean_accessor, ITensorAccessorUPtr std_accessor)
+NodeID GraphBuilder::add_normalize_planar_yuv_node(
+    Graph &g, NodeParams params, NodeIdxPair input, ITensorAccessorUPtr mean_accessor, ITensorAccessorUPtr std_accessor)
 {
     check_nodeidx_pair(input, g);
 
@@ -589,12 +666,14 @@ NodeID GraphBuilder::add_normalize_planar_yuv_node(Graph &g, NodeParams params,
     return norm_planar_yuv_nid;
 }
 
-NodeID GraphBuilder::add_pad_node(Graph &g, NodeParams params, NodeIdxPair input, const PaddingList &paddings, PixelValue pad_value)
+NodeID GraphBuilder::add_pad_node(
+    Graph &g, NodeParams params, NodeIdxPair input, const PaddingList &paddings, PixelValue pad_value)
 {
     return create_simple_single_input_output_node<PadLayerNode>(g, params, input, paddings, pad_value);
 }
 
-NodeID GraphBuilder::add_permute_node(Graph &g, NodeParams params, NodeIdxPair input, PermutationVector perm, DataLayout layout)
+NodeID GraphBuilder::add_permute_node(
+    Graph &g, NodeParams params, NodeIdxPair input, PermutationVector perm, DataLayout layout)
 {
     return create_simple_single_input_output_node<PermuteLayerNode>(g, params, input, perm, layout);
 }
@@ -618,12 +697,18 @@ NodeID GraphBuilder::add_pooling_node(Graph &g, NodeParams params, NodeIdxPair i
     return create_simple_single_input_output_node<PoolingLayerNode>(g, params, input, pool_info);
 }
 
-NodeID GraphBuilder::add_print_node(Graph &g, NodeParams params, NodeIdxPair input, std::ostream &stream, const IOFormatInfo &format_info, const std::function<ITensor *(ITensor *)> transform)
+NodeID GraphBuilder::add_print_node(Graph                                    &g,
+                                    NodeParams                                params,
+                                    NodeIdxPair                               input,
+                                    std::ostream                             &stream,
+                                    const IOFormatInfo                       &format_info,
+                                    const std::function<ITensor *(ITensor *)> transform)
 {
     return create_simple_single_input_output_node<PrintLayerNode>(g, params, input, stream, format_info, transform);
 }
 
-NodeID GraphBuilder::add_priorbox_node(Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, const PriorBoxLayerInfo &prior_info)
+NodeID GraphBuilder::add_priorbox_node(
+    Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, const PriorBoxLayerInfo &prior_info)
 {
     check_nodeidx_pair(input0, g);
     check_nodeidx_pair(input1, g);
@@ -638,12 +723,16 @@ NodeID GraphBuilder::add_priorbox_node(Graph &g, NodeParams params, NodeIdxPair
     return prior_nid;
 }
 
-NodeID GraphBuilder::add_quantization_node(Graph &g, NodeParams params, NodeIdxPair input, const QuantizationInfo &out_quant_info)
+NodeID GraphBuilder::add_quantization_node(Graph                  &g,
+                                           NodeParams              params,
+                                           NodeIdxPair             input,
+                                           const QuantizationInfo &out_quant_info)
 {
     return create_simple_single_input_output_node<QuantizationLayerNode>(g, params, input, out_quant_info);
 }
 
-NodeID GraphBuilder::add_reduction_operation_node(Graph &g, NodeParams params, NodeIdxPair input, ReductionOperation op, int axis, bool keep_dims)
+NodeID GraphBuilder::add_reduction_operation_node(
+    Graph &g, NodeParams params, NodeIdxPair input, ReductionOperation op, int axis, bool keep_dims)
 {
     return create_simple_single_input_output_node<ReductionLayerNode>(g, params, input, op, axis, keep_dims);
 }
@@ -658,13 +747,14 @@ NodeID GraphBuilder::add_reshape_node(Graph &g, NodeParams params, NodeIdxPair i
     return create_simple_single_input_output_node<ReshapeLayerNode>(g, params, input, shape);
 }
 
-NodeID GraphBuilder::add_resize_node(Graph &g, NodeParams params, NodeIdxPair input, InterpolationPolicy policy,
-                                     float width_scale, float height_scale)
+NodeID GraphBuilder::add_resize_node(
+    Graph &g, NodeParams params, NodeIdxPair input, InterpolationPolicy policy, float width_scale, float height_scale)
 {
     return create_simple_single_input_output_node<ResizeLayerNode>(g, params, input, policy, width_scale, height_scale);
 }
 
-NodeID GraphBuilder::add_roi_align_node(Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair rois, ROIPoolingLayerInfo pool_info)
+NodeID GraphBuilder::add_roi_align_node(
+    Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair rois, ROIPoolingLayerInfo pool_info)
 {
     check_nodeidx_pair(input, g);
     check_nodeidx_pair(rois, g);
@@ -678,7 +768,11 @@ NodeID GraphBuilder::add_roi_align_node(Graph &g, NodeParams params, NodeIdxPair
     return nid;
 }
 
-NodeID GraphBuilder::add_scale_layer(Graph &g, const NodeParams &params, NodeIdxPair input, ITensorAccessorUPtr mul_accessor, ITensorAccessorUPtr add_accessor)
+NodeID GraphBuilder::add_scale_layer(Graph              &g,
+                                     const NodeParams   &params,
+                                     NodeIdxPair         input,
+                                     ITensorAccessorUPtr mul_accessor,
+                                     ITensorAccessorUPtr add_accessor)
 {
     check_nodeidx_pair(input, g);
 
@@ -688,22 +782,23 @@ NodeID GraphBuilder::add_scale_layer(Graph &g, const NodeParams &params, NodeIdx
 
     // Create mul node
     TensorDescriptor mul_desc = input_tensor_desc;
-    const size_t     C        = input_tensor_desc.shape[get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL)];
+    const size_t     C = input_tensor_desc.shape[get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL)];
     mul_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::WIDTH), 1);
     mul_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::HEIGHT), 1);
     mul_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL), C);
     NodeID      mul_const_nid   = add_const_node_with_name(g, params, "Mul", mul_desc, std::move(mul_accessor));
-    NodeIdxPair mul_const_nidxp = { mul_const_nid, 0 };
+    NodeIdxPair mul_const_nidxp = {mul_const_nid, 0};
 
     // Create add node
     TensorDescriptor add_desc        = mul_desc;
     NodeID           add_const_nid   = add_const_node_with_name(g, params, "Add", add_desc, std::move(add_accessor));
-    NodeIdxPair      add_const_nidxp = { add_const_nid, 0 };
+    NodeIdxPair      add_const_nidxp = {add_const_nid, 0};
 
     // Create node and connect
-    NodeID      mul_node      = GraphBuilder::add_elementwise_node(g, params, input, mul_const_nidxp, EltwiseOperation::Mul);
-    NodeIdxPair mulnode_nidxp = { mul_node, 0 };
-    NodeID      add_node      = GraphBuilder::add_elementwise_node(g, params, mulnode_nidxp, add_const_nidxp, EltwiseOperation::Add);
+    NodeID      mul_node = GraphBuilder::add_elementwise_node(g, params, input, mul_const_nidxp, EltwiseOperation::Mul);
+    NodeIdxPair mulnode_nidxp = {mul_node, 0};
+    NodeID      add_node =
+        GraphBuilder::add_elementwise_node(g, params, mulnode_nidxp, add_const_nidxp, EltwiseOperation::Add);
 
     return add_node;
 }
@@ -713,17 +808,25 @@ NodeID GraphBuilder::add_softmax_node(Graph &g, NodeParams params, NodeIdxPair i
     return create_simple_single_input_output_node<SoftmaxLayerNode>(g, params, input, beta);
 }
 
-NodeID GraphBuilder::add_slice_node(Graph &g, NodeParams params, NodeIdxPair input, Coordinates &starts, Coordinates &ends)
+NodeID
+GraphBuilder::add_slice_node(Graph &g, NodeParams params, NodeIdxPair input, Coordinates &starts, Coordinates &ends)
 {
     return create_simple_single_input_output_node<SliceLayerNode>(g, params, input, starts, ends);
 }
 
-NodeID GraphBuilder::add_split_node(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_splits, unsigned int axis)
+NodeID
+GraphBuilder::add_split_node(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_splits, unsigned int axis)
 {
     return create_simple_single_input_output_node<SplitLayerNode>(g, params, input, num_splits, axis);
 }
 
-NodeID GraphBuilder::add_strided_slice_node(Graph &g, NodeParams params, NodeIdxPair input, Coordinates &starts, Coordinates &ends, BiStrides &strides, StridedSliceLayerInfo info)
+NodeID GraphBuilder::add_strided_slice_node(Graph                &g,
+                                            NodeParams            params,
+                                            NodeIdxPair           input,
+                                            Coordinates          &starts,
+                                            Coordinates          &ends,
+                                            BiStrides            &strides,
+                                            StridedSliceLayerInfo info)
 {
     return create_simple_single_input_output_node<StridedSliceLayerNode>(g, params, input, starts, ends, strides, info);
 }
@@ -770,7 +873,8 @@ NodeID GraphBuilder::add_yolo_node(Graph &g, NodeParams params, NodeIdxPair inpu
     g.add_connection(input.node_id, input.index, cls, 0);
     g.add_connection(cls, 0, cls_act, 0);
 
-    NodeID concat = g.add_node<ConcatenateLayerNode>(3, descriptors::ConcatLayerDescriptor(DataLayoutDimension::CHANNEL));
+    NodeID concat =
+        g.add_node<ConcatenateLayerNode>(3, descriptors::ConcatLayerDescriptor(DataLayoutDimension::CHANNEL));
     set_node_params(g, concat, params);
     g.add_connection(act_box, 0, concat, 0);
     g.add_connection(imm, 0, concat, 1);
diff --git a/src/graph/GraphContext.cpp b/src/graph/GraphContext.cpp
index 7b74c2fe0e..10850aa259 100644
--- a/src/graph/GraphContext.cpp
+++ b/src/graph/GraphContext.cpp
@@ -24,15 +24,14 @@
 #include "arm_compute/graph/GraphContext.h"
 
 #include "arm_compute/graph.h"
-#include "arm_compute/graph/Utils.h"
 #include "arm_compute/graph/backends/BackendRegistry.h"
+#include "arm_compute/graph/Utils.h"
 
 namespace arm_compute
 {
 namespace graph
 {
-GraphContext::GraphContext()
-    : _config(), _memory_managers(), _weights_managers()
+GraphContext::GraphContext() : _config(), _memory_managers(), _weights_managers()
 {
 }
 
@@ -56,7 +55,7 @@ void GraphContext::set_config(const GraphConfig &config)
 bool GraphContext::insert_memory_management_ctx(MemoryManagerContext &&memory_ctx)
 {
     Target target = memory_ctx.target;
-    if(target == Target::UNSPECIFIED || _memory_managers.find(target) != std::end(_memory_managers))
+    if (target == Target::UNSPECIFIED || _memory_managers.find(target) != std::end(_memory_managers))
     {
         return false;
     }
@@ -79,7 +78,7 @@ bool GraphContext::insert_weights_management_ctx(WeightsManagerContext &&weights
 {
     Target target = weights_managers.target;
 
-    if(_weights_managers.find(target) != std::end(_weights_managers))
+    if (_weights_managers.find(target) != std::end(_weights_managers))
     {
         return false;
     }
@@ -102,17 +101,17 @@ std::map<Target, WeightsManagerContext> &GraphContext::weights_managers()
 void GraphContext::finalize()
 {
     const size_t num_pools = 1;
-    for(auto &mm_obj : _memory_managers)
+    for (auto &mm_obj : _memory_managers)
     {
         ARM_COMPUTE_ERROR_ON(!mm_obj.second.allocator);
 
         // Finalize intra layer memory manager
-        if(mm_obj.second.intra_mm != nullptr)
+        if (mm_obj.second.intra_mm != nullptr)
         {
             mm_obj.second.intra_mm->populate(*mm_obj.second.allocator, num_pools);
         }
         // Finalize cross layer memory manager
-        if(mm_obj.second.cross_mm != nullptr)
+        if (mm_obj.second.cross_mm != nullptr)
         {
             mm_obj.second.cross_mm->populate(*mm_obj.second.allocator, num_pools);
         }
diff --git a/src/graph/GraphManager.cpp b/src/graph/GraphManager.cpp
index 45b608c70a..58ae60d4cc 100644
--- a/src/graph/GraphManager.cpp
+++ b/src/graph/GraphManager.cpp
@@ -23,15 +23,15 @@
  */
 #include "arm_compute/graph/GraphManager.h"
 
+#include "arm_compute/graph/algorithms/TopologicalSort.h"
+#include "arm_compute/graph/detail/CrossLayerMemoryManagerHelpers.h"
+#include "arm_compute/graph/detail/ExecutionHelpers.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/Logger.h"
 #include "arm_compute/graph/PassManager.h"
 #include "arm_compute/graph/TypePrinter.h"
 #include "arm_compute/graph/Utils.h"
-#include "arm_compute/graph/algorithms/TopologicalSort.h"
-#include "arm_compute/graph/detail/CrossLayerMemoryManagerHelpers.h"
-#include "arm_compute/graph/detail/ExecutionHelpers.h"
 
 #include "src/common/utils/Log.h"
 
@@ -39,8 +39,7 @@ namespace arm_compute
 {
 namespace graph
 {
-GraphManager::GraphManager()
-    : _workloads()
+GraphManager::GraphManager() : _workloads()
 {
 }
 
@@ -49,7 +48,7 @@ void GraphManager::finalize_graph(Graph &graph, GraphContext &ctx, PassManager &
     ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Initiate graph configuration!");
 
     // Check if graph has been registered
-    if(_workloads.find(graph.id()) != std::end(_workloads))
+    if (_workloads.find(graph.id()) != std::end(_workloads))
     {
         ARM_COMPUTE_ERROR("Graph is already registered!");
     }
@@ -62,7 +61,7 @@ void GraphManager::finalize_graph(Graph &graph, GraphContext &ctx, PassManager &
 
     // In case CLVK is selected, use the CL backend and
     // update config
-    if(target == Target::CLVK)
+    if (target == Target::CLVK)
     {
         forced_target       = Target::CL;
         GraphConfig config  = ctx.config();
@@ -71,7 +70,7 @@ void GraphManager::finalize_graph(Graph &graph, GraphContext &ctx, PassManager &
         ctx.set_config(config);
     }
 
-    if(!is_target_supported(target))
+    if (!is_target_supported(target))
     {
         forced_target = get_default_target();
         ARM_COMPUTE_LOG_GRAPH_INFO("Switching target from " << target << " to " << forced_target << std::endl);
@@ -105,7 +104,7 @@ void GraphManager::finalize_graph(Graph &graph, GraphContext &ctx, PassManager &
     detail::prepare_all_tasks(workload);
 
     // Setup tensor memory (Allocate all tensors or setup transition manager)
-    if(ctx.config().use_transition_memory_manager)
+    if (ctx.config().use_transition_memory_manager)
     {
         detail::configure_transition_manager(graph, ctx, workload);
     }
@@ -130,10 +129,10 @@ void GraphManager::execute_graph(Graph &graph)
     auto it = _workloads.find(graph.id());
     ARM_COMPUTE_ERROR_ON_MSG(it == std::end(_workloads), "Graph is not registered!");
 
-    while(true)
+    while (true)
     {
         // Call input accessors
-        if(!detail::call_all_input_node_accessors(it->second))
+        if (!detail::call_all_input_node_accessors(it->second))
         {
             return;
         }
@@ -142,7 +141,7 @@ void GraphManager::execute_graph(Graph &graph)
         detail::call_all_tasks(it->second);
 
         // Call output accessors
-        if(!detail::call_all_output_node_accessors(it->second))
+        if (!detail::call_all_output_node_accessors(it->second))
         {
             return;
         }
@@ -157,4 +156,4 @@ void GraphManager::invalidate_graph(Graph &graph)
     _workloads.erase(it);
 }
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/INode.cpp b/src/graph/INode.cpp
index 70fe44e134..83c3ef7e37 100644
--- a/src/graph/INode.cpp
+++ b/src/graph/INode.cpp
@@ -75,17 +75,17 @@ void INode::set_assigned_target(Target target)
 
 void INode::set_output_tensor(TensorID tid, size_t idx)
 {
-    if(tid != NullTensorID && (idx < _outputs.size()) && (_graph->tensor(tid) != nullptr))
+    if (tid != NullTensorID && (idx < _outputs.size()) && (_graph->tensor(tid) != nullptr))
     {
         ARM_COMPUTE_ERROR_ON(_graph == nullptr);
         Tensor *updated_tensor = _graph->tensor(tid);
         _outputs[idx]          = tid;
 
         // Set tensor to all output edges of the node
-        for(auto &output_edge_id : _output_edges)
+        for (auto &output_edge_id : _output_edges)
         {
             auto output_edge = _graph->edge(output_edge_id);
-            if(output_edge != nullptr)
+            if (output_edge != nullptr)
             {
                 // Unbind edge from current tensor
                 auto current_output_tensor = output_edge->tensor();
diff --git a/src/graph/INodeVisitor.cpp b/src/graph/INodeVisitor.cpp
index 5369f6f539..90b2e3327f 100644
--- a/src/graph/INodeVisitor.cpp
+++ b/src/graph/INodeVisitor.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph/INodeVisitor.h"
+
 #include "arm_compute/graph/nodes/Nodes.h"
 
 namespace arm_compute
diff --git a/src/graph/PassManager.cpp b/src/graph/PassManager.cpp
index f7e214c1b4..9a889e1da3 100644
--- a/src/graph/PassManager.cpp
+++ b/src/graph/PassManager.cpp
@@ -29,8 +29,7 @@ namespace arm_compute
 {
 namespace graph
 {
-PassManager::PassManager()
-    : _passes()
+PassManager::PassManager() : _passes()
 {
 }
 
@@ -46,7 +45,7 @@ IGraphMutator *PassManager::pass(size_t index)
 
 void PassManager::append(std::unique_ptr<IGraphMutator> pass, bool conditional)
 {
-    if(pass && conditional)
+    if (pass && conditional)
     {
         ARM_COMPUTE_LOG_GRAPH_VERBOSE("Appending mutating pass : " << pass->name() << std::endl);
         _passes.push_back(std::move(pass));
@@ -60,9 +59,9 @@ void PassManager::clear()
 
 void PassManager::run_all(Graph &g)
 {
-    for(auto &pass : _passes)
+    for (auto &pass : _passes)
     {
-        if(pass)
+        if (pass)
         {
             ARM_COMPUTE_LOG_GRAPH_INFO("Running mutating pass : " << pass->name() << std::endl);
             pass->mutate(g);
@@ -72,9 +71,9 @@ void PassManager::run_all(Graph &g)
 
 void PassManager::run_type(Graph &g, IGraphMutator::MutationType type)
 {
-    for(auto &pass : _passes)
+    for (auto &pass : _passes)
     {
-        if(pass && (pass->type() == type))
+        if (pass && (pass->type() == type))
         {
             ARM_COMPUTE_LOG_GRAPH_INFO("Running mutating pass : " << pass->name() << std::endl);
             pass->mutate(g);
@@ -84,17 +83,17 @@ void PassManager::run_type(Graph &g, IGraphMutator::MutationType type)
 
 void PassManager::run_index(Graph &g, size_t index)
 {
-    if(index >= _passes.size())
+    if (index >= _passes.size())
     {
         return;
     }
 
     auto &pass = _passes.at(index);
-    if(pass != nullptr)
+    if (pass != nullptr)
     {
         ARM_COMPUTE_LOG_GRAPH_INFO("Running mutating pass : " << pass->name() << std::endl);
         pass->mutate(g);
     }
 }
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/Tensor.cpp b/src/graph/Tensor.cpp
index 3d4723430f..72679c4ea4 100644
--- a/src/graph/Tensor.cpp
+++ b/src/graph/Tensor.cpp
@@ -75,20 +75,20 @@ std::unique_ptr<ITensorAccessor> Tensor::extract_accessor()
 bool Tensor::call_accessor()
 {
     // Early exit guard
-    if(!_accessor || !_handle)
+    if (!_accessor || !_handle)
     {
         return false;
     }
 
     const bool access_data = _accessor->access_tensor_data();
 
-    if(access_data)
+    if (access_data)
     {
         // Map tensor
         _handle->map(true);
 
         // Return in case of null backend buffer
-        if(_handle->tensor().buffer() == nullptr)
+        if (_handle->tensor().buffer() == nullptr)
         {
             return false;
         }
@@ -97,7 +97,7 @@ bool Tensor::call_accessor()
     // Call accessor
     bool retval = _accessor->access_tensor(_handle->tensor());
 
-    if(access_data)
+    if (access_data)
     {
         // Unmap tensor
         _handle->unmap();
diff --git a/src/graph/TypeLoader.cpp b/src/graph/TypeLoader.cpp
index 3c51289dba..e1248fbb6b 100644
--- a/src/graph/TypeLoader.cpp
+++ b/src/graph/TypeLoader.cpp
@@ -31,10 +31,9 @@ namespace arm_compute
 {
 arm_compute::DataLayout data_layout_from_name(const std::string &name)
 {
-    static const std::map<std::string, arm_compute::DataLayout> data_layouts =
-    {
-        { "nhwc", DataLayout::NHWC },
-        { "nchw", DataLayout::NCHW },
+    static const std::map<std::string, arm_compute::DataLayout> data_layouts = {
+        {"nhwc", DataLayout::NHWC},
+        {"nchw", DataLayout::NCHW},
     };
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
@@ -45,7 +44,7 @@ arm_compute::DataLayout data_layout_from_name(const std::string &name)
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
-    catch(const std::out_of_range &)
+    catch (const std::out_of_range &)
     {
         throw std::invalid_argument(name);
     }
@@ -55,11 +54,10 @@ namespace graph
 {
 Target target_from_name(const std::string &name)
 {
-    static const std::map<std::string, Target> targets =
-    {
-        { "neon", Target::NEON },
-        { "cl", Target::CL },
-        { "clvk", Target::CLVK },
+    static const std::map<std::string, Target> targets = {
+        {"neon", Target::NEON},
+        {"cl", Target::CL},
+        {"clvk", Target::CLVK},
     };
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
@@ -70,7 +68,7 @@ Target target_from_name(const std::string &name)
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
-    catch(const std::out_of_range &)
+    catch (const std::out_of_range &)
     {
         throw std::invalid_argument(name);
     }
@@ -79,12 +77,11 @@ Target target_from_name(const std::string &name)
 
 ConvolutionMethod Convolution_method_from_name(const std::string &name)
 {
-    static const std::map<std::string, ConvolutionMethod> methods =
-    {
-        { "default", ConvolutionMethod::Default },
-        { "direct", ConvolutionMethod::Direct },
-        { "gemm", ConvolutionMethod::GEMM },
-        { "winograd", ConvolutionMethod::Winograd },
+    static const std::map<std::string, ConvolutionMethod> methods = {
+        {"default", ConvolutionMethod::Default},
+        {"direct", ConvolutionMethod::Direct},
+        {"gemm", ConvolutionMethod::GEMM},
+        {"winograd", ConvolutionMethod::Winograd},
     };
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
@@ -95,7 +92,7 @@ ConvolutionMethod Convolution_method_from_name(const std::string &name)
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
-    catch(const std::out_of_range &)
+    catch (const std::out_of_range &)
     {
         throw std::invalid_argument(name);
     }
@@ -104,10 +101,9 @@ ConvolutionMethod Convolution_method_from_name(const std::string &name)
 
 DepthwiseConvolutionMethod depthwise_convolution_method_from_name(const std::string &name)
 {
-    static const std::map<std::string, DepthwiseConvolutionMethod> methods =
-    {
-        { "default", DepthwiseConvolutionMethod::Default },
-        { "optimized3x3", DepthwiseConvolutionMethod::Optimized3x3 },
+    static const std::map<std::string, DepthwiseConvolutionMethod> methods = {
+        {"default", DepthwiseConvolutionMethod::Default},
+        {"optimized3x3", DepthwiseConvolutionMethod::Optimized3x3},
     };
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
@@ -118,7 +114,7 @@ DepthwiseConvolutionMethod depthwise_convolution_method_from_name(const std::str
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
-    catch(const std::out_of_range &)
+    catch (const std::out_of_range &)
     {
         throw std::invalid_argument(name);
     }
diff --git a/src/graph/Utils.cpp b/src/graph/Utils.cpp
index dcab177a3b..452d8ec7b2 100644
--- a/src/graph/Utils.cpp
+++ b/src/graph/Utils.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/graph/Utils.h"
 
-#include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/backends/BackendRegistry.h"
+#include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/mutators/GraphMutators.h"
 
 namespace arm_compute
@@ -33,16 +33,17 @@ namespace graph
 {
 bool is_target_supported(Target target)
 {
-    return backends::BackendRegistry::get().contains(target) && backends::BackendRegistry::get().find_backend(target)->is_backend_supported();
+    return backends::BackendRegistry::get().contains(target) &&
+           backends::BackendRegistry::get().find_backend(target)->is_backend_supported();
 }
 
 Target get_default_target()
 {
-    if(is_target_supported(Target::NEON))
+    if (is_target_supported(Target::NEON))
     {
         return Target::NEON;
     }
-    if(is_target_supported(Target::CL))
+    if (is_target_supported(Target::CL))
     {
         return Target::CL;
     }
@@ -52,18 +53,18 @@ Target get_default_target()
 void force_target_to_graph(Graph &g, Target target)
 {
     auto &nodes = g.nodes();
-    for(auto &node : nodes)
+    for (auto &node : nodes)
     {
-        if(node)
+        if (node)
         {
             node->set_assigned_target(target);
         }
     }
 
     auto &tensors = g.tensors();
-    for(auto &tensor : tensors)
+    for (auto &tensor : tensors)
     {
-        if(tensor)
+        if (tensor)
         {
             tensor->desc().target = target;
         }
@@ -76,9 +77,9 @@ PassManager create_default_pass_manager(Target target, const GraphConfig &cfg)
     PassManager pm;
 
     // Passes that mutate graph IR
-    if(cfg.use_synthetic_type)
+    if (cfg.use_synthetic_type)
     {
-        switch(cfg.synthetic_type)
+        switch (cfg.synthetic_type)
         {
             case DataType::QASYMM8:
             case DataType::QASYMM8_SIGNED:
@@ -107,9 +108,9 @@ PassManager create_default_pass_manager(Target target, const GraphConfig &cfg)
 
 void release_default_graph_context(GraphContext &ctx)
 {
-    for(const auto &backend : backends::BackendRegistry::get().backends())
+    for (const auto &backend : backends::BackendRegistry::get().backends())
     {
-        if(backend.second->is_backend_supported())
+        if (backend.second->is_backend_supported())
         {
             backend.second->release_backend_context(ctx);
         }
@@ -118,9 +119,9 @@ void release_default_graph_context(GraphContext &ctx)
 
 void sync_backends()
 {
-    for(const auto &backend : backends::BackendRegistry::get().backends())
+    for (const auto &backend : backends::BackendRegistry::get().backends())
     {
-        if(backend.second->backend_allocator())
+        if (backend.second->backend_allocator())
         {
             backend.second->sync();
         }
@@ -129,10 +130,10 @@ void sync_backends()
 
 void setup_requested_backend_context(GraphContext &ctx, Target target)
 {
-    if(backends::BackendRegistry::get().contains(target))
+    if (backends::BackendRegistry::get().contains(target))
     {
         const auto &backend = backends::BackendRegistry::get().find_backend(target);
-        if(backend->is_backend_supported())
+        if (backend->is_backend_supported())
         {
             backend->setup_backend_context(ctx);
         }
@@ -141,20 +142,22 @@ void setup_requested_backend_context(GraphContext &ctx, Target target)
 
 size_t get_dimension_size(const TensorDescriptor &descriptor, const DataLayoutDimension data_layout_dimension)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(descriptor.layout == DataLayout::UNKNOWN, "Cannot retrieve the dimension index for an unknown layout!");
+    ARM_COMPUTE_ERROR_ON_MSG(descriptor.layout == DataLayout::UNKNOWN,
+                             "Cannot retrieve the dimension index for an unknown layout!");
     return descriptor.shape[get_dimension_idx(descriptor.layout, data_layout_dimension)];
 }
 
 size_t get_dimension_idx(DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(data_layout == DataLayout::UNKNOWN, "Cannot retrieve the dimension index for an unknown layout!");
+    ARM_COMPUTE_ERROR_ON_MSG(data_layout == DataLayout::UNKNOWN,
+                             "Cannot retrieve the dimension index for an unknown layout!");
 
     /* Return the index based on the data layout
      * [N C H W]
      * [3 2 1 0]
      * [N H W C]
      */
-    switch(data_layout_dimension)
+    switch (data_layout_dimension)
     {
         case DataLayoutDimension::CHANNEL:
             return (data_layout == DataLayout::NCHW) ? 2 : 0;
@@ -181,13 +184,13 @@ std::vector<NodeIdxPair> get_driving_nodes(const INode &node)
     const Graph *g = node.graph();
     ARM_COMPUTE_ERROR_ON(g == nullptr);
 
-    for(auto &output_edge_id : node.output_edges())
+    for (auto &output_edge_id : node.output_edges())
     {
         auto output_edge = g->edge(output_edge_id);
-        if(output_edge != nullptr)
+        if (output_edge != nullptr)
         {
             ARM_COMPUTE_ERROR_ON(output_edge->consumer() == nullptr);
-            driving_nodes.push_back({ output_edge->consumer_id(), output_edge->consumer_idx() });
+            driving_nodes.push_back({output_edge->consumer_id(), output_edge->consumer_idx()});
         }
     }
 
@@ -201,13 +204,13 @@ std::vector<NodeIdxPair> get_driver_nodes(const INode &node)
     const Graph *g = node.graph();
     ARM_COMPUTE_ERROR_ON(g == nullptr);
 
-    for(auto &input_edge_id : node.input_edges())
+    for (auto &input_edge_id : node.input_edges())
     {
         auto input_edge = g->edge(input_edge_id);
-        if(input_edge != nullptr)
+        if (input_edge != nullptr)
         {
             ARM_COMPUTE_ERROR_ON(input_edge->producer() == nullptr);
-            driver_nodes.push_back({ input_edge->producer_id(), input_edge->producer_idx() });
+            driver_nodes.push_back({input_edge->producer_id(), input_edge->producer_idx()});
         }
     }
 
@@ -216,7 +219,7 @@ std::vector<NodeIdxPair> get_driver_nodes(const INode &node)
 
 void configure_tensor(Tensor *tensor)
 {
-    if(tensor != nullptr && tensor->handle() == nullptr)
+    if (tensor != nullptr && tensor->handle() == nullptr)
     {
         Target                         target  = tensor->desc().target;
         backends::IDeviceBackend      &backend = backends::BackendRegistry::get().get_backend(target);
diff --git a/src/graph/Workload.cpp b/src/graph/Workload.cpp
index b9d57295b0..9dddad7cbd 100644
--- a/src/graph/Workload.cpp
+++ b/src/graph/Workload.cpp
@@ -40,12 +40,12 @@ void ExecutionTask::operator()()
 
 void execute_task(ExecutionTask &task)
 {
-    if(task.task)
+    if (task.task)
     {
         task.task->run();
     }
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
-    else if(task.node->type() == NodeType::PrintLayer)
+    else if (task.node->type() == NodeType::PrintLayer)
     {
         auto print_node   = utils::cast::polymorphic_downcast<PrintLayerNode *>(task.node);
         auto input_handle = print_node->input(0)->handle();
@@ -61,14 +61,13 @@ void execute_task(ExecutionTask &task)
 
 void ExecutionTask::prepare()
 {
-    if(task)
+    if (task)
     {
         task->prepare();
     }
 }
 
-TaskExecutor::TaskExecutor()
-    : execute_function(execute_task)
+TaskExecutor::TaskExecutor() : execute_function(execute_task)
 {
 }
 
@@ -78,4 +77,4 @@ TaskExecutor &TaskExecutor::get()
     return executor;
 }
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/algorithms/TopologicalSort.cpp b/src/graph/algorithms/TopologicalSort.cpp
index 3a69352471..08e14e1657 100644
--- a/src/graph/algorithms/TopologicalSort.cpp
+++ b/src/graph/algorithms/TopologicalSort.cpp
@@ -50,14 +50,14 @@ inline bool all_inputs_are_visited(const INode *node, const std::vector<bool> &v
     ARM_COMPUTE_ERROR_ON(graph == nullptr);
 
     bool are_all_visited = true;
-    for(const auto &input_edge_id : node->input_edges())
+    for (const auto &input_edge_id : node->input_edges())
     {
-        if(input_edge_id != EmptyNodeID)
+        if (input_edge_id != EmptyNodeID)
         {
             const Edge *input_edge = graph->edge(input_edge_id);
             ARM_COMPUTE_ERROR_ON(input_edge == nullptr);
             ARM_COMPUTE_ERROR_ON(input_edge->producer() == nullptr);
-            if(!visited[input_edge->producer_id()])
+            if (!visited[input_edge->producer_id()])
             {
                 are_all_visited = false;
                 break;
@@ -80,9 +80,9 @@ std::vector<NodeID> bfs(Graph &g)
     std::list<NodeID> queue;
 
     // Push inputs and mark as visited
-    for(auto &input : g.nodes(NodeType::Input))
+    for (auto &input : g.nodes(NodeType::Input))
     {
-        if(input != EmptyNodeID)
+        if (input != EmptyNodeID)
         {
             visited[input] = true;
             queue.push_back(input);
@@ -90,9 +90,9 @@ std::vector<NodeID> bfs(Graph &g)
     }
 
     // Push const nodes and mark as visited
-    for(auto &const_node : g.nodes(NodeType::Const))
+    for (auto &const_node : g.nodes(NodeType::Const))
     {
-        if(const_node != EmptyNodeID)
+        if (const_node != EmptyNodeID)
         {
             visited[const_node] = true;
             queue.push_back(const_node);
@@ -100,7 +100,7 @@ std::vector<NodeID> bfs(Graph &g)
     }
 
     // Iterate over vector and edges
-    while(!queue.empty())
+    while (!queue.empty())
     {
         // Dequeue a node from queue and process
         NodeID n = queue.front();
@@ -109,11 +109,11 @@ std::vector<NodeID> bfs(Graph &g)
 
         const INode *node = g.node(n);
         ARM_COMPUTE_ERROR_ON(node == nullptr);
-        for(const auto &eid : node->output_edges())
+        for (const auto &eid : node->output_edges())
         {
             const Edge *e = g.edge(eid);
             ARM_COMPUTE_ERROR_ON(e == nullptr);
-            if(!visited[e->consumer_id()] && detail::all_inputs_are_visited(e->consumer(), visited))
+            if (!visited[e->consumer_id()] && detail::all_inputs_are_visited(e->consumer(), visited))
             {
                 visited[e->consumer_id()] = true;
                 queue.push_back(e->consumer_id());
@@ -135,9 +135,9 @@ std::vector<NodeID> dfs(Graph &g)
     std::stack<NodeID> stack;
 
     // Push inputs and mark as visited
-    for(auto &input : g.nodes(NodeType::Input))
+    for (auto &input : g.nodes(NodeType::Input))
     {
-        if(input != EmptyNodeID)
+        if (input != EmptyNodeID)
         {
             visited[input] = true;
             stack.push(input);
@@ -145,9 +145,9 @@ std::vector<NodeID> dfs(Graph &g)
     }
 
     // Push const nodes and mark as visited
-    for(auto &const_node : g.nodes(NodeType::Const))
+    for (auto &const_node : g.nodes(NodeType::Const))
     {
-        if(const_node != EmptyNodeID)
+        if (const_node != EmptyNodeID)
         {
             visited[const_node] = true;
             stack.push(const_node);
@@ -155,7 +155,7 @@ std::vector<NodeID> dfs(Graph &g)
     }
 
     // Iterate over vector and edges
-    while(!stack.empty())
+    while (!stack.empty())
     {
         // Pop a node from stack and process
         NodeID n = stack.top();
@@ -163,7 +163,7 @@ std::vector<NodeID> dfs(Graph &g)
         stack.pop();
 
         // Mark node as visited
-        if(!visited[n])
+        if (!visited[n])
         {
             visited[n] = true;
         }
@@ -171,11 +171,11 @@ std::vector<NodeID> dfs(Graph &g)
         const INode *node = g.node(n);
         ARM_COMPUTE_ERROR_ON(node == nullptr);
         // Reverse iterate to push branches from right to left and pop on the opposite order
-        for(const auto &eid : arm_compute::utils::iterable::reverse_iterate(node->output_edges()))
+        for (const auto &eid : arm_compute::utils::iterable::reverse_iterate(node->output_edges()))
         {
             const Edge *e = g.edge(eid);
             ARM_COMPUTE_ERROR_ON(e == nullptr);
-            if(!visited[e->consumer_id()] && detail::all_inputs_are_visited(e->consumer(), visited))
+            if (!visited[e->consumer_id()] && detail::all_inputs_are_visited(e->consumer(), visited))
             {
                 stack.push(e->consumer_id());
             }
diff --git a/src/graph/backends/BackendRegistry.cpp b/src/graph/backends/BackendRegistry.cpp
index 46b4f99e23..bb6af79f8b 100644
--- a/src/graph/backends/BackendRegistry.cpp
+++ b/src/graph/backends/BackendRegistry.cpp
@@ -31,8 +31,7 @@ namespace graph
 {
 namespace backends
 {
-BackendRegistry::BackendRegistry()
-    : _registered_backends()
+BackendRegistry::BackendRegistry() : _registered_backends()
 {
 }
 
diff --git a/src/graph/backends/CL/CLDeviceBackend.cpp b/src/graph/backends/CL/CLDeviceBackend.cpp
index 01e5ab1730..e27a4109d1 100644
--- a/src/graph/backends/CL/CLDeviceBackend.cpp
+++ b/src/graph/backends/CL/CLDeviceBackend.cpp
@@ -23,18 +23,17 @@
  */
 #include "arm_compute/graph/backends/CL/CLDeviceBackend.h"
 
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/GraphContext.h"
-#include "arm_compute/graph/INode.h"
-#include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/graph/backends/BackendRegistrar.h"
 #include "arm_compute/graph/backends/CL/CLFunctionFactory.h"
 #include "arm_compute/graph/backends/CL/CLNodeValidator.h"
 #include "arm_compute/graph/backends/CL/CLSubTensorHandle.h"
 #include "arm_compute/graph/backends/CL/CLTensorHandle.h"
-
-#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/Tensor.h"
 #include "arm_compute/runtime/BlobLifetimeManager.h"
 #include "arm_compute/runtime/CL/CLBufferAllocator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
@@ -64,7 +63,12 @@ bool file_exists(const std::string &filename)
 static detail::BackendRegistrar<CLDeviceBackend> CLDeviceBackend_registrar(Target::CL);
 
 CLDeviceBackend::CLDeviceBackend()
-    : _context_count(0), _tuner(), _gemm_heuristics(), _allocator(nullptr), _tuner_file(), _backend_type(CLBackendType::Native)
+    : _context_count(0),
+      _tuner(),
+      _gemm_heuristics(),
+      _allocator(nullptr),
+      _tuner_file(),
+      _backend_type(CLBackendType::Native)
 {
 }
 
@@ -95,7 +99,7 @@ void CLDeviceBackend::release_backend_context(GraphContext &ctx)
 {
     ARM_COMPUTE_UNUSED(ctx);
     _context_count--;
-    if(_context_count == 0) // No more context using the backend: free resources
+    if (_context_count == 0) // No more context using the backend: free resources
     {
         _allocator = nullptr;
     }
@@ -105,7 +109,7 @@ void CLDeviceBackend::setup_backend_context(GraphContext &ctx)
 {
     // Force backend initialization
     _context_count++;
-    if(_context_count == 1)
+    if (_context_count == 1)
     {
         _backend_type = ctx.config().backend_type;
         initialize_backend();
@@ -115,7 +119,7 @@ void CLDeviceBackend::setup_backend_context(GraphContext &ctx)
     _tuner_file = ctx.config().tuner_file;
 
     // Load tuner data if available
-    if(file_exists(_tuner_file))
+    if (file_exists(_tuner_file))
     {
         _tuner.load_from_file(_tuner_file);
     }
@@ -128,7 +132,7 @@ void CLDeviceBackend::setup_backend_context(GraphContext &ctx)
     CLScheduler::get().gemm_heuristics()->reload_from_file(ctx.config().mlgo_file);
 
     // Setup a management backend
-    if(ctx.memory_management_ctx(Target::CL) == nullptr)
+    if (ctx.memory_management_ctx(Target::CL) == nullptr)
     {
         MemoryManagerContext mm_ctx;
         mm_ctx.target      = Target::CL;
@@ -141,7 +145,7 @@ void CLDeviceBackend::setup_backend_context(GraphContext &ctx)
     }
 
     // Create function level weights manager
-    if(ctx.weights_management_ctx(Target::CL) == nullptr)
+    if (ctx.weights_management_ctx(Target::CL) == nullptr)
     {
         WeightsManagerContext wm_ctx;
         wm_ctx.target = Target::CL;
@@ -174,9 +178,10 @@ std::unique_ptr<ITensorHandle> CLDeviceBackend::create_tensor(const Tensor &tens
     return std::make_unique<CLTensorHandle>(info);
 }
 
-std::unique_ptr<ITensorHandle> CLDeviceBackend::create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent)
+std::unique_ptr<ITensorHandle>
+CLDeviceBackend::create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent)
 {
-    if(parent == nullptr)
+    if (parent == nullptr)
     {
         return nullptr;
     }
@@ -203,7 +208,7 @@ arm_compute::Status CLDeviceBackend::validate_node(INode &node)
 
 std::shared_ptr<arm_compute::IMemoryManager> CLDeviceBackend::create_memory_manager(MemoryManagerAffinity affinity)
 {
-    if(affinity == MemoryManagerAffinity::Offset)
+    if (affinity == MemoryManagerAffinity::Offset)
     {
         ARM_COMPUTE_LOG_GRAPH_WARNING("CL Backend does not support offset affinity memory management!");
         return nullptr;
diff --git a/src/graph/backends/CL/CLFunctionsFactory.cpp b/src/graph/backends/CL/CLFunctionsFactory.cpp
index 882810474e..d4e1aa880f 100644
--- a/src/graph/backends/CL/CLFunctionsFactory.cpp
+++ b/src/graph/backends/CL/CLFunctionsFactory.cpp
@@ -22,12 +22,12 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph/backends/CL/CLFunctionFactory.h"
-
+#include "arm_compute/graph/backends/FunctionHelpers.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/GraphContext.h"
-#include "arm_compute/graph/backends/FunctionHelpers.h"
 #include "arm_compute/runtime/CL/CLFunctions.h"
 #include "arm_compute/runtime/CPP/CPPFunctions.h"
+
 #include "src/core/CL/CLKernels.h"
 #include "support/Cast.h"
 
@@ -89,20 +89,19 @@ class CPPWrapperFunction : public IFunction
 {
 public:
     /* Default constructor */
-    CPPWrapperFunction()
-        : _tensors(), _func(nullptr)
+    CPPWrapperFunction() : _tensors(), _func(nullptr)
     {
     }
 
     void run() override
     {
-        for(auto &tensor : _tensors)
+        for (auto &tensor : _tensors)
         {
             tensor->map(CLScheduler::get().queue());
         }
         _func->run();
 
-        for(auto &tensor : _tensors)
+        for (auto &tensor : _tensors)
         {
             tensor->unmap(CLScheduler::get().queue());
         }
@@ -127,7 +126,8 @@ namespace detail
 {
 // Specialized functions
 template <>
-std::unique_ptr<IFunction> create_detection_output_layer<CPPDetectionOutputLayer, CLTargetInfo>(DetectionOutputLayerNode &node)
+std::unique_ptr<IFunction>
+create_detection_output_layer<CPPDetectionOutputLayer, CLTargetInfo>(DetectionOutputLayerNode &node)
 {
     validate_node<CLTargetInfo>(node, 3 /* expected inputs */, 1 /* expected outputs */);
 
@@ -149,16 +149,12 @@ std::unique_ptr<IFunction> create_detection_output_layer<CPPDetectionOutputLayer
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << CLTargetInfo::TargetType
-                               << " Data Type: " << input0->info()->data_type()
-                               << " Input0 shape: " << input0->info()->tensor_shape()
-                               << " Input1 shape: " << input1->info()->tensor_shape()
+                               << node.name() << " Type: " << node.type() << " Target: " << CLTargetInfo::TargetType
+                               << " Data Type: " << input0->info()->data_type() << " Input0 shape: "
+                               << input0->info()->tensor_shape() << " Input1 shape: " << input1->info()->tensor_shape()
                                << " Input2 shape: " << input2->info()->tensor_shape()
                                << " Output shape: " << output->info()->tensor_shape()
-                               << " DetectionOutputLayer info: " << detect_info
-                               << std::endl);
+                               << " DetectionOutputLayer info: " << detect_info << std::endl);
 
     auto wrap_function = std::make_unique<CPPWrapperFunction>();
 
@@ -171,7 +167,8 @@ std::unique_ptr<IFunction> create_detection_output_layer<CPPDetectionOutputLayer
     return std::move(wrap_function);
 }
 template <>
-std::unique_ptr<IFunction> create_detection_post_process_layer<CPPDetectionPostProcessLayer, CLTargetInfo>(DetectionPostProcessLayerNode &node)
+std::unique_ptr<IFunction>
+create_detection_post_process_layer<CPPDetectionPostProcessLayer, CLTargetInfo>(DetectionPostProcessLayerNode &node)
 {
     validate_node<CLTargetInfo>(node, 3 /* expected inputs */, 4 /* expected outputs */);
 
@@ -199,19 +196,15 @@ std::unique_ptr<IFunction> create_detection_post_process_layer<CPPDetectionPostP
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << CLTargetInfo::TargetType
-                               << " Data Type: " << input0->info()->data_type()
-                               << " Input0 shape: " << input0->info()->tensor_shape()
-                               << " Input1 shape: " << input1->info()->tensor_shape()
+                               << node.name() << " Type: " << node.type() << " Target: " << CLTargetInfo::TargetType
+                               << " Data Type: " << input0->info()->data_type() << " Input0 shape: "
+                               << input0->info()->tensor_shape() << " Input1 shape: " << input1->info()->tensor_shape()
                                << " Input2 shape: " << input2->info()->tensor_shape()
                                << " Output0 shape: " << output0->info()->tensor_shape()
                                << " Output1 shape: " << output1->info()->tensor_shape()
                                << " Output2 shape: " << output2->info()->tensor_shape()
                                << " Output3 shape: " << output3->info()->tensor_shape()
-                               << " DetectionPostProcessLayer info: " << detect_info
-                               << std::endl);
+                               << " DetectionPostProcessLayer info: " << detect_info << std::endl);
 
     auto wrap_function = std::make_unique<CPPWrapperFunction>();
 
@@ -230,92 +223,128 @@ std::unique_ptr<IFunction> create_detection_post_process_layer<CPPDetectionPostP
 
 std::unique_ptr<IFunction> CLFunctionFactory::create(INode *node, GraphContext &ctx)
 {
-    if(node == nullptr)
+    if (node == nullptr)
     {
         return nullptr;
     }
 
     NodeType type = node->type();
-    switch(type)
+    switch (type)
     {
         case NodeType::ActivationLayer:
-            return detail::create_activation_layer<CLActivationLayer, CLTargetInfo>(*polymorphic_downcast<ActivationLayerNode *>(node));
+            return detail::create_activation_layer<CLActivationLayer, CLTargetInfo>(
+                *polymorphic_downcast<ActivationLayerNode *>(node));
         case NodeType::ArgMinMaxLayer:
-            return detail::create_arg_min_max_layer<CLArgMinMaxLayer, CLTargetInfo>(*polymorphic_downcast<ArgMinMaxLayerNode *>(node));
+            return detail::create_arg_min_max_layer<CLArgMinMaxLayer, CLTargetInfo>(
+                *polymorphic_downcast<ArgMinMaxLayerNode *>(node));
         case NodeType::BatchNormalizationLayer:
-            return detail::create_batch_normalization_layer<CLBatchNormalizationLayer, CLTargetInfo>(*polymorphic_downcast<BatchNormalizationLayerNode *>(node));
+            return detail::create_batch_normalization_layer<CLBatchNormalizationLayer, CLTargetInfo>(
+                *polymorphic_downcast<BatchNormalizationLayerNode *>(node));
         case NodeType::BoundingBoxTransformLayer:
-            return detail::create_bounding_box_transform_layer<CLBoundingBoxTransform, CLTargetInfo>(*polymorphic_downcast<BoundingBoxTransformLayerNode *>(node));
+            return detail::create_bounding_box_transform_layer<CLBoundingBoxTransform, CLTargetInfo>(
+                *polymorphic_downcast<BoundingBoxTransformLayerNode *>(node));
         case NodeType::ChannelShuffleLayer:
-            return detail::create_channel_shuffle_layer<CLChannelShuffleLayer, CLTargetInfo>(*polymorphic_downcast<ChannelShuffleLayerNode *>(node));
+            return detail::create_channel_shuffle_layer<CLChannelShuffleLayer, CLTargetInfo>(
+                *polymorphic_downcast<ChannelShuffleLayerNode *>(node));
         case NodeType::ConvolutionLayer:
-            return detail::create_convolution_layer<CLConvolutionLayerFunctions, CLTargetInfo>(*polymorphic_downcast<ConvolutionLayerNode *>(node), ctx);
+            return detail::create_convolution_layer<CLConvolutionLayerFunctions, CLTargetInfo>(
+                *polymorphic_downcast<ConvolutionLayerNode *>(node), ctx);
         case NodeType::DeconvolutionLayer:
-            return detail::create_deconvolution_layer<CLDeconvolutionLayer, CLTargetInfo>(*polymorphic_downcast<DeconvolutionLayerNode *>(node), ctx);
+            return detail::create_deconvolution_layer<CLDeconvolutionLayer, CLTargetInfo>(
+                *polymorphic_downcast<DeconvolutionLayerNode *>(node), ctx);
         case NodeType::ConcatenateLayer:
-            return detail::create_concatenate_layer<CLConcatenateLayer, CLTargetInfo>(*polymorphic_downcast<ConcatenateLayerNode *>(node));
+            return detail::create_concatenate_layer<CLConcatenateLayer, CLTargetInfo>(
+                *polymorphic_downcast<ConcatenateLayerNode *>(node));
         case NodeType::DepthToSpaceLayer:
-            return detail::create_depth_to_space_layer<CLDepthToSpaceLayer, CLTargetInfo>(*polymorphic_downcast<DepthToSpaceLayerNode *>(node));
+            return detail::create_depth_to_space_layer<CLDepthToSpaceLayer, CLTargetInfo>(
+                *polymorphic_downcast<DepthToSpaceLayerNode *>(node));
         case NodeType::DepthwiseConvolutionLayer:
-            return detail::create_depthwise_convolution_layer<CLDepthwiseConvolutionLayer, CLTargetInfo>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+            return detail::create_depthwise_convolution_layer<CLDepthwiseConvolutionLayer, CLTargetInfo>(
+                *polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
         case NodeType::DequantizationLayer:
-            return detail::create_dequantization_layer<CLDequantizationLayer, CLTargetInfo>(*polymorphic_downcast<DequantizationLayerNode *>(node));
+            return detail::create_dequantization_layer<CLDequantizationLayer, CLTargetInfo>(
+                *polymorphic_downcast<DequantizationLayerNode *>(node));
         case NodeType::DetectionOutputLayer:
-            return detail::create_detection_output_layer<CPPDetectionOutputLayer, CLTargetInfo>(*polymorphic_downcast<DetectionOutputLayerNode *>(node));
+            return detail::create_detection_output_layer<CPPDetectionOutputLayer, CLTargetInfo>(
+                *polymorphic_downcast<DetectionOutputLayerNode *>(node));
         case NodeType::DetectionPostProcessLayer:
-            return detail::create_detection_post_process_layer<CPPDetectionPostProcessLayer, CLTargetInfo>(*polymorphic_downcast<DetectionPostProcessLayerNode *>(node));
+            return detail::create_detection_post_process_layer<CPPDetectionPostProcessLayer, CLTargetInfo>(
+                *polymorphic_downcast<DetectionPostProcessLayerNode *>(node));
         case NodeType::EltwiseLayer:
-            return detail::create_eltwise_layer<CLEltwiseFunctions, CLTargetInfo>(*polymorphic_downcast<EltwiseLayerNode *>(node));
+            return detail::create_eltwise_layer<CLEltwiseFunctions, CLTargetInfo>(
+                *polymorphic_downcast<EltwiseLayerNode *>(node));
         case NodeType::UnaryEltwiseLayer:
-            return detail::create_unary_eltwise_layer<CLUnaryEltwiseFunctions, CLTargetInfo>(*polymorphic_downcast<UnaryEltwiseLayerNode *>(node));
+            return detail::create_unary_eltwise_layer<CLUnaryEltwiseFunctions, CLTargetInfo>(
+                *polymorphic_downcast<UnaryEltwiseLayerNode *>(node));
         case NodeType::FlattenLayer:
-            return detail::create_flatten_layer<CLFlattenLayer, CLTargetInfo>(*polymorphic_downcast<FlattenLayerNode *>(node));
+            return detail::create_flatten_layer<CLFlattenLayer, CLTargetInfo>(
+                *polymorphic_downcast<FlattenLayerNode *>(node));
         case NodeType::FullyConnectedLayer:
-            return detail::create_fully_connected_layer<CLFullyConnectedLayer, CLTargetInfo>(*polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
+            return detail::create_fully_connected_layer<CLFullyConnectedLayer, CLTargetInfo>(
+                *polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
         case NodeType::FusedConvolutionBatchNormalizationLayer:
-            return detail::create_fused_convolution_batch_normalization_layer<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(node), ctx);
+            return detail::create_fused_convolution_batch_normalization_layer<CLFusedLayerTypes, CLTargetInfo>(
+                *polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(node), ctx);
         case NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer:
-            return detail::create_fused_depthwise_convolution_batch_normalization_layer<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node), ctx);
+            return detail::create_fused_depthwise_convolution_batch_normalization_layer<CLFusedLayerTypes,
+                                                                                        CLTargetInfo>(
+                *polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node), ctx);
         case NodeType::GenerateProposalsLayer:
-            return detail::create_generate_proposals_layer<CLGenerateProposalsLayer, CLTargetInfo>(*polymorphic_downcast<GenerateProposalsLayerNode *>(node), ctx);
+            return detail::create_generate_proposals_layer<CLGenerateProposalsLayer, CLTargetInfo>(
+                *polymorphic_downcast<GenerateProposalsLayerNode *>(node), ctx);
         case NodeType::L2NormalizeLayer:
-            return detail::create_l2_normalize_layer<CLL2NormalizeLayer, CLTargetInfo>(*polymorphic_downcast<L2NormalizeLayerNode *>(node), ctx);
+            return detail::create_l2_normalize_layer<CLL2NormalizeLayer, CLTargetInfo>(
+                *polymorphic_downcast<L2NormalizeLayerNode *>(node), ctx);
         case NodeType::NormalizationLayer:
-            return detail::create_normalization_layer<CLNormalizationLayer, CLTargetInfo>(*polymorphic_downcast<NormalizationLayerNode *>(node), ctx);
+            return detail::create_normalization_layer<CLNormalizationLayer, CLTargetInfo>(
+                *polymorphic_downcast<NormalizationLayerNode *>(node), ctx);
         case NodeType::NormalizePlanarYUVLayer:
-            return detail::create_normalize_planar_yuv_layer<CLNormalizePlanarYUVLayer, CLTargetInfo>(*polymorphic_downcast<NormalizePlanarYUVLayerNode *>(node));
+            return detail::create_normalize_planar_yuv_layer<CLNormalizePlanarYUVLayer, CLTargetInfo>(
+                *polymorphic_downcast<NormalizePlanarYUVLayerNode *>(node));
         case NodeType::PadLayer:
             return detail::create_pad_layer<CLPadLayer, CLTargetInfo>(*polymorphic_downcast<PadLayerNode *>(node));
         case NodeType::PermuteLayer:
-            return detail::create_permute_layer<CLPermute, CLTargetInfo>(*polymorphic_downcast<PermuteLayerNode *>(node));
+            return detail::create_permute_layer<CLPermute, CLTargetInfo>(
+                *polymorphic_downcast<PermuteLayerNode *>(node));
         case NodeType::PoolingLayer:
-            return detail::create_pooling_layer<CLPoolingLayer, CLTargetInfo>(*polymorphic_downcast<PoolingLayerNode *>(node));
+            return detail::create_pooling_layer<CLPoolingLayer, CLTargetInfo>(
+                *polymorphic_downcast<PoolingLayerNode *>(node));
         case NodeType::PReluLayer:
-            return detail::create_prelu_layer<CLPReluLayer, CLTargetInfo>(*polymorphic_downcast<PReluLayerNode *>(node));
+            return detail::create_prelu_layer<CLPReluLayer, CLTargetInfo>(
+                *polymorphic_downcast<PReluLayerNode *>(node));
         case NodeType::PrintLayer:
             return detail::create_print_layer<CLTargetInfo>(*polymorphic_downcast<PrintLayerNode *>(node));
         case NodeType::PriorBoxLayer:
-            return detail::create_priorbox_layer<CLPriorBoxLayer, CLTargetInfo>(*polymorphic_downcast<PriorBoxLayerNode *>(node));
+            return detail::create_priorbox_layer<CLPriorBoxLayer, CLTargetInfo>(
+                *polymorphic_downcast<PriorBoxLayerNode *>(node));
         case NodeType::QuantizationLayer:
-            return detail::create_quantization_layer<CLQuantizationLayer, CLTargetInfo>(*polymorphic_downcast<QuantizationLayerNode *>(node));
+            return detail::create_quantization_layer<CLQuantizationLayer, CLTargetInfo>(
+                *polymorphic_downcast<QuantizationLayerNode *>(node));
         case NodeType::ReductionOperationLayer:
-            return detail::create_reduction_operation_layer<CLReductionOperation, CLTargetInfo>(*polymorphic_downcast<ReductionLayerNode *>(node), ctx);
+            return detail::create_reduction_operation_layer<CLReductionOperation, CLTargetInfo>(
+                *polymorphic_downcast<ReductionLayerNode *>(node), ctx);
         case NodeType::ReorgLayer:
-            return detail::create_reorg_layer<CLReorgLayer, CLTargetInfo>(*polymorphic_downcast<ReorgLayerNode *>(node));
+            return detail::create_reorg_layer<CLReorgLayer, CLTargetInfo>(
+                *polymorphic_downcast<ReorgLayerNode *>(node));
         case NodeType::ReshapeLayer:
-            return detail::create_reshape_layer<CLReshapeLayer, CLTargetInfo>(*polymorphic_downcast<ReshapeLayerNode *>(node));
+            return detail::create_reshape_layer<CLReshapeLayer, CLTargetInfo>(
+                *polymorphic_downcast<ReshapeLayerNode *>(node));
         case NodeType::ResizeLayer:
             return detail::create_resize_layer<CLScale, CLTargetInfo>(*polymorphic_downcast<ResizeLayerNode *>(node));
         case NodeType::ROIAlignLayer:
-            return detail::create_roi_align_layer<CLROIAlignLayer, CLTargetInfo>(*polymorphic_downcast<ROIAlignLayerNode *>(node));
+            return detail::create_roi_align_layer<CLROIAlignLayer, CLTargetInfo>(
+                *polymorphic_downcast<ROIAlignLayerNode *>(node));
         case NodeType::SliceLayer:
             return detail::create_slice_layer<CLSlice, CLTargetInfo>(*polymorphic_downcast<SliceLayerNode *>(node));
         case NodeType::SoftmaxLayer:
-            return detail::create_softmax_layer<CLSoftmaxLayer, CLTargetInfo>(*polymorphic_downcast<SoftmaxLayerNode *>(node), ctx);
+            return detail::create_softmax_layer<CLSoftmaxLayer, CLTargetInfo>(
+                *polymorphic_downcast<SoftmaxLayerNode *>(node), ctx);
         case NodeType::StackLayer:
-            return detail::create_stack_layer<CLStackLayer, CLTargetInfo>(*polymorphic_downcast<StackLayerNode *>(node));
+            return detail::create_stack_layer<CLStackLayer, CLTargetInfo>(
+                *polymorphic_downcast<StackLayerNode *>(node));
         case NodeType::StridedSliceLayer:
-            return detail::create_strided_slice_layer<CLStridedSlice, CLTargetInfo>(*polymorphic_downcast<StridedSliceLayerNode *>(node));
+            return detail::create_strided_slice_layer<CLStridedSlice, CLTargetInfo>(
+                *polymorphic_downcast<StridedSliceLayerNode *>(node));
         default:
             return nullptr;
     }
diff --git a/src/graph/backends/CL/CLNodeValidator.cpp b/src/graph/backends/CL/CLNodeValidator.cpp
index 8fd8c14f63..510eda7935 100644
--- a/src/graph/backends/CL/CLNodeValidator.cpp
+++ b/src/graph/backends/CL/CLNodeValidator.cpp
@@ -25,7 +25,6 @@
 
 #include "arm_compute/graph/backends/ValidateHelpers.h"
 #include "arm_compute/graph/nodes/Nodes.h"
-
 #include "arm_compute/runtime/CL/CLFunctions.h"
 #include "arm_compute/runtime/CPP/CPPFunctions.h"
 
@@ -57,41 +56,51 @@ struct CLUnaryEltwiseLayerFunctions
 
 Status CLNodeValidator::validate(INode *node)
 {
-    if(node == nullptr)
+    if (node == nullptr)
     {
         return Status{};
     }
 
     NodeType type = node->type();
-    switch(type)
+    switch (type)
     {
         case NodeType::ArgMinMaxLayer:
-            return detail::validate_arg_min_max_layer<CLArgMinMaxLayer>(*polymorphic_downcast<ArgMinMaxLayerNode *>(node));
+            return detail::validate_arg_min_max_layer<CLArgMinMaxLayer>(
+                *polymorphic_downcast<ArgMinMaxLayerNode *>(node));
         case NodeType::BoundingBoxTransformLayer:
-            return detail::validate_bounding_box_transform_layer<CLBoundingBoxTransform>(*polymorphic_downcast<BoundingBoxTransformLayerNode *>(node));
+            return detail::validate_bounding_box_transform_layer<CLBoundingBoxTransform>(
+                *polymorphic_downcast<BoundingBoxTransformLayerNode *>(node));
         case NodeType::ChannelShuffleLayer:
-            return detail::validate_channel_shuffle_layer<CLChannelShuffleLayer>(*polymorphic_downcast<ChannelShuffleLayerNode *>(node));
+            return detail::validate_channel_shuffle_layer<CLChannelShuffleLayer>(
+                *polymorphic_downcast<ChannelShuffleLayerNode *>(node));
         case NodeType::ConvolutionLayer:
-            return detail::validate_convolution_layer<CLConvolutionLayer,
-                   CLDirectConvolutionLayer,
-                   CLGEMMConvolutionLayer,
-                   CLWinogradConvolutionLayer>(*polymorphic_downcast<ConvolutionLayerNode *>(node));
+            return detail::validate_convolution_layer<CLConvolutionLayer, CLDirectConvolutionLayer,
+                                                      CLGEMMConvolutionLayer, CLWinogradConvolutionLayer>(
+                *polymorphic_downcast<ConvolutionLayerNode *>(node));
         case NodeType::DepthToSpaceLayer:
-            return detail::validate_depth_to_space_layer<CLDepthToSpaceLayer>(*polymorphic_downcast<DepthToSpaceLayerNode *>(node));
+            return detail::validate_depth_to_space_layer<CLDepthToSpaceLayer>(
+                *polymorphic_downcast<DepthToSpaceLayerNode *>(node));
         case NodeType::DepthwiseConvolutionLayer:
-            return detail::validate_depthwise_convolution_layer<CLDepthwiseConvolutionLayer>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+            return detail::validate_depthwise_convolution_layer<CLDepthwiseConvolutionLayer>(
+                *polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
         case NodeType::DequantizationLayer:
-            return detail::validate_dequantization_layer<CLDequantizationLayer>(*polymorphic_downcast<DequantizationLayerNode *>(node));
+            return detail::validate_dequantization_layer<CLDequantizationLayer>(
+                *polymorphic_downcast<DequantizationLayerNode *>(node));
         case NodeType::DetectionOutputLayer:
-            return detail::validate_detection_output_layer<CPPDetectionOutputLayer>(*polymorphic_downcast<DetectionOutputLayerNode *>(node));
+            return detail::validate_detection_output_layer<CPPDetectionOutputLayer>(
+                *polymorphic_downcast<DetectionOutputLayerNode *>(node));
         case NodeType::DetectionPostProcessLayer:
-            return detail::validate_detection_post_process_layer<CPPDetectionPostProcessLayer>(*polymorphic_downcast<DetectionPostProcessLayerNode *>(node));
+            return detail::validate_detection_post_process_layer<CPPDetectionPostProcessLayer>(
+                *polymorphic_downcast<DetectionPostProcessLayerNode *>(node));
         case NodeType::GenerateProposalsLayer:
-            return detail::validate_generate_proposals_layer<CLGenerateProposalsLayer>(*polymorphic_downcast<GenerateProposalsLayerNode *>(node));
+            return detail::validate_generate_proposals_layer<CLGenerateProposalsLayer>(
+                *polymorphic_downcast<GenerateProposalsLayerNode *>(node));
         case NodeType::L2NormalizeLayer:
-            return detail::validate_l2_normalize_layer<CLL2NormalizeLayer>(*polymorphic_downcast<L2NormalizeLayerNode *>(node));
+            return detail::validate_l2_normalize_layer<CLL2NormalizeLayer>(
+                *polymorphic_downcast<L2NormalizeLayerNode *>(node));
         case NodeType::NormalizePlanarYUVLayer:
-            return detail::validate_normalize_planar_yuv_layer<CLNormalizePlanarYUVLayer>(*polymorphic_downcast<NormalizePlanarYUVLayerNode *>(node));
+            return detail::validate_normalize_planar_yuv_layer<CLNormalizePlanarYUVLayer>(
+                *polymorphic_downcast<NormalizePlanarYUVLayerNode *>(node));
         case NodeType::PadLayer:
             return detail::validate_pad_layer<CLPadLayer>(*polymorphic_downcast<PadLayerNode *>(node));
         case NodeType::PermuteLayer:
@@ -101,9 +110,11 @@ Status CLNodeValidator::validate(INode *node)
         case NodeType::PriorBoxLayer:
             return detail::validate_priorbox_layer<CLPriorBoxLayer>(*polymorphic_downcast<PriorBoxLayerNode *>(node));
         case NodeType::QuantizationLayer:
-            return detail::validate_quantization_layer<CLQuantizationLayer>(*polymorphic_downcast<QuantizationLayerNode *>(node));
+            return detail::validate_quantization_layer<CLQuantizationLayer>(
+                *polymorphic_downcast<QuantizationLayerNode *>(node));
         case NodeType::ReductionOperationLayer:
-            return detail::validate_reduction_operation_layer<CLReductionOperation>(*polymorphic_downcast<ReductionLayerNode *>(node));
+            return detail::validate_reduction_operation_layer<CLReductionOperation>(
+                *polymorphic_downcast<ReductionLayerNode *>(node));
         case NodeType::ReorgLayer:
             return detail::validate_reorg_layer<CLReorgLayer>(*polymorphic_downcast<ReorgLayerNode *>(node));
         case NodeType::ReshapeLayer:
@@ -113,11 +124,14 @@ Status CLNodeValidator::validate(INode *node)
         case NodeType::SliceLayer:
             return detail::validate_slice_layer<CLSlice>(*polymorphic_downcast<SliceLayerNode *>(node));
         case NodeType::StridedSliceLayer:
-            return detail::validate_strided_slice_layer<CLStridedSlice>(*polymorphic_downcast<StridedSliceLayerNode *>(node));
+            return detail::validate_strided_slice_layer<CLStridedSlice>(
+                *polymorphic_downcast<StridedSliceLayerNode *>(node));
         case NodeType::EltwiseLayer:
-            return detail::validate_eltwise_Layer<CLEltwiseLayerFunctions>(*polymorphic_downcast<EltwiseLayerNode *>(node));
+            return detail::validate_eltwise_Layer<CLEltwiseLayerFunctions>(
+                *polymorphic_downcast<EltwiseLayerNode *>(node));
         case NodeType::UnaryEltwiseLayer:
-            return detail::validate_unary_eltwise_layer<CLUnaryEltwiseLayerFunctions>(*polymorphic_downcast<UnaryEltwiseLayerNode *>(node));
+            return detail::validate_unary_eltwise_layer<CLUnaryEltwiseLayerFunctions>(
+                *polymorphic_downcast<UnaryEltwiseLayerNode *>(node));
         default:
             return Status{};
     }
diff --git a/src/graph/backends/CL/CLSubTensorHandle.cpp b/src/graph/backends/CL/CLSubTensorHandle.cpp
index b97d25890a..ccdc877a18 100644
--- a/src/graph/backends/CL/CLSubTensorHandle.cpp
+++ b/src/graph/backends/CL/CLSubTensorHandle.cpp
@@ -31,7 +31,10 @@ namespace graph
 {
 namespace backends
 {
-CLSubTensorHandle::CLSubTensorHandle(ITensorHandle *parent_handle, const TensorShape &shape, const Coordinates &coords, bool extend_parent)
+CLSubTensorHandle::CLSubTensorHandle(ITensorHandle     *parent_handle,
+                                     const TensorShape &shape,
+                                     const Coordinates &coords,
+                                     bool               extend_parent)
     : _sub_tensor(), _parent_handle(nullptr)
 {
     ARM_COMPUTE_ERROR_ON(!parent_handle);
@@ -98,4 +101,4 @@ Target CLSubTensorHandle::target() const
 }
 } // namespace backends
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/backends/CL/CLTensorHandle.cpp b/src/graph/backends/CL/CLTensorHandle.cpp
index a496c2ce47..1b69f9dede 100644
--- a/src/graph/backends/CL/CLTensorHandle.cpp
+++ b/src/graph/backends/CL/CLTensorHandle.cpp
@@ -31,8 +31,7 @@ namespace graph
 {
 namespace backends
 {
-CLTensorHandle::CLTensorHandle(const ITensorInfo &info)
-    : _tensor()
+CLTensorHandle::CLTensorHandle(const ITensorInfo &info) : _tensor()
 {
     _tensor.allocator()->init(info);
 }
@@ -49,7 +48,7 @@ void CLTensorHandle::free()
 
 void CLTensorHandle::manage(IMemoryGroup *mg)
 {
-    if(mg != nullptr)
+    if (mg != nullptr)
     {
         mg->manage(&_tensor);
     }
@@ -68,7 +67,7 @@ void CLTensorHandle::unmap()
 void CLTensorHandle::release_if_unused()
 {
     // TODO (geopin01): Release tensor only if all sub-tensors are marked as not used
-    if(!_tensor.is_used())
+    if (!_tensor.is_used())
     {
         _tensor.allocator()->free();
     }
@@ -100,4 +99,4 @@ Target CLTensorHandle::target() const
 }
 } // namespace backends
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/backends/NEON/NEDeviceBackend.cpp b/src/graph/backends/NEON/NEDeviceBackend.cpp
index 18456538da..fc7b309803 100644
--- a/src/graph/backends/NEON/NEDeviceBackend.cpp
+++ b/src/graph/backends/NEON/NEDeviceBackend.cpp
@@ -23,18 +23,17 @@
  */
 #include "arm_compute/graph/backends/NEON/NEDeviceBackend.h"
 
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/GraphContext.h"
-#include "arm_compute/graph/INode.h"
-#include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/graph/backends/BackendRegistrar.h"
 #include "arm_compute/graph/backends/NEON/NEFunctionFactory.h"
 #include "arm_compute/graph/backends/NEON/NENodeValidator.h"
 #include "arm_compute/graph/backends/NEON/NESubTensorHandle.h"
 #include "arm_compute/graph/backends/NEON/NETensorHandle.h"
-
-#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/Tensor.h"
 #include "arm_compute/runtime/Allocator.h"
 #include "arm_compute/runtime/BlobLifetimeManager.h"
 #include "arm_compute/runtime/IWeightsManager.h"
@@ -53,8 +52,7 @@ namespace backends
 /** Register CPU backend */
 static detail::BackendRegistrar<NEDeviceBackend> NEDeviceBackend_registrar(Target::NEON);
 
-NEDeviceBackend::NEDeviceBackend()
-    : _allocator()
+NEDeviceBackend::NEDeviceBackend() : _allocator()
 {
 }
 
@@ -72,13 +70,13 @@ void NEDeviceBackend::release_backend_context(GraphContext &ctx)
 void NEDeviceBackend::setup_backend_context(GraphContext &ctx)
 {
     // Set number of threads
-    if(ctx.config().num_threads >= 0)
+    if (ctx.config().num_threads >= 0)
     {
         Scheduler::get().set_num_threads(ctx.config().num_threads);
     }
 
     // Create function level memory manager
-    if(ctx.memory_management_ctx(Target::NEON) == nullptr)
+    if (ctx.memory_management_ctx(Target::NEON) == nullptr)
     {
         MemoryManagerContext mm_ctx;
         mm_ctx.target      = Target::NEON;
@@ -91,7 +89,7 @@ void NEDeviceBackend::setup_backend_context(GraphContext &ctx)
     }
 
     // Create function level weights manager
-    if(ctx.weights_management_ctx(Target::NEON) == nullptr)
+    if (ctx.weights_management_ctx(Target::NEON) == nullptr)
     {
         WeightsManagerContext wm_ctx;
         wm_ctx.target = Target::NEON;
@@ -124,9 +122,10 @@ std::unique_ptr<ITensorHandle> NEDeviceBackend::create_tensor(const Tensor &tens
     return std::make_unique<NETensorHandle>(info);
 }
 
-std::unique_ptr<ITensorHandle> NEDeviceBackend::create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent)
+std::unique_ptr<ITensorHandle>
+NEDeviceBackend::create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent)
 {
-    if(parent == nullptr)
+    if (parent == nullptr)
     {
         return nullptr;
     }
@@ -154,7 +153,7 @@ arm_compute::Status NEDeviceBackend::validate_node(INode &node)
 std::shared_ptr<arm_compute::IMemoryManager> NEDeviceBackend::create_memory_manager(MemoryManagerAffinity affinity)
 {
     std::shared_ptr<ILifetimeManager> lifetime_mgr = nullptr;
-    if(affinity == MemoryManagerAffinity::Buffer)
+    if (affinity == MemoryManagerAffinity::Buffer)
     {
         lifetime_mgr = std::make_shared<BlobLifetimeManager>();
     }
diff --git a/src/graph/backends/NEON/NEFunctionFactory.cpp b/src/graph/backends/NEON/NEFunctionFactory.cpp
index d7ed5f9ecb..fe15d4cec1 100644
--- a/src/graph/backends/NEON/NEFunctionFactory.cpp
+++ b/src/graph/backends/NEON/NEFunctionFactory.cpp
@@ -23,13 +23,13 @@
  */
 #include "arm_compute/graph/backends/NEON/NEFunctionFactory.h"
 
+#include "arm_compute/graph/backends/FunctionHelpers.h"
+#include "arm_compute/graph/backends/Utils.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/TypePrinter.h"
-#include "arm_compute/graph/backends/FunctionHelpers.h"
-#include "arm_compute/graph/backends/Utils.h"
 #include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/graph/TypePrinter.h"
 #include "arm_compute/runtime/CPP/CPPFunctions.h"
 #include "arm_compute/runtime/NEON/NEFunctions.h"
 
@@ -88,7 +88,8 @@ struct NEFusedLayerTypes
 namespace detail
 {
 template <>
-std::unique_ptr<IFunction> create_normalization_layer<NENormalizationLayer, NETargetInfo>(NormalizationLayerNode &node, GraphContext &ctx)
+std::unique_ptr<IFunction> create_normalization_layer<NENormalizationLayer, NETargetInfo>(NormalizationLayerNode &node,
+                                                                                          GraphContext           &ctx)
 {
     validate_node<NETargetInfo>(node, 1 /* expected inputs */, 1 /* expected outputs */);
 
@@ -105,14 +106,10 @@ std::unique_ptr<IFunction> create_normalization_layer<NENormalizationLayer, NETa
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << NETargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " Normalization info: " << norm_info.type()
-                               << std::endl);
+                               << node.name() << " Type: " << node.type() << " Target: " << NETargetInfo::TargetType
+                               << " Data Type: " << input->info()->data_type() << " Input shape: "
+                               << input->info()->tensor_shape() << " Output shape: " << output->info()->tensor_shape()
+                               << " Normalization info: " << norm_info.type() << std::endl);
 
     return func;
 }
@@ -120,84 +117,116 @@ std::unique_ptr<IFunction> create_normalization_layer<NENormalizationLayer, NETa
 
 std::unique_ptr<IFunction> NEFunctionFactory::create(INode *node, GraphContext &ctx)
 {
-    if(node == nullptr)
+    if (node == nullptr)
     {
         return nullptr;
     }
 
     NodeType type = node->type();
-    switch(type)
+    switch (type)
     {
         case NodeType::ActivationLayer:
-            return detail::create_activation_layer<NEActivationLayer, NETargetInfo>(*polymorphic_downcast<ActivationLayerNode *>(node));
+            return detail::create_activation_layer<NEActivationLayer, NETargetInfo>(
+                *polymorphic_downcast<ActivationLayerNode *>(node));
         case NodeType::ArgMinMaxLayer:
-            return detail::create_arg_min_max_layer<NEArgMinMaxLayer, NETargetInfo>(*polymorphic_downcast<ArgMinMaxLayerNode *>(node));
+            return detail::create_arg_min_max_layer<NEArgMinMaxLayer, NETargetInfo>(
+                *polymorphic_downcast<ArgMinMaxLayerNode *>(node));
         case NodeType::BatchNormalizationLayer:
-            return detail::create_batch_normalization_layer<NEBatchNormalizationLayer, NETargetInfo>(*polymorphic_downcast<BatchNormalizationLayerNode *>(node));
+            return detail::create_batch_normalization_layer<NEBatchNormalizationLayer, NETargetInfo>(
+                *polymorphic_downcast<BatchNormalizationLayerNode *>(node));
         case NodeType::ChannelShuffleLayer:
-            return detail::create_channel_shuffle_layer<NEChannelShuffleLayer, NETargetInfo>(*polymorphic_downcast<ChannelShuffleLayerNode *>(node));
+            return detail::create_channel_shuffle_layer<NEChannelShuffleLayer, NETargetInfo>(
+                *polymorphic_downcast<ChannelShuffleLayerNode *>(node));
         case NodeType::ConvolutionLayer:
-            return detail::create_convolution_layer<NEConvolutionLayerFunctions, NETargetInfo>(*polymorphic_downcast<ConvolutionLayerNode *>(node), ctx);
+            return detail::create_convolution_layer<NEConvolutionLayerFunctions, NETargetInfo>(
+                *polymorphic_downcast<ConvolutionLayerNode *>(node), ctx);
         case NodeType::DepthToSpaceLayer:
-            return detail::create_depth_to_space_layer<NEDepthToSpaceLayer, NETargetInfo>(*polymorphic_downcast<DepthToSpaceLayerNode *>(node));
+            return detail::create_depth_to_space_layer<NEDepthToSpaceLayer, NETargetInfo>(
+                *polymorphic_downcast<DepthToSpaceLayerNode *>(node));
         case NodeType::DeconvolutionLayer:
-            return detail::create_deconvolution_layer<NEDeconvolutionLayer, NETargetInfo>(*polymorphic_downcast<DeconvolutionLayerNode *>(node), ctx);
+            return detail::create_deconvolution_layer<NEDeconvolutionLayer, NETargetInfo>(
+                *polymorphic_downcast<DeconvolutionLayerNode *>(node), ctx);
         case NodeType::ConcatenateLayer:
-            return detail::create_concatenate_layer<NEConcatenateLayer, NETargetInfo>(*polymorphic_downcast<ConcatenateLayerNode *>(node));
+            return detail::create_concatenate_layer<NEConcatenateLayer, NETargetInfo>(
+                *polymorphic_downcast<ConcatenateLayerNode *>(node));
         case NodeType::DepthwiseConvolutionLayer:
-            return detail::create_depthwise_convolution_layer<NEDepthwiseConvolutionLayer, NETargetInfo>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+            return detail::create_depthwise_convolution_layer<NEDepthwiseConvolutionLayer, NETargetInfo>(
+                *polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
         case NodeType::DequantizationLayer:
-            return detail::create_dequantization_layer<NEDequantizationLayer, NETargetInfo>(*polymorphic_downcast<DequantizationLayerNode *>(node));
+            return detail::create_dequantization_layer<NEDequantizationLayer, NETargetInfo>(
+                *polymorphic_downcast<DequantizationLayerNode *>(node));
         case NodeType::DetectionOutputLayer:
-            return detail::create_detection_output_layer<CPPDetectionOutputLayer, NETargetInfo>(*polymorphic_downcast<DetectionOutputLayerNode *>(node));
+            return detail::create_detection_output_layer<CPPDetectionOutputLayer, NETargetInfo>(
+                *polymorphic_downcast<DetectionOutputLayerNode *>(node));
         case NodeType::DetectionPostProcessLayer:
-            return detail::create_detection_post_process_layer<NEDetectionPostProcessLayer, NETargetInfo>(*polymorphic_downcast<DetectionPostProcessLayerNode *>(node));
+            return detail::create_detection_post_process_layer<NEDetectionPostProcessLayer, NETargetInfo>(
+                *polymorphic_downcast<DetectionPostProcessLayerNode *>(node));
         case NodeType::EltwiseLayer:
-            return detail::create_eltwise_layer<NEEltwiseFunctions, NETargetInfo>(*polymorphic_downcast<EltwiseLayerNode *>(node));
+            return detail::create_eltwise_layer<NEEltwiseFunctions, NETargetInfo>(
+                *polymorphic_downcast<EltwiseLayerNode *>(node));
         case NodeType::UnaryEltwiseLayer:
-            return detail::create_unary_eltwise_layer<NEUnaryEltwiseFunctions, NETargetInfo>(*polymorphic_downcast<UnaryEltwiseLayerNode *>(node));
+            return detail::create_unary_eltwise_layer<NEUnaryEltwiseFunctions, NETargetInfo>(
+                *polymorphic_downcast<UnaryEltwiseLayerNode *>(node));
         case NodeType::FlattenLayer:
-            return detail::create_flatten_layer<NEFlattenLayer, NETargetInfo>(*polymorphic_downcast<FlattenLayerNode *>(node));
+            return detail::create_flatten_layer<NEFlattenLayer, NETargetInfo>(
+                *polymorphic_downcast<FlattenLayerNode *>(node));
         case NodeType::FullyConnectedLayer:
-            return detail::create_fully_connected_layer<NEFullyConnectedLayer, NETargetInfo>(*polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
+            return detail::create_fully_connected_layer<NEFullyConnectedLayer, NETargetInfo>(
+                *polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
         case NodeType::FusedConvolutionBatchNormalizationLayer:
-            return detail::create_fused_convolution_batch_normalization_layer<NEFusedLayerTypes, NETargetInfo>(*polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(node), ctx);
+            return detail::create_fused_convolution_batch_normalization_layer<NEFusedLayerTypes, NETargetInfo>(
+                *polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(node), ctx);
         case NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer:
-            return detail::create_fused_depthwise_convolution_batch_normalization_layer<NEFusedLayerTypes, NETargetInfo>(*polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node), ctx);
+            return detail::create_fused_depthwise_convolution_batch_normalization_layer<NEFusedLayerTypes,
+                                                                                        NETargetInfo>(
+                *polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node), ctx);
         case NodeType::L2NormalizeLayer:
-            return detail::create_l2_normalize_layer<NEL2NormalizeLayer, NETargetInfo>(*polymorphic_downcast<L2NormalizeLayerNode *>(node), ctx);
+            return detail::create_l2_normalize_layer<NEL2NormalizeLayer, NETargetInfo>(
+                *polymorphic_downcast<L2NormalizeLayerNode *>(node), ctx);
         case NodeType::NormalizationLayer:
-            return detail::create_normalization_layer<NENormalizationLayer, NETargetInfo>(*polymorphic_downcast<NormalizationLayerNode *>(node), ctx);
+            return detail::create_normalization_layer<NENormalizationLayer, NETargetInfo>(
+                *polymorphic_downcast<NormalizationLayerNode *>(node), ctx);
         case NodeType::PadLayer:
             return detail::create_pad_layer<NEPadLayer, NETargetInfo>(*polymorphic_downcast<PadLayerNode *>(node));
         case NodeType::PermuteLayer:
-            return detail::create_permute_layer<NEPermute, NETargetInfo>(*polymorphic_downcast<PermuteLayerNode *>(node));
+            return detail::create_permute_layer<NEPermute, NETargetInfo>(
+                *polymorphic_downcast<PermuteLayerNode *>(node));
         case NodeType::PoolingLayer:
-            return detail::create_pooling_layer<NEPoolingLayer, NETargetInfo>(*polymorphic_downcast<PoolingLayerNode *>(node));
+            return detail::create_pooling_layer<NEPoolingLayer, NETargetInfo>(
+                *polymorphic_downcast<PoolingLayerNode *>(node));
         case NodeType::PReluLayer:
-            return detail::create_prelu_layer<NEPReluLayer, NETargetInfo>(*polymorphic_downcast<PReluLayerNode *>(node));
+            return detail::create_prelu_layer<NEPReluLayer, NETargetInfo>(
+                *polymorphic_downcast<PReluLayerNode *>(node));
         case NodeType::PrintLayer:
             return detail::create_print_layer<NETargetInfo>(*polymorphic_downcast<PrintLayerNode *>(node));
         case NodeType::PriorBoxLayer:
-            return detail::create_priorbox_layer<NEPriorBoxLayer, NETargetInfo>(*polymorphic_downcast<PriorBoxLayerNode *>(node));
+            return detail::create_priorbox_layer<NEPriorBoxLayer, NETargetInfo>(
+                *polymorphic_downcast<PriorBoxLayerNode *>(node));
         case NodeType::QuantizationLayer:
-            return detail::create_quantization_layer<NEQuantizationLayer, NETargetInfo>(*polymorphic_downcast<QuantizationLayerNode *>(node));
+            return detail::create_quantization_layer<NEQuantizationLayer, NETargetInfo>(
+                *polymorphic_downcast<QuantizationLayerNode *>(node));
         case NodeType::ReductionOperationLayer:
-            return detail::create_reduction_operation_layer<NEReductionOperation, NETargetInfo>(*polymorphic_downcast<ReductionLayerNode *>(node), ctx);
+            return detail::create_reduction_operation_layer<NEReductionOperation, NETargetInfo>(
+                *polymorphic_downcast<ReductionLayerNode *>(node), ctx);
         case NodeType::ReorgLayer:
-            return detail::create_reorg_layer<NEReorgLayer, NETargetInfo>(*polymorphic_downcast<ReorgLayerNode *>(node));
+            return detail::create_reorg_layer<NEReorgLayer, NETargetInfo>(
+                *polymorphic_downcast<ReorgLayerNode *>(node));
         case NodeType::ReshapeLayer:
-            return detail::create_reshape_layer<NEReshapeLayer, NETargetInfo>(*polymorphic_downcast<ReshapeLayerNode *>(node));
+            return detail::create_reshape_layer<NEReshapeLayer, NETargetInfo>(
+                *polymorphic_downcast<ReshapeLayerNode *>(node));
         case NodeType::ResizeLayer:
             return detail::create_resize_layer<NEScale, NETargetInfo>(*polymorphic_downcast<ResizeLayerNode *>(node));
         case NodeType::SliceLayer:
             return detail::create_slice_layer<NESlice, NETargetInfo>(*polymorphic_downcast<SliceLayerNode *>(node));
         case NodeType::SoftmaxLayer:
-            return detail::create_softmax_layer<NESoftmaxLayer, NETargetInfo>(*polymorphic_downcast<SoftmaxLayerNode *>(node), ctx);
+            return detail::create_softmax_layer<NESoftmaxLayer, NETargetInfo>(
+                *polymorphic_downcast<SoftmaxLayerNode *>(node), ctx);
         case NodeType::StackLayer:
-            return detail::create_stack_layer<NEStackLayer, NETargetInfo>(*polymorphic_downcast<StackLayerNode *>(node));
+            return detail::create_stack_layer<NEStackLayer, NETargetInfo>(
+                *polymorphic_downcast<StackLayerNode *>(node));
         case NodeType::StridedSliceLayer:
-            return detail::create_strided_slice_layer<NEStridedSlice, NETargetInfo>(*polymorphic_downcast<StridedSliceLayerNode *>(node));
+            return detail::create_strided_slice_layer<NEStridedSlice, NETargetInfo>(
+                *polymorphic_downcast<StridedSliceLayerNode *>(node));
         default:
             return nullptr;
     }
diff --git a/src/graph/backends/NEON/NENodeValidator.cpp b/src/graph/backends/NEON/NENodeValidator.cpp
index a485e5d235..a97806f92c 100644
--- a/src/graph/backends/NEON/NENodeValidator.cpp
+++ b/src/graph/backends/NEON/NENodeValidator.cpp
@@ -25,9 +25,9 @@
 
 #include "arm_compute/graph/backends/ValidateHelpers.h"
 #include "arm_compute/graph/nodes/Nodes.h"
-
 #include "arm_compute/runtime/CPP/CPPFunctions.h"
 #include "arm_compute/runtime/NEON/NEFunctions.h"
+
 #include "support/Cast.h"
 
 using namespace arm_compute::utils::cast;
@@ -56,41 +56,51 @@ struct NEUnaryEltwiseLayerFunctions
 
 Status NENodeValidator::validate(INode *node)
 {
-    if(node == nullptr)
+    if (node == nullptr)
     {
         return Status{};
     }
 
     NodeType type = node->type();
-    switch(type)
+    switch (type)
     {
         case NodeType::ArgMinMaxLayer:
-            return detail::validate_arg_min_max_layer<NEArgMinMaxLayer>(*polymorphic_downcast<ArgMinMaxLayerNode *>(node));
+            return detail::validate_arg_min_max_layer<NEArgMinMaxLayer>(
+                *polymorphic_downcast<ArgMinMaxLayerNode *>(node));
         case NodeType::BoundingBoxTransformLayer:
-            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : BoundingBoxTransformLayer");
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR,
+                                            "Unsupported operation : BoundingBoxTransformLayer");
         case NodeType::ChannelShuffleLayer:
-            return detail::validate_channel_shuffle_layer<NEChannelShuffleLayer>(*polymorphic_downcast<ChannelShuffleLayerNode *>(node));
+            return detail::validate_channel_shuffle_layer<NEChannelShuffleLayer>(
+                *polymorphic_downcast<ChannelShuffleLayerNode *>(node));
         case NodeType::ConvolutionLayer:
-            return detail::validate_convolution_layer<NEConvolutionLayer,
-                   NEDirectConvolutionLayer,
-                   NEGEMMConvolutionLayer,
-                   NEWinogradConvolutionLayer>(*polymorphic_downcast<ConvolutionLayerNode *>(node));
+            return detail::validate_convolution_layer<NEConvolutionLayer, NEDirectConvolutionLayer,
+                                                      NEGEMMConvolutionLayer, NEWinogradConvolutionLayer>(
+                *polymorphic_downcast<ConvolutionLayerNode *>(node));
         case NodeType::DepthToSpaceLayer:
-            return detail::validate_depth_to_space_layer<NEDepthToSpaceLayer>(*polymorphic_downcast<DepthToSpaceLayerNode *>(node));
+            return detail::validate_depth_to_space_layer<NEDepthToSpaceLayer>(
+                *polymorphic_downcast<DepthToSpaceLayerNode *>(node));
         case NodeType::DepthwiseConvolutionLayer:
-            return detail::validate_depthwise_convolution_layer<NEDepthwiseConvolutionLayer>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+            return detail::validate_depthwise_convolution_layer<NEDepthwiseConvolutionLayer>(
+                *polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
         case NodeType::DequantizationLayer:
-            return detail::validate_dequantization_layer<NEDequantizationLayer>(*polymorphic_downcast<DequantizationLayerNode *>(node));
+            return detail::validate_dequantization_layer<NEDequantizationLayer>(
+                *polymorphic_downcast<DequantizationLayerNode *>(node));
         case NodeType::DetectionOutputLayer:
-            return detail::validate_detection_output_layer<CPPDetectionOutputLayer>(*polymorphic_downcast<DetectionOutputLayerNode *>(node));
+            return detail::validate_detection_output_layer<CPPDetectionOutputLayer>(
+                *polymorphic_downcast<DetectionOutputLayerNode *>(node));
         case NodeType::DetectionPostProcessLayer:
-            return detail::validate_detection_post_process_layer<NEDetectionPostProcessLayer>(*polymorphic_downcast<DetectionPostProcessLayerNode *>(node));
+            return detail::validate_detection_post_process_layer<NEDetectionPostProcessLayer>(
+                *polymorphic_downcast<DetectionPostProcessLayerNode *>(node));
         case NodeType::GenerateProposalsLayer:
-            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : GenerateProposalsLayer");
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR,
+                                            "Unsupported operation : GenerateProposalsLayer");
         case NodeType::L2NormalizeLayer:
-            return detail::validate_l2_normalize_layer<NEL2NormalizeLayer>(*polymorphic_downcast<L2NormalizeLayerNode *>(node));
+            return detail::validate_l2_normalize_layer<NEL2NormalizeLayer>(
+                *polymorphic_downcast<L2NormalizeLayerNode *>(node));
         case NodeType::NormalizePlanarYUVLayer:
-            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : NormalizePlanarYUVLayer");
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR,
+                                            "Unsupported operation : NormalizePlanarYUVLayer");
         case NodeType::PadLayer:
             return detail::validate_pad_layer<NEPadLayer>(*polymorphic_downcast<PadLayerNode *>(node));
         case NodeType::PermuteLayer:
@@ -100,23 +110,29 @@ Status NENodeValidator::validate(INode *node)
         case NodeType::PriorBoxLayer:
             return detail::validate_priorbox_layer<NEPriorBoxLayer>(*polymorphic_downcast<PriorBoxLayerNode *>(node));
         case NodeType::QuantizationLayer:
-            return detail::validate_quantization_layer<NEQuantizationLayer>(*polymorphic_downcast<QuantizationLayerNode *>(node));
+            return detail::validate_quantization_layer<NEQuantizationLayer>(
+                *polymorphic_downcast<QuantizationLayerNode *>(node));
         case NodeType::ReductionOperationLayer:
-            return detail::validate_reduction_operation_layer<NEReductionOperation>(*polymorphic_downcast<ReductionLayerNode *>(node));
+            return detail::validate_reduction_operation_layer<NEReductionOperation>(
+                *polymorphic_downcast<ReductionLayerNode *>(node));
         case NodeType::ReorgLayer:
             return detail::validate_reorg_layer<NEReorgLayer>(*polymorphic_downcast<ReorgLayerNode *>(node));
         case NodeType::ReshapeLayer:
             return detail::validate_reshape_layer<NEReshapeLayer>(*polymorphic_downcast<ReshapeLayerNode *>(node));
         case NodeType::ROIAlignLayer:
-            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : ROIAlignLayer");
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR,
+                                            "Unsupported operation : ROIAlignLayer");
         case NodeType::SliceLayer:
             return detail::validate_slice_layer<NESlice>(*polymorphic_downcast<SliceLayerNode *>(node));
         case NodeType::StridedSliceLayer:
-            return detail::validate_strided_slice_layer<NEStridedSlice>(*polymorphic_downcast<StridedSliceLayerNode *>(node));
+            return detail::validate_strided_slice_layer<NEStridedSlice>(
+                *polymorphic_downcast<StridedSliceLayerNode *>(node));
         case NodeType::EltwiseLayer:
-            return detail::validate_eltwise_Layer<NEEltwiseLayerFunctions>(*polymorphic_downcast<EltwiseLayerNode *>(node));
+            return detail::validate_eltwise_Layer<NEEltwiseLayerFunctions>(
+                *polymorphic_downcast<EltwiseLayerNode *>(node));
         case NodeType::UnaryEltwiseLayer:
-            return detail::validate_unary_eltwise_layer<NEUnaryEltwiseLayerFunctions>(*polymorphic_downcast<UnaryEltwiseLayerNode *>(node));
+            return detail::validate_unary_eltwise_layer<NEUnaryEltwiseLayerFunctions>(
+                *polymorphic_downcast<UnaryEltwiseLayerNode *>(node));
         default:
             return Status{};
     }
diff --git a/src/graph/backends/NEON/NESubTensorHandle.cpp b/src/graph/backends/NEON/NESubTensorHandle.cpp
index 36f29d0d10..8964a00c5e 100644
--- a/src/graph/backends/NEON/NESubTensorHandle.cpp
+++ b/src/graph/backends/NEON/NESubTensorHandle.cpp
@@ -29,7 +29,10 @@ namespace graph
 {
 namespace backends
 {
-NESubTensorHandle::NESubTensorHandle(ITensorHandle *parent_handle, const TensorShape &shape, const Coordinates &coords, bool extend_parent)
+NESubTensorHandle::NESubTensorHandle(ITensorHandle     *parent_handle,
+                                     const TensorShape &shape,
+                                     const Coordinates &coords,
+                                     bool               extend_parent)
     : _sub_tensor(), _parent_handle(nullptr)
 {
     ARM_COMPUTE_ERROR_ON(!parent_handle);
@@ -95,4 +98,4 @@ Target NESubTensorHandle::target() const
 }
 } // namespace backends
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/backends/NEON/NETensorHandle.cpp b/src/graph/backends/NEON/NETensorHandle.cpp
index 4393156e8a..dabf67060d 100644
--- a/src/graph/backends/NEON/NETensorHandle.cpp
+++ b/src/graph/backends/NEON/NETensorHandle.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/graph/backends/NEON/NETensorHandle.h"
 
 #include "arm_compute/runtime/MemoryGroup.h"
+
 #include "support/Cast.h"
 
 namespace arm_compute
@@ -32,8 +33,7 @@ namespace graph
 {
 namespace backends
 {
-NETensorHandle::NETensorHandle(const ITensorInfo &info)
-    : _tensor()
+NETensorHandle::NETensorHandle(const ITensorInfo &info) : _tensor()
 {
     _tensor.allocator()->init(info);
 }
@@ -50,7 +50,7 @@ void NETensorHandle::free()
 
 void NETensorHandle::manage(IMemoryGroup *mg)
 {
-    if(mg != nullptr)
+    if (mg != nullptr)
     {
         mg->manage(&_tensor);
     }
@@ -68,7 +68,7 @@ void NETensorHandle::unmap()
 void NETensorHandle::release_if_unused()
 {
     // TODO (geopin01): Release tensor only if all sub-tensors are marked as not used
-    if(!_tensor.is_used())
+    if (!_tensor.is_used())
     {
         _tensor.allocator()->free();
     }
@@ -100,4 +100,4 @@ Target NETensorHandle::target() const
 }
 } // namespace backends
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp b/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
index b45f453f23..1e813dc678 100644
--- a/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
+++ b/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
@@ -23,6 +23,8 @@
  */
 #include "arm_compute/graph/detail/CrossLayerMemoryManagerHelpers.h"
 
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/GraphManager.h"
@@ -30,9 +32,7 @@
 #include "arm_compute/graph/Tensor.h"
 #include "arm_compute/graph/Types.h"
 #include "arm_compute/graph/Utils.h"
-#include "arm_compute/graph/backends/BackendRegistry.h"
 
-#include "arm_compute/core/ITensor.h"
 #include "support/Cast.h"
 
 #include <algorithm>
@@ -78,28 +78,28 @@ IMemoryGroup *get_memory_group_from_handle(GraphContext &ctx, ITensorHandle *han
  */
 std::set<ITensorHandle *> get_const_handles(const Graph &g)
 {
-    std::set<NodeType> const_node_types = { NodeType::Input, NodeType::Output, NodeType::Const };
+    std::set<NodeType> const_node_types = {NodeType::Input, NodeType::Output, NodeType::Const};
 
     std::set<ITensorHandle *> const_tensors;
 
     auto &nodes = g.nodes();
-    for(auto &node : nodes)
+    for (auto &node : nodes)
     {
         // If its a const node:
-        if(node != nullptr && const_node_types.find(node->type()) != std::end(const_node_types))
+        if (node != nullptr && const_node_types.find(node->type()) != std::end(const_node_types))
         {
             // TODO (geopin01) : Create IO iterator wrappers
             // Add all its inputs / outputs to the list of constant handles
-            for(unsigned int i = 0; i < node->num_inputs(); ++i)
+            for (unsigned int i = 0; i < node->num_inputs(); ++i)
             {
-                if(node->input(i) != nullptr)
+                if (node->input(i) != nullptr)
                 {
                     const_tensors.insert(node->input(i)->handle()->parent_handle());
                 }
             }
-            for(unsigned int i = 0; i < node->num_outputs(); ++i)
+            for (unsigned int i = 0; i < node->num_outputs(); ++i)
             {
-                if(node->output(i) != nullptr)
+                if (node->output(i) != nullptr)
                 {
                     const_tensors.insert(node->output(i)->handle()->parent_handle());
                 }
@@ -118,9 +118,8 @@ std::set<ITensorHandle *> get_const_handles(const Graph &g)
  *
  * @return List of transition handles
  */
-TaskHandles get_transition_handles(GraphContext                    &ctx,
-                                   ExecutionTask                   &task,
-                                   const std::set<ITensorHandle *> &const_tensors)
+TaskHandles
+get_transition_handles(GraphContext &ctx, ExecutionTask &task, const std::set<ITensorHandle *> &const_tensors)
 {
     ARM_COMPUTE_ERROR_ON(task.node == nullptr || (task.task == nullptr && !is_utility_node(task.node)));
     INode &node = *task.node;
@@ -128,28 +127,30 @@ TaskHandles get_transition_handles(GraphContext                    &ctx,
     TaskHandles transition_handles;
 
     // Add input handles
-    for(unsigned int i = 0; i < node.input_edges().size(); ++i)
+    for (unsigned int i = 0; i < node.input_edges().size(); ++i)
     {
         Edge *input_edge = node.input_edge(i);
         // If this input is the output of another node
-        if(input_edge != nullptr && input_edge->tensor() != nullptr && const_tensors.find(input_edge->tensor()->handle()->parent_handle()) == std::end(const_tensors))
+        if (input_edge != nullptr && input_edge->tensor() != nullptr &&
+            const_tensors.find(input_edge->tensor()->handle()->parent_handle()) == std::end(const_tensors))
         {
             // Then add it to the list of transition buffers
             ITensorHandle *tensor_handle = input_edge->tensor()->handle()->parent_handle();
-            IMemoryGroup *mm_group      = get_memory_group_from_handle(ctx, tensor_handle);
+            IMemoryGroup  *mm_group      = get_memory_group_from_handle(ctx, tensor_handle);
             transition_handles.input_handles.emplace_back(std::make_pair(tensor_handle, mm_group));
         }
     }
 
     // Add output handles
-    for(unsigned int i = 0; i < node.num_outputs(); ++i)
+    for (unsigned int i = 0; i < node.num_outputs(); ++i)
     {
         Tensor *output_tensor = node.output(i);
         // If this output is used as an input for another node
-        if(output_tensor != nullptr && const_tensors.find(output_tensor->handle()->parent_handle()) == std::end(const_tensors))
+        if (output_tensor != nullptr &&
+            const_tensors.find(output_tensor->handle()->parent_handle()) == std::end(const_tensors))
         {
             ITensorHandle *tensor_handle = output_tensor->handle()->parent_handle();
-            IMemoryGroup *mm_group      = get_memory_group_from_handle(ctx, tensor_handle);
+            IMemoryGroup  *mm_group      = get_memory_group_from_handle(ctx, tensor_handle);
             transition_handles.output_handles.emplace_back(std::make_pair(tensor_handle, mm_group));
         }
     }
@@ -164,11 +165,11 @@ TaskHandles get_transition_handles(GraphContext                    &ctx,
  */
 void count_input_handles_per_target(const TaskHandles &task_handles, TargetHandleCounter &handle_counter)
 {
-    for(const auto &handle : task_handles.input_handles)
+    for (const auto &handle : task_handles.input_handles)
     {
         ITensorHandle *key            = handle.first;
         HandleCounter &target_counter = handle_counter[key->target()];
-        if(target_counter.find(key) == std::end(target_counter))
+        if (target_counter.find(key) == std::end(target_counter))
         {
             target_counter.emplace(std::make_pair(key, 1));
         }
@@ -192,12 +193,12 @@ void configure_handle_lifetime(std::vector<TaskHandles> &tasks_handles, const Ha
     // Acquires the given handles and sets them as in flight if they aren't already
     auto acquire = [&](std::vector<std::pair<ITensorHandle *, IMemoryGroup *>> &handles)
     {
-        for(auto &handle : handles)
+        for (auto &handle : handles)
         {
             ITensorHandle *parent_handle = handle.first;
             ARM_COMPUTE_ERROR_ON(parent_handle == nullptr);
             // If the tensor is not already in flight:
-            if(tensors_in_flight.find(parent_handle) == std::end(tensors_in_flight))
+            if (tensors_in_flight.find(parent_handle) == std::end(tensors_in_flight))
             {
                 ARM_COMPUTE_ERROR_ON(hc.find(parent_handle) == std::end(hc));
                 // Then add it to the list of in flight tensors
@@ -208,20 +209,20 @@ void configure_handle_lifetime(std::vector<TaskHandles> &tasks_handles, const Ha
         }
     };
 
-    for(auto &task_handle : tasks_handles)
+    for (auto &task_handle : tasks_handles)
     {
         // Marking all the input and output tensors of the task as in flight
         acquire(task_handle.input_handles);
         acquire(task_handle.output_handles);
 
         // Releasing the input tensors
-        for(auto &input_handle : task_handle.input_handles)
+        for (auto &input_handle : task_handle.input_handles)
         {
             ITensorHandle *ihandle = input_handle.first;
             ARM_COMPUTE_ERROR_ON(ihandle == nullptr);
             ARM_COMPUTE_ERROR_ON(tensors_in_flight.find(ihandle) == std::end(tensors_in_flight));
             --tensors_in_flight[ihandle];
-            if(tensors_in_flight[ihandle] <= 0)
+            if (tensors_in_flight[ihandle] <= 0)
             {
                 // Remove tensor for tensors in flight
                 tensors_in_flight.erase(ihandle);
@@ -242,7 +243,7 @@ void configure_transition_manager(Graph &g, GraphContext &ctx, ExecutionWorkload
     TargetHandleCounter      target_handle_count;
 
     // Count handles
-    for(auto &task : workload.tasks)
+    for (auto &task : workload.tasks)
     {
         // Populates IO handles
         tasks_handles.push_back(get_transition_handles(ctx, task, const_tensors));
@@ -252,12 +253,12 @@ void configure_transition_manager(Graph &g, GraphContext &ctx, ExecutionWorkload
     }
 
     // Setup memory managers
-    for(auto &hc : target_handle_count)
+    for (auto &hc : target_handle_count)
     {
         MemoryManagerContext *mm_ctx = ctx.memory_management_ctx(hc.first);
-        if(mm_ctx != nullptr)
+        if (mm_ctx != nullptr)
         {
-            if(mm_ctx->cross_mm != nullptr && mm_ctx->cross_group != nullptr)
+            if (mm_ctx->cross_mm != nullptr && mm_ctx->cross_group != nullptr)
             {
                 // Manage and allocate tensors
                 configure_handle_lifetime(tasks_handles, hc.second);
diff --git a/src/graph/detail/ExecutionHelpers.cpp b/src/graph/detail/ExecutionHelpers.cpp
index ac800df76c..870d24a6c7 100644
--- a/src/graph/detail/ExecutionHelpers.cpp
+++ b/src/graph/detail/ExecutionHelpers.cpp
@@ -23,12 +23,12 @@
  */
 #include "arm_compute/graph/detail/ExecutionHelpers.h"
 
+#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/GraphManager.h"
 #include "arm_compute/graph/Tensor.h"
 #include "arm_compute/graph/Utils.h"
-#include "arm_compute/graph/backends/BackendRegistry.h"
 
 namespace arm_compute
 {
@@ -41,9 +41,9 @@ void validate_all_nodes(Graph &g)
     auto &nodes = g.nodes();
 
     // Create tasks
-    for(auto &node : nodes)
+    for (auto &node : nodes)
     {
-        if(node != nullptr)
+        if (node != nullptr)
         {
             Target                    assigned_target = node->assigned_target();
             backends::IDeviceBackend &backend         = backends::BackendRegistry::get().get_backend(assigned_target);
@@ -57,9 +57,9 @@ void configure_all_tensors(Graph &g)
 {
     auto &tensors = g.tensors();
 
-    for(auto &tensor : tensors)
+    for (auto &tensor : tensors)
     {
-        if(tensor && tensor->handle() == nullptr)
+        if (tensor && tensor->handle() == nullptr)
         {
             Target                         target  = tensor->desc().target;
             backends::IDeviceBackend      &backend = backends::BackendRegistry::get().get_backend(target);
@@ -72,10 +72,10 @@ void configure_all_tensors(Graph &g)
 
 void allocate_all_input_tensors(INode &node)
 {
-    for(unsigned int i = 0; i < node.num_inputs(); ++i)
+    for (unsigned int i = 0; i < node.num_inputs(); ++i)
     {
         Tensor *tensor = node.input(i);
-        if(tensor != nullptr && !tensor->bound_edges().empty())
+        if (tensor != nullptr && !tensor->bound_edges().empty())
         {
             ARM_COMPUTE_ERROR_ON_MSG(!tensor->handle(), "Tensor handle is not configured!");
             tensor->handle()->allocate();
@@ -85,10 +85,10 @@ void allocate_all_input_tensors(INode &node)
 
 void allocate_all_output_tensors(INode &node)
 {
-    for(unsigned int i = 0; i < node.num_outputs(); ++i)
+    for (unsigned int i = 0; i < node.num_outputs(); ++i)
     {
         Tensor *tensor = node.output(i);
-        if(tensor != nullptr && !tensor->bound_edges().empty())
+        if (tensor != nullptr && !tensor->bound_edges().empty())
         {
             ARM_COMPUTE_ERROR_ON_MSG(!tensor->handle(), "Tensor handle is not configured!");
             tensor->handle()->allocate();
@@ -98,11 +98,11 @@ void allocate_all_output_tensors(INode &node)
 
 void allocate_const_tensors(Graph &g)
 {
-    for(auto &node : g.nodes())
+    for (auto &node : g.nodes())
     {
-        if(node != nullptr)
+        if (node != nullptr)
         {
-            switch(node->type())
+            switch (node->type())
             {
                 case NodeType::Const:
                 case NodeType::Input:
@@ -121,9 +121,10 @@ void allocate_all_tensors(Graph &g)
 {
     auto &tensors = g.tensors();
 
-    for(auto &tensor : tensors)
+    for (auto &tensor : tensors)
     {
-        if(tensor && !tensor->bound_edges().empty() && tensor->handle() != nullptr && tensor->handle()->tensor().info()->is_resizable() && tensor->handle()->tensor().is_used())
+        if (tensor && !tensor->bound_edges().empty() && tensor->handle() != nullptr &&
+            tensor->handle()->tensor().info()->is_resizable() && tensor->handle()->tensor().is_used())
         {
             tensor->handle()->allocate();
         }
@@ -140,15 +141,15 @@ ExecutionWorkload configure_all_nodes(Graph &g, GraphContext &ctx, const std::ve
     workload.tasks.reserve(node_order.size());
 
     // Create tasks
-    for(auto &node_id : node_order)
+    for (auto &node_id : node_order)
     {
         auto node = g.node(node_id);
-        if(node != nullptr)
+        if (node != nullptr)
         {
             Target                     assigned_target = node->assigned_target();
-            backends::IDeviceBackend &backend         = backends::BackendRegistry::get().get_backend(assigned_target);
+            backends::IDeviceBackend  &backend         = backends::BackendRegistry::get().get_backend(assigned_target);
             std::unique_ptr<IFunction> func            = backend.configure_node(*node, ctx);
-            if(func != nullptr || is_utility_node(node))
+            if (func != nullptr || is_utility_node(node))
             {
                 workload.tasks.emplace_back(ExecutionTask(std::move(func), node));
             }
@@ -156,14 +157,14 @@ ExecutionWorkload configure_all_nodes(Graph &g, GraphContext &ctx, const std::ve
     }
 
     // Add inputs and outputs
-    for(auto &node : g.nodes())
+    for (auto &node : g.nodes())
     {
-        if(node != nullptr && node->type() == NodeType::Input)
+        if (node != nullptr && node->type() == NodeType::Input)
         {
             workload.inputs.push_back(node->output(0));
         }
 
-        if(node != nullptr && node->type() == NodeType::Output)
+        if (node != nullptr && node->type() == NodeType::Output)
         {
             workload.outputs.push_back(node->input(0));
             continue;
@@ -175,9 +176,9 @@ ExecutionWorkload configure_all_nodes(Graph &g, GraphContext &ctx, const std::ve
 
 void release_unused_tensors(Graph &g)
 {
-    for(auto &tensor : g.tensors())
+    for (auto &tensor : g.tensors())
     {
-        if(tensor != nullptr && tensor->handle() != nullptr)
+        if (tensor != nullptr && tensor->handle() != nullptr)
         {
             tensor->handle()->release_if_unused();
         }
@@ -194,11 +195,11 @@ void call_all_const_node_accessors(Graph &g)
 {
     auto &nodes = g.nodes();
 
-    for(auto &node : nodes)
+    for (auto &node : nodes)
     {
-        if(node != nullptr && node->type() == NodeType::Const && node->num_outputs())
+        if (node != nullptr && node->type() == NodeType::Const && node->num_outputs())
         {
-            if(!node->output(0)->bound_edges().empty())
+            if (!node->output(0)->bound_edges().empty())
             {
                 call_tensor_accessor(node->output(0));
             }
@@ -209,18 +210,19 @@ void call_all_const_node_accessors(Graph &g)
 bool call_all_input_node_accessors(ExecutionWorkload &workload)
 {
     bool is_valid = true;
-    std::for_each(std::begin(workload.inputs), std::end(workload.inputs), [&](Tensor * input_tensor)
-    {
-        bool valid_input = (input_tensor != nullptr) && input_tensor->call_accessor();
-        is_valid         = is_valid && valid_input;
-    });
+    std::for_each(std::begin(workload.inputs), std::end(workload.inputs),
+                  [&](Tensor *input_tensor)
+                  {
+                      bool valid_input = (input_tensor != nullptr) && input_tensor->call_accessor();
+                      is_valid         = is_valid && valid_input;
+                  });
     return is_valid;
 }
 
 void prepare_all_tasks(ExecutionWorkload &workload)
 {
     ARM_COMPUTE_ERROR_ON(workload.graph == nullptr);
-    for(auto &task : workload.tasks)
+    for (auto &task : workload.tasks)
     {
         task.prepare();
         release_unused_tensors(*workload.graph);
@@ -232,24 +234,24 @@ void call_all_tasks(ExecutionWorkload &workload)
     ARM_COMPUTE_ERROR_ON(workload.ctx == nullptr);
 
     // Acquire memory for the transition buffers
-    for(auto &mm_ctx : workload.ctx->memory_managers())
+    for (auto &mm_ctx : workload.ctx->memory_managers())
     {
-        if(mm_ctx.second.cross_group != nullptr)
+        if (mm_ctx.second.cross_group != nullptr)
         {
             mm_ctx.second.cross_group->acquire();
         }
     }
 
     // Execute tasks
-    for(auto &task : workload.tasks)
+    for (auto &task : workload.tasks)
     {
         task();
     }
 
     // Release memory for the transition buffers
-    for(auto &mm_ctx : workload.ctx->memory_managers())
+    for (auto &mm_ctx : workload.ctx->memory_managers())
     {
-        if(mm_ctx.second.cross_group != nullptr)
+        if (mm_ctx.second.cross_group != nullptr)
         {
             mm_ctx.second.cross_group->release();
         }
@@ -259,11 +261,12 @@ void call_all_tasks(ExecutionWorkload &workload)
 bool call_all_output_node_accessors(ExecutionWorkload &workload)
 {
     bool is_valid = true;
-    std::for_each(std::begin(workload.outputs), std::end(workload.outputs), [&](Tensor * output_tensor)
-    {
-        bool valid_output = (output_tensor != nullptr) && output_tensor->call_accessor();
-        is_valid          = is_valid && valid_output;
-    });
+    std::for_each(std::begin(workload.outputs), std::end(workload.outputs),
+                  [&](Tensor *output_tensor)
+                  {
+                      bool valid_output = (output_tensor != nullptr) && output_tensor->call_accessor();
+                      is_valid          = is_valid && valid_output;
+                  });
 
     sync_backends();
 
diff --git a/src/graph/frontend/Stream.cpp b/src/graph/frontend/Stream.cpp
index 44c8400874..383a6dc67f 100644
--- a/src/graph/frontend/Stream.cpp
+++ b/src/graph/frontend/Stream.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/graph/frontend/Stream.h"
 
-#include "arm_compute/graph/Utils.h"
 #include "arm_compute/graph/frontend/ILayer.h"
+#include "arm_compute/graph/Utils.h"
 
 namespace arm_compute
 {
@@ -32,8 +32,7 @@ namespace graph
 {
 namespace frontend
 {
-Stream::Stream(size_t id, std::string name)
-    : _ctx(), _manager(), _g(id, std::move(name))
+Stream::Stream(size_t id, std::string name) : _ctx(), _manager(), _g(id, std::move(name))
 {
 }
 
diff --git a/src/graph/frontend/SubStream.cpp b/src/graph/frontend/SubStream.cpp
index 4b42207e80..8596aaa1a3 100644
--- a/src/graph/frontend/SubStream.cpp
+++ b/src/graph/frontend/SubStream.cpp
@@ -23,8 +23,8 @@
  */
 #include "arm_compute/graph/frontend/SubStream.h"
 
-#include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/frontend/ILayer.h"
+#include "arm_compute/graph/Graph.h"
 
 namespace arm_compute
 {
@@ -32,8 +32,7 @@ namespace graph
 {
 namespace frontend
 {
-SubStream::SubStream(IStream &s)
-    : _s(s)
+SubStream::SubStream(IStream &s) : _s(s)
 {
     _hints     = s.hints();
     _tail_node = s.tail_node();
diff --git a/src/graph/mutators/DepthConcatSubTensorMutator.cpp b/src/graph/mutators/DepthConcatSubTensorMutator.cpp
index 963b948432..1b7ee3c4a4 100644
--- a/src/graph/mutators/DepthConcatSubTensorMutator.cpp
+++ b/src/graph/mutators/DepthConcatSubTensorMutator.cpp
@@ -23,12 +23,12 @@
  */
 #include "arm_compute/graph/mutators/DepthConcatSubTensorMutator.h"
 
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Utils.h"
 #include "arm_compute/graph/algorithms/TopologicalSort.h"
 #include "arm_compute/graph/backends/BackendRegistry.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/Logger.h"
 #include "arm_compute/graph/nodes/ConcatenateLayerNode.h"
+#include "arm_compute/graph/Utils.h"
 
 #include "support/Cast.h"
 #include "support/Iterable.h"
@@ -50,7 +50,7 @@ IGraphMutator::MutationType DepthConcatSubTensorMutator::type() const
 void DepthConcatSubTensorMutator::mutate(Graph &g)
 {
     // Early exit if no Concatenation layers exist in graph
-    if(g.nodes(NodeType::ConcatenateLayer).empty())
+    if (g.nodes(NodeType::ConcatenateLayer).empty())
     {
         return;
     }
@@ -59,43 +59,48 @@ void DepthConcatSubTensorMutator::mutate(Graph &g)
     std::vector<NodeID> topological_sorted_node_ids = dfs(g);
 
     // Should be in reverse order of execution
-    for(auto &node_id : arm_compute::utils::iterable::reverse_iterate(topological_sorted_node_ids))
+    for (auto &node_id : arm_compute::utils::iterable::reverse_iterate(topological_sorted_node_ids))
     {
         INode *node = g.node(node_id);
-        if(node != nullptr && node->type() == NodeType::ConcatenateLayer && node->output(0) != nullptr)
+        if (node != nullptr && node->type() == NodeType::ConcatenateLayer && node->output(0) != nullptr)
         {
             // Get output tensor
             auto output_tensor = node->output(0);
 
             // Check concatenation axis (Sub-tensor optimization is supported for concatenation axis >=2)
             auto *concat_node = arm_compute::utils::cast::polymorphic_downcast<ConcatenateLayerNode *>(node);
-            if(output_tensor == nullptr || get_dimension_idx(output_tensor->desc().layout, concat_node->concatenation_axis()) < 2)
+            if (output_tensor == nullptr ||
+                get_dimension_idx(output_tensor->desc().layout, concat_node->concatenation_axis()) < 2)
             {
                 continue;
             }
 
             // Check that all tensor have the same target, valid inputs and same quantization info
-            bool is_valid = std::all_of(node->input_edges().cbegin(), node->input_edges().cend(),
-                                        [&](const EdgeID & eid)
-            {
-                return (g.edge(eid) != nullptr) && (g.edge(eid)->tensor() != nullptr) && (g.edge(eid)->tensor()->desc().target == output_tensor->desc().target)
-                       && (g.edge(eid)->tensor()->desc().quant_info == output_tensor->desc().quant_info);
-            });
+            bool is_valid =
+                std::all_of(node->input_edges().cbegin(), node->input_edges().cend(),
+                            [&](const EdgeID &eid)
+                            {
+                                return (g.edge(eid) != nullptr) && (g.edge(eid)->tensor() != nullptr) &&
+                                       (g.edge(eid)->tensor()->desc().target == output_tensor->desc().target) &&
+                                       (g.edge(eid)->tensor()->desc().quant_info == output_tensor->desc().quant_info);
+                            });
 
             // Create subtensors
-            if(is_valid && is_target_supported(output_tensor->desc().target))
+            if (is_valid && is_target_supported(output_tensor->desc().target))
             {
                 ARM_COMPUTE_LOG_GRAPH_VERBOSE("Using sub-tensors for the node with ID : "
                                               << node->id() << " and name : " << node->name() << std::endl);
                 // Create sub-tensor handles
                 unsigned depth = 0;
-                for(unsigned int i = 0; i < node->input_edges().size(); ++i)
+                for (unsigned int i = 0; i < node->input_edges().size(); ++i)
                 {
                     auto       input_tensor = node->input(i);
                     const auto input_shape  = input_tensor->desc().shape;
 
-                    backends::IDeviceBackend      &backend = backends::BackendRegistry::get().get_backend(input_tensor->desc().target);
-                    std::unique_ptr<ITensorHandle> handle  = backend.create_subtensor(output_tensor->handle(), input_shape, Coordinates(0, 0, depth), false);
+                    backends::IDeviceBackend &backend =
+                        backends::BackendRegistry::get().get_backend(input_tensor->desc().target);
+                    std::unique_ptr<ITensorHandle> handle =
+                        backend.create_subtensor(output_tensor->handle(), input_shape, Coordinates(0, 0, depth), false);
                     input_tensor->set_handle(std::move(handle));
 
                     depth += input_shape.z();
diff --git a/src/graph/mutators/GroupedConvolutionMutator.cpp b/src/graph/mutators/GroupedConvolutionMutator.cpp
index b7c551ce8b..31efba6bb1 100644
--- a/src/graph/mutators/GroupedConvolutionMutator.cpp
+++ b/src/graph/mutators/GroupedConvolutionMutator.cpp
@@ -23,15 +23,14 @@
  */
 #include "arm_compute/graph/mutators/GroupedConvolutionMutator.h"
 
+#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/GraphBuilder.h"
 #include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Utils.h"
-#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/graph/Utils.h"
 
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 #include <set>
@@ -42,43 +41,51 @@ namespace graph
 {
 namespace
 {
-NodeID create_grouped_convolution(Graph &g, const NodeParams &params, NodeIdxPair input, NodeID weights, NodeID bias,
-                                  PadStrideInfo conv_info, ConvolutionMethod method, ActivationLayerInfo fused_act, FastMathHint fast_math_hint, unsigned int num_groups)
+NodeID create_grouped_convolution(Graph              &g,
+                                  const NodeParams   &params,
+                                  NodeIdxPair         input,
+                                  NodeID              weights,
+                                  NodeID              bias,
+                                  PadStrideInfo       conv_info,
+                                  ConvolutionMethod   method,
+                                  ActivationLayerInfo fused_act,
+                                  FastMathHint        fast_math_hint,
+                                  unsigned int        num_groups)
 {
     bool has_bias = (bias != EmptyNodeID);
 
     // Split input
     const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
-    const unsigned int     input_idx         = get_dimension_idx(input_tensor_desc.layout, DataLayoutDimension::CHANNEL);
-    NodeID                 input_split       = GraphBuilder::add_split_node(g, params, input, num_groups, input_idx);
+    const unsigned int     input_idx   = get_dimension_idx(input_tensor_desc.layout, DataLayoutDimension::CHANNEL);
+    NodeID                 input_split = GraphBuilder::add_split_node(g, params, input, num_groups, input_idx);
 
     // Split weights
     const TensorDescriptor weights_tensor_desc = get_tensor_descriptor(g, g.node(weights)->outputs()[0]);
-    const unsigned int     batch_idx           = get_dimension_idx(weights_tensor_desc.layout, DataLayoutDimension::BATCHES);
-    NodeID                 weights_split       = GraphBuilder::add_split_node(g, params, { weights, 0 }, num_groups, batch_idx);
+    const unsigned int     batch_idx     = get_dimension_idx(weights_tensor_desc.layout, DataLayoutDimension::BATCHES);
+    NodeID                 weights_split = GraphBuilder::add_split_node(g, params, {weights, 0}, num_groups, batch_idx);
 
     // Split bias
     NodeID bias_split = EmptyNodeID;
-    if(has_bias)
+    if (has_bias)
     {
         // Split bias
-        bias_split = GraphBuilder::add_split_node(g, params, { bias, 0 }, num_groups, 0);
+        bias_split = GraphBuilder::add_split_node(g, params, {bias, 0}, num_groups, 0);
     }
 
     std::vector<NodeIdxPair> convolution_outputs;
-    for(unsigned int i = 0; i < num_groups; ++i)
+    for (unsigned int i = 0; i < num_groups; ++i)
     {
         NodeParams group_params = params;
         NodeID     conv_nid     = g.add_node<ConvolutionLayerNode>(conv_info, 1, method, fast_math_hint);
         g.add_connection(input_split, i, conv_nid, 0);
         g.add_connection(weights_split, i, conv_nid, 1);
-        if(has_bias)
+        if (has_bias)
         {
             g.add_connection(bias_split, i, conv_nid, 2);
         }
 
         // Add group name
-        if(!group_params.name.empty())
+        if (!group_params.name.empty())
         {
             group_params.name.append("_g" + arm_compute::support::cpp11::to_string(i));
         }
@@ -92,7 +99,7 @@ NodeID create_grouped_convolution(Graph &g, const NodeParams &params, NodeIdxPai
         auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(node);
         conv_node->set_fused_activation(fused_act);
 
-        convolution_outputs.push_back({ conv_nid, 0 });
+        convolution_outputs.push_back({conv_nid, 0});
     }
 
     // Depth concatenate output
@@ -113,7 +120,7 @@ IGraphMutator::MutationType GroupedConvolutionMutator::type() const
 void GroupedConvolutionMutator::mutate(Graph &g)
 {
     // Early exit if no Convolution layers exist in graph
-    if(g.nodes(NodeType::ConvolutionLayer).empty())
+    if (g.nodes(NodeType::ConvolutionLayer).empty())
     {
         return;
     }
@@ -122,17 +129,18 @@ void GroupedConvolutionMutator::mutate(Graph &g)
     size_t total_nodes = g.nodes().size();
 
     // Iterate over convolution nodes
-    for(unsigned int i = 0; i < total_nodes; ++i)
+    for (unsigned int i = 0; i < total_nodes; ++i)
     {
         INode *node = g.node(i);
-        if(node != nullptr && node->type() == NodeType::ConvolutionLayer && arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(node)->num_groups() != 1)
+        if (node != nullptr && node->type() == NodeType::ConvolutionLayer &&
+            arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(node)->num_groups() != 1)
         {
             // Validate node
             backends::IDeviceBackend &backend = backends::BackendRegistry::get().get_backend(node->assigned_target());
             Status                    status  = backend.validate_node(*node);
 
             // If grouped convolution is not supported
-            if(!bool(status))
+            if (!bool(status))
             {
                 // Down-cast node
                 auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(node);
@@ -151,7 +159,8 @@ void GroupedConvolutionMutator::mutate(Graph &g)
                 ARM_COMPUTE_ERROR_ON(conv_node->input_edge(0) == nullptr || conv_node->input_edge(1) == nullptr);
                 const NodeID input_id   = conv_node->input_edge(0)->producer()->id();
                 const NodeID weights_id = conv_node->input_edge(1)->producer()->id();
-                const NodeID bias_id    = (conv_node->input_edge(2) != nullptr) ? conv_node->input_edge(2)->producer()->id() : EmptyNodeID;
+                const NodeID bias_id =
+                    (conv_node->input_edge(2) != nullptr) ? conv_node->input_edge(2)->producer()->id() : EmptyNodeID;
 
                 // Get driving nodes
                 std::vector<NodeIdxPair> driving_nodes = get_driving_nodes(*node);
@@ -164,14 +173,15 @@ void GroupedConvolutionMutator::mutate(Graph &g)
                 NodeID   latest_nid = g.nodes().size();
 
                 // Create grouped convolution node
-                NodeID grouped_conv_id = create_grouped_convolution(g, params, { input_id, 0 }, weights_id, bias_id,
-                                                                    conv_info, conv_method, fused_act_info, fast_math_hint, num_groups);
+                NodeID grouped_conv_id =
+                    create_grouped_convolution(g, params, {input_id, 0}, weights_id, bias_id, conv_info, conv_method,
+                                               fused_act_info, fast_math_hint, num_groups);
 
                 // Remove convolution node
                 g.remove_node(node->id());
 
                 // Update batch normalization node outputs
-                for(auto &driving_node : driving_nodes)
+                for (auto &driving_node : driving_nodes)
                 {
                     g.add_connection(grouped_conv_id, 0, driving_node.node_id, driving_node.index);
                 }
@@ -180,17 +190,16 @@ void GroupedConvolutionMutator::mutate(Graph &g)
                 g.node(grouped_conv_id)->output(0)->set_accessor(std::move(node_accessor));
 
                 // Configure new tensors and nodes
-                std::for_each(g.tensors().begin() + latest_tid, g.tensors().end(), [](std::unique_ptr<Tensor> &t)
-                {
-                    configure_tensor(t.get());
-                });
-                std::for_each(g.nodes().begin() + latest_nid, g.nodes().end(), [&assigned_target](std::unique_ptr<INode> &n)
-                {
-                    if(n != nullptr)
-                    {
-                        n->set_assigned_target(assigned_target);
-                    }
-                });
+                std::for_each(g.tensors().begin() + latest_tid, g.tensors().end(),
+                              [](std::unique_ptr<Tensor> &t) { configure_tensor(t.get()); });
+                std::for_each(g.nodes().begin() + latest_nid, g.nodes().end(),
+                              [&assigned_target](std::unique_ptr<INode> &n)
+                              {
+                                  if (n != nullptr)
+                                  {
+                                      n->set_assigned_target(assigned_target);
+                                  }
+                              });
             }
         }
     }
diff --git a/src/graph/mutators/InPlaceOperationMutator.cpp b/src/graph/mutators/InPlaceOperationMutator.cpp
index d3ea940895..a51dcc4f42 100644
--- a/src/graph/mutators/InPlaceOperationMutator.cpp
+++ b/src/graph/mutators/InPlaceOperationMutator.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/graph/Logger.h"
 #include "arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h"
 #include "arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h"
+
 #include "support/Cast.h"
 
 using namespace arm_compute::utils::cast;
@@ -48,7 +49,7 @@ bool output_edges_are_separate_tensors(Graph &g, const Edge *input_edge)
     const auto input_tensor  = input_edge->tensor();
     const auto input_edge_id = input_edge->id();
 
-    if(parent_node == nullptr)
+    if (parent_node == nullptr)
     {
         return false;
     }
@@ -57,24 +58,23 @@ bool output_edges_are_separate_tensors(Graph &g, const Edge *input_edge)
 
     // If the output is connected to only one edge, then computations can
     // be done in-place.
-    if(output_edges.size() == 1)
+    if (output_edges.size() == 1)
     {
         return true;
     }
 
-    return std::all_of(output_edges.begin(),
-                       output_edges.end(),
-                       [&](const EdgeID & edge_id)
-    {
-        // Skip check on current input edge
-        if(edge_id == input_edge_id)
-        {
-            return true;
-        }
-
-        auto edge = g.edge(edge_id);
-        return edge->tensor() != input_tensor;
-    });
+    return std::all_of(output_edges.begin(), output_edges.end(),
+                       [&](const EdgeID &edge_id)
+                       {
+                           // Skip check on current input edge
+                           if (edge_id == input_edge_id)
+                           {
+                               return true;
+                           }
+
+                           auto edge = g.edge(edge_id);
+                           return edge->tensor() != input_tensor;
+                       });
 }
 
 // If do in-place calculation, then need to use the new output and inherit original output's accessor
@@ -109,12 +109,14 @@ void try_in_place_depthwiseconv(std::unique_ptr<INode> &node)
     // Extract PadStrideInfo and depth multiplier
     PadStrideInfo conv_info{};
     unsigned int  depth_multiplier{};
-    if(node->type() == NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer)
+    if (node->type() == NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer)
     {
-        conv_info        = polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node.get())->convolution_info();
-        depth_multiplier = polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node.get())->depth_multiplier();
+        conv_info =
+            polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node.get())->convolution_info();
+        depth_multiplier =
+            polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node.get())->depth_multiplier();
     }
-    else if(node->type() == NodeType::DepthwiseConvolutionLayer)
+    else if (node->type() == NodeType::DepthwiseConvolutionLayer)
     {
         conv_info        = polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node.get())->convolution_info();
         depth_multiplier = polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node.get())->depth_multiplier();
@@ -126,7 +128,8 @@ void try_in_place_depthwiseconv(std::unique_ptr<INode> &node)
     const auto out_shape = current_output_tensor->desc().shape;
     const auto qinfo_out = current_output_tensor->desc().quant_info;
 
-    bool input_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, input_shape, 0) && (qinfo_input == qinfo_out) && (input_tensor->accessor() == nullptr);
+    bool input_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, input_shape, 0) &&
+                              (qinfo_input == qinfo_out) && (input_tensor->accessor() == nullptr);
 
     // Specify conditions with which input can be in-placed
     input_can_in_place &= weight_layout == input_tensor->desc().layout && weight_layout == DataLayout::NHWC;
@@ -141,13 +144,14 @@ void try_in_place_depthwiseconv(std::unique_ptr<INode> &node)
     input_can_in_place &= !conv_info.has_padding();
     // NOTE: Dilation should also be (1, 1). However currently dilation is not supported in the depthwise conv node
 
-    if(input_can_in_place)
+    if (input_can_in_place)
     {
         set_new_output_and_inherit_accessor(node, current_output_tensor, input_tensor);
     }
     else
     {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to the input tensor or the quantization info are different.\n");
+        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to the input tensor "
+                                      "or the quantization info are different.\n");
     }
 }
 
@@ -170,7 +174,7 @@ void try_in_place_elementwise(std::unique_ptr<INode> &node)
 
     const TensorShape out_shape = TensorShape::broadcast_shape(shape0, shape1);
     // Inputs are not broadcast compatible
-    if(out_shape.total_size() == 0)
+    if (out_shape.total_size() == 0)
     {
         return;
     }
@@ -181,22 +185,27 @@ void try_in_place_elementwise(std::unique_ptr<INode> &node)
     const auto qinfo_out = current_output_tensor->desc().quant_info;
 
     // Can do in place, if the input has same shape as output, has same quntisation info as output, has same data type as output and input doesn't have accessor.
-    bool input0_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, shape0, 0) && (qinfo0 == qinfo_out)
-                               && (input0_tensor->desc().data_type == current_output_tensor->desc().data_type) && (input0_tensor->accessor() == nullptr);
-    bool input1_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, shape1, 0) && (qinfo1 == qinfo_out)
-                               && (input1_tensor->desc().data_type == current_output_tensor->desc().data_type) && (input1_tensor->accessor() == nullptr);
-
-    if(input0_can_in_place)
+    bool input0_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, shape0, 0) &&
+                               (qinfo0 == qinfo_out) &&
+                               (input0_tensor->desc().data_type == current_output_tensor->desc().data_type) &&
+                               (input0_tensor->accessor() == nullptr);
+    bool input1_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, shape1, 0) &&
+                               (qinfo1 == qinfo_out) &&
+                               (input1_tensor->desc().data_type == current_output_tensor->desc().data_type) &&
+                               (input1_tensor->accessor() == nullptr);
+
+    if (input0_can_in_place)
     {
         set_new_output_and_inherit_accessor(node, current_output_tensor, input0_tensor);
     }
-    else if(input1_can_in_place)
+    else if (input1_can_in_place)
     {
         set_new_output_and_inherit_accessor(node, current_output_tensor, input1_tensor);
     }
     else
     {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to the input tensor or the quantization info are different.\n");
+        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to the input tensor "
+                                      "or the quantization info are different.\n");
     }
 }
 } // namespace
@@ -213,33 +222,31 @@ IGraphMutator::MutationType InPlaceOperationMutator::type() const
 
 void InPlaceOperationMutator::mutate(Graph &g)
 {
-    std::set<NodeType> in_place_nodes =
-    {
-        NodeType::ActivationLayer,
-        NodeType::BatchNormalizationLayer,
-        NodeType::EltwiseLayer,
-        NodeType::UnaryEltwiseLayer,
-        NodeType::DepthwiseConvolutionLayer,
-        NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer,
-        NodeType::PrintLayer
-    };
+    std::set<NodeType> in_place_nodes = {NodeType::ActivationLayer,
+                                         NodeType::BatchNormalizationLayer,
+                                         NodeType::EltwiseLayer,
+                                         NodeType::UnaryEltwiseLayer,
+                                         NodeType::DepthwiseConvolutionLayer,
+                                         NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer,
+                                         NodeType::PrintLayer};
 
     // Not interested in the order of nodes
-    for(auto &node : g.nodes())
+    for (auto &node : g.nodes())
     {
-        if(node && in_place_nodes.find(node->type()) != std::end(in_place_nodes))
+        if (node && in_place_nodes.find(node->type()) != std::end(in_place_nodes))
         {
             // Get input edge
             Edge *input_edge = node->input_edge(0);
 
             // Check if parent has a single output if yes then force in place calculation else not
-            if((input_edge != nullptr) && output_edges_are_separate_tensors(g, input_edge))
+            if ((input_edge != nullptr) && output_edges_are_separate_tensors(g, input_edge))
             {
-                if(node->type() == NodeType::EltwiseLayer)
+                if (node->type() == NodeType::EltwiseLayer)
                 {
                     try_in_place_elementwise(node);
                 }
-                else if(node->type() == NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer || node->type() == NodeType::DepthwiseConvolutionLayer)
+                else if (node->type() == NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer ||
+                         node->type() == NodeType::DepthwiseConvolutionLayer)
                 {
                     try_in_place_depthwiseconv(node);
                 }
@@ -252,9 +259,11 @@ void InPlaceOperationMutator::mutate(Graph &g)
                     ARM_COMPUTE_ERROR_ON(current_output_tensor == nullptr || new_output_tensor == nullptr);
 
                     // Prevent in-place operation if there is an accessor bound to the in-place tensor or quantization info are different
-                    if(new_output_tensor->accessor() != nullptr || current_output_tensor->desc().quant_info != new_output_tensor->desc().quant_info)
+                    if (new_output_tensor->accessor() != nullptr ||
+                        current_output_tensor->desc().quant_info != new_output_tensor->desc().quant_info)
                     {
-                        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to the input tensor or the quantization info are different.\n");
+                        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to "
+                                                      "the input tensor or the quantization info are different.\n");
                     }
                     else
                     {
diff --git a/src/graph/mutators/MutatorUtils.cpp b/src/graph/mutators/MutatorUtils.cpp
index c8f38f34e7..f47240eadd 100644
--- a/src/graph/mutators/MutatorUtils.cpp
+++ b/src/graph/mutators/MutatorUtils.cpp
@@ -29,14 +29,14 @@ namespace graph
 {
 bool is_padding_in_height_or_width(const DataLayout &layout, const PaddingList &padding_list)
 {
-    if(layout == DataLayout::NCHW || layout == DataLayout::NHWC)
+    if (layout == DataLayout::NCHW || layout == DataLayout::NHWC)
     {
         const unsigned int height_index = get_dimension_idx(layout, DataLayoutDimension::HEIGHT);
         const unsigned int width_index  = get_dimension_idx(layout, DataLayoutDimension::WIDTH);
 
-        for(unsigned int i = 0; i < padding_list.size(); ++i)
+        for (unsigned int i = 0; i < padding_list.size(); ++i)
         {
-            if(i != height_index && i != width_index && padding_list[i] != PaddingInfo(0, 0))
+            if (i != height_index && i != width_index && padding_list[i] != PaddingInfo(0, 0))
             {
                 // if the index is not either height or width, don't fuse
                 return false;
@@ -49,4 +49,4 @@ bool is_padding_in_height_or_width(const DataLayout &layout, const PaddingList &
     return false;
 }
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/mutators/NodeExecutionMethodMutator.cpp b/src/graph/mutators/NodeExecutionMethodMutator.cpp
index 09a3cf50c0..588befecae 100644
--- a/src/graph/mutators/NodeExecutionMethodMutator.cpp
+++ b/src/graph/mutators/NodeExecutionMethodMutator.cpp
@@ -23,11 +23,11 @@
  */
 #include "arm_compute/graph/mutators/NodeExecutionMethodMutator.h"
 
+#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Utils.h"
-#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/graph/Utils.h"
 
 #include "support/Cast.h"
 
@@ -49,17 +49,17 @@ template <typename Setter>
 void set_default_on_invalid_method(Graph &g, NodeType node_type, Setter &&setter)
 {
     const std::vector<NodeID> &node_ids = g.nodes(node_type);
-    for(auto &node_id : node_ids)
+    for (auto &node_id : node_ids)
     {
         INode *node = g.node(node_id);
-        if(node != nullptr)
+        if (node != nullptr)
         {
             // Validate node
             backends::IDeviceBackend &backend = backends::BackendRegistry::get().get_backend(node->assigned_target());
             Status                    status  = backend.validate_node(*node);
 
             // Set default execution method in case of failure
-            if(!bool(status))
+            if (!bool(status))
             {
                 setter(node);
             }
@@ -81,22 +81,26 @@ IGraphMutator::MutationType NodeExecutionMethodMutator::type() const
 void NodeExecutionMethodMutator::mutate(Graph &g)
 {
     // Convolution Layer
-    set_default_on_invalid_method(g, NodeType::ConvolutionLayer, [](INode * n)
-    {
-        ARM_COMPUTE_LOG_GRAPH_INFO("Switched ConvolutionLayer method of node with ID : "
-                                   << n->id() << " and Name: " << n->name() << std::endl);
-        auto *casted_node = arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(n);
-        casted_node->set_convolution_method(ConvolutionMethod::Default);
-    });
+    set_default_on_invalid_method(g, NodeType::ConvolutionLayer,
+                                  [](INode *n)
+                                  {
+                                      ARM_COMPUTE_LOG_GRAPH_INFO("Switched ConvolutionLayer method of node with ID : "
+                                                                 << n->id() << " and Name: " << n->name() << std::endl);
+                                      auto *casted_node =
+                                          arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(n);
+                                      casted_node->set_convolution_method(ConvolutionMethod::Default);
+                                  });
 
     // Depthwise Convolution Layer
-    set_default_on_invalid_method(g, NodeType::DepthwiseConvolutionLayer, [](INode * n)
-    {
-        ARM_COMPUTE_LOG_GRAPH_INFO("Switched Depthwise ConvolutionLayer method of node with ID : "
-                                   << n->id() << " and Name: " << n->name() << std::endl);
-        auto *casted_node = arm_compute::utils::cast::polymorphic_downcast<DepthwiseConvolutionLayerNode *>(n);
-        casted_node->set_depthwise_convolution_method(DepthwiseConvolutionMethod::Default);
-    });
+    set_default_on_invalid_method(
+        g, NodeType::DepthwiseConvolutionLayer,
+        [](INode *n)
+        {
+            ARM_COMPUTE_LOG_GRAPH_INFO("Switched Depthwise ConvolutionLayer method of node with ID : "
+                                       << n->id() << " and Name: " << n->name() << std::endl);
+            auto *casted_node = arm_compute::utils::cast::polymorphic_downcast<DepthwiseConvolutionLayerNode *>(n);
+            casted_node->set_depthwise_convolution_method(DepthwiseConvolutionMethod::Default);
+        });
 }
 } // namespace graph
 } // namespace arm_compute
diff --git a/src/graph/mutators/NodeFusionMutator.cpp b/src/graph/mutators/NodeFusionMutator.cpp
index 38284b93cf..998a4a05c7 100644
--- a/src/graph/mutators/NodeFusionMutator.cpp
+++ b/src/graph/mutators/NodeFusionMutator.cpp
@@ -24,15 +24,14 @@
 #include "arm_compute/graph/mutators/NodeFusionMutator.h"
 
 #include "arm_compute/core/utils/DataTypeUtils.h"
+#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/GraphBuilder.h"
 #include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Utils.h"
-#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h"
 #include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/graph/Utils.h"
 
 #include "src/graph/mutators/MutatorUtils.h"
-
 #include "support/Cast.h"
 
 #include <list>
@@ -46,7 +45,7 @@ namespace detail
 {
 void transfer_driving_nodes_and_remove_old_node(Graph &g, INode *new_node, INode *old_node, bool add_output_tensor)
 {
-    if(new_node == nullptr || old_node == nullptr)
+    if (new_node == nullptr || old_node == nullptr)
     {
         return;
     }
@@ -55,7 +54,7 @@ void transfer_driving_nodes_and_remove_old_node(Graph &g, INode *new_node, INode
     std::vector<NodeIdxPair> last_driving_nodes = get_driving_nodes(*old_node);
 
     // Extract last fusable node accessor if any
-    if(old_node->output(0) == nullptr)
+    if (old_node->output(0) == nullptr)
     {
         return;
     }
@@ -65,10 +64,10 @@ void transfer_driving_nodes_and_remove_old_node(Graph &g, INode *new_node, INode
     g.remove_node(old_node->id());
 
     // Update fused node outputs
-    for(auto &driving_node : last_driving_nodes)
+    for (auto &driving_node : last_driving_nodes)
     {
         g.add_connection(new_node->id(), 0, driving_node.node_id, driving_node.index);
-        if(add_output_tensor)
+        if (add_output_tensor)
         {
             configure_tensor(new_node->output(0));
         }
@@ -83,19 +82,21 @@ void fuse_convolution_with_batch_normalization(Graph &g, const Edge *output_edge
     ARM_COMPUTE_ERROR_ON(output_edge == nullptr);
 
     auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(output_edge->producer());
-    auto *bn_node   = arm_compute::utils::cast::polymorphic_downcast<BatchNormalizationLayerNode *>(output_edge->consumer());
+    auto *bn_node =
+        arm_compute::utils::cast::polymorphic_downcast<BatchNormalizationLayerNode *>(output_edge->consumer());
 
     // Not fusing if number of groups is greater than 1
-    if(conv_node->num_groups() > 1)
+    if (conv_node->num_groups() > 1)
     {
         return;
     }
 
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing convolution node with ID : " << output_edge->producer_id()
-                                  << " with BatchNormalization Layer node with ID : " << output_edge->consumer_id() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing convolution node with ID : "
+                                  << output_edge->producer_id() << " with BatchNormalization Layer node with ID : "
+                                  << output_edge->consumer_id() << std::endl);
 
     // Prevent fusion if fused node has an output accessor
-    if(conv_node->output(0)->accessor() == nullptr)
+    if (conv_node->output(0)->accessor() == nullptr)
     {
         const Target assigned_target = conv_node->assigned_target();
 
@@ -115,9 +116,10 @@ void fuse_convolution_with_batch_normalization(Graph &g, const Edge *output_edge
         const auto epsilon = bn_node->epsilon();
 
         // Create the fused node
-        const NodeID fused_id = g.add_node<FusedConvolutionBatchNormalizationNode>(epsilon, conv_info, num_groups, conv_method, fast_math_hint, act_info);
+        const NodeID fused_id = g.add_node<FusedConvolutionBatchNormalizationNode>(
+            epsilon, conv_info, num_groups, conv_method, fast_math_hint, act_info);
 
-        if(conv_node->input_edge(2) != nullptr)
+        if (conv_node->input_edge(2) != nullptr)
         {
             auto conv_bias_id = conv_node->input_edge(2)->producer_id();
             g.add_connection(conv_bias_id, 0, fused_id, 2);
@@ -129,13 +131,13 @@ void fuse_convolution_with_batch_normalization(Graph &g, const Edge *output_edge
         g.add_connection(bn_mean_id, 0, fused_id, 3);
         g.add_connection(bn_var_id, 0, fused_id, 4);
 
-        if(bn_node->input_edge(3) != nullptr)
+        if (bn_node->input_edge(3) != nullptr)
         {
             const auto bn_beta_id = bn_node->input_edge(3)->producer_id();
             g.add_connection(bn_beta_id, 0, fused_id, 5);
         }
 
-        if(bn_node->input_edge(4) != nullptr)
+        if (bn_node->input_edge(4) != nullptr)
         {
             const auto bn_gamma_id = bn_node->input_edge(4)->producer_id();
             g.add_connection(bn_gamma_id, 0, fused_id, 6);
@@ -147,14 +149,15 @@ void fuse_convolution_with_batch_normalization(Graph &g, const Edge *output_edge
         transfer_driving_nodes_and_remove_old_node(g, fused_node, bn_node, true);
 
         fused_node->set_assigned_target(assigned_target);
-        fused_node->set_common_node_parameters(NodeParams{ conv_node->name() + "+" + bn_node_name, assigned_target });
+        fused_node->set_common_node_parameters(NodeParams{conv_node->name() + "+" + bn_node_name, assigned_target});
 
         // Remove convolution node
         g.remove_node(conv_node->id());
     }
     else
     {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution with batch normalization due to the presence of an output accessor\n");
+        ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+            "Prevented fusion of convolution with batch normalization due to the presence of an output accessor\n");
     }
 }
 
@@ -162,14 +165,17 @@ void fuse_depthwise_convolution_with_batch_normalization(Graph &g, const Edge *o
 {
     ARM_COMPUTE_ERROR_ON(output_edge == nullptr);
 
-    auto *depth_conv_node = arm_compute::utils::cast::polymorphic_downcast<DepthwiseConvolutionLayerNode *>(output_edge->producer());
-    auto *bn_node         = arm_compute::utils::cast::polymorphic_downcast<BatchNormalizationLayerNode *>(output_edge->consumer());
+    auto *depth_conv_node =
+        arm_compute::utils::cast::polymorphic_downcast<DepthwiseConvolutionLayerNode *>(output_edge->producer());
+    auto *bn_node =
+        arm_compute::utils::cast::polymorphic_downcast<BatchNormalizationLayerNode *>(output_edge->consumer());
 
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing depthwise convolution node with ID : " << output_edge->producer_id()
-                                  << " with BatchNormalization Layer node with ID : " << output_edge->consumer_id() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing depthwise convolution node with ID : "
+                                  << output_edge->producer_id() << " with BatchNormalization Layer node with ID : "
+                                  << output_edge->consumer_id() << std::endl);
 
     // Prevent fusion if fused node has an output accessor
-    if(depth_conv_node->output(0)->accessor() == nullptr)
+    if (depth_conv_node->output(0)->accessor() == nullptr)
     {
         const Target assigned_target = depth_conv_node->assigned_target();
 
@@ -189,9 +195,10 @@ void fuse_depthwise_convolution_with_batch_normalization(Graph &g, const Edge *o
         const auto epsilon     = bn_node->epsilon();
 
         // Create the fused node
-        const NodeID fused_id = g.add_node<FusedDepthwiseConvolutionBatchNormalizationNode>(epsilon, conv_info, depth_multiplier, depth_conv_method, act_info);
+        const NodeID fused_id = g.add_node<FusedDepthwiseConvolutionBatchNormalizationNode>(
+            epsilon, conv_info, depth_multiplier, depth_conv_method, act_info);
 
-        if(depth_conv_node->input_edge(2) != nullptr)
+        if (depth_conv_node->input_edge(2) != nullptr)
         {
             const auto conv_bias_id = depth_conv_node->input_edge(2)->producer_id();
             g.add_connection(conv_bias_id, 0, fused_id, 2);
@@ -211,19 +218,23 @@ void fuse_depthwise_convolution_with_batch_normalization(Graph &g, const Edge *o
         transfer_driving_nodes_and_remove_old_node(g, fused_node, bn_node, true);
 
         fused_node->set_assigned_target(assigned_target);
-        fused_node->set_common_node_parameters(NodeParams{ depth_conv_node->name() + "+" + bn_node_name, assigned_target });
+        fused_node->set_common_node_parameters(
+            NodeParams{depth_conv_node->name() + "+" + bn_node_name, assigned_target});
 
         // Remove convolution node
         g.remove_node(depth_conv_node->id());
     }
     else
     {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of depthwise convolution with batch normalization due to the presence of an output accessor\n");
+        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of depthwise convolution with batch normalization due to the "
+                                      "presence of an output accessor\n");
     }
 }
 
 template <typename N>
-void fuse_node_with_activation(Graph &g, const Edge *output_edge, const std::set<Activation> &supported_fused_activations)
+void fuse_node_with_activation(Graph                      &g,
+                               const Edge                 *output_edge,
+                               const std::set<Activation> &supported_fused_activations)
 {
     ARM_COMPUTE_ERROR_ON(output_edge == nullptr);
 
@@ -233,22 +244,23 @@ void fuse_node_with_activation(Graph &g, const Edge *output_edge, const std::set
     ARM_COMPUTE_ERROR_ON(act_node->output(0) == nullptr || n_node->output(0) == nullptr);
 
     // Check if activation is supported for fusion
-    if(supported_fused_activations.count(act_node->activation_info().activation()) == 0)
+    if (supported_fused_activations.count(act_node->activation_info().activation()) == 0)
     {
         return;
     }
 
     // EltwiseLayerNode can only be fused when dataype is float
-    if(n_node->type() == NodeType::EltwiseLayer && !is_data_type_float(n_node->output(0)->desc().data_type))
+    if (n_node->type() == NodeType::EltwiseLayer && !is_data_type_float(n_node->output(0)->desc().data_type))
     {
         return;
     }
 
     ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing node with ID : " << output_edge->producer_id()
-                                  << " with Activation Layer node with ID : " << output_edge->consumer_id() << std::endl);
+                                                           << " with Activation Layer node with ID : "
+                                                           << output_edge->consumer_id() << std::endl);
 
     // Prevent fusion if fused node has an output accessor
-    if(n_node->output(0)->accessor() == nullptr)
+    if (n_node->output(0)->accessor() == nullptr)
     {
         // Set activation info to fused node
         n_node->set_fused_activation(act_node->activation_info());
@@ -257,7 +269,8 @@ void fuse_node_with_activation(Graph &g, const Edge *output_edge, const std::set
     }
     else
     {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of node with activation due to the presence of an output accessor\n");
+        ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+            "Prevented fusion of node with activation due to the presence of an output accessor\n");
     }
 }
 
@@ -268,8 +281,8 @@ void fuse_pad_with_convolution(Graph &g, const Edge *output_edge)
     auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<N *>(output_edge->consumer());
 
     const Edge *input_edge = pad_node->input_edge(0);
-    if(input_edge != nullptr && input_edge->tensor() != nullptr && pad_node->output(0)->accessor() == nullptr
-       && pad_node->pad_value().get<float>() == 0.0)
+    if (input_edge != nullptr && input_edge->tensor() != nullptr && pad_node->output(0)->accessor() == nullptr &&
+        pad_node->pad_value().get<float>() == 0.0)
     {
         const DataLayout  layout       = input_edge->tensor()->desc().layout;
         const PaddingList padding_list = pad_node->padding();
@@ -280,18 +293,14 @@ void fuse_pad_with_convolution(Graph &g, const Edge *output_edge)
         const PaddingInfo pad_w = width_index < padding_list.size() ? padding_list[width_index] : PaddingInfo(0, 0);
         const PaddingInfo pad_h = height_index < padding_list.size() ? padding_list[height_index] : PaddingInfo(0, 0);
 
-        if(is_padding_in_height_or_width(layout, padding_list))
+        if (is_padding_in_height_or_width(layout, padding_list))
         {
             // Add paddings to the convolution node
             const PadStrideInfo conv_info = conv_node->convolution_info();
-            const PadStrideInfo new_conv_info(
-                conv_info.stride().first,
-                conv_info.stride().second,
-                conv_info.pad_left() + pad_w.first,
-                conv_info.pad_right() + pad_w.second,
-                conv_info.pad_top() + pad_h.first,
-                conv_info.pad_bottom() + pad_h.second,
-                conv_info.round());
+            const PadStrideInfo new_conv_info(conv_info.stride().first, conv_info.stride().second,
+                                              conv_info.pad_left() + pad_w.first, conv_info.pad_right() + pad_w.second,
+                                              conv_info.pad_top() + pad_h.first, conv_info.pad_bottom() + pad_h.second,
+                                              conv_info.round());
             conv_node->set_convolution_info(new_conv_info);
 
             // Update drivers of the convolution node
@@ -299,7 +308,7 @@ void fuse_pad_with_convolution(Graph &g, const Edge *output_edge)
             g.remove_node(pad_node->id());
 
             // Update fused node inputs
-            for(auto &driver_node : pad_driver_nodes)
+            for (auto &driver_node : pad_driver_nodes)
             {
                 g.add_connection(driver_node.node_id, driver_node.index, conv_node->id(), 0);
             }
@@ -308,22 +317,23 @@ void fuse_pad_with_convolution(Graph &g, const Edge *output_edge)
 }
 
 template <typename N1, typename N2, typename F, typename... Args>
-void fuse_layer(Graph &g, std::function<bool(INode &)> const &prec, const F fuse_fcn, Args &&... optional_arguments)
+void fuse_layer(Graph &g, std::function<bool(INode &)> const &prec, const F fuse_fcn, Args &&...optional_arguments)
 {
     // Note that fused nodes may be added to the end of the node list.
     // Instead of only looping over the original list of nodes, we loop over the current node list which could be growing.
     // This is intentional as it probes the newly added fused nodes for further fusing opportunities.
-    for(unsigned int i = 0; i < g.nodes().size(); ++i)
+    for (unsigned int i = 0; i < g.nodes().size(); ++i)
     {
         auto node = g.node(i);
         // Check if the node is of type N1 and not a branching node
-        if(node && node->type() == N1::node_type && node->output_edges().size() == 1)
+        if (node && node->type() == N1::node_type && node->output_edges().size() == 1)
         {
             const auto output_edge_id = *node->output_edges().begin();
             const auto output_edge    = g.edge(output_edge_id);
 
             // Check if following node is a type N2 node
-            if((output_edge != nullptr) && (output_edge->consumer() != nullptr) && (output_edge->consumer()->type() == N2::node_type) && prec(*output_edge->producer()))
+            if ((output_edge != nullptr) && (output_edge->consumer() != nullptr) &&
+                (output_edge->consumer()->type() == N2::node_type) && prec(*output_edge->producer()))
             {
                 fuse_fcn(g, output_edge, optional_arguments...);
             }
@@ -332,22 +342,22 @@ void fuse_layer(Graph &g, std::function<bool(INode &)> const &prec, const F fuse
 }
 
 template <typename N1, typename F, typename... Args>
-void fuse_layer(Graph &g, std::function<bool(INode &)> const &prec, const F fuse_fcn, Args &&... optional_arguments)
+void fuse_layer(Graph &g, std::function<bool(INode &)> const &prec, const F fuse_fcn, Args &&...optional_arguments)
 {
     // Note that fused nodes may be added to the end of the node list.
     // Instead of only looping over the original list of nodes, we loop over the current node list which could be growing.
     // This is intentional as it probes the newly added fused nodes for further fusing opportunities.
-    for(unsigned int i = 0; i < g.nodes().size(); ++i)
+    for (unsigned int i = 0; i < g.nodes().size(); ++i)
     {
         auto node = g.node(i);
         // Check if the node is of type N1 and not a branching node
-        if(node && node->type() == N1::node_type && node->output_edges().size() == 1)
+        if (node && node->type() == N1::node_type && node->output_edges().size() == 1)
         {
             const auto output_edge_id = *node->output_edges().begin();
             const auto output_edge    = g.edge(output_edge_id);
 
             // Check if it's the correct target
-            if((output_edge != nullptr) && (output_edge->consumer() != nullptr) && prec(*output_edge->producer()))
+            if ((output_edge != nullptr) && (output_edge->consumer() != nullptr) && prec(*output_edge->producer()))
             {
                 fuse_fcn(g, output_edge, i, optional_arguments...);
             }
@@ -369,30 +379,24 @@ IGraphMutator::MutationType NodeFusionMutator::type() const
 void NodeFusionMutator::mutate(Graph &g)
 {
     // Supported activations when fusing
-    const std::set<Activation> supported_fused_activations = { Activation::ABS, Activation::BOUNDED_RELU, Activation::ELU,
-                                                               Activation::HARD_SWISH, Activation::IDENTITY, Activation::LEAKY_RELU,
-                                                               Activation::LINEAR, Activation::LOGISTIC, Activation::LU_BOUNDED_RELU,
-                                                               Activation::RELU, Activation::SOFT_RELU, Activation::SQRT,
-                                                               Activation::SQUARE, Activation::TANH
-                                                             };
+    const std::set<Activation> supported_fused_activations = {
+        Activation::ABS,        Activation::BOUNDED_RELU, Activation::ELU,
+        Activation::HARD_SWISH, Activation::IDENTITY,     Activation::LEAKY_RELU,
+        Activation::LINEAR,     Activation::LOGISTIC,     Activation::LU_BOUNDED_RELU,
+        Activation::RELU,       Activation::SOFT_RELU,    Activation::SQRT,
+        Activation::SQUARE,     Activation::TANH};
 
     // Preconditions
-    auto empty_prec = [](INode &)
-    {
-        return true;
-    };
-    auto cl_target_prec = [](INode & n)
-    {
-        return n.assigned_target() == Target::CL;
-    };
-    auto qs8_prec = [&g](INode & n)
+    auto empty_prec     = [](INode &) { return true; };
+    auto cl_target_prec = [](INode &n) { return n.assigned_target() == Target::CL; };
+    auto qs8_prec       = [&g](INode &n)
     {
         ARM_COMPUTE_ERROR_ON(n.output(0) == nullptr);
 
         const auto output_edge_id = *n.output_edges().begin();
         const auto output_edge    = g.edge(output_edge_id);
         // To perform fusion the two nodes must have same output quantization information
-        const bool same_qinfo     = n.output(0)->desc().quant_info == output_edge->producer()->output(0)->desc().quant_info;
+        const bool same_qinfo = n.output(0)->desc().quant_info == output_edge->producer()->output(0)->desc().quant_info;
         const bool output_qasymm8 = n.output(0)->desc().data_type == DataType::QASYMM8;
 
         return (output_qasymm8 && same_qinfo) || !output_qasymm8;
@@ -400,16 +404,25 @@ void NodeFusionMutator::mutate(Graph &g)
 
     // Fusion mutations
 
-    detail::fuse_layer<PadLayerNode, ConvolutionLayerNode>(g, empty_prec, detail::fuse_pad_with_convolution<ConvolutionLayerNode>);
-    detail::fuse_layer<PadLayerNode, DepthwiseConvolutionLayerNode>(g, empty_prec, detail::fuse_pad_with_convolution<DepthwiseConvolutionLayerNode>);
-    detail::fuse_layer<BatchNormalizationLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<BatchNormalizationLayerNode>, supported_fused_activations);
-    detail::fuse_layer<ConvolutionLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<ConvolutionLayerNode>, supported_fused_activations);
-    detail::fuse_layer<DepthwiseConvolutionLayerNode, ActivationLayerNode>(g, qs8_prec, detail::fuse_node_with_activation<DepthwiseConvolutionLayerNode>, supported_fused_activations);
-    detail::fuse_layer<FullyConnectedLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<FullyConnectedLayerNode>, supported_fused_activations);
-    detail::fuse_layer<EltwiseLayerNode, ActivationLayerNode>(g, cl_target_prec, detail::fuse_node_with_activation<EltwiseLayerNode>, supported_fused_activations);
+    detail::fuse_layer<PadLayerNode, ConvolutionLayerNode>(g, empty_prec,
+                                                           detail::fuse_pad_with_convolution<ConvolutionLayerNode>);
+    detail::fuse_layer<PadLayerNode, DepthwiseConvolutionLayerNode>(
+        g, empty_prec, detail::fuse_pad_with_convolution<DepthwiseConvolutionLayerNode>);
+    detail::fuse_layer<BatchNormalizationLayerNode, ActivationLayerNode>(
+        g, empty_prec, detail::fuse_node_with_activation<BatchNormalizationLayerNode>, supported_fused_activations);
+    detail::fuse_layer<ConvolutionLayerNode, ActivationLayerNode>(
+        g, empty_prec, detail::fuse_node_with_activation<ConvolutionLayerNode>, supported_fused_activations);
+    detail::fuse_layer<DepthwiseConvolutionLayerNode, ActivationLayerNode>(
+        g, qs8_prec, detail::fuse_node_with_activation<DepthwiseConvolutionLayerNode>, supported_fused_activations);
+    detail::fuse_layer<FullyConnectedLayerNode, ActivationLayerNode>(
+        g, empty_prec, detail::fuse_node_with_activation<FullyConnectedLayerNode>, supported_fused_activations);
+    detail::fuse_layer<EltwiseLayerNode, ActivationLayerNode>(
+        g, cl_target_prec, detail::fuse_node_with_activation<EltwiseLayerNode>, supported_fused_activations);
     // The fusion of BatchNormalizationLayer must occur after the fusion of ActivationLayer. Because FusedConvolutionBatchNormalizationNode assumes the BatchNormalization is already fused with activation, if any
-    detail::fuse_layer<ConvolutionLayerNode, BatchNormalizationLayerNode>(g, empty_prec, detail::fuse_convolution_with_batch_normalization);
-    detail::fuse_layer<DepthwiseConvolutionLayerNode, BatchNormalizationLayerNode>(g, empty_prec, detail::fuse_depthwise_convolution_with_batch_normalization);
+    detail::fuse_layer<ConvolutionLayerNode, BatchNormalizationLayerNode>(
+        g, empty_prec, detail::fuse_convolution_with_batch_normalization);
+    detail::fuse_layer<DepthwiseConvolutionLayerNode, BatchNormalizationLayerNode>(
+        g, empty_prec, detail::fuse_depthwise_convolution_with_batch_normalization);
 }
 } // namespace graph
 } // namespace arm_compute
diff --git a/src/graph/mutators/SplitLayerSubTensorMutator.cpp b/src/graph/mutators/SplitLayerSubTensorMutator.cpp
index 2c28a1a2d1..533f8944cf 100644
--- a/src/graph/mutators/SplitLayerSubTensorMutator.cpp
+++ b/src/graph/mutators/SplitLayerSubTensorMutator.cpp
@@ -23,12 +23,12 @@
  */
 #include "arm_compute/graph/mutators/SplitLayerSubTensorMutator.h"
 
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Utils.h"
 #include "arm_compute/graph/algorithms/TopologicalSort.h"
 #include "arm_compute/graph/backends/BackendRegistry.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/Logger.h"
 #include "arm_compute/graph/nodes/SplitLayerNode.h"
+#include "arm_compute/graph/Utils.h"
 
 #include "support/Cast.h"
 #include "support/Iterable.h"
@@ -50,7 +50,7 @@ IGraphMutator::MutationType SplitLayerSubTensorMutator::type() const
 void SplitLayerSubTensorMutator::mutate(Graph &g)
 {
     // Early exit if no Split layers exist in graph
-    if(g.nodes(NodeType::SplitLayer).empty())
+    if (g.nodes(NodeType::SplitLayer).empty())
     {
         return;
     }
@@ -59,23 +59,23 @@ void SplitLayerSubTensorMutator::mutate(Graph &g)
     std::vector<NodeID> topological_sorted_node_ids = dfs(g);
 
     // Should be in reverse order of execution
-    for(auto &node_id : arm_compute::utils::iterable::reverse_iterate(topological_sorted_node_ids))
+    for (auto &node_id : arm_compute::utils::iterable::reverse_iterate(topological_sorted_node_ids))
     {
         INode *node = g.node(node_id);
-        if(node != nullptr && node->type() == NodeType::SplitLayer && node->input(0) != nullptr)
+        if (node != nullptr && node->type() == NodeType::SplitLayer && node->input(0) != nullptr)
         {
             // Get output tensor
             Tensor *input_tensor = node->input(0);
 
             // Check that all tensor have the same target and are valid
             bool is_valid = std::all_of(node->outputs().cbegin(), node->outputs().cend(),
-                                        [&](const TensorID & tid)
-            {
-                return (g.tensor(tid) != nullptr) && (g.tensor(tid)->desc().target == input_tensor->desc().target);
-            });
+                                        [&](const TensorID &tid) {
+                                            return (g.tensor(tid) != nullptr) &&
+                                                   (g.tensor(tid)->desc().target == input_tensor->desc().target);
+                                        });
 
             // Create subtensors
-            if(is_valid && is_target_supported(input_tensor->desc().target))
+            if (is_valid && is_target_supported(input_tensor->desc().target))
             {
                 ARM_COMPUTE_LOG_GRAPH_VERBOSE("Using sub-tensors for the node with ID : "
                                               << node->id() << " and name : " << node->name() << std::endl);
@@ -87,15 +87,18 @@ void SplitLayerSubTensorMutator::mutate(Graph &g)
                 const bool         extend_parent = (axis < 2);
 
                 // Create sub-tensor handles
-                for(unsigned int i = 0; i < node->outputs().size(); ++i)
+                for (unsigned int i = 0; i < node->outputs().size(); ++i)
                 {
                     Tensor           *output_tensor = node->output(i);
                     const TensorShape output_shape  = output_tensor->desc().shape;
                     Coordinates       coords;
-                    std::tie(std::ignore, coords) = split_node->compute_output_descriptor(input_tensor->desc(), num_splits, axis, i);
+                    std::tie(std::ignore, coords) =
+                        split_node->compute_output_descriptor(input_tensor->desc(), num_splits, axis, i);
 
-                    backends::IDeviceBackend      &backend = backends::BackendRegistry::get().get_backend(output_tensor->desc().target);
-                    std::unique_ptr<ITensorHandle> handle  = backend.create_subtensor(input_tensor->handle(), output_shape, coords, extend_parent);
+                    backends::IDeviceBackend &backend =
+                        backends::BackendRegistry::get().get_backend(output_tensor->desc().target);
+                    std::unique_ptr<ITensorHandle> handle =
+                        backend.create_subtensor(input_tensor->handle(), output_shape, coords, extend_parent);
                     output_tensor->set_handle(std::move(handle));
                 }
             }
diff --git a/src/graph/mutators/SyntheticDataTypeMutator.cpp b/src/graph/mutators/SyntheticDataTypeMutator.cpp
index 74d040b81d..3dc2480e85 100644
--- a/src/graph/mutators/SyntheticDataTypeMutator.cpp
+++ b/src/graph/mutators/SyntheticDataTypeMutator.cpp
@@ -26,8 +26,8 @@
 #include "arm_compute/graph/GraphBuilder.h"
 #include "arm_compute/graph/ITensorAccessor.h"
 #include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Utils.h"
 #include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/graph/Utils.h"
 
 #include "support/Cast.h"
 
@@ -62,14 +62,12 @@ public:
  */
 bool is_mutation_supported(Graph &g)
 {
-    const std::set<NodeType> unsupported_node_types = { NodeType::DetectionOutputLayer,
-                                                        NodeType::NormalizationLayer,
-                                                        NodeType::PriorBoxLayer
-                                                      };
+    const std::set<NodeType> unsupported_node_types = {NodeType::DetectionOutputLayer, NodeType::NormalizationLayer,
+                                                       NodeType::PriorBoxLayer};
 
-    for(const auto &utype : unsupported_node_types)
+    for (const auto &utype : unsupported_node_types)
     {
-        if(!g.nodes(utype).empty())
+        if (!g.nodes(utype).empty())
         {
             return false;
         }
@@ -83,12 +81,12 @@ bool is_mutation_supported(Graph &g)
  */
 void remove_optimized_nodes(Graph &g)
 {
-    const std::set<NodeType> optimized_node_types = { NodeType::BatchNormalizationLayer };
+    const std::set<NodeType> optimized_node_types = {NodeType::BatchNormalizationLayer};
 
-    for(const auto &opt_type : optimized_node_types)
+    for (const auto &opt_type : optimized_node_types)
     {
         const std::vector<NodeID> opt_nodes_ids = g.nodes(opt_type);
-        for(const auto &node_id : opt_nodes_ids)
+        for (const auto &node_id : opt_nodes_ids)
         {
             INode *node = g.node(node_id);
 
@@ -108,7 +106,7 @@ void remove_optimized_nodes(Graph &g)
             g.remove_node(node->id());
 
             // Update connections
-            for(auto &driving_node : driving_nodes)
+            for (auto &driving_node : driving_nodes)
             {
                 g.add_connection(producer->id(), producer_edge_id, driving_node.node_id, driving_node.index);
             }
@@ -123,11 +121,11 @@ void remove_optimized_nodes(Graph &g)
 void convert_tensors(Graph &g, DataType data_type)
 {
     auto &tensors = g.tensors();
-    for(auto &tensor : tensors)
+    for (auto &tensor : tensors)
     {
-        if(tensor != nullptr)
+        if (tensor != nullptr)
         {
-            switch(data_type)
+            switch (data_type)
             {
                 case DataType::QASYMM8:
                 case DataType::QASYMM8_SIGNED:
@@ -156,7 +154,7 @@ template <typename NT>
 void convert_special_node(Graph &g, std::function<bool(INode *, Tensor *)> const &f)
 {
     const std::vector<NodeID> nodes_ids = g.nodes(NT::node_type);
-    for(const auto &nodes_id : nodes_ids)
+    for (const auto &nodes_id : nodes_ids)
     {
         INode *node = arm_compute::utils::cast::polymorphic_downcast<NT *>(g.node(nodes_id));
         ARM_COMPUTE_ERROR_ON(node == nullptr);
@@ -174,41 +172,41 @@ void convert_special_node(Graph &g, std::function<bool(INode *, Tensor *)> const
  */
 void convert_special_tensors(Graph &g)
 {
-    auto softmax_func = [](INode * node, Tensor * tensor)
+    auto softmax_func = [](INode *node, Tensor *tensor)
     {
         ARM_COMPUTE_UNUSED(node);
-        if(tensor->desc().data_type == DataType::QASYMM8)
+        if (tensor->desc().data_type == DataType::QASYMM8)
         {
             tensor->desc().quant_info = QuantizationInfo(1.f / 256.f, 0);
         }
-        else if(tensor->desc().data_type == DataType::QASYMM8_SIGNED)
+        else if (tensor->desc().data_type == DataType::QASYMM8_SIGNED)
         {
             tensor->desc().quant_info = QuantizationInfo(1.f / 256.f, -128);
         }
         return true;
     };
 
-    auto act_func = [](INode * node, Tensor * tensor)
+    auto act_func = [](INode *node, Tensor *tensor)
     {
         auto *act_node = arm_compute::utils::cast::polymorphic_downcast<ActivationLayerNode *>(node);
-        if(tensor->desc().data_type == DataType::QASYMM8)
+        if (tensor->desc().data_type == DataType::QASYMM8)
         {
-            if(act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::TANH)
+            if (act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::TANH)
             {
                 tensor->desc().quant_info = QuantizationInfo(1.f / 128.f, 128);
             }
-            else if(act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            else if (act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC)
             {
                 tensor->desc().quant_info = QuantizationInfo(1.f / 256.f, 0);
             }
         }
-        else if(tensor->desc().data_type == DataType::QASYMM8_SIGNED)
+        else if (tensor->desc().data_type == DataType::QASYMM8_SIGNED)
         {
-            if(act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::TANH)
+            if (act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::TANH)
             {
                 tensor->desc().quant_info = QuantizationInfo(1.f / 128.f, 0);
             }
-            else if(act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            else if (act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC)
             {
                 tensor->desc().quant_info = QuantizationInfo(1.f / 256.f, -128);
             }
@@ -228,22 +226,19 @@ void convert_special_tensors(Graph &g)
  */
 void handle_nodes_with_bias(Graph &g)
 {
-    const std::set<NodeType> special_node_types = { NodeType::ConvolutionLayer,
-                                                    NodeType::DeconvolutionLayer,
-                                                    NodeType::DepthwiseConvolutionLayer,
-                                                    NodeType::FullyConnectedLayer
-                                                  };
+    const std::set<NodeType> special_node_types = {NodeType::ConvolutionLayer, NodeType::DeconvolutionLayer,
+                                                   NodeType::DepthwiseConvolutionLayer, NodeType::FullyConnectedLayer};
 
-    for(const auto &spc_type : special_node_types)
+    for (const auto &spc_type : special_node_types)
     {
         const std::vector<NodeID> scp_nodes_ids = g.nodes(spc_type);
-        for(const auto &node_id : scp_nodes_ids)
+        for (const auto &node_id : scp_nodes_ids)
         {
             INode *node = g.node(node_id);
-            if(node != nullptr)
+            if (node != nullptr)
             {
                 Tensor *tensor = node->input(2);
-                if(tensor != nullptr)
+                if (tensor != nullptr)
                 {
                     tensor->desc().data_type = DataType::S32;
                 }
@@ -253,8 +248,8 @@ void handle_nodes_with_bias(Graph &g)
                     params.name = params.name.empty() ? "" : params.name + "Bias";
 
                     TensorDescriptor b_desc = node->input(1)->desc();
-                    auto             depth  = b_desc.shape[get_dimension_idx(b_desc.layout, DataLayoutDimension::BATCHES)];
-                    b_desc.shape            = TensorShape(depth);
+                    auto depth   = b_desc.shape[get_dimension_idx(b_desc.layout, DataLayoutDimension::BATCHES)];
+                    b_desc.shape = TensorShape(depth);
 
                     auto accessor = std::make_unique<EmptyAccessor>();
                     auto b_nid    = GraphBuilder::add_const_node(g, params, b_desc, std::move(accessor));
@@ -266,8 +261,7 @@ void handle_nodes_with_bias(Graph &g)
 }
 } // namespace
 
-SyntheticDataTypeMutator::SyntheticDataTypeMutator(DataType mutate_type)
-    : _mutate_type{ mutate_type }
+SyntheticDataTypeMutator::SyntheticDataTypeMutator(DataType mutate_type) : _mutate_type{mutate_type}
 {
 }
 
@@ -283,7 +277,7 @@ IGraphMutator::MutationType SyntheticDataTypeMutator::type() const
 
 void SyntheticDataTypeMutator::mutate(Graph &g)
 {
-    if(is_mutation_supported(g))
+    if (is_mutation_supported(g))
     {
         // Remove nodes that get optimized out (e.g. BatchNorm)
         remove_optimized_nodes(g);
diff --git a/src/graph/nodes/ActivationLayerNode.cpp b/src/graph/nodes/ActivationLayerNode.cpp
index cf65d83a5e..1773afcb16 100644
--- a/src/graph/nodes/ActivationLayerNode.cpp
+++ b/src/graph/nodes/ActivationLayerNode.cpp
@@ -44,7 +44,7 @@ ActivationLayerInfo ActivationLayerNode::activation_info() const
 
 bool ActivationLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -63,7 +63,7 @@ TensorDescriptor ActivationLayerNode::configure_output(size_t idx) const
     ARM_COMPUTE_ERROR_ON(src == nullptr);
 
     TensorDescriptor output_info = src->desc();
-    if(!_out_quant_info.empty())
+    if (!_out_quant_info.empty())
     {
         output_info.quant_info = _out_quant_info;
     }
diff --git a/src/graph/nodes/ArgMinMaxLayerNode.cpp b/src/graph/nodes/ArgMinMaxLayerNode.cpp
index 63163b9e2c..5adebc950a 100644
--- a/src/graph/nodes/ArgMinMaxLayerNode.cpp
+++ b/src/graph/nodes/ArgMinMaxLayerNode.cpp
@@ -23,16 +23,18 @@
  */
 #include "arm_compute/graph/nodes/ArgMinMaxLayerNode.h"
 
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
 
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
 namespace arm_compute
 {
 namespace graph
 {
-ArgMinMaxLayerNode::ArgMinMaxLayerNode(ReductionOperation op, unsigned int axis, DataType out_data_type, QuantizationInfo out_quant_info)
+ArgMinMaxLayerNode::ArgMinMaxLayerNode(ReductionOperation op,
+                                       unsigned int       axis,
+                                       DataType           out_data_type,
+                                       QuantizationInfo   out_quant_info)
     : _op(op), _axis(axis), _out_data_type(out_data_type), _out_quant_info(std::move(out_quant_info))
 {
     _input_edges.resize(1, EmptyEdgeID);
@@ -56,7 +58,7 @@ DataType ArgMinMaxLayerNode::out_data_type() const
 
 bool ArgMinMaxLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -75,17 +77,18 @@ TensorDescriptor ArgMinMaxLayerNode::configure_output(size_t idx) const
     ARM_COMPUTE_ERROR_ON(src == nullptr);
 
     TensorDescriptor output_info = src->desc();
-    if(!_out_quant_info.empty())
+    if (!_out_quant_info.empty())
     {
         output_info.quant_info = _out_quant_info;
     }
 
-    if(_out_data_type != DataType::UNKNOWN)
+    if (_out_data_type != DataType::UNKNOWN)
     {
         output_info.data_type = _out_data_type;
     }
 
-    TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(output_info.shape, _axis, false);
+    TensorShape output_shape =
+        arm_compute::misc::shape_calculator::compute_reduced_shape(output_info.shape, _axis, false);
     output_info.set_shape(output_shape);
 
     return output_info;
diff --git a/src/graph/nodes/BatchNormalizationLayerNode.cpp b/src/graph/nodes/BatchNormalizationLayerNode.cpp
index ceca0e2715..c317123e8d 100644
--- a/src/graph/nodes/BatchNormalizationLayerNode.cpp
+++ b/src/graph/nodes/BatchNormalizationLayerNode.cpp
@@ -55,7 +55,7 @@ void BatchNormalizationLayerNode::set_fused_activation(ActivationLayerInfo fused
 
 bool BatchNormalizationLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -86,4 +86,4 @@ void BatchNormalizationLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/BoundingBoxTransformLayerNode.cpp b/src/graph/nodes/BoundingBoxTransformLayerNode.cpp
index f3f4f91075..8e52174639 100644
--- a/src/graph/nodes/BoundingBoxTransformLayerNode.cpp
+++ b/src/graph/nodes/BoundingBoxTransformLayerNode.cpp
@@ -23,17 +23,15 @@
  */
 #include "arm_compute/graph/nodes/BoundingBoxTransformLayerNode.h"
 
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
 
-#include "arm_compute/core/Helpers.h"
-
 namespace arm_compute
 {
 namespace graph
 {
-BoundingBoxTransformLayerNode::BoundingBoxTransformLayerNode(BoundingBoxTransformInfo &info)
-    : _bbox_info(info)
+BoundingBoxTransformLayerNode::BoundingBoxTransformLayerNode(BoundingBoxTransformInfo &info) : _bbox_info(info)
 {
     _input_edges.resize(2, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -46,7 +44,7 @@ const BoundingBoxTransformInfo &BoundingBoxTransformLayerNode::info() const
 
 bool BoundingBoxTransformLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
diff --git a/src/graph/nodes/ChannelShuffleLayerNode.cpp b/src/graph/nodes/ChannelShuffleLayerNode.cpp
index 5102e4b6da..3cb9e23eca 100644
--- a/src/graph/nodes/ChannelShuffleLayerNode.cpp
+++ b/src/graph/nodes/ChannelShuffleLayerNode.cpp
@@ -30,8 +30,7 @@ namespace arm_compute
 {
 namespace graph
 {
-ChannelShuffleLayerNode::ChannelShuffleLayerNode(unsigned int num_groups)
-    : _num_groups(num_groups)
+ChannelShuffleLayerNode::ChannelShuffleLayerNode(unsigned int num_groups) : _num_groups(num_groups)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -44,7 +43,7 @@ unsigned int ChannelShuffleLayerNode::num_groups() const
 
 bool ChannelShuffleLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -75,4 +74,4 @@ void ChannelShuffleLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/ConcatenateLayerNode.cpp b/src/graph/nodes/ConcatenateLayerNode.cpp
index 3f3c70f3bb..8e5393a5e4 100644
--- a/src/graph/nodes/ConcatenateLayerNode.cpp
+++ b/src/graph/nodes/ConcatenateLayerNode.cpp
@@ -24,17 +24,17 @@
 #include "arm_compute/graph/nodes/ConcatenateLayerNode.h"
 
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
 #include "arm_compute/graph/Utils.h"
 
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
 namespace arm_compute
 {
 namespace graph
 {
-ConcatenateLayerNode::ConcatenateLayerNode(unsigned int total_nodes, descriptors::ConcatLayerDescriptor concat_descriptor)
+ConcatenateLayerNode::ConcatenateLayerNode(unsigned int                       total_nodes,
+                                           descriptors::ConcatLayerDescriptor concat_descriptor)
     : _total_nodes(total_nodes), _concat_descriptor(std::move(concat_descriptor)), _is_enabled(true)
 {
     _input_edges.resize(_total_nodes, EmptyEdgeID);
@@ -73,7 +73,7 @@ TensorDescriptor ConcatenateLayerNode::compute_output_descriptor(const std::vect
     // Extract shapes
     std::vector<const TensorShape *> shapes;
     shapes.reserve(input_descriptors.size());
-    for(auto &input_descriptor : input_descriptors)
+    for (auto &input_descriptor : input_descriptors)
     {
         shapes.emplace_back(&input_descriptor.shape);
     }
@@ -85,7 +85,7 @@ TensorDescriptor ConcatenateLayerNode::compute_output_descriptor(const std::vect
 
 bool ConcatenateLayerNode::forward_descriptors()
 {
-    if(_outputs[0] != NullTensorID)
+    if (_outputs[0] != NullTensorID)
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -101,24 +101,22 @@ TensorDescriptor ConcatenateLayerNode::configure_output(size_t idx) const
     ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
 
     // Check if all input tensors are set
-    bool are_all_inputs_set = std::all_of(std::begin(_input_edges), std::end(_input_edges), [](const EdgeID & eid)
-    {
-        return eid != EmptyEdgeID;
-    });
+    bool are_all_inputs_set = std::all_of(std::begin(_input_edges), std::end(_input_edges),
+                                          [](const EdgeID &eid) { return eid != EmptyEdgeID; });
 
     TensorDescriptor output_info = {};
 
-    if(are_all_inputs_set)
+    if (are_all_inputs_set)
     {
         std::vector<TensorDescriptor> inputs_descriptors;
-        for(unsigned int i = 0; i < _input_edges.size(); ++i)
+        for (unsigned int i = 0; i < _input_edges.size(); ++i)
         {
             const Tensor *t = _graph->tensor(input_id(i));
             ARM_COMPUTE_ERROR_ON(t == nullptr);
             inputs_descriptors.push_back(t->desc());
         }
         output_info = compute_output_descriptor(inputs_descriptors, _concat_descriptor.axis);
-        if(!_concat_descriptor.output_qinfo.empty())
+        if (!_concat_descriptor.output_qinfo.empty())
         {
             output_info.quant_info = _concat_descriptor.output_qinfo;
         }
diff --git a/src/graph/nodes/ConstNode.cpp b/src/graph/nodes/ConstNode.cpp
index eb96d63888..6e8fbff71a 100644
--- a/src/graph/nodes/ConstNode.cpp
+++ b/src/graph/nodes/ConstNode.cpp
@@ -30,15 +30,14 @@ namespace arm_compute
 {
 namespace graph
 {
-ConstNode::ConstNode(TensorDescriptor desc)
-    : _desc(std::move(desc))
+ConstNode::ConstNode(TensorDescriptor desc) : _desc(std::move(desc))
 {
     _outputs.resize(1, NullTensorID);
 }
 
 bool ConstNode::forward_descriptors()
 {
-    if(output_id(0) != NullTensorID)
+    if (output_id(0) != NullTensorID)
     {
         Tensor *t = output(0);
         ARM_COMPUTE_ERROR_ON(t == nullptr);
diff --git a/src/graph/nodes/ConvolutionLayerNode.cpp b/src/graph/nodes/ConvolutionLayerNode.cpp
index ee9dde91d5..f0263fc84a 100644
--- a/src/graph/nodes/ConvolutionLayerNode.cpp
+++ b/src/graph/nodes/ConvolutionLayerNode.cpp
@@ -37,7 +37,12 @@ ConvolutionLayerNode::ConvolutionLayerNode(PadStrideInfo     info,
                                            ConvolutionMethod method,
                                            FastMathHint      fast_math_hint,
                                            QuantizationInfo  out_quant_info)
-    : _info(std::move(info)), _num_groups(num_groups), _method(method), _fast_math_hint(fast_math_hint), _out_quant_info(std::move(out_quant_info)), _fused_activation()
+    : _info(std::move(info)),
+      _num_groups(num_groups),
+      _method(method),
+      _fast_math_hint(fast_math_hint),
+      _out_quant_info(std::move(out_quant_info)),
+      _fused_activation()
 {
     _input_edges.resize(3, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -100,20 +105,22 @@ TensorDescriptor ConvolutionLayerNode::compute_output_descriptor(const TensorDes
     const unsigned int kernel_width  = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
     const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
 
-    std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
+    std::tie(output_width, output_height) =
+        scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
 
     const DataLayout data_layout       = input_descriptor.layout;
     TensorDescriptor output_descriptor = input_descriptor;
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL),
+                                weights_descriptor.shape[3]);
 
     return output_descriptor;
 }
 
 bool ConvolutionLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -132,7 +139,7 @@ TensorDescriptor ConvolutionLayerNode::configure_output(size_t idx) const
     ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr);
 
     TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), _info);
-    if(!_out_quant_info.empty())
+    if (!_out_quant_info.empty())
     {
         output_info.quant_info = _out_quant_info;
     }
diff --git a/src/graph/nodes/DeconvolutionLayerNode.cpp b/src/graph/nodes/DeconvolutionLayerNode.cpp
index 3542d5ad10..2058ab21e5 100644
--- a/src/graph/nodes/DeconvolutionLayerNode.cpp
+++ b/src/graph/nodes/DeconvolutionLayerNode.cpp
@@ -56,20 +56,22 @@ TensorDescriptor DeconvolutionLayerNode::compute_output_descriptor(const TensorD
     const unsigned int kernel_width  = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
     const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
 
-    std::tie(output_width, output_height) = deconvolution_output_dimensions(input_width, input_height, kernel_width, kernel_height, info);
+    std::tie(output_width, output_height) =
+        deconvolution_output_dimensions(input_width, input_height, kernel_width, kernel_height, info);
 
     const DataLayout data_layout       = input_descriptor.layout;
     TensorDescriptor output_descriptor = input_descriptor;
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL),
+                                weights_descriptor.shape[3]);
 
     return output_descriptor;
 }
 
 bool DeconvolutionLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -89,7 +91,7 @@ TensorDescriptor DeconvolutionLayerNode::configure_output(size_t idx) const
 
     TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), descriptor.info);
 
-    if(!descriptor.out_quant_info.empty())
+    if (!descriptor.out_quant_info.empty())
     {
         output_info.set_quantization_info(descriptor.out_quant_info);
     }
diff --git a/src/graph/nodes/DepthToSpaceLayerNode.cpp b/src/graph/nodes/DepthToSpaceLayerNode.cpp
index b70ac56a07..0b914a0e56 100644
--- a/src/graph/nodes/DepthToSpaceLayerNode.cpp
+++ b/src/graph/nodes/DepthToSpaceLayerNode.cpp
@@ -32,8 +32,7 @@ namespace arm_compute
 {
 namespace graph
 {
-DepthToSpaceLayerNode::DepthToSpaceLayerNode(int block_shape)
-    : _block_shape(block_shape)
+DepthToSpaceLayerNode::DepthToSpaceLayerNode(int block_shape) : _block_shape(block_shape)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -44,7 +43,8 @@ int DepthToSpaceLayerNode::block_shape() const
     return _block_shape;
 }
 
-TensorDescriptor DepthToSpaceLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor, int block_shape)
+TensorDescriptor DepthToSpaceLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+                                                                  int                     block_shape)
 {
     using namespace arm_compute::helpers::tensor_transform;
 
@@ -53,14 +53,15 @@ TensorDescriptor DepthToSpaceLayerNode::compute_output_descriptor(const TensorDe
 
     // Set descriptor shape
     TensorDescriptor output_descriptor = input_descriptor;
-    output_descriptor.shape            = misc::shape_calculator::compute_depth_to_space_shape(input_shape, data_layout, block_shape);
+    output_descriptor.shape =
+        misc::shape_calculator::compute_depth_to_space_shape(input_shape, data_layout, block_shape);
 
     return output_descriptor;
 }
 
 bool DepthToSpaceLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
diff --git a/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp b/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp
index 7de20165cb..92d7266088 100644
--- a/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp
+++ b/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp
@@ -32,9 +32,15 @@ namespace arm_compute
 {
 namespace graph
 {
-DepthwiseConvolutionLayerNode::DepthwiseConvolutionLayerNode(PadStrideInfo info, int depth_multiplier, DepthwiseConvolutionMethod method,
-                                                             QuantizationInfo out_quant_info)
-    : _info(std::move(info)), _depth_multiplier(depth_multiplier), _method(method), _out_quant_info(std::move(out_quant_info)), _fused_activation()
+DepthwiseConvolutionLayerNode::DepthwiseConvolutionLayerNode(PadStrideInfo              info,
+                                                             int                        depth_multiplier,
+                                                             DepthwiseConvolutionMethod method,
+                                                             QuantizationInfo           out_quant_info)
+    : _info(std::move(info)),
+      _depth_multiplier(depth_multiplier),
+      _method(method),
+      _out_quant_info(std::move(out_quant_info)),
+      _fused_activation()
 {
     _input_edges.resize(3, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -89,20 +95,22 @@ TensorDescriptor DepthwiseConvolutionLayerNode::compute_output_descriptor(const
     const unsigned int kernel_width   = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
     const unsigned int kernel_height  = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
 
-    std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
+    std::tie(output_width, output_height) =
+        scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
 
     const DataLayout data_layout       = input_descriptor.layout;
     TensorDescriptor output_descriptor = input_descriptor;
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), input_channels * depth_multiplier);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL),
+                                input_channels * depth_multiplier);
 
     return output_descriptor;
 }
 
 bool DepthwiseConvolutionLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -121,7 +129,7 @@ TensorDescriptor DepthwiseConvolutionLayerNode::configure_output(size_t idx) con
     ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr);
 
     TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), _info, _depth_multiplier);
-    if(!_out_quant_info.empty())
+    if (!_out_quant_info.empty())
     {
         output_info.quant_info = _out_quant_info;
     }
@@ -139,4 +147,4 @@ void DepthwiseConvolutionLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/DequantizationLayerNode.cpp b/src/graph/nodes/DequantizationLayerNode.cpp
index 14c4752f12..3ea000852a 100644
--- a/src/graph/nodes/DequantizationLayerNode.cpp
+++ b/src/graph/nodes/DequantizationLayerNode.cpp
@@ -40,7 +40,7 @@ DequantizationLayerNode::DequantizationLayerNode()
 
 bool DequantizationLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -74,4 +74,4 @@ void DequantizationLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/DetectionOutputLayerNode.cpp b/src/graph/nodes/DetectionOutputLayerNode.cpp
index fc6f531ee0..65ddd2f5bc 100644
--- a/src/graph/nodes/DetectionOutputLayerNode.cpp
+++ b/src/graph/nodes/DetectionOutputLayerNode.cpp
@@ -32,8 +32,7 @@ namespace arm_compute
 {
 namespace graph
 {
-DetectionOutputLayerNode::DetectionOutputLayerNode(DetectionOutputLayerInfo detection_info)
-    : _info(detection_info)
+DetectionOutputLayerNode::DetectionOutputLayerNode(DetectionOutputLayerInfo detection_info) : _info(detection_info)
 {
     _input_edges.resize(3, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -47,7 +46,8 @@ DetectionOutputLayerInfo DetectionOutputLayerNode::detection_output_info() const
 TensorDescriptor DetectionOutputLayerNode::compute_output_descriptor(const TensorDescriptor         &input_descriptor,
                                                                      const DetectionOutputLayerInfo &info)
 {
-    const unsigned int max_size = info.keep_top_k() * ((input_descriptor.shape.num_dimensions() > 1) ? input_descriptor.shape[1] : 1);
+    const unsigned int max_size =
+        info.keep_top_k() * ((input_descriptor.shape.num_dimensions() > 1) ? input_descriptor.shape[1] : 1);
 
     TensorDescriptor output_descriptor = input_descriptor;
     output_descriptor.shape.set(0, detection_size);
@@ -58,7 +58,8 @@ TensorDescriptor DetectionOutputLayerNode::compute_output_descriptor(const Tenso
 
 bool DetectionOutputLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (input_id(2) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (input_id(2) != NullTensorID) &&
+        (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
diff --git a/src/graph/nodes/DetectionPostProcessLayerNode.cpp b/src/graph/nodes/DetectionPostProcessLayerNode.cpp
index 2c5005af30..af3fc03d67 100644
--- a/src/graph/nodes/DetectionPostProcessLayerNode.cpp
+++ b/src/graph/nodes/DetectionPostProcessLayerNode.cpp
@@ -46,10 +46,11 @@ DetectionPostProcessLayerInfo DetectionPostProcessLayerNode::detection_post_proc
 
 bool DetectionPostProcessLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (input_id(2) != NullTensorID) && (output_id(0) != NullTensorID) && (output_id(1) != NullTensorID)
-       && (output_id(2) != NullTensorID) && (output_id(3) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (input_id(2) != NullTensorID) &&
+        (output_id(0) != NullTensorID) && (output_id(1) != NullTensorID) && (output_id(2) != NullTensorID) &&
+        (output_id(3) != NullTensorID))
     {
-        for(unsigned int i = 0; i < 4; ++i)
+        for (unsigned int i = 0; i < 4; ++i)
         {
             Tensor *dst = output(i);
             ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -68,7 +69,7 @@ TensorDescriptor DetectionPostProcessLayerNode::configure_output(size_t idx) con
     TensorDescriptor   output_desc;
     const unsigned int num_detected_box = _info.max_detections() * _info.max_classes_per_detection();
 
-    switch(idx)
+    switch (idx)
     {
         case 0:
             // Configure boxes output
@@ -101,4 +102,4 @@ void DetectionPostProcessLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/DummyNode.cpp b/src/graph/nodes/DummyNode.cpp
index 6fa9fbaf56..b5f37bd79b 100644
--- a/src/graph/nodes/DummyNode.cpp
+++ b/src/graph/nodes/DummyNode.cpp
@@ -32,8 +32,7 @@ namespace arm_compute
 {
 namespace graph
 {
-DummyNode::DummyNode(TensorShape shape)
-    : _shape(shape)
+DummyNode::DummyNode(TensorShape shape) : _shape(shape)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -41,7 +40,7 @@ DummyNode::DummyNode(TensorShape shape)
 
 bool DummyNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -75,4 +74,4 @@ void DummyNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/EltwiseLayerNode.cpp b/src/graph/nodes/EltwiseLayerNode.cpp
index 4426e953ee..3f7a08e64d 100644
--- a/src/graph/nodes/EltwiseLayerNode.cpp
+++ b/src/graph/nodes/EltwiseLayerNode.cpp
@@ -31,8 +31,7 @@ namespace arm_compute
 {
 namespace graph
 {
-EltwiseLayerNode::EltwiseLayerNode(const descriptors::EltwiseLayerDescriptor &descriptor)
-    : descriptor(descriptor)
+EltwiseLayerNode::EltwiseLayerNode(const descriptors::EltwiseLayerDescriptor &descriptor) : descriptor(descriptor)
 {
     _input_edges.resize(2, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -70,7 +69,7 @@ void EltwiseLayerNode::set_fused_activation(ActivationLayerInfo fused_activation
 
 bool EltwiseLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -97,7 +96,7 @@ TensorDescriptor EltwiseLayerNode::configure_output(size_t idx) const
 
     output_info.set_shape(out_shape);
 
-    if(!descriptor.out_quant_info.empty())
+    if (!descriptor.out_quant_info.empty())
     {
         output_info.set_quantization_info(descriptor.out_quant_info);
     }
@@ -134,7 +133,7 @@ void UnaryEltwiseLayerNode::set_fused_activation(ActivationLayerInfo fused_activ
 
 bool UnaryEltwiseLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -153,7 +152,7 @@ TensorDescriptor UnaryEltwiseLayerNode::configure_output(size_t idx) const
 
     auto output_info = src->desc();
 
-    if(!descriptor.out_quant_info.empty())
+    if (!descriptor.out_quant_info.empty())
     {
         output_info.set_quantization_info(descriptor.out_quant_info);
     }
diff --git a/src/graph/nodes/FlattenLayerNode.cpp b/src/graph/nodes/FlattenLayerNode.cpp
index 48519a1695..952df2f3ec 100644
--- a/src/graph/nodes/FlattenLayerNode.cpp
+++ b/src/graph/nodes/FlattenLayerNode.cpp
@@ -38,7 +38,7 @@ FlattenLayerNode::FlattenLayerNode()
 
 bool FlattenLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -72,4 +72,4 @@ void FlattenLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/FullyConnectedLayer.cpp b/src/graph/nodes/FullyConnectedLayer.cpp
index 6278227878..1eed69ddaf 100644
--- a/src/graph/nodes/FullyConnectedLayer.cpp
+++ b/src/graph/nodes/FullyConnectedLayer.cpp
@@ -21,18 +21,23 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/graph/nodes/FullyConnectedLayerNode.h"
-
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/nodes/FullyConnectedLayerNode.h"
 
 namespace arm_compute
 {
 namespace graph
 {
-FullyConnectedLayerNode::FullyConnectedLayerNode(unsigned int num_outputs, QuantizationInfo out_quant_info, FullyConnectedLayerInfo fc_info, FastMathHint fast_math_hint)
-    : _num_outputs(num_outputs), _out_quant_info(std::move(out_quant_info)), _info(fc_info), _fast_math_hint(fast_math_hint)
+FullyConnectedLayerNode::FullyConnectedLayerNode(unsigned int            num_outputs,
+                                                 QuantizationInfo        out_quant_info,
+                                                 FullyConnectedLayerInfo fc_info,
+                                                 FastMathHint            fast_math_hint)
+    : _num_outputs(num_outputs),
+      _out_quant_info(std::move(out_quant_info)),
+      _info(fc_info),
+      _fast_math_hint(fast_math_hint)
 {
     _input_edges.resize(3, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -60,11 +65,11 @@ TensorDescriptor FullyConnectedLayerNode::compute_weights_descriptor(const Tenso
     unsigned int num_weights    = 1;
     unsigned int num_dimensions = input_descriptor.shape.num_dimensions();
     // Ignore the batch dimension if there is one:
-    if(num_dimensions == 2 || num_dimensions == 4)
+    if (num_dimensions == 2 || num_dimensions == 4)
     {
         num_dimensions--;
     }
-    for(unsigned int i = 0; i < num_dimensions; i++)
+    for (unsigned int i = 0; i < num_dimensions; i++)
     {
         num_weights *= input_descriptor.shape[i];
     }
@@ -73,13 +78,13 @@ TensorDescriptor FullyConnectedLayerNode::compute_weights_descriptor(const Tenso
     weights_descriptor.shape            = TensorShape(num_weights, num_outputs);
 
     // If weights are tranposed, use tranposed shape
-    if(!fc_info.transpose_weights)
+    if (!fc_info.transpose_weights)
     {
         weights_descriptor.shape = TensorShape(num_outputs, num_weights);
     }
 
     // Set quantization info if present
-    if(!weights_quant_info.empty())
+    if (!weights_quant_info.empty())
     {
         weights_descriptor.quant_info = weights_quant_info;
     }
@@ -93,7 +98,7 @@ TensorDescriptor FullyConnectedLayerNode::compute_output_descriptor(const Tensor
 {
     // Note: Only 1D batch space is supported at the moment
     unsigned int batches = input_descriptor.shape[1];
-    if(input_descriptor.shape.num_dimensions() > 2)
+    if (input_descriptor.shape.num_dimensions() > 2)
     {
         batches = input_descriptor.shape[3];
     }
@@ -103,7 +108,7 @@ TensorDescriptor FullyConnectedLayerNode::compute_output_descriptor(const Tensor
     output_descriptor.shape            = TensorShape(num_outputs, batches);
 
     // Set quantization info if present
-    if(!out_quant_info.empty())
+    if (!out_quant_info.empty())
     {
         output_descriptor.quant_info = out_quant_info;
     }
@@ -118,7 +123,7 @@ FullyConnectedLayerInfo FullyConnectedLayerNode::info() const
 
 bool FullyConnectedLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -147,4 +152,4 @@ void FullyConnectedLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/FusedConvolutionBatchNormalizationNode.cpp b/src/graph/nodes/FusedConvolutionBatchNormalizationNode.cpp
index de995ebee9..9d37e84acf 100644
--- a/src/graph/nodes/FusedConvolutionBatchNormalizationNode.cpp
+++ b/src/graph/nodes/FusedConvolutionBatchNormalizationNode.cpp
@@ -32,12 +32,18 @@ namespace arm_compute
 {
 namespace graph
 {
-FusedConvolutionBatchNormalizationNode::FusedConvolutionBatchNormalizationNode(float epsilon, PadStrideInfo info,
-                                                                               unsigned int      num_groups,
-                                                                               ConvolutionMethod method,
-                                                                               FastMathHint      fast_math_hint,
+FusedConvolutionBatchNormalizationNode::FusedConvolutionBatchNormalizationNode(float               epsilon,
+                                                                               PadStrideInfo       info,
+                                                                               unsigned int        num_groups,
+                                                                               ConvolutionMethod   method,
+                                                                               FastMathHint        fast_math_hint,
                                                                                ActivationLayerInfo fused_activation)
-    : _epsilon(epsilon), _info(std::move(info)), _num_groups(num_groups), _method(method), _fast_math_hint(fast_math_hint), _fused_activation(fused_activation)
+    : _epsilon(epsilon),
+      _info(std::move(info)),
+      _num_groups(num_groups),
+      _method(method),
+      _fast_math_hint(fast_math_hint),
+      _fused_activation(fused_activation)
 {
     _input_edges.resize(7, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -88,9 +94,8 @@ void FusedConvolutionBatchNormalizationNode::set_fused_activation(ActivationLaye
     _fused_activation = fused_activation;
 }
 
-TensorDescriptor FusedConvolutionBatchNormalizationNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                                                   const TensorDescriptor &weights_descriptor,
-                                                                                   const PadStrideInfo    &info)
+TensorDescriptor FusedConvolutionBatchNormalizationNode::compute_output_descriptor(
+    const TensorDescriptor &input_descriptor, const TensorDescriptor &weights_descriptor, const PadStrideInfo &info)
 {
     unsigned int output_width  = 0;
     unsigned int output_height = 0;
@@ -100,20 +105,22 @@ TensorDescriptor FusedConvolutionBatchNormalizationNode::compute_output_descript
     const unsigned int kernel_width  = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
     const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
 
-    std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
+    std::tie(output_width, output_height) =
+        scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
 
     const DataLayout data_layout       = input_descriptor.layout;
     TensorDescriptor output_descriptor = input_descriptor;
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL),
+                                weights_descriptor.shape[3]);
 
     return output_descriptor;
 }
 
 bool FusedConvolutionBatchNormalizationNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
diff --git a/src/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp b/src/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp
index c022450b9d..c51641d64c 100644
--- a/src/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp
+++ b/src/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp
@@ -32,18 +32,24 @@ namespace arm_compute
 {
 namespace graph
 {
-FusedDepthwiseConvolutionBatchNormalizationNode::FusedDepthwiseConvolutionBatchNormalizationNode(float                      epsilon,
-                                                                                                 PadStrideInfo              info,
-                                                                                                 unsigned int               depth_multiplier,
-                                                                                                 DepthwiseConvolutionMethod method,
-                                                                                                 ActivationLayerInfo        fused_activation)
-    : _epsilon(epsilon), _info(std::move(info)), _depth_multiplier(depth_multiplier), _method(method), _fused_activation(fused_activation)
+FusedDepthwiseConvolutionBatchNormalizationNode::FusedDepthwiseConvolutionBatchNormalizationNode(
+    float                      epsilon,
+    PadStrideInfo              info,
+    unsigned int               depth_multiplier,
+    DepthwiseConvolutionMethod method,
+    ActivationLayerInfo        fused_activation)
+    : _epsilon(epsilon),
+      _info(std::move(info)),
+      _depth_multiplier(depth_multiplier),
+      _method(method),
+      _fused_activation(fused_activation)
 {
     _input_edges.resize(7, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
 }
 
-void FusedDepthwiseConvolutionBatchNormalizationNode::set_depthwise_convolution_method(DepthwiseConvolutionMethod method)
+void FusedDepthwiseConvolutionBatchNormalizationNode::set_depthwise_convolution_method(
+    DepthwiseConvolutionMethod method)
 {
     _method = method;
 }
@@ -78,10 +84,11 @@ void FusedDepthwiseConvolutionBatchNormalizationNode::set_fused_activation(Activ
     _fused_activation = fused_activation;
 }
 
-TensorDescriptor FusedDepthwiseConvolutionBatchNormalizationNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                                                            const TensorDescriptor &weights_descriptor,
-                                                                                            const PadStrideInfo    &info,
-                                                                                            int                     depth_multiplier)
+TensorDescriptor
+FusedDepthwiseConvolutionBatchNormalizationNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+                                                                           const TensorDescriptor &weights_descriptor,
+                                                                           const PadStrideInfo    &info,
+                                                                           int                     depth_multiplier)
 {
     unsigned int output_width  = 0;
     unsigned int output_height = 0;
@@ -92,19 +99,22 @@ TensorDescriptor FusedDepthwiseConvolutionBatchNormalizationNode::compute_output
     const unsigned int kernel_width   = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
     const unsigned int kernel_height  = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
 
-    std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
+    std::tie(output_width, output_height) =
+        scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
 
     TensorDescriptor output_descriptor = input_descriptor;
     output_descriptor.shape.set(get_dimension_idx(output_descriptor.layout, DataLayoutDimension::WIDTH), output_width);
-    output_descriptor.shape.set(get_dimension_idx(output_descriptor.layout, DataLayoutDimension::HEIGHT), output_height);
-    output_descriptor.shape.set(get_dimension_idx(output_descriptor.layout, DataLayoutDimension::CHANNEL), input_channels * depth_multiplier);
+    output_descriptor.shape.set(get_dimension_idx(output_descriptor.layout, DataLayoutDimension::HEIGHT),
+                                output_height);
+    output_descriptor.shape.set(get_dimension_idx(output_descriptor.layout, DataLayoutDimension::CHANNEL),
+                                input_channels * depth_multiplier);
 
     return output_descriptor;
 }
 
 bool FusedDepthwiseConvolutionBatchNormalizationNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
diff --git a/src/graph/nodes/GenerateProposalsLayerNode.cpp b/src/graph/nodes/GenerateProposalsLayerNode.cpp
index 9f36862818..1671a47a95 100644
--- a/src/graph/nodes/GenerateProposalsLayerNode.cpp
+++ b/src/graph/nodes/GenerateProposalsLayerNode.cpp
@@ -23,17 +23,15 @@
  */
 #include "arm_compute/graph/nodes/GenerateProposalsLayerNode.h"
 
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
 
-#include "arm_compute/core/Helpers.h"
-
 namespace arm_compute
 {
 namespace graph
 {
-GenerateProposalsLayerNode::GenerateProposalsLayerNode(GenerateProposalsInfo &info)
-    : _info(info)
+GenerateProposalsLayerNode::GenerateProposalsLayerNode(GenerateProposalsInfo &info) : _info(info)
 {
     _input_edges.resize(3, EmptyEdgeID);
     _outputs.resize(3, NullTensorID);
@@ -46,10 +44,10 @@ const GenerateProposalsInfo &GenerateProposalsLayerNode::info() const
 
 bool GenerateProposalsLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (input_id(2) != NullTensorID) && (output_id(0) != NullTensorID) && (output_id(1) != NullTensorID)
-       && (output_id(2) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (input_id(2) != NullTensorID) &&
+        (output_id(0) != NullTensorID) && (output_id(1) != NullTensorID) && (output_id(2) != NullTensorID))
     {
-        for(unsigned int i = 0; i < 3; ++i)
+        for (unsigned int i = 0; i < 3; ++i)
         {
             Tensor *dst = output(i);
             ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -68,7 +66,7 @@ TensorDescriptor GenerateProposalsLayerNode::configure_output(size_t idx) const
     ARM_COMPUTE_ERROR_ON(src == nullptr);
     TensorDescriptor output_desc = src->desc();
 
-    switch(idx)
+    switch (idx)
     {
         case 0:
             // Configure proposals output
diff --git a/src/graph/nodes/InputNode.cpp b/src/graph/nodes/InputNode.cpp
index 072281f259..7408bc265d 100644
--- a/src/graph/nodes/InputNode.cpp
+++ b/src/graph/nodes/InputNode.cpp
@@ -30,15 +30,14 @@ namespace arm_compute
 {
 namespace graph
 {
-InputNode::InputNode(TensorDescriptor desc)
-    : _desc(std::move(desc))
+InputNode::InputNode(TensorDescriptor desc) : _desc(std::move(desc))
 {
     _outputs.resize(1, NullTensorID);
 }
 
 bool InputNode::forward_descriptors()
 {
-    if(output_id(0) != NullTensorID)
+    if (output_id(0) != NullTensorID)
     {
         Tensor *t = output(0);
         ARM_COMPUTE_ERROR_ON(t == nullptr);
diff --git a/src/graph/nodes/L2NormalizeLayerNode.cpp b/src/graph/nodes/L2NormalizeLayerNode.cpp
index 0c35a335fa..1a57cf0199 100644
--- a/src/graph/nodes/L2NormalizeLayerNode.cpp
+++ b/src/graph/nodes/L2NormalizeLayerNode.cpp
@@ -30,18 +30,15 @@ namespace arm_compute
 {
 namespace graph
 {
-L2NormalizeLayerNode::L2NormalizeLayerNode()
-    : L2NormalizeLayerNode(0, 1e-12f)
+L2NormalizeLayerNode::L2NormalizeLayerNode() : L2NormalizeLayerNode(0, 1e-12f)
 {
 }
 
-L2NormalizeLayerNode::L2NormalizeLayerNode(int axis)
-    : L2NormalizeLayerNode(axis, 1e-12f)
+L2NormalizeLayerNode::L2NormalizeLayerNode(int axis) : L2NormalizeLayerNode(axis, 1e-12f)
 {
 }
 
-L2NormalizeLayerNode::L2NormalizeLayerNode(int axis, float epsilon)
-    : _axis(axis), _epsilon(epsilon)
+L2NormalizeLayerNode::L2NormalizeLayerNode(int axis, float epsilon) : _axis(axis), _epsilon(epsilon)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -49,7 +46,7 @@ L2NormalizeLayerNode::L2NormalizeLayerNode(int axis, float epsilon)
 
 bool L2NormalizeLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -92,4 +89,4 @@ void L2NormalizeLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/NormalizationLayerNode.cpp b/src/graph/nodes/NormalizationLayerNode.cpp
index eaa1bcf924..b18bb7dd93 100644
--- a/src/graph/nodes/NormalizationLayerNode.cpp
+++ b/src/graph/nodes/NormalizationLayerNode.cpp
@@ -31,8 +31,7 @@ namespace arm_compute
 {
 namespace graph
 {
-NormalizationLayerNode::NormalizationLayerNode(NormalizationLayerInfo norm_info)
-    : _info(norm_info)
+NormalizationLayerNode::NormalizationLayerNode(NormalizationLayerInfo norm_info) : _info(norm_info)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -45,7 +44,7 @@ NormalizationLayerInfo NormalizationLayerNode::normalization_info() const
 
 bool NormalizationLayerNode::forward_descriptors()
 {
-    if(input_id(0) != NullTensorID && (output_id(0) != NullTensorID))
+    if (input_id(0) != NullTensorID && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -76,4 +75,4 @@ void NormalizationLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/NormalizePlanarYUVLayerNode.cpp b/src/graph/nodes/NormalizePlanarYUVLayerNode.cpp
index 113d0a541f..cac96606ea 100644
--- a/src/graph/nodes/NormalizePlanarYUVLayerNode.cpp
+++ b/src/graph/nodes/NormalizePlanarYUVLayerNode.cpp
@@ -39,7 +39,7 @@ NormalizePlanarYUVLayerNode::NormalizePlanarYUVLayerNode()
 
 bool NormalizePlanarYUVLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
diff --git a/src/graph/nodes/PReluLayerNode.cpp b/src/graph/nodes/PReluLayerNode.cpp
index 378c18e3bb..2b50fe9234 100644
--- a/src/graph/nodes/PReluLayerNode.cpp
+++ b/src/graph/nodes/PReluLayerNode.cpp
@@ -38,7 +38,7 @@ PReluLayerNode::PReluLayerNode()
 
 bool PReluLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
diff --git a/src/graph/nodes/PadLayerNode.cpp b/src/graph/nodes/PadLayerNode.cpp
index 6424370d41..336e7de05a 100644
--- a/src/graph/nodes/PadLayerNode.cpp
+++ b/src/graph/nodes/PadLayerNode.cpp
@@ -23,17 +23,15 @@
  */
 #include "arm_compute/graph/nodes/PadLayerNode.h"
 
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
 
-#include "arm_compute/core/Helpers.h"
-
 namespace arm_compute
 {
 namespace graph
 {
-PadLayerNode::PadLayerNode(const PaddingList &padding, PixelValue pad_value)
-    : _padding(padding), _pad_value(pad_value)
+PadLayerNode::PadLayerNode(const PaddingList &padding, PixelValue pad_value) : _padding(padding), _pad_value(pad_value)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -51,7 +49,7 @@ PixelValue PadLayerNode::pad_value() const
 
 bool PadLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -71,7 +69,7 @@ TensorDescriptor PadLayerNode::configure_output(size_t idx) const
 
     TensorDescriptor  output_desc = src->desc();
     const TensorShape input_shape = src->desc().shape;
-    for(size_t dim = 0; dim < _padding.size(); ++dim)
+    for (size_t dim = 0; dim < _padding.size(); ++dim)
     {
         output_desc.shape.set(dim, _padding[dim].first + input_shape[dim] + _padding[dim].second);
     }
diff --git a/src/graph/nodes/PermuteLayerNode.cpp b/src/graph/nodes/PermuteLayerNode.cpp
index b311ee1301..db53722363 100644
--- a/src/graph/nodes/PermuteLayerNode.cpp
+++ b/src/graph/nodes/PermuteLayerNode.cpp
@@ -23,17 +23,15 @@
  */
 #include "arm_compute/graph/nodes/PermuteLayerNode.h"
 
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
 
-#include "arm_compute/core/Helpers.h"
-
 namespace arm_compute
 {
 namespace graph
 {
-PermuteLayerNode::PermuteLayerNode(PermutationVector perm, DataLayout layout)
-    : _perm(perm), _layout(layout)
+PermuteLayerNode::PermuteLayerNode(PermutationVector perm, DataLayout layout) : _perm(perm), _layout(layout)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -46,7 +44,7 @@ const PermutationVector &PermuteLayerNode::permutation_vector() const
 
 bool PermuteLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -66,7 +64,7 @@ TensorDescriptor PermuteLayerNode::configure_output(size_t idx) const
 
     TensorDescriptor output_desc = src->desc();
     permute(output_desc.shape, _perm);
-    if(_layout != DataLayout::UNKNOWN)
+    if (_layout != DataLayout::UNKNOWN)
     {
         output_desc.layout = _layout;
     }
@@ -84,4 +82,4 @@ void PermuteLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/PoolingLayerNode.cpp b/src/graph/nodes/PoolingLayerNode.cpp
index 4ecf924a5e..ac954acbe3 100644
--- a/src/graph/nodes/PoolingLayerNode.cpp
+++ b/src/graph/nodes/PoolingLayerNode.cpp
@@ -32,8 +32,7 @@ namespace arm_compute
 {
 namespace graph
 {
-PoolingLayerNode::PoolingLayerNode(PoolingLayerInfo pool_info)
-    : _info(std::move(pool_info))
+PoolingLayerNode::PoolingLayerNode(PoolingLayerInfo pool_info) : _info(std::move(pool_info))
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -55,7 +54,8 @@ TensorDescriptor PoolingLayerNode::compute_output_descriptor(const TensorDescrip
     const unsigned int pool_size_x  = info.is_global_pooling ? input_width : info.pool_size.width;
     const unsigned int pool_size_y  = info.is_global_pooling ? input_height : info.pool_size.height;
 
-    std::tie(pooled_width, pooled_height) = scaled_dimensions(input_width, input_height, pool_size_x, pool_size_y, info.pad_stride_info);
+    std::tie(pooled_width, pooled_height) =
+        scaled_dimensions(input_width, input_height, pool_size_x, pool_size_y, info.pad_stride_info);
 
     const DataLayout data_layout       = input_descriptor.layout;
     TensorDescriptor output_descriptor = input_descriptor;
@@ -67,7 +67,7 @@ TensorDescriptor PoolingLayerNode::compute_output_descriptor(const TensorDescrip
 
 bool PoolingLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -98,4 +98,4 @@ void PoolingLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/PrintLayerNode.cpp b/src/graph/nodes/PrintLayerNode.cpp
index da408d8c4d..82a340005b 100644
--- a/src/graph/nodes/PrintLayerNode.cpp
+++ b/src/graph/nodes/PrintLayerNode.cpp
@@ -32,7 +32,9 @@ namespace arm_compute
 {
 namespace graph
 {
-PrintLayerNode::PrintLayerNode(std::ostream &stream, const IOFormatInfo &format_info, const std::function<ITensor *(ITensor *)> transform)
+PrintLayerNode::PrintLayerNode(std::ostream                             &stream,
+                               const IOFormatInfo                       &format_info,
+                               const std::function<ITensor *(ITensor *)> transform)
     : _stream(stream), _format_info(format_info), _transform(transform)
 {
     _input_edges.resize(1, EmptyEdgeID);
@@ -56,7 +58,7 @@ const std::function<ITensor *(ITensor *)> PrintLayerNode::transform() const
 
 bool PrintLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -88,4 +90,4 @@ void PrintLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/PriorBoxLayerNode.cpp b/src/graph/nodes/PriorBoxLayerNode.cpp
index f017ead880..5ffb173333 100644
--- a/src/graph/nodes/PriorBoxLayerNode.cpp
+++ b/src/graph/nodes/PriorBoxLayerNode.cpp
@@ -32,8 +32,7 @@ namespace arm_compute
 {
 namespace graph
 {
-PriorBoxLayerNode::PriorBoxLayerNode(PriorBoxLayerInfo prior_info)
-    : _info(std::move(prior_info))
+PriorBoxLayerNode::PriorBoxLayerNode(PriorBoxLayerInfo prior_info) : _info(std::move(prior_info))
 {
     _input_edges.resize(2, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -44,7 +43,7 @@ PriorBoxLayerInfo PriorBoxLayerNode::priorbox_info() const
     return _info;
 }
 
-TensorDescriptor PriorBoxLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+TensorDescriptor PriorBoxLayerNode::compute_output_descriptor(const TensorDescriptor  &input_descriptor,
                                                               const PriorBoxLayerInfo &info)
 {
     const unsigned int layer_width  = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
@@ -61,7 +60,7 @@ TensorDescriptor PriorBoxLayerNode::compute_output_descriptor(const TensorDescri
 
 bool PriorBoxLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
diff --git a/src/graph/nodes/QuantizationLayerNode.cpp b/src/graph/nodes/QuantizationLayerNode.cpp
index 4906808dae..0dd2da919d 100644
--- a/src/graph/nodes/QuantizationLayerNode.cpp
+++ b/src/graph/nodes/QuantizationLayerNode.cpp
@@ -47,7 +47,7 @@ QuantizationLayerNode::QuantizationLayerNode(QuantizationInfo out_quant_info, Da
 
 bool QuantizationLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
diff --git a/src/graph/nodes/ROIAlignLayerNode.cpp b/src/graph/nodes/ROIAlignLayerNode.cpp
index 62891811f3..5909335826 100644
--- a/src/graph/nodes/ROIAlignLayerNode.cpp
+++ b/src/graph/nodes/ROIAlignLayerNode.cpp
@@ -24,17 +24,15 @@
 
 #include "arm_compute/graph/nodes/ROIAlignLayerNode.h"
 
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
 
-#include "arm_compute/core/Helpers.h"
-
 namespace arm_compute
 {
 namespace graph
 {
-ROIAlignLayerNode::ROIAlignLayerNode(ROIPoolingLayerInfo &pool_info)
-    : _pool_info(pool_info)
+ROIAlignLayerNode::ROIAlignLayerNode(ROIPoolingLayerInfo &pool_info) : _pool_info(pool_info)
 {
     _input_edges.resize(2, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -47,7 +45,7 @@ const ROIPoolingLayerInfo &ROIAlignLayerNode::pooling_info() const
 
 bool ROIAlignLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -92,4 +90,4 @@ void ROIAlignLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/ReductionLayerNode.cpp b/src/graph/nodes/ReductionLayerNode.cpp
index 0e93039894..965c1ba0a5 100644
--- a/src/graph/nodes/ReductionLayerNode.cpp
+++ b/src/graph/nodes/ReductionLayerNode.cpp
@@ -56,7 +56,7 @@ bool ReductionLayerNode::keep_dims() const
 
 bool ReductionLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -74,8 +74,9 @@ TensorDescriptor ReductionLayerNode::configure_output(size_t idx) const
     const Tensor *src = input(0);
     ARM_COMPUTE_ERROR_ON(src == nullptr);
 
-    TensorDescriptor output_info  = src->desc();
-    TensorShape      output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(output_info.shape, _axis, _keep_dims);
+    TensorDescriptor output_info = src->desc();
+    TensorShape      output_shape =
+        arm_compute::misc::shape_calculator::compute_reduced_shape(output_info.shape, _axis, _keep_dims);
     output_info.set_shape(output_shape);
 
     return output_info;
@@ -91,4 +92,4 @@ void ReductionLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/ReorgLayerNode.cpp b/src/graph/nodes/ReorgLayerNode.cpp
index e693e4b931..251a4ea1b2 100644
--- a/src/graph/nodes/ReorgLayerNode.cpp
+++ b/src/graph/nodes/ReorgLayerNode.cpp
@@ -31,8 +31,7 @@ namespace arm_compute
 {
 namespace graph
 {
-ReorgLayerNode::ReorgLayerNode(int stride)
-    : _stride(stride)
+ReorgLayerNode::ReorgLayerNode(int stride) : _stride(stride)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -51,20 +50,22 @@ TensorDescriptor ReorgLayerNode::compute_output_descriptor(const TensorDescripto
 
     ARM_COMPUTE_ERROR_ON(stride <= 0);
     ARM_COMPUTE_ERROR_ON_MSG((input_width % stride != 0), "The width of the input tensor must be a multiple of stride");
-    ARM_COMPUTE_ERROR_ON_MSG((input_height % stride != 0), "The height of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_ERROR_ON_MSG((input_height % stride != 0),
+                             "The height of the input tensor must be a multiple of stride");
 
     const DataLayout data_layout       = input_descriptor.layout;
     TensorDescriptor output_descriptor = input_descriptor;
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), input_width / stride);
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), input_height / stride);
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), input_channel * stride * stride);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL),
+                                input_channel * stride * stride);
 
     return output_descriptor;
 }
 
 bool ReorgLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -95,4 +96,4 @@ void ReorgLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/ReshapeLayer.cpp b/src/graph/nodes/ReshapeLayer.cpp
index a6354d03ed..ce6bf9b803 100644
--- a/src/graph/nodes/ReshapeLayer.cpp
+++ b/src/graph/nodes/ReshapeLayer.cpp
@@ -21,17 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/graph/nodes/ReshapeLayerNode.h"
-
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/nodes/ReshapeLayerNode.h"
 
 namespace arm_compute
 {
 namespace graph
 {
-ReshapeLayerNode::ReshapeLayerNode(TensorShape shape)
-    : _shape(shape)
+ReshapeLayerNode::ReshapeLayerNode(TensorShape shape) : _shape(shape)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -39,7 +37,7 @@ ReshapeLayerNode::ReshapeLayerNode(TensorShape shape)
 
 bool ReshapeLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -73,4 +71,4 @@ void ReshapeLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/ResizeLayerNode.cpp b/src/graph/nodes/ResizeLayerNode.cpp
index 2a94bf6063..292b2c643e 100644
--- a/src/graph/nodes/ResizeLayerNode.cpp
+++ b/src/graph/nodes/ResizeLayerNode.cpp
@@ -50,7 +50,7 @@ std::pair<float, float> ResizeLayerNode::scaling_factor() const
 
 bool ResizeLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -88,4 +88,4 @@ void ResizeLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/SliceLayerNode.cpp b/src/graph/nodes/SliceLayerNode.cpp
index b7655b9eae..eb877d9a24 100644
--- a/src/graph/nodes/SliceLayerNode.cpp
+++ b/src/graph/nodes/SliceLayerNode.cpp
@@ -32,8 +32,7 @@ namespace arm_compute
 {
 namespace graph
 {
-SliceLayerNode::SliceLayerNode(const Coordinates &starts, const Coordinates &ends)
-    : _starts(starts), _ends(ends)
+SliceLayerNode::SliceLayerNode(const Coordinates &starts, const Coordinates &ends) : _starts(starts), _ends(ends)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -50,19 +49,20 @@ Coordinates SliceLayerNode::ends() const
 }
 
 TensorDescriptor SliceLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                           const Coordinates &starts, const Coordinates &ends)
+                                                           const Coordinates      &starts,
+                                                           const Coordinates      &ends)
 {
     using namespace arm_compute::helpers::tensor_transform;
 
     TensorDescriptor output_desc = input_descriptor;
-    output_desc.shape            = arm_compute::misc::shape_calculator::compute_slice_shape(input_descriptor.shape, starts, ends);
+    output_desc.shape = arm_compute::misc::shape_calculator::compute_slice_shape(input_descriptor.shape, starts, ends);
 
     return output_desc;
 }
 
 bool SliceLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
diff --git a/src/graph/nodes/SoftmaxLayerNode.cpp b/src/graph/nodes/SoftmaxLayerNode.cpp
index 031166993a..4beac81b1f 100644
--- a/src/graph/nodes/SoftmaxLayerNode.cpp
+++ b/src/graph/nodes/SoftmaxLayerNode.cpp
@@ -31,8 +31,7 @@ namespace arm_compute
 {
 namespace graph
 {
-SoftmaxLayerNode::SoftmaxLayerNode(float beta)
-    : _beta(beta)
+SoftmaxLayerNode::SoftmaxLayerNode(float beta) : _beta(beta)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -45,7 +44,7 @@ float SoftmaxLayerNode::beta() const
 
 bool SoftmaxLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -79,4 +78,4 @@ void SoftmaxLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/SplitLayerNode.cpp b/src/graph/nodes/SplitLayerNode.cpp
index 31931c3a79..dfb6624f80 100644
--- a/src/graph/nodes/SplitLayerNode.cpp
+++ b/src/graph/nodes/SplitLayerNode.cpp
@@ -49,8 +49,8 @@ unsigned int SplitLayerNode::axis() const
     return _axis;
 }
 
-std::pair<TensorDescriptor, Coordinates> SplitLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                                                   unsigned int num_splits, int axis, unsigned int idx)
+std::pair<TensorDescriptor, Coordinates> SplitLayerNode::compute_output_descriptor(
+    const TensorDescriptor &input_descriptor, unsigned int num_splits, int axis, unsigned int idx)
 {
     // Handle negative axis, negative index is used to specify axis from the end (e.g. -1 for the last axis).
     int              num_dimension = static_cast<int32_t>(input_descriptor.shape.num_dimensions());
@@ -58,7 +58,7 @@ std::pair<TensorDescriptor, Coordinates> SplitLayerNode::compute_output_descript
     Coordinates      coords;
     TensorDescriptor output_descriptor = input_descriptor;
     int              split_size        = input_descriptor.shape[tmp_axis] / num_splits;
-    if(_size_splits.empty())
+    if (_size_splits.empty())
     {
         output_descriptor.shape.set(tmp_axis, split_size);
         coords.set(tmp_axis, idx * split_size);
@@ -66,15 +66,15 @@ std::pair<TensorDescriptor, Coordinates> SplitLayerNode::compute_output_descript
     else
     {
         int split_size = _size_splits[idx];
-        if(split_size == -1)
+        if (split_size == -1)
         {
             split_size = input_descriptor.shape[tmp_axis];
-            for(unsigned int i = 0; i < _size_splits.size() - 1; ++i)
+            for (unsigned int i = 0; i < _size_splits.size() - 1; ++i)
                 split_size -= _size_splits[i];
         }
         output_descriptor.shape.set(tmp_axis, split_size);
         int coord_value = 0;
-        for(unsigned int i = 0; i < idx; ++i)
+        for (unsigned int i = 0; i < idx; ++i)
             coord_value += _size_splits[i];
         coords.set(tmp_axis, coord_value);
     }
@@ -84,12 +84,12 @@ std::pair<TensorDescriptor, Coordinates> SplitLayerNode::compute_output_descript
 
 bool SplitLayerNode::forward_descriptors()
 {
-    if(input_id(0) != NullTensorID)
+    if (input_id(0) != NullTensorID)
     {
         validate();
-        for(unsigned int i = 0; i < _outputs.size(); ++i)
+        for (unsigned int i = 0; i < _outputs.size(); ++i)
         {
-            if(output_id(i) != NullTensorID)
+            if (output_id(i) != NullTensorID)
             {
                 Tensor *dst_i = output(i);
                 ARM_COMPUTE_ERROR_ON(dst_i == nullptr);
@@ -117,10 +117,10 @@ TensorDescriptor SplitLayerNode::configure_output(size_t idx) const
     int tmp_axis      = wrap_around(_axis, num_dimension);
 
     int split_size = (_size_splits.empty()) ? (input_descriptor.shape[tmp_axis] / _num_splits) : _size_splits[idx];
-    if(split_size == -1)
+    if (split_size == -1)
     {
         split_size = input_descriptor.shape[tmp_axis];
-        for(unsigned int i = 0; i < _size_splits.size() - 1; ++i)
+        for (unsigned int i = 0; i < _size_splits.size() - 1; ++i)
             split_size -= _size_splits[i];
     }
     output_descriptor.shape.set(tmp_axis, split_size);
@@ -138,7 +138,7 @@ Status SplitLayerNode::validate() const
     // Handle negative axis, negative index is used to specify axis from the end (e.g. -1 for the last axis).
     int tmp_axis = wrap_around(_axis, num_dimension);
 
-    if(_size_splits.empty())
+    if (_size_splits.empty())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->desc().shape[tmp_axis] % _num_splits, "Split should be exact");
     }
@@ -156,4 +156,4 @@ void SplitLayerNode::accept(INodeVisitor &v)
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/graph/nodes/StackLayerNode.cpp b/src/graph/nodes/StackLayerNode.cpp
index f292b33ad0..031d8fc739 100644
--- a/src/graph/nodes/StackLayerNode.cpp
+++ b/src/graph/nodes/StackLayerNode.cpp
@@ -25,18 +25,16 @@
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
 #include "arm_compute/graph/Utils.h"
 
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
 namespace arm_compute
 {
 namespace graph
 {
-StackLayerNode::StackLayerNode(unsigned int total_nodes, int axis)
-    : _total_nodes(total_nodes), _axis(axis)
+StackLayerNode::StackLayerNode(unsigned int total_nodes, int axis) : _total_nodes(total_nodes), _axis(axis)
 {
     _input_edges.resize(_total_nodes, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -64,7 +62,7 @@ TensorDescriptor StackLayerNode::compute_output_descriptor(const std::vector<Ten
 
 bool StackLayerNode::forward_descriptors()
 {
-    if(_outputs[0] != NullTensorID)
+    if (_outputs[0] != NullTensorID)
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -80,17 +78,15 @@ TensorDescriptor StackLayerNode::configure_output(size_t idx) const
     ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
 
     // Check if all input tensors are set
-    bool are_all_inputs_set = std::all_of(std::begin(_input_edges), std::end(_input_edges), [](const EdgeID & eid)
-    {
-        return eid != EmptyEdgeID;
-    });
+    bool are_all_inputs_set = std::all_of(std::begin(_input_edges), std::end(_input_edges),
+                                          [](const EdgeID &eid) { return eid != EmptyEdgeID; });
 
     TensorDescriptor output_info = {};
 
-    if(are_all_inputs_set)
+    if (are_all_inputs_set)
     {
         std::vector<TensorDescriptor> inputs_descriptors;
-        for(unsigned int i = 0; i < _input_edges.size(); ++i)
+        for (unsigned int i = 0; i < _input_edges.size(); ++i)
         {
             const Tensor *t = _graph->tensor(input_id(i));
             ARM_COMPUTE_ERROR_ON(t == nullptr);
diff --git a/src/graph/nodes/StridedSliceLayerNode.cpp b/src/graph/nodes/StridedSliceLayerNode.cpp
index 6a1a724bb3..fc9f72204c 100644
--- a/src/graph/nodes/StridedSliceLayerNode.cpp
+++ b/src/graph/nodes/StridedSliceLayerNode.cpp
@@ -79,7 +79,7 @@ TensorDescriptor StridedSliceLayerNode::compute_output_descriptor(const TensorDe
 
 bool StridedSliceLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
diff --git a/src/graph/printers/DotGraphPrinter.cpp b/src/graph/printers/DotGraphPrinter.cpp
index 9c7c4248bb..5587ed23f0 100644
--- a/src/graph/printers/DotGraphPrinter.cpp
+++ b/src/graph/printers/DotGraphPrinter.cpp
@@ -25,9 +25,9 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/nodes/Nodes.h"
 #include "arm_compute/graph/Tensor.h"
 #include "arm_compute/graph/TypePrinter.h"
-#include "arm_compute/graph/nodes/Nodes.h"
 
 namespace arm_compute
 {
@@ -152,9 +152,9 @@ void DotGraphPrinter::print_footer(const Graph &g, std::ostream &os)
 
 void DotGraphPrinter::print_nodes(const Graph &g, std::ostream &os)
 {
-    for(const auto &n : g.nodes())
+    for (const auto &n : g.nodes())
     {
-        if(n)
+        if (n)
         {
             // Output node id
             std::string node_id = std::string("n") + support::cpp11::to_string(n->id());
@@ -166,7 +166,8 @@ void DotGraphPrinter::print_nodes(const Graph &g, std::ostream &os)
             std::string name             = n->name().empty() ? node_id : n->name();
             auto        node_description = _dot_node_visitor.info();
 
-            os << R"([label = ")" << name << R"( \n )" << n->assigned_target() << R"( \n )" << node_description << R"("])";
+            os << R"([label = ")" << name << R"( \n )" << n->assigned_target() << R"( \n )" << node_description
+               << R"("])";
             os << ";\n";
         }
     }
@@ -174,16 +175,17 @@ void DotGraphPrinter::print_nodes(const Graph &g, std::ostream &os)
 
 void DotGraphPrinter::print_edges(const Graph &g, std::ostream &os)
 {
-    for(const auto &e : g.edges())
+    for (const auto &e : g.edges())
     {
-        if(e)
+        if (e)
         {
             std::string source_node_id = std::string("n") + support::cpp11::to_string(e->producer_id());
             std::string sink_node_id   = std::string("n") + support::cpp11::to_string(e->consumer_id());
             os << source_node_id << " -> " << sink_node_id << " ";
             const Tensor *t = e->tensor();
             ARM_COMPUTE_ERROR_ON(t == nullptr);
-            os << R"([label = ")" << t->desc().shape << R"( \n )" << t->desc().data_type << R"( \n )" << t->desc().layout << R"("])";
+            os << R"([label = ")" << t->desc().shape << R"( \n )" << t->desc().data_type << R"( \n )"
+               << t->desc().layout << R"("])";
             os << ";\n";
         }
     }
diff --git a/src/runtime/Allocator.cpp b/src/runtime/Allocator.cpp
index ef7c62d64b..eca712dbf0 100644
--- a/src/runtime/Allocator.cpp
+++ b/src/runtime/Allocator.cpp
@@ -22,9 +22,9 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/Allocator.h"
-#include "arm_compute/runtime/MemoryRegion.h"
 
 #include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/MemoryRegion.h"
 
 #include <cstddef>
 
diff --git a/src/runtime/BlobLifetimeManager.cpp b/src/runtime/BlobLifetimeManager.cpp
index bea55d8eb9..8a0fc05c39 100644
--- a/src/runtime/BlobLifetimeManager.cpp
+++ b/src/runtime/BlobLifetimeManager.cpp
@@ -35,8 +35,7 @@
 
 namespace arm_compute
 {
-BlobLifetimeManager::BlobLifetimeManager()
-    : _blobs()
+BlobLifetimeManager::BlobLifetimeManager() : _blobs()
 {
 }
 
@@ -62,33 +61,32 @@ void BlobLifetimeManager::update_blobs_and_mappings()
     ARM_COMPUTE_ERROR_ON(_active_group == nullptr);
 
     // Sort free blobs requirements in descending order.
-    _free_blobs.sort([](const Blob & ba, const Blob & bb)
-    {
-        return ba.max_size > bb.max_size;
-    });
+    _free_blobs.sort([](const Blob &ba, const Blob &bb) { return ba.max_size > bb.max_size; });
 
     // Create group sizes vector
     std::vector<BlobInfo> group_sizes;
-    std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(group_sizes), [](const Blob & b)
-    {
-        return BlobInfo{ b.max_size, b.max_alignment, b.bound_elements.size() };
-    });
+    std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(group_sizes),
+                   [](const Blob &b) {
+                       return BlobInfo{b.max_size, b.max_alignment, b.bound_elements.size()};
+                   });
 
     // Update blob sizes
     size_t max_size = std::max(_blobs.size(), group_sizes.size());
     _blobs.resize(max_size);
     group_sizes.resize(max_size);
-    std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs), [](BlobInfo lhs, BlobInfo rhs)
-    {
-        return BlobInfo{ std::max(lhs.size, rhs.size), std::max(lhs.alignment, rhs.alignment), std::max(lhs.owners, rhs.owners) };
-    });
+    std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs),
+                   [](BlobInfo lhs, BlobInfo rhs)
+                   {
+                       return BlobInfo{std::max(lhs.size, rhs.size), std::max(lhs.alignment, rhs.alignment),
+                                       std::max(lhs.owners, rhs.owners)};
+                   });
 
     // Calculate group mappings
     auto &group_mappings = _active_group->mappings();
     int   blob_idx       = 0;
-    for(auto &free_blob : _free_blobs)
+    for (auto &free_blob : _free_blobs)
     {
-        for(auto &bound_element_id : free_blob.bound_elements)
+        for (auto &bound_element_id : free_blob.bound_elements)
         {
             ARM_COMPUTE_ERROR_ON(_active_elements.find(bound_element_id) == std::end(_active_elements));
             Element &bound_element               = _active_elements[bound_element_id];
diff --git a/src/runtime/BlobMemoryPool.cpp b/src/runtime/BlobMemoryPool.cpp
index 88e280537c..a2f63ef52b 100644
--- a/src/runtime/BlobMemoryPool.cpp
+++ b/src/runtime/BlobMemoryPool.cpp
@@ -47,7 +47,7 @@ BlobMemoryPool::~BlobMemoryPool()
 void BlobMemoryPool::acquire(MemoryMappings &handles)
 {
     // Set memory to handlers
-    for(auto &handle : handles)
+    for (auto &handle : handles)
     {
         ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
         handle.first->set_region(_blobs[handle.second].get());
@@ -56,7 +56,7 @@ void BlobMemoryPool::acquire(MemoryMappings &handles)
 
 void BlobMemoryPool::release(MemoryMappings &handles)
 {
-    for(auto &handle : handles)
+    for (auto &handle : handles)
     {
         ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
         handle.first->set_region(nullptr);
@@ -78,7 +78,7 @@ void BlobMemoryPool::allocate_blobs(const std::vector<BlobInfo> &blob_info)
 {
     ARM_COMPUTE_ERROR_ON(!_allocator);
 
-    for(const auto &bi : blob_info)
+    for (const auto &bi : blob_info)
     {
         _blobs.push_back(_allocator->make_region(bi.size, bi.alignment));
     }
diff --git a/src/runtime/CL/CLBufferAllocator.cpp b/src/runtime/CL/CLBufferAllocator.cpp
index e06ef3d37d..b4545b93bf 100644
--- a/src/runtime/CL/CLBufferAllocator.cpp
+++ b/src/runtime/CL/CLBufferAllocator.cpp
@@ -35,7 +35,8 @@ namespace arm_compute
 void *CLBufferAllocator::allocate(size_t size, size_t alignment)
 {
     ARM_COMPUTE_UNUSED(alignment);
-    cl_mem buf{ clCreateBuffer(CLScheduler::get().context().get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size, nullptr, nullptr) };
+    cl_mem buf{clCreateBuffer(CLScheduler::get().context().get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size,
+                              nullptr, nullptr)};
     return static_cast<void *>(buf);
 }
 
diff --git a/src/runtime/CL/CLGEMMHeuristicsHandle.cpp b/src/runtime/CL/CLGEMMHeuristicsHandle.cpp
index 7168259fcd..d680dc08bb 100644
--- a/src/runtime/CL/CLGEMMHeuristicsHandle.cpp
+++ b/src/runtime/CL/CLGEMMHeuristicsHandle.cpp
@@ -27,8 +27,7 @@
 
 namespace arm_compute
 {
-CLGEMMHeuristicsHandle::CLGEMMHeuristicsHandle()
-    : _heuristics(std::make_unique<mlgo::MLGOHeuristics>())
+CLGEMMHeuristicsHandle::CLGEMMHeuristicsHandle() : _heuristics(std::make_unique<mlgo::MLGOHeuristics>())
 {
 }
 CLGEMMHeuristicsHandle::~CLGEMMHeuristicsHandle() = default;
diff --git a/src/runtime/CL/CLHelpers.cpp b/src/runtime/CL/CLHelpers.cpp
index 5b4bbbcde0..eb28ecbf8d 100644
--- a/src/runtime/CL/CLHelpers.cpp
+++ b/src/runtime/CL/CLHelpers.cpp
@@ -50,34 +50,30 @@ void printf_callback(const char *buffer, unsigned int len, size_t complete, void
  * @return A pointer to the context properties which can be used to create an opencl context
  */
 
-void initialise_context_properties(const cl::Platform &platform, const cl::Device &device, std::array<cl_context_properties, 7> &prop)
+void initialise_context_properties(const cl::Platform                   &platform,
+                                   const cl::Device                     &device,
+                                   std::array<cl_context_properties, 7> &prop)
 {
     ARM_COMPUTE_UNUSED(device);
 #if defined(ARM_COMPUTE_ASSERTS_ENABLED)
     // Query devices in the context for cl_arm_printf support
-    if(arm_compute::device_supports_extension(device, "cl_arm_printf"))
+    if (arm_compute::device_supports_extension(device, "cl_arm_printf"))
     {
         // Create a cl_context with a printf_callback and user specified buffer size.
-        std::array<cl_context_properties, 7> properties_printf =
-        {
+        std::array<cl_context_properties, 7> properties_printf = {
             CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()),
             // Enable a printf callback function for this context.
             CL_PRINTF_CALLBACK_ARM, reinterpret_cast<cl_context_properties>(printf_callback),
             // Request a minimum printf buffer size of 4MB for devices in the
             // context that support this extension.
-            CL_PRINTF_BUFFERSIZE_ARM, 0x1000,
-            0
-        };
+            CL_PRINTF_BUFFERSIZE_ARM, 0x1000, 0};
         prop = properties_printf;
     }
     else
 #endif // defined(ARM_COMPUTE_ASSERTS_ENABLED)
     {
-        std::array<cl_context_properties, 3> properties =
-        {
-            CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()),
-            0
-        };
+        std::array<cl_context_properties, 3> properties = {CL_CONTEXT_PLATFORM,
+                                                           reinterpret_cast<cl_context_properties>(platform()), 0};
         std::copy(properties.begin(), properties.end(), prop.begin());
     };
 }
@@ -91,19 +87,19 @@ cl::Platform select_preferable_platform(CLBackendType cl_backend_type)
     cl::Platform::get(&platforms);
     ARM_COMPUTE_ERROR_ON_MSG(platforms.size() == 0, "Couldn't find any OpenCL platform");
 
-    cl::Platform selected_platform{ nullptr };
+    cl::Platform selected_platform{nullptr};
 
     // If the user has selected the Native platform, return the first available.
-    switch(cl_backend_type)
+    switch (cl_backend_type)
     {
         case CLBackendType::Native:
             selected_platform = platforms[0];
             break;
         case CLBackendType::Clvk:
-            for(auto p : platforms)
+            for (auto p : platforms)
             {
                 std::string res = p.getInfo<CL_PLATFORM_NAME>();
-                if(res.find("clvk") != std::string::npos)
+                if (res.find("clvk") != std::string::npos)
                 {
                     selected_platform = p;
                     break;
@@ -114,7 +110,7 @@ cl::Platform select_preferable_platform(CLBackendType cl_backend_type)
             ARM_COMPUTE_ERROR("Unsupported backend type");
     }
 
-    if(!selected_platform())
+    if (!selected_platform())
     {
         ARM_COMPUTE_ERROR("No valid platform found");
     }
@@ -122,8 +118,7 @@ cl::Platform select_preferable_platform(CLBackendType cl_backend_type)
     return selected_platform;
 }
 
-std::tuple<cl::Context, cl::Device, cl_int>
-create_opencl_context_and_device(CLBackendType cl_backend_type)
+std::tuple<cl::Context, cl::Device, cl_int> create_opencl_context_and_device(CLBackendType cl_backend_type)
 {
     ARM_COMPUTE_ERROR_ON(!opencl_is_available());
     cl::Platform            p = select_preferable_platform(cl_backend_type);
@@ -131,9 +126,9 @@ create_opencl_context_and_device(CLBackendType cl_backend_type)
     std::vector<cl::Device> platform_devices;
     p.getDevices(CL_DEVICE_TYPE_DEFAULT, &platform_devices);
     ARM_COMPUTE_ERROR_ON_MSG(platform_devices.size() == 0, "Couldn't find any OpenCL device");
-    device     = platform_devices[0];
-    cl_int err = CL_SUCCESS;
-    std::array<cl_context_properties, 7> properties = { 0, 0, 0, 0, 0, 0, 0 };
+    device                                          = platform_devices[0];
+    cl_int                               err        = CL_SUCCESS;
+    std::array<cl_context_properties, 7> properties = {0, 0, 0, 0, 0, 0, 0};
     initialise_context_properties(p, device, properties);
     cl::Context cl_context = cl::Context(device, properties.data(), nullptr, nullptr, &err);
     ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Failed to create OpenCL context");
@@ -143,7 +138,7 @@ create_opencl_context_and_device(CLBackendType cl_backend_type)
 void schedule_kernel_on_ctx(CLRuntimeContext *ctx, ICLKernel *kernel, bool flush)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(kernel);
-    if(ctx)
+    if (ctx)
     {
         ARM_COMPUTE_ERROR_ON(ctx->gpu_scheduler() == nullptr);
         ctx->gpu_scheduler()->enqueue(*kernel, flush);
diff --git a/src/runtime/CL/CLMemory.cpp b/src/runtime/CL/CLMemory.cpp
index a1743c56e6..c6ee6fde83 100644
--- a/src/runtime/CL/CLMemory.cpp
+++ b/src/runtime/CL/CLMemory.cpp
@@ -24,24 +24,22 @@
 #include "arm_compute/runtime/CL/CLMemory.h"
 
 #include "arm_compute/core/Error.h"
+
 #include "support/Cast.h"
 
 namespace arm_compute
 {
-CLMemory::CLMemory()
-    : _region(nullptr), _region_owned(nullptr)
+CLMemory::CLMemory() : _region(nullptr), _region_owned(nullptr)
 {
 }
 
-CLMemory::CLMemory(const std::shared_ptr<ICLMemoryRegion> &memory)
-    : _region(nullptr), _region_owned(memory)
+CLMemory::CLMemory(const std::shared_ptr<ICLMemoryRegion> &memory) : _region(nullptr), _region_owned(memory)
 {
     _region_owned = memory;
     _region       = _region_owned.get();
 }
 
-CLMemory::CLMemory(ICLMemoryRegion *memory)
-    : _region(memory), _region_owned(nullptr)
+CLMemory::CLMemory(ICLMemoryRegion *memory) : _region(memory), _region_owned(nullptr)
 {
     _region = memory;
 }
@@ -78,4 +76,4 @@ void CLMemory::set_owned_region(std::unique_ptr<IMemoryRegion> region)
     _region_owned = utils::cast::polymorphic_downcast_unique_ptr<ICLMemoryRegion>(std::move(region));
     _region       = _region_owned.get();
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/CLMemoryRegion.cpp b/src/runtime/CL/CLMemoryRegion.cpp
index 00f91a0ffb..835958b816 100644
--- a/src/runtime/CL/CLMemoryRegion.cpp
+++ b/src/runtime/CL/CLMemoryRegion.cpp
@@ -29,10 +29,7 @@
 namespace arm_compute
 {
 ICLMemoryRegion::ICLMemoryRegion(size_t size)
-    : IMemoryRegion(size),
-      _ctx(CLScheduler::get().context()),
-      _mapping(nullptr),
-      _mem()
+    : IMemoryRegion(size), _ctx(CLScheduler::get().context()), _mapping(nullptr), _mem()
 {
 }
 
@@ -57,17 +54,15 @@ std::unique_ptr<IMemoryRegion> ICLMemoryRegion::extract_subregion(size_t offset,
     return nullptr;
 }
 
-CLBufferMemoryRegion::CLBufferMemoryRegion(cl_mem_flags flags, size_t size)
-    : ICLMemoryRegion(size)
+CLBufferMemoryRegion::CLBufferMemoryRegion(cl_mem_flags flags, size_t size) : ICLMemoryRegion(size)
 {
-    if(_size != 0)
+    if (_size != 0)
     {
         _mem = cl::Buffer(CLScheduler::get().context(), flags, _size);
     }
 }
 
-CLBufferMemoryRegion::CLBufferMemoryRegion(const cl::Buffer &buffer)
-    : ICLMemoryRegion(buffer.getInfo<CL_MEM_SIZE>())
+CLBufferMemoryRegion::CLBufferMemoryRegion(const cl::Buffer &buffer) : ICLMemoryRegion(buffer.getInfo<CL_MEM_SIZE>())
 {
     _mem = buffer;
 }
@@ -102,10 +97,10 @@ void CLBufferMemoryRegion::unmap(cl::CommandQueue &q)
 ICLSVMMemoryRegion::ICLSVMMemoryRegion(cl_mem_flags flags, size_t size, size_t alignment)
     : ICLMemoryRegion(size), _ptr(nullptr)
 {
-    if(size != 0)
+    if (size != 0)
     {
         _ptr = clSVMAlloc(CLScheduler::get().context().get(), flags, size, alignment);
-        if(_ptr != nullptr)
+        if (_ptr != nullptr)
         {
             _mem = cl::Buffer(CLScheduler::get().context(), CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, _size, _ptr);
         }
@@ -114,7 +109,7 @@ ICLSVMMemoryRegion::ICLSVMMemoryRegion(cl_mem_flags flags, size_t size, size_t a
 
 ICLSVMMemoryRegion::~ICLSVMMemoryRegion()
 {
-    if(_ptr != nullptr)
+    if (_ptr != nullptr)
     {
         try
         {
@@ -125,7 +120,7 @@ ICLSVMMemoryRegion::~ICLSVMMemoryRegion()
             _mem = cl::Buffer();
             clSVMFree(_ctx.get(), _ptr);
         }
-        catch(...)
+        catch (...)
         {
         }
     }
@@ -144,7 +139,8 @@ CLCoarseSVMMemoryRegion::CLCoarseSVMMemoryRegion(cl_mem_flags flags, size_t size
 void *CLCoarseSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking)
 {
     ARM_COMPUTE_ERROR_ON(_ptr == nullptr);
-    clEnqueueSVMMap(q.get(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, _ptr, _size, 0, nullptr, nullptr);
+    clEnqueueSVMMap(q.get(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, _ptr, _size, 0, nullptr,
+                    nullptr);
     _mapping = _ptr;
     return _mapping;
 }
@@ -163,7 +159,7 @@ CLFineSVMMemoryRegion::CLFineSVMMemoryRegion(cl_mem_flags flags, size_t size, si
 
 void *CLFineSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking)
 {
-    if(blocking)
+    if (blocking)
     {
         clFinish(q.get());
     }
diff --git a/src/runtime/CL/CLOperator.cpp b/src/runtime/CL/CLOperator.cpp
index 075a544077..89d4520038 100644
--- a/src/runtime/CL/CLOperator.cpp
+++ b/src/runtime/CL/CLOperator.cpp
@@ -30,14 +30,13 @@ namespace arm_compute
 {
 namespace experimental
 {
-ICLOperator::ICLOperator(IRuntimeContext *ctx)
-    : _kernel(), _ctx(ctx), _workspace()
+ICLOperator::ICLOperator(IRuntimeContext *ctx) : _kernel(), _ctx(ctx), _workspace()
 {
 }
 
 void ICLOperator::run(ITensorPack &tensors)
 {
-    if(tensors.empty())
+    if (tensors.empty())
     {
         ARM_COMPUTE_ERROR("No inputs provided");
     }
diff --git a/src/runtime/CL/CLRuntimeContext.cpp b/src/runtime/CL/CLRuntimeContext.cpp
index 5083b4b0c5..b426b8c304 100644
--- a/src/runtime/CL/CLRuntimeContext.cpp
+++ b/src/runtime/CL/CLRuntimeContext.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/CLRuntimeContext.h"
+
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
@@ -29,7 +30,10 @@
 namespace arm_compute
 {
 CLRuntimeContext::CLRuntimeContext()
-    : _gpu_owned_scheduler(std::make_unique<CLScheduler>()), _gpu_scheduler(_gpu_owned_scheduler.get()), _symbols(), _backend_type()
+    : _gpu_owned_scheduler(std::make_unique<CLScheduler>()),
+      _gpu_scheduler(_gpu_owned_scheduler.get()),
+      _symbols(),
+      _backend_type()
 {
     _symbols.load_default();
     auto ctx_dev_err = create_opencl_context_and_device(_backend_type);
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index b7a4dff45d..f0a42f55fd 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -81,7 +82,7 @@ cl::Event CLScheduler::enqueue_sync_event()
 
 void CLScheduler::tune_kernel_static(ICLKernel &kernel)
 {
-    if(_cl_tuner != nullptr)
+    if (_cl_tuner != nullptr)
     {
         _cl_tuner->tune_kernel_static(kernel);
     }
@@ -95,8 +96,16 @@ bool CLScheduler::is_initialised() const
 std::once_flag CLScheduler::_initialize_symbols;
 
 CLScheduler::CLScheduler()
-    : _context(), _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner(nullptr), _gemm_heuristics(nullptr), _backend_type(CLBackendType::Native), _job_chaining_enabled(true),
-      _job_chaining_size(1), _job_chaining_count(0)
+    : _context(),
+      _queue(),
+      _target(GPUTarget::MIDGARD),
+      _is_initialised(false),
+      _cl_tuner(nullptr),
+      _gemm_heuristics(nullptr),
+      _backend_type(CLBackendType::Native),
+      _job_chaining_enabled(true),
+      _job_chaining_size(1),
+      _job_chaining_count(0)
 {
 }
 
@@ -107,9 +116,12 @@ CLScheduler &CLScheduler::get()
     return scheduler;
 }
 
-void CLScheduler::default_init_with_context(cl::Device &device, cl::Context &ctx, ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h)
+void CLScheduler::default_init_with_context(cl::Device             &device,
+                                            cl::Context            &ctx,
+                                            ICLTuner               *cl_tuner,
+                                            CLGEMMHeuristicsHandle *gemm_h)
 {
-    if(!_is_initialised)
+    if (!_is_initialised)
     {
         const std::string cl_kernels_folder("./cl_kernels/");
         cl::CommandQueue  queue = cl::CommandQueue(ctx, device);
@@ -121,7 +133,7 @@ void CLScheduler::default_init_with_context(cl::Device &device, cl::Context &ctx
 
 void CLScheduler::default_init(ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type)
 {
-    if(!_is_initialised)
+    if (!_is_initialised)
     {
         cl::Context ctx;
         cl::Device  dev;
@@ -151,7 +163,12 @@ void CLScheduler::set_context(cl::Context context)
     CLKernelLibrary::get().set_context(_context);
 }
 
-void CLScheduler::init(cl::Context context, cl::CommandQueue queue, const cl::Device &device, ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type)
+void CLScheduler::init(cl::Context             context,
+                       cl::CommandQueue        queue,
+                       const cl::Device       &device,
+                       ICLTuner               *cl_tuner,
+                       CLGEMMHeuristicsHandle *gemm_h,
+                       CLBackendType           cl_backend_type)
 {
     set_context(std::move(context));
     _queue           = std::move(queue);
@@ -164,21 +181,21 @@ void CLScheduler::init(cl::Context context, cl::CommandQueue queue, const cl::De
 
 void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool flush)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(!_is_initialised,
-                             "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \
+    ARM_COMPUTE_ERROR_ON_MSG(
+        !_is_initialised, "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \
                              or CLScheduler::get()::init() and CLKernelLibrary::get()::init() function before running functions!");
 
     const bool inject_memory = !tensors.empty();
 
     // Tune the kernel if the CLTuner has been provided
-    if(_cl_tuner != nullptr)
+    if (_cl_tuner != nullptr)
     {
         inject_memory ? _cl_tuner->tune_kernel_dynamic(kernel, tensors) : _cl_tuner->tune_kernel_dynamic(kernel);
     }
 
     // Run kernel
     inject_memory ? kernel.run_op(tensors, kernel.window(), _queue) : kernel.run(kernel.window(), _queue);
-    if(_job_chaining_enabled)
+    if (_job_chaining_enabled)
     {
         ++_job_chaining_count;
     }
@@ -188,9 +205,9 @@ void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool f
 
 void CLScheduler::flush_queue(bool flush)
 {
-    if(_job_chaining_enabled)
+    if (_job_chaining_enabled)
     {
-        if(_job_chaining_count >= _job_chaining_size)
+        if (_job_chaining_count >= _job_chaining_size)
         {
             _job_chaining_count = 0;
             /*
@@ -199,14 +216,14 @@ void CLScheduler::flush_queue(bool flush)
                 the CPU activity for job-scheduling.
                 For eg. job-chain size goes from 1, 2, 4, 8 and 16
             */
-            if(_job_chaining_size < 16)
+            if (_job_chaining_size < 16)
             {
                 _job_chaining_size <<= 1;
             }
             _queue.flush();
         }
     }
-    else if(flush)
+    else if (flush)
     {
         _queue.flush();
     }
diff --git a/src/runtime/CL/CLSubTensor.cpp b/src/runtime/CL/CLSubTensor.cpp
index 14936ae23c..ace820bbb7 100644
--- a/src/runtime/CL/CLSubTensor.cpp
+++ b/src/runtime/CL/CLSubTensor.cpp
@@ -29,12 +29,14 @@
 
 using namespace arm_compute;
 
-CLSubTensor::CLSubTensor()
-    : _parent(nullptr), _info()
+CLSubTensor::CLSubTensor() : _parent(nullptr), _info()
 {
 }
 
-CLSubTensor::CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords, bool extend_parent)
+CLSubTensor::CLSubTensor(ICLTensor         *parent,
+                         const TensorShape &tensor_shape,
+                         const Coordinates &coords,
+                         bool               extend_parent)
     : _parent(nullptr), _info()
 {
     ARM_COMPUTE_ERROR_ON(parent == nullptr);
@@ -81,7 +83,7 @@ void CLSubTensor::unmap()
 uint8_t *CLSubTensor::do_map(cl::CommandQueue &q, bool blocking)
 {
     ARM_COMPUTE_ERROR_ON(cl_buffer().get() == nullptr);
-    if(_parent->buffer() == nullptr)
+    if (_parent->buffer() == nullptr)
     {
         _parent->map(q, blocking);
     }
diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
index f85b8ae777..e6457218c7 100644
--- a/src/runtime/CL/CLTensorAllocator.cpp
+++ b/src/runtime/CL/CLTensorAllocator.cpp
@@ -46,17 +46,16 @@ static IAllocator *static_global_cl_allocator = nullptr;
 std::unique_ptr<ICLMemoryRegion> allocate_region(size_t size, cl_uint alignment)
 {
     // Try fine-grain SVM
-    std::unique_ptr<ICLMemoryRegion> region = std::make_unique<CLFineSVMMemoryRegion>(CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER,
-                                                                                      size,
-                                                                                      alignment);
+    std::unique_ptr<ICLMemoryRegion> region =
+        std::make_unique<CLFineSVMMemoryRegion>(CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, size, alignment);
 
     // Try coarse-grain SVM in case of failure
-    if(region != nullptr && region->ptr() == nullptr)
+    if (region != nullptr && region->ptr() == nullptr)
     {
         region = std::make_unique<CLCoarseSVMMemoryRegion>(CL_MEM_READ_WRITE, size, alignment);
     }
     // Try legacy buffer memory in case of failure
-    if(region != nullptr && region->ptr() == nullptr)
+    if (region != nullptr && region->ptr() == nullptr)
     {
         region = std::make_unique<CLBufferMemoryRegion>(CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size);
     }
@@ -80,7 +79,10 @@ void clear_quantization_arrays(CLFloatArray &scale, CLInt32Array &offset)
  * @param[in]      qinfo    Quantization info
  * @param[in]      pad_size Pad size to use in case array needs to be padded for computation purposes
  */
-void populate_quantization_info(CLFloatArray &scale, CLInt32Array &offset, const QuantizationInfo &qinfo, size_t pad_size)
+void populate_quantization_info(CLFloatArray           &scale,
+                                CLInt32Array           &offset,
+                                const QuantizationInfo &qinfo,
+                                size_t                  pad_size)
 {
     clear_quantization_arrays(scale, offset);
 
@@ -90,16 +92,18 @@ void populate_quantization_info(CLFloatArray &scale, CLInt32Array &offset, const
     const size_t              element_size = sizeof(std::remove_reference<decltype(qscale)>::type::value_type);
     scale                                  = CLFloatArray(num_elements + pad_size);
     scale.resize(num_elements);
-    CLScheduler::get().queue().enqueueWriteBuffer(scale.cl_buffer(), CL_TRUE, 0, num_elements * element_size, qinfo.scale().data());
+    CLScheduler::get().queue().enqueueWriteBuffer(scale.cl_buffer(), CL_TRUE, 0, num_elements * element_size,
+                                                  qinfo.scale().data());
 
-    if(!qinfo.offset().empty())
+    if (!qinfo.offset().empty())
     {
         // Create offset array
-        const std::vector<int32_t> &qoffset             = qinfo.offset();
-        const size_t                offset_element_size = sizeof(std::remove_reference<decltype(qoffset)>::type::value_type);
-        offset                                          = CLInt32Array(num_elements + pad_size);
+        const std::vector<int32_t> &qoffset = qinfo.offset();
+        const size_t offset_element_size    = sizeof(std::remove_reference<decltype(qoffset)>::type::value_type);
+        offset                              = CLInt32Array(num_elements + pad_size);
         offset.resize(num_elements);
-        CLScheduler::get().queue().enqueueWriteBuffer(offset.cl_buffer(), CL_TRUE, 0, num_elements * offset_element_size, qinfo.offset().data());
+        CLScheduler::get().queue().enqueueWriteBuffer(offset.cl_buffer(), CL_TRUE, 0,
+                                                      num_elements * offset_element_size, qinfo.offset().data());
     }
 }
 } // namespace
@@ -111,7 +115,7 @@ CLTensorAllocator::CLTensorAllocator(IMemoryManageable *owner, CLRuntimeContext
 
 CLQuantization CLTensorAllocator::quantization() const
 {
-    return { &_scale, &_offset };
+    return {&_scale, &_offset};
 }
 
 uint8_t *CLTensorAllocator::data()
@@ -127,10 +131,10 @@ const cl::Buffer &CLTensorAllocator::cl_data() const
 void CLTensorAllocator::allocate()
 {
     // Allocate tensor backing memory
-    if(_associated_memory_group == nullptr)
+    if (_associated_memory_group == nullptr)
     {
         // Perform memory allocation
-        if(static_global_cl_allocator != nullptr)
+        if (static_global_cl_allocator != nullptr)
         {
             _memory.set_owned_region(static_global_cl_allocator->make_region(info().total_size(), 0));
         }
@@ -146,7 +150,7 @@ void CLTensorAllocator::allocate()
     }
 
     // Allocate and fill the quantization parameter arrays
-    if(is_data_type_quantized_per_channel(info().data_type()))
+    if (is_data_type_quantized_per_channel(info().data_type()))
     {
         const size_t pad_size = 0;
         populate_quantization_info(_scale, _offset, info().quantization_info(), pad_size);
@@ -193,7 +197,7 @@ void CLTensorAllocator::set_global_allocator(IAllocator *allocator)
 
 uint8_t *CLTensorAllocator::lock()
 {
-    if(_ctx)
+    if (_ctx)
     {
         return map(_ctx->gpu_scheduler()->queue(), true);
     }
@@ -206,7 +210,7 @@ uint8_t *CLTensorAllocator::lock()
 void CLTensorAllocator::unlock()
 {
     ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
-    if(_ctx)
+    if (_ctx)
     {
         unmap(_ctx->gpu_scheduler()->queue(), reinterpret_cast<uint8_t *>(_memory.region()->buffer()));
     }
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index 445638f01f..0d62fe3afe 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp
@@ -22,10 +22,11 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/CLTuner.h"
-#include "arm_compute/runtime/CL/tuners/CLTuningParametersList.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/tuners/CLTuningParametersList.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/CL/ICLKernel.h"
 #include "support/StringSupport.h"
@@ -37,19 +38,23 @@
 namespace arm_compute
 {
 CLTuner::CLTuner(bool tune_new_kernels, CLTuningInfo tuning_info)
-    : real_clEnqueueNDRangeKernel(nullptr), _tuning_params_table(), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels), _tuning_info(tuning_info)
+    : real_clEnqueueNDRangeKernel(nullptr),
+      _tuning_params_table(),
+      _lws_table(),
+      _kernel_event(),
+      _tune_new_kernels(tune_new_kernels),
+      _tuning_info(tuning_info)
 {
 }
 
 struct CLTuner::IKernelData
 {
-    virtual ~IKernelData() = default;
+    virtual ~IKernelData()                                          = default;
     virtual void do_run(ICLKernel &kernel, cl::CommandQueue &queue) = 0;
 };
 struct DefaultKernelData : public CLTuner::IKernelData
 {
-    DefaultKernelData(ITensorPack &tensors)
-        : _tensors{ tensors }
+    DefaultKernelData(ITensorPack &tensors) : _tensors{tensors}
     {
     }
     ~DefaultKernelData() override = default;
@@ -100,16 +105,17 @@ void CLTuner::tune_kernel_dynamic(ICLKernel &kernel)
 void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data)
 {
     // Get the configuration ID from the kernel and append GPU target name and number of available compute units
-    const std::string config_id = kernel.config_id() + "_" + string_from_target(kernel.get_target()) + "_MP" + support::cpp11::to_string(CLKernelLibrary::get().get_num_compute_units());
+    const std::string config_id = kernel.config_id() + "_" + string_from_target(kernel.get_target()) + "_MP" +
+                                  support::cpp11::to_string(CLKernelLibrary::get().get_num_compute_units());
 
     // Check if we need to find the Optimal LWS. If the kernel's config_id is equal to default_config_id, the kernel does not require to be tuned
-    if(kernel.config_id() != arm_compute::default_config_id)
+    if (kernel.config_id() != arm_compute::default_config_id)
     {
         auto p = _tuning_params_table.find(config_id);
 
-        if(p == _tuning_params_table.end())
+        if (p == _tuning_params_table.end())
         {
-            if(_tune_new_kernels)
+            if (_tune_new_kernels)
             {
                 // Find the optimal LWS for the kernel
                 CLTuningParams opt_tuning_params = find_optimal_tuning_params(kernel, data);
@@ -119,7 +125,7 @@ void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data)
 
                 // Set Local-Workgroup-Size
                 kernel.set_lws_hint(opt_tuning_params.get_lws());
-                if(_tuning_info.tune_wbsm)
+                if (_tuning_info.tune_wbsm)
                 {
                     kernel.set_wbsm_hint(opt_tuning_params.get_wbsm());
                 }
@@ -129,7 +135,7 @@ void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data)
         {
             // Set Local-Workgroup-Size
             kernel.set_lws_hint(p->second.get_lws());
-            if(_tuning_info.tune_wbsm)
+            if (_tuning_info.tune_wbsm)
             {
                 kernel.set_wbsm_hint(p->second.get_wbsm());
             }
@@ -138,7 +144,7 @@ void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data)
 }
 void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors)
 {
-    DefaultKernelData data{ tensors };
+    DefaultKernelData data{tensors};
 
     do_tune_kernel_dynamic(kernel, &data);
 }
@@ -154,7 +160,7 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat
     cl::CommandQueue queue_profiler;
 
     // Extract real OpenCL function to intercept
-    if(real_clEnqueueNDRangeKernel == nullptr)
+    if (real_clEnqueueNDRangeKernel == nullptr)
     {
         real_clEnqueueNDRangeKernel = CLSymbols::get().clEnqueueNDRangeKernel_ptr;
     }
@@ -165,7 +171,7 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat
     // Check if we can use the OpenCL timer with the default queue
     cl_command_queue_properties props = default_queue.getInfo<CL_QUEUE_PROPERTIES>();
 
-    if((props & CL_QUEUE_PROFILING_ENABLE) == 0)
+    if ((props & CL_QUEUE_PROFILING_ENABLE) == 0)
     {
         // Set the queue for profiling
         queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE);
@@ -176,21 +182,23 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat
     }
 
     // Start intercepting enqueues:
-    auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
-                              const cl_event * event_wait_list, cl_event * event)
+    auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo,
+                              const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
+                              const cl_event *event_wait_list, cl_event *event)
     {
-        if(this->kernel_event_is_set())
+        if (this->kernel_event_is_set())
         {
             // If the event is already set it means the kernel enqueue is sliced: given that we only time the first slice we can save time by skipping the other enqueues.
             return CL_SUCCESS;
         }
         cl_event tmp;
-        cl_int   retval = this->real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp);
+        cl_int   retval = this->real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws,
+                                                            num_events_in_wait_list, event_wait_list, &tmp);
 
         // Set OpenCL event
         this->set_cl_kernel_event(tmp);
 
-        if(event != nullptr)
+        if (event != nullptr)
         {
             //return cl_event from the intercepted call
             clRetainEvent(tmp);
@@ -209,9 +217,10 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat
     /// This is only a temporary workaround. An ideal solution involves decoupling the execution window from run() / run_op()
     /// Please see COMPMID-5934
     cl::NDRange gws = kernel.get_cached_gws();
-    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO,
-                                        "[CLTuner] Kernel with config_id '%s' uses %s as the upper-bound for lws search",
-                                        kernel.config_id().c_str(), to_string(gws).c_str());
+    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(
+        arm_compute::logging::LogLevel::INFO,
+        "[CLTuner] Kernel with config_id '%s' uses %s as the upper-bound for lws search", kernel.config_id().c_str(),
+        to_string(gws).c_str());
 
     queue_profiler.finish();
 
@@ -224,7 +233,7 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat
 
     // Construct the list of tuning parameters values to be tested based on the tuner mode.
     auto tuning_list = cl_tuner::get_tuning_parameters_list(_tuning_info, gws);
-    for(size_t i = 0; i < tuning_list->size(); ++i)
+    for (size_t i = 0; i < tuning_list->size(); ++i)
     {
         CLTuningParams tuning_test = (*tuning_list)[i];
         // Setting the lws
@@ -234,19 +243,18 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat
         auto        z           = lws_test[2];
         const bool  invalid_lws = (x * y * z > kernel.get_max_workgroup_size()) || (x == 1 && y == 1 && z == 1);
 
-        if(invalid_lws)
+        if (invalid_lws)
         {
             continue;
         }
 
         kernel.set_lws_hint(lws_test);
-        if(_tuning_info.tune_wbsm && CLKernelLibrary::get().is_wbsm_supported())
+        if (_tuning_info.tune_wbsm && CLKernelLibrary::get().is_wbsm_supported())
         {
             cl_int wbsm_test = tuning_test.get_wbsm();
             kernel.set_wbsm_hint(wbsm_test);
         }
-        ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO,
-                                            "[CLTuner] Trying LWS: %s, WBSM: %d",
+        ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "[CLTuner] Trying LWS: %s, WBSM: %d",
                                             to_string(kernel.lws_hint()).c_str(), kernel.wbsm_hint());
 
         // Run the kernel
@@ -260,11 +268,11 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat
         _kernel_event        = nullptr;
 
         // Check the execution time
-        if(diff < min_exec_time)
+        if (diff < min_exec_time)
         {
             min_exec_time = diff;
             opt_tuning_params.set_lws(tuning_test.get_lws());
-            if(_tuning_info.tune_wbsm)
+            if (_tuning_info.tune_wbsm)
             {
                 opt_tuning_params.set_wbsm(tuning_test.get_wbsm());
             }
@@ -292,30 +300,30 @@ void CLTuner::load_from_file(const std::string &filename)
     std::ifstream fs;
     fs.exceptions(std::ifstream::badbit);
     fs.open(filename, std::ios::in);
-    if(!fs.is_open())
+    if (!fs.is_open())
     {
         ARM_COMPUTE_ERROR_VAR("Failed to open '%s' (%s [%d])", filename.c_str(), strerror(errno), errno);
     }
     std::string line;
     bool        header_line = true;
-    while(!std::getline(fs, line).fail())
+    while (!std::getline(fs, line).fail())
     {
-        if(header_line)
+        if (header_line)
         {
             header_line            = false;
             size_t pos_lws         = line.find("lws");
             size_t pos_wbsm        = line.find("wbsm");
             _tuning_info.tune_wbsm = false;
-            if(pos_lws != std::string::npos || pos_wbsm != std::string::npos)
+            if (pos_lws != std::string::npos || pos_wbsm != std::string::npos)
             {
                 // The file has in the first line the parameters it has been tuned on
-                if(pos_wbsm != std::string::npos)
+                if (pos_wbsm != std::string::npos)
                 {
                     _tuning_info.tune_wbsm = true;
                 }
                 // Once the line with the tuning parameter is read we can
                 // read the next one to start collecting the values
-                if(std::getline(fs, line).fail())
+                if (std::getline(fs, line).fail())
                 {
                     break;
                 }
@@ -324,13 +332,13 @@ void CLTuner::load_from_file(const std::string &filename)
 
         CLTuningParams tuning_params;
         size_t         pos = line.find(";");
-        if(pos == std::string::npos)
+        if (pos == std::string::npos)
         {
             ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str());
         }
         std::string kernel_id = line.substr(0, pos);
         line.erase(0, pos + 1);
-        if(!tuning_params.from_string(_tuning_info, line))
+        if (!tuning_params.from_string(_tuning_info, line))
         {
             ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str());
         }
@@ -341,7 +349,7 @@ void CLTuner::load_from_file(const std::string &filename)
 
 bool CLTuner::save_to_file(const std::string &filename) const
 {
-    if(!_tune_new_kernels || _tuning_params_table.empty() || filename.empty())
+    if (!_tune_new_kernels || _tuning_params_table.empty() || filename.empty())
     {
         return false;
     }
@@ -350,16 +358,16 @@ bool CLTuner::save_to_file(const std::string &filename) const
     fs.open(filename, std::ios::out);
     std::string header_string = "";
     header_string += "lws";
-    if(_tuning_info.tune_wbsm)
+    if (_tuning_info.tune_wbsm)
     {
-        if(!header_string.empty())
+        if (!header_string.empty())
         {
             header_string += " ";
         }
         header_string += "wbsm";
     }
     fs << header_string << std::endl;
-    for(auto const &kernel_data : _tuning_params_table)
+    for (auto const &kernel_data : _tuning_params_table)
     {
         CLTuningParams tun_pams(kernel_data.second);
         fs << kernel_data.first << tun_pams.to_string(_tuning_info) << std::endl;
diff --git a/src/runtime/CL/ICLSimpleFunction.cpp b/src/runtime/CL/ICLSimpleFunction.cpp
index 4530537789..bc782c3a2c 100644
--- a/src/runtime/CL/ICLSimpleFunction.cpp
+++ b/src/runtime/CL/ICLSimpleFunction.cpp
@@ -26,15 +26,14 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 using namespace arm_compute;
 
 ICLSimpleFunction::ICLSimpleFunction(CLRuntimeContext *ctx) // NOLINT
-    : _kernel(),
-      _border_handler(std::make_unique<CLFillBorderKernel>()),
-      _ctx(ctx)
+    : _kernel(), _border_handler(std::make_unique<CLFillBorderKernel>()), _ctx(ctx)
 {
 }
 
diff --git a/src/runtime/CL/Utils.cpp b/src/runtime/CL/Utils.cpp
index da3d4850bf..294396c28a 100644
--- a/src/runtime/CL/Utils.cpp
+++ b/src/runtime/CL/Utils.cpp
@@ -35,20 +35,20 @@ namespace arm_compute
 void restore_program_cache_from_file(const std::string &filename)
 {
     std::ifstream cache_file(filename, std::ios::binary);
-    if(cache_file.is_open())
+    if (cache_file.is_open())
     {
-        if(!CLScheduler::get().is_initialised())
+        if (!CLScheduler::get().is_initialised())
         {
             arm_compute::CLScheduler::get().default_init();
         }
 
-        while(!cache_file.eof())
+        while (!cache_file.eof())
         {
             size_t name_len   = 0;
             size_t binary_len = 0;
             cache_file.read(reinterpret_cast<char *>(&name_len), sizeof(size_t));
             cache_file.read(reinterpret_cast<char *>(&binary_len), sizeof(size_t));
-            if(name_len == 0 || binary_len == 0)
+            if (name_len == 0 || binary_len == 0)
             {
                 break;
             }
@@ -60,7 +60,7 @@ void restore_program_cache_from_file(const std::string &filename)
             tmp.resize(binary_len);
             cache_file.read(reinterpret_cast<char *>(binary.data()), binary_len);
             cl::Context             context = arm_compute::CLScheduler::get().context();
-            cl::Program::Binaries   binaries{ binary };
+            cl::Program::Binaries   binaries{binary};
             std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
             cl::Program             program(context, devices, binaries);
             program.build();
@@ -72,12 +72,12 @@ void restore_program_cache_from_file(const std::string &filename)
 
 void save_program_cache_to_file(const std::string &filename)
 {
-    if(CLScheduler::get().is_initialised())
+    if (CLScheduler::get().is_initialised())
     {
         std::ofstream cache_file(filename, std::ios::binary);
-        if(cache_file.is_open())
+        if (cache_file.is_open())
         {
-            for(const auto &it : CLKernelLibrary::get().get_built_programs())
+            for (const auto &it : CLKernelLibrary::get().get_built_programs())
             {
                 std::vector<std::vector<unsigned char>> binaries = it.second.getInfo<CL_PROGRAM_BINARIES>();
                 ARM_COMPUTE_ERROR_ON(binaries.size() != 1);
diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp
index f324b1a68c..c035644e4a 100644
--- a/src/runtime/CL/functions/CLActivationLayer.cpp
+++ b/src/runtime/CL/functions/CLActivationLayer.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/CL/CLRuntimeContext.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClActivation.h"
 
@@ -35,18 +36,17 @@ namespace arm_compute
 {
 struct CLActivationLayer::Impl
 {
-    const ICLTensor                      *src{ nullptr };
-    ICLTensor                            *dst{ nullptr };
-    CLRuntimeContext                     *ctx{ nullptr };
-    std::unique_ptr<opencl::ClActivation> op{ nullptr };
+    const ICLTensor                      *src{nullptr};
+    ICLTensor                            *dst{nullptr};
+    CLRuntimeContext                     *ctx{nullptr};
+    std::unique_ptr<opencl::ClActivation> op{nullptr};
 };
 
-CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx)
-    : _impl(std::make_unique<Impl>())
+CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx) : _impl(std::make_unique<Impl>())
 {
     _impl->ctx = ctx;
 }
-CLActivationLayer::CLActivationLayer(CLActivationLayer &&) = default;
+CLActivationLayer::CLActivationLayer(CLActivationLayer &&)            = default;
 CLActivationLayer &CLActivationLayer::operator=(CLActivationLayer &&) = default;
 CLActivationLayer::~CLActivationLayer()                               = default;
 
@@ -55,7 +55,10 @@ void CLActivationLayer::configure(ICLTensor *input, ICLTensor *output, Activatio
     configure(CLKernelLibrary::get().get_compile_context(), input, output, act_info);
 }
 
-void CLActivationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
+void CLActivationLayer::configure(const CLCompileContext &compile_context,
+                                  ICLTensor              *input,
+                                  ICLTensor              *output,
+                                  ActivationLayerInfo     act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
@@ -66,7 +69,8 @@ void CLActivationLayer::configure(const CLCompileContext &compile_context, ICLTe
     _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), act_info);
 }
 
-Status CLActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status
+CLActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     return opencl::ClActivation::validate(input, output, act_info);
 }
diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
index b30d739025..f9bbd31e8a 100644
--- a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
+++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
@@ -27,31 +27,39 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/CL/kernels/CLArgMinMaxLayerKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/runtime/Utils.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 CLArgMinMaxLayer::CLArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _not_reshaped_output(), _arg_min_max_kernel(), _reshape(), _reduction_axis()
+    : _memory_group(std::move(memory_manager)),
+      _not_reshaped_output(),
+      _arg_min_max_kernel(),
+      _reshape(),
+      _reduction_axis()
 {
 }
 
 CLArgMinMaxLayer::~CLArgMinMaxLayer() = default;
 
-Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
+Status
+CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid reduction operation");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions), "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN,
+                                    "Invalid reduction operation");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions),
+                                    "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
 
     DataType   output_data_type = DataType::S32;
@@ -59,17 +67,18 @@ Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITen
     const auto input_num_channles = input->num_channels();
     const auto input_qinfo        = input->quantization_info();
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         output_data_type                       = output->data_type();
-        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, false));
+        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, false));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
     }
 
     auto shape_before_reshape = input->tensor_shape();
     shape_before_reshape.set(axis, 1);
-    auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo)
-    {
+    auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, int num_channels,
+                                    QuantizationInfo qinfo) {
         ti.set_data_type(data_type).set_tensor_shape(shape).set_num_channels(num_channels).set_quantization_info(qinfo);
     };
 
@@ -85,20 +94,36 @@ void CLArgMinMaxLayer::configure(const ICLTensor *input, int axis, ICLTensor *ou
     configure(CLKernelLibrary::get().get_compile_context(), input, axis, output, op);
 }
 
-void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op)
+void CLArgMinMaxLayer::configure(const CLCompileContext   &compile_context,
+                                 const ICLTensor          *input,
+                                 int                       axis,
+                                 ICLTensor                *output,
+                                 const ReductionOperation &op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, axis, output, op);
 
     _reduction_axis = axis;
 
-    const TensorShape output_shape     = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
-    DataType          output_data_type = (output->info()->data_type() == DataType::UNKNOWN) ? DataType::S32 : output->info()->data_type();
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
-
-    TensorShape not_reshaped_output_shape{ input->info()->tensor_shape() };
+    const TensorShape output_shape =
+        arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
+    DataType output_data_type =
+        (output->info()->data_type() == DataType::UNKNOWN) ? DataType::S32 : output->info()->data_type();
+    auto_init_if_empty(*output->info(), input->info()
+                                            ->clone()
+                                            ->set_tensor_shape(output_shape)
+                                            .set_data_type(output_data_type)
+                                            .reset_padding()
+                                            .set_is_resizable(true));
+
+    TensorShape not_reshaped_output_shape{input->info()->tensor_shape()};
     not_reshaped_output_shape.set(axis, 1);
-    auto_init_if_empty(*_not_reshaped_output.info(), input->info()->clone()->set_tensor_shape(not_reshaped_output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
+    auto_init_if_empty(*_not_reshaped_output.info(), input->info()
+                                                         ->clone()
+                                                         ->set_tensor_shape(not_reshaped_output_shape)
+                                                         .set_data_type(output_data_type)
+                                                         .reset_padding()
+                                                         .set_is_resizable(true));
 
     _arg_min_max_kernel = std::make_unique<CLArgMinMaxLayerKernel>();
     _arg_min_max_kernel->configure(compile_context, input, &_not_reshaped_output, axis, op);
diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
index e8affc0853..0c371c4171 100644
--- a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
@@ -30,9 +30,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
-#include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
 
 namespace arm_compute
 {
@@ -43,24 +42,40 @@ CLBatchNormalizationLayer::CLBatchNormalizationLayer()
 
 CLBatchNormalizationLayer::~CLBatchNormalizationLayer() = default;
 
-void CLBatchNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon,
+void CLBatchNormalizationLayer::configure(ICLTensor          *input,
+                                          ICLTensor          *output,
+                                          const ICLTensor    *mean,
+                                          const ICLTensor    *var,
+                                          const ICLTensor    *beta,
+                                          const ICLTensor    *gamma,
+                                          float               epsilon,
                                           ActivationLayerInfo act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, var, beta, gamma, epsilon, act_info);
 }
 
-void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta,
-                                          const ICLTensor *gamma, float epsilon,
-                                          ActivationLayerInfo act_info)
+void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_context,
+                                          ICLTensor              *input,
+                                          ICLTensor              *output,
+                                          const ICLTensor        *mean,
+                                          const ICLTensor        *var,
+                                          const ICLTensor        *beta,
+                                          const ICLTensor        *gamma,
+                                          float                   epsilon,
+                                          ActivationLayerInfo     act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, mean, var, beta, gamma, epsilon, act_info);
     _norm_kernel->configure(compile_context, input, output, mean, var, beta, gamma, epsilon, act_info);
 }
 
-Status CLBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                           const ITensorInfo *mean, const ITensorInfo *var,
-                                           const ITensorInfo *beta, const ITensorInfo *gamma,
-                                           float epsilon, ActivationLayerInfo act_info)
+Status CLBatchNormalizationLayer::validate(const ITensorInfo  *input,
+                                           const ITensorInfo  *output,
+                                           const ITensorInfo  *mean,
+                                           const ITensorInfo  *var,
+                                           const ITensorInfo  *beta,
+                                           const ITensorInfo  *gamma,
+                                           float               epsilon,
+                                           ActivationLayerInfo act_info)
 {
     return CLBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info);
 }
@@ -69,4 +84,4 @@ void CLBatchNormalizationLayer::run()
 {
     CLScheduler::get().enqueue(*_norm_kernel, true);
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
index d7a409128d..a3798daf61 100644
--- a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
@@ -30,14 +30,12 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
-#include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
 
 namespace arm_compute
 {
-CLBatchToSpaceLayer::CLBatchToSpaceLayer()
-    : _batch_to_space_kernel(std::make_unique<CLBatchToSpaceLayerKernel>())
+CLBatchToSpaceLayer::CLBatchToSpaceLayer() : _batch_to_space_kernel(std::make_unique<CLBatchToSpaceLayerKernel>())
 {
 }
 
@@ -49,29 +47,43 @@ void CLBatchToSpaceLayer::configure(const ICLTensor *input, const ICLTensor *blo
     _batch_to_space_kernel->configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output);
 }
 
-void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
+void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    const ICLTensor        *block_shape,
+                                    ICLTensor              *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input, block_shape, output);
     _batch_to_space_kernel->configure(compile_context, input, block_shape, output);
 }
 
-void CLBatchToSpaceLayer::configure(const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info)
+void CLBatchToSpaceLayer::configure(
+    const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output, crop_info);
 }
 
-void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info)
+void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    int32_t                 block_shape_x,
+                                    int32_t                 block_shape_y,
+                                    ICLTensor              *output,
+                                    const CropInfo         &crop_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, block_shape_x, block_shape_y, output);
     _batch_to_space_kernel->configure(compile_context, input, block_shape_x, block_shape_y, output, crop_info);
 }
 
-Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+Status
+CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
 {
     return CLBatchToSpaceLayerKernel::validate(input, block_shape, output);
 }
 
-Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info)
+Status CLBatchToSpaceLayer::validate(const ITensorInfo *input,
+                                     int32_t            block_shape_x,
+                                     int32_t            block_shape_y,
+                                     const ITensorInfo *output,
+                                     const CropInfo    &crop_info)
 {
     return CLBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output, crop_info);
 }
diff --git a/src/runtime/CL/functions/CLBitwiseAnd.cpp b/src/runtime/CL/functions/CLBitwiseAnd.cpp
index a4712ed3f1..7bfd0e3677 100644
--- a/src/runtime/CL/functions/CLBitwiseAnd.cpp
+++ b/src/runtime/CL/functions/CLBitwiseAnd.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h"
 
-#include "src/core/CL/kernels/CLBitwiseKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBitwiseKernel.h"
 
 #include <utility>
 
@@ -36,11 +35,14 @@ void CLBitwiseAnd::configure(const ICLTensor *input1, const ICLTensor *input2, I
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
 }
 
-void CLBitwiseAnd::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+void CLBitwiseAnd::configure(const CLCompileContext &compile_context,
+                             const ICLTensor        *input1,
+                             const ICLTensor        *input2,
+                             ICLTensor              *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
     auto k = std::make_unique<CLBitwiseKernel>();
     k->configure(compile_context, input1, input2, output, BitwiseOperation::AND);
     _kernel = std::move(k);
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBitwiseNot.cpp b/src/runtime/CL/functions/CLBitwiseNot.cpp
index 5964b92447..9763915c02 100644
--- a/src/runtime/CL/functions/CLBitwiseNot.cpp
+++ b/src/runtime/CL/functions/CLBitwiseNot.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBitwiseNot.h"
 
-#include "src/core/CL/kernels/CLBitwiseKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBitwiseKernel.h"
 
 #include <utility>
 
@@ -43,4 +42,4 @@ void CLBitwiseNot::configure(const CLCompileContext &compile_context, const ICLT
     k->configure(compile_context, input, nullptr, output, BitwiseOperation::NOT);
     _kernel = std::move(k);
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBitwiseOr.cpp b/src/runtime/CL/functions/CLBitwiseOr.cpp
index a07bf17bb2..dd3171b982 100644
--- a/src/runtime/CL/functions/CLBitwiseOr.cpp
+++ b/src/runtime/CL/functions/CLBitwiseOr.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBitwiseOr.h"
 
-#include "src/core/CL/kernels/CLBitwiseKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBitwiseKernel.h"
 
 #include <utility>
 
@@ -36,11 +35,14 @@ void CLBitwiseOr::configure(const ICLTensor *input1, const ICLTensor *input2, IC
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
 }
 
-void CLBitwiseOr::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+void CLBitwiseOr::configure(const CLCompileContext &compile_context,
+                            const ICLTensor        *input1,
+                            const ICLTensor        *input2,
+                            ICLTensor              *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
     auto k = std::make_unique<CLBitwiseKernel>();
     k->configure(compile_context, input1, input2, output, BitwiseOperation::OR);
     _kernel = std::move(k);
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLBitwiseXor.cpp b/src/runtime/CL/functions/CLBitwiseXor.cpp
index f65e2e406c..5bee4b37ec 100644
--- a/src/runtime/CL/functions/CLBitwiseXor.cpp
+++ b/src/runtime/CL/functions/CLBitwiseXor.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBitwiseXor.h"
 
-#include "src/core/CL/kernels/CLBitwiseKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBitwiseKernel.h"
 
 #include <utility>
 
@@ -36,7 +35,10 @@ void CLBitwiseXor::configure(const ICLTensor *input1, const ICLTensor *input2, I
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
 }
 
-void CLBitwiseXor::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+void CLBitwiseXor::configure(const CLCompileContext &compile_context,
+                             const ICLTensor        *input1,
+                             const ICLTensor        *input2,
+                             ICLTensor              *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
     auto k = std::make_unique<CLBitwiseKernel>();
diff --git a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
index 48583bfaf3..76e626fd75 100644
--- a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
+++ b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
@@ -23,18 +23,24 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h"
 
-#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
 
 namespace arm_compute
 {
-void CLBoundingBoxTransform::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransform::configure(const ICLTensor                *boxes,
+                                       ICLTensor                      *pred_boxes,
+                                       const ICLTensor                *deltas,
+                                       const BoundingBoxTransformInfo &info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), boxes, pred_boxes, deltas, info);
 }
 
-void CLBoundingBoxTransform::configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransform::configure(const CLCompileContext         &compile_context,
+                                       const ICLTensor                *boxes,
+                                       ICLTensor                      *pred_boxes,
+                                       const ICLTensor                *deltas,
+                                       const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_LOG_PARAMS(boxes, pred_boxes, deltas, info);
 
@@ -44,7 +50,10 @@ void CLBoundingBoxTransform::configure(const CLCompileContext &compile_context,
     _kernel = std::move(k);
 }
 
-Status CLBoundingBoxTransform::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status CLBoundingBoxTransform::validate(const ITensorInfo              *boxes,
+                                        const ITensorInfo              *pred_boxes,
+                                        const ITensorInfo              *deltas,
+                                        const BoundingBoxTransformInfo &info)
 {
     return CLBoundingBoxTransformKernel::validate(boxes, pred_boxes, deltas, info);
 }
diff --git a/src/runtime/CL/functions/CLCast.cpp b/src/runtime/CL/functions/CLCast.cpp
index 10f7cc2065..42ec8f7ee0 100644
--- a/src/runtime/CL/functions/CLCast.cpp
+++ b/src/runtime/CL/functions/CLCast.cpp
@@ -26,10 +26,10 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClCast.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClCast.h"
 
 #include <utility>
 
@@ -37,16 +37,15 @@ namespace arm_compute
 {
 struct CLCast::Impl
 {
-    const ICLTensor                *src{ nullptr };
-    ICLTensor                      *dst{ nullptr };
-    std::unique_ptr<opencl::ClCast> op{ nullptr };
+    const ICLTensor                *src{nullptr};
+    ICLTensor                      *dst{nullptr};
+    std::unique_ptr<opencl::ClCast> op{nullptr};
 };
 
-CLCast::CLCast()
-    : _impl(std::make_unique<Impl>())
+CLCast::CLCast() : _impl(std::make_unique<Impl>())
 {
 }
-CLCast::CLCast(CLCast &&) = default;
+CLCast::CLCast(CLCast &&)            = default;
 CLCast &CLCast::operator=(CLCast &&) = default;
 CLCast::~CLCast()                    = default;
 
@@ -55,7 +54,10 @@ void CLCast::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy
     configure(CLKernelLibrary::get().get_compile_context(), input, output, policy);
 }
 
-void CLCast::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy)
+void CLCast::configure(const CLCompileContext &compile_context,
+                       const ICLTensor        *input,
+                       ICLTensor              *output,
+                       ConvertPolicy           policy)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, output, policy);
@@ -74,7 +76,7 @@ Status CLCast::validate(const ITensorInfo *input, const ITensorInfo *output, Con
 
 void CLCast::run()
 {
-    ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } };
+    ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}};
     _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
index 021f28f238..1ee4789816 100644
--- a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
+++ b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
@@ -24,9 +24,9 @@
 #include "arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h"
 
 #include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h"
 
 namespace arm_compute
 {
@@ -35,7 +35,10 @@ void CLChannelShuffleLayer::configure(const ICLTensor *input, ICLTensor *output,
     configure(CLKernelLibrary::get().get_compile_context(), input, output, num_groups);
 }
 
-void CLChannelShuffleLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
+void CLChannelShuffleLayer::configure(const CLCompileContext &compile_context,
+                                      const ICLTensor        *input,
+                                      ICLTensor              *output,
+                                      unsigned int            num_groups)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, num_groups);
     auto k = std::make_unique<CLChannelShuffleLayerKernel>();
diff --git a/src/runtime/CL/functions/CLComparison.cpp b/src/runtime/CL/functions/CLComparison.cpp
index 192a266f0f..2f54371e88 100644
--- a/src/runtime/CL/functions/CLComparison.cpp
+++ b/src/runtime/CL/functions/CLComparison.cpp
@@ -25,10 +25,10 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLComparisonKernel.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLComparisonKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 namespace arm_compute
 {
@@ -37,25 +37,33 @@ void CLComparison::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *ou
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, operation);
 }
 
-void CLComparison::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+void CLComparison::configure(const CLCompileContext &compile_context,
+                             ICLTensor              *input1,
+                             ICLTensor              *input2,
+                             ICLTensor              *output,
+                             ComparisonOperation     operation)
 {
     ARM_COMPUTE_LOG_PARAMS(input2, input2, output, operation);
     auto k = std::make_unique<CLComparisonKernel>();
     k->configure(compile_context, input1, input2, output, operation);
     _kernel = std::move(k);
 
-    if(output->info()->dimension(0) > 1)
+    if (output->info()->dimension(0) > 1)
     {
         ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
 
-        if(broadcasted_info->info()->dimension(0) == 1)
+        if (broadcasted_info->info()->dimension(0) == 1)
         {
-            _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+            _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(),
+                                       BorderMode::REPLICATE);
         }
     }
 }
 
-Status CLComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation)
+Status CLComparison::validate(const ITensorInfo  *input1,
+                              const ITensorInfo  *input2,
+                              const ITensorInfo  *output,
+                              ComparisonOperation operation)
 {
     return CLComparisonKernel::validate(input1, input2, output, operation);
 }
@@ -67,25 +75,30 @@ void CLComparisonStatic<COP>::configure(ICLTensor *input1, ICLTensor *input2, IC
 }
 
 template <ComparisonOperation COP>
-void CLComparisonStatic<COP>::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+void CLComparisonStatic<COP>::configure(const CLCompileContext &compile_context,
+                                        ICLTensor              *input1,
+                                        ICLTensor              *input2,
+                                        ICLTensor              *output)
 {
     auto k = std::make_unique<CLComparisonKernel>();
     k->configure(compile_context, input1, input2, output, COP);
     _kernel = std::move(k);
 
-    if(output->info()->dimension(0) > 1)
+    if (output->info()->dimension(0) > 1)
     {
         ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
 
-        if(broadcasted_info->info()->dimension(0) == 1)
+        if (broadcasted_info->info()->dimension(0) == 1)
         {
-            _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+            _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(),
+                                       BorderMode::REPLICATE);
         }
     }
 }
 
 template <ComparisonOperation COP>
-Status CLComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status
+CLComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
 {
     return CLComparisonKernel::validate(input1, input2, output, COP);
 }
diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp
index 0a8884f4e3..9df1c34593 100644
--- a/src/runtime/CL/functions/CLConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp
@@ -24,24 +24,23 @@
 #include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClConcatenate.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClConcatenate.h"
 
 namespace arm_compute
 {
 struct CLConcatenateLayer::Impl
 {
     std::vector<const ICLTensor *>         srcs{};
-    ICLTensor                             *dst{ nullptr };
-    unsigned int                           num_inputs{ 0 };
-    unsigned int                           axis{ 0 };
-    std::unique_ptr<opencl::ClConcatenate> op{ nullptr };
+    ICLTensor                             *dst{nullptr};
+    unsigned int                           num_inputs{0};
+    unsigned int                           axis{0};
+    std::unique_ptr<opencl::ClConcatenate> op{nullptr};
 };
 
-CLConcatenateLayer::CLConcatenateLayer()
-    : _impl(std::make_unique<Impl>())
+CLConcatenateLayer::CLConcatenateLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
@@ -56,7 +55,10 @@ void CLConcatenateLayer::configure(std::vector<const ICLTensor *> &inputs_vector
     configure(CLKernelLibrary::get().get_compile_context(), inputs_vector, output, axis);
 }
 
-void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
+void CLConcatenateLayer::configure(const CLCompileContext         &compile_context,
+                                   std::vector<const ICLTensor *> &inputs_vector,
+                                   ICLTensor                      *output,
+                                   size_t                          axis)
 {
     ARM_COMPUTE_ERROR_ON(output == nullptr);
     ARM_COMPUTE_LOG_PARAMS(inputs_vector, output, axis);
@@ -68,7 +70,7 @@ void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std:
     _impl->op         = std::make_unique<opencl::ClConcatenate>();
 
     std::vector<ITensorInfo *> inputs_vector_info;
-    for(unsigned int i = 0; i < inputs_vector.size(); ++i)
+    for (unsigned int i = 0; i < inputs_vector.size(); ++i)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i));
         inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
@@ -76,7 +78,9 @@ void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std:
     _impl->op->configure(compile_context, inputs_vector_info, _impl->dst->info(), axis);
 }
 
-Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
+Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector,
+                                    const ITensorInfo                      *output,
+                                    size_t                                  axis)
 {
     return opencl::ClConcatenate::validate(inputs_vector, output, axis);
 }
@@ -84,7 +88,7 @@ Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inpu
 void CLConcatenateLayer::run()
 {
     ITensorPack pack;
-    for(unsigned i = 0; i < _impl->num_inputs; ++i)
+    for (unsigned i = 0; i < _impl->num_inputs; ++i)
     {
         pack.add_tensor(TensorType::ACL_SRC_VEC + i, _impl->srcs.at(i));
     }
diff --git a/src/runtime/CL/functions/CLConv3D.cpp b/src/runtime/CL/functions/CLConv3D.cpp
index 729b973b6a..9d1b368f72 100644
--- a/src/runtime/CL/functions/CLConv3D.cpp
+++ b/src/runtime/CL/functions/CLConv3D.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLConv3D.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/gpu/cl/operators/ClDirectConv3d.h"
 
 namespace arm_compute
@@ -32,29 +33,38 @@ using namespace arm_compute::experimental;
 
 struct CLConv3D::Impl
 {
-    const ICLTensor                        *src{ nullptr };
-    const ICLTensor                        *weights{ nullptr };
-    const ICLTensor                        *biases{ nullptr };
-    ICLTensor                              *dst{ nullptr };
-    std::unique_ptr<opencl::ClDirectConv3d> op{ nullptr };
+    const ICLTensor                        *src{nullptr};
+    const ICLTensor                        *weights{nullptr};
+    const ICLTensor                        *biases{nullptr};
+    ICLTensor                              *dst{nullptr};
+    std::unique_ptr<opencl::ClDirectConv3d> op{nullptr};
 };
 
-CLConv3D::CLConv3D()
-    : _impl(std::make_unique<Impl>())
+CLConv3D::CLConv3D() : _impl(std::make_unique<Impl>())
 {
 }
 
 CLConv3D::~CLConv3D() = default;
 
-void CLConv3D::configure(const ICLTensor *src, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *dst, const Conv3dInfo &conv3d_info)
+void CLConv3D::configure(const ICLTensor  *src,
+                         const ICLTensor  *weights,
+                         const ICLTensor  *biases,
+                         ICLTensor        *dst,
+                         const Conv3dInfo &conv3d_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), src, weights, biases, dst, conv3d_info);
 }
 
-void CLConv3D::configure(const CLCompileContext &compile_context, const ICLTensor *src, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *dst, const Conv3dInfo &conv3d_info)
+void CLConv3D::configure(const CLCompileContext &compile_context,
+                         const ICLTensor        *src,
+                         const ICLTensor        *weights,
+                         const ICLTensor        *biases,
+                         ICLTensor              *dst,
+                         const Conv3dInfo       &conv3d_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(CLConv3D::validate(src->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), dst->info(), conv3d_info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLConv3D::validate(
+        src->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), dst->info(), conv3d_info));
 
     _impl->src     = src;
     _impl->weights = weights;
@@ -62,10 +72,15 @@ void CLConv3D::configure(const CLCompileContext &compile_context, const ICLTenso
     _impl->dst     = dst;
 
     _impl->op = std::make_unique<opencl::ClDirectConv3d>();
-    _impl->op->configure(compile_context, _impl->src->info(), _impl->weights->info(), _impl->biases ? _impl->biases->info() : nullptr, _impl->dst->info(), conv3d_info);
+    _impl->op->configure(compile_context, _impl->src->info(), _impl->weights->info(),
+                         _impl->biases ? _impl->biases->info() : nullptr, _impl->dst->info(), conv3d_info);
 }
 
-Status CLConv3D::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv3dInfo &conv3d_info)
+Status CLConv3D::validate(const ITensorInfo *src,
+                          const ITensorInfo *weights,
+                          const ITensorInfo *biases,
+                          const ITensorInfo *dst,
+                          const Conv3dInfo  &conv3d_info)
 {
     return opencl::ClDirectConv3d::validate(src, weights, biases, dst, conv3d_info);
 }
diff --git a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
index b3efe5c8a0..2298f2a669 100644
--- a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
+++ b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
@@ -27,33 +27,37 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
 
 namespace arm_compute
 {
 struct CLConvertFullyConnectedWeights::Impl
 {
-    const ICLTensor                                        *src{ nullptr };
-    ICLTensor                                              *dst{ nullptr };
-    std::unique_ptr<opencl::ClConvertFullyConnectedWeights> op{ nullptr };
+    const ICLTensor                                        *src{nullptr};
+    ICLTensor                                              *dst{nullptr};
+    std::unique_ptr<opencl::ClConvertFullyConnectedWeights> op{nullptr};
 };
-CLConvertFullyConnectedWeights::CLConvertFullyConnectedWeights()
-    : _impl(std::make_unique<Impl>())
+CLConvertFullyConnectedWeights::CLConvertFullyConnectedWeights() : _impl(std::make_unique<Impl>())
 {
 }
 CLConvertFullyConnectedWeights::~CLConvertFullyConnectedWeights() = default;
 
-void CLConvertFullyConnectedWeights::configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
-                                               DataLayout data_layout)
+void CLConvertFullyConnectedWeights::configure(const ICLTensor   *input,
+                                               ICLTensor         *output,
+                                               const TensorShape &original_input_shape,
+                                               DataLayout         data_layout)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, original_input_shape, data_layout);
 }
 
-void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
-                                               DataLayout data_layout)
+void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_context,
+                                               const ICLTensor        *input,
+                                               ICLTensor              *output,
+                                               const TensorShape      &original_input_shape,
+                                               DataLayout              data_layout)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, output, original_input_shape, data_layout);
@@ -63,8 +67,10 @@ void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_c
     _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), original_input_shape, data_layout);
 }
 
-Status CLConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
-                                                DataLayout data_layout)
+Status CLConvertFullyConnectedWeights::validate(const ITensorInfo *input,
+                                                const ITensorInfo *output,
+                                                const TensorShape &original_input_shape,
+                                                DataLayout         data_layout)
 {
     return opencl::ClConvertFullyConnectedWeights::validate(input, output, original_input_shape, data_layout);
 }
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index f3c05adb47..7767b45a01 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -28,11 +28,11 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/ICLKernel.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClConv2d.h"
-
-#include "src/common/utils/Log.h"
 #include "support/Cast.h"
 
 namespace arm_compute
@@ -43,41 +43,59 @@ struct CLConvolutionLayer::Impl
 {
     MemoryGroup                          memory_group{};
     std::shared_ptr<IMemoryManager>      memory_manager{};
-    std::unique_ptr<opencl::IClOperator> op{ nullptr };
+    std::unique_ptr<opencl::IClOperator> op{nullptr};
     ITensorPack                          run_pack{};
     ITensorPack                          prep_pack{};
     WorkspaceData<CLTensor>              workspace{};
     experimental::MemoryRequirements     aux_mem_req{};
-    std::unique_ptr<IFunction>           func{ nullptr };
+    std::unique_ptr<IFunction>           func{nullptr};
 };
 
-CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _impl(std::make_unique<Impl>())
+CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
 {
     _impl->memory_manager = std::move(memory_manager);
 }
 
 CLConvolutionLayer::~CLConvolutionLayer() = default;
 
-void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+void CLConvolutionLayer::configure(ICLTensor                 *input,
+                                   const ICLTensor           *weights,
+                                   const ICLTensor           *biases,
+                                   ICLTensor                 *output,
+                                   const PadStrideInfo       &conv_info,
+                                   const WeightsInfo         &weights_info,
+                                   const Size2D              &dilation,
+                                   const ActivationLayerInfo &act_info,
+                                   bool                       enable_fast_math,
+                                   unsigned int               num_groups)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info,
+              dilation, act_info, enable_fast_math, num_groups);
 }
 
-void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                                   const WeightsInfo &weights_info,
-                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+void CLConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                   ICLTensor                 *input,
+                                   const ICLTensor           *weights,
+                                   const ICLTensor           *biases,
+                                   ICLTensor                 *output,
+                                   const PadStrideInfo       &conv_info,
+                                   const WeightsInfo         &weights_info,
+                                   const Size2D              &dilation,
+                                   const ActivationLayerInfo &act_info,
+                                   bool                       enable_fast_math,
+                                   unsigned int               num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
-                                                            enable_fast_math, num_groups));
-    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+    ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(
+        input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info,
+        weights_info, dilation, act_info, enable_fast_math, num_groups));
+    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
+                           enable_fast_math, num_groups);
 
     const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups);
 
-    switch(opencl::ClConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv2d_info,
-                                                    weights_info, CLScheduler::get().target()))
+    switch (opencl::ClConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv2d_info,
+                                                     weights_info, CLScheduler::get().target()))
     {
         case ConvolutionMethod::WINOGRAD:
         case ConvolutionMethod::DIRECT:
@@ -85,7 +103,8 @@ void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLT
         case ConvolutionMethod::GEMM:
         {
             auto f = std::make_unique<opencl::ClConv2d>();
-            f->configure(compile_context, input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv2d_info, weights_info);
+            f->configure(compile_context, input->info(), weights->info(),
+                         ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv2d_info, weights_info);
             _impl->op = std::move(f);
             break;
         }
@@ -101,40 +120,52 @@ void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLT
             break;
     }
 
-    if(_impl->op)
+    if (_impl->op)
     {
         _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager));
         _impl->aux_mem_req  = _impl->op->workspace();
-        _impl->run_pack     = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
-        _impl->prep_pack    = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } };
-        _impl->workspace    = manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+        _impl->run_pack     = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+        _impl->prep_pack    = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}};
+        _impl->workspace =
+            manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
     }
 }
 
-Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+Status CLConvolutionLayer::validate(const ITensorInfo         *input,
+                                    const ITensorInfo         *weights,
+                                    const ITensorInfo         *biases,
+                                    const ITensorInfo         *output,
+                                    const PadStrideInfo       &conv_info,
+                                    const WeightsInfo         &weights_info,
+                                    const Size2D              &dilation,
+                                    const ActivationLayerInfo &act_info,
+                                    bool                       enable_fast_math,
+                                    unsigned int               num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW),
+                                    "Grouping (num_groups != 1) with NHWC data layout is not supported");
 
     const GPUTarget  gpu_target  = CLScheduler::get().target();
     const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups);
 
-    switch(opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target))
+    switch (opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target))
     {
         case ConvolutionMethod::WINOGRAD:
         case ConvolutionMethod::DIRECT:
         case ConvolutionMethod::INDIRECT:
         case ConvolutionMethod::GEMM:
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(opencl::ClConv2d::validate(input, weights, biases, output, conv2d_info, weights_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                opencl::ClConv2d::validate(input, weights, biases, output, conv2d_info, weights_info));
             break;
         }
         case ConvolutionMethod::FFT:
         {
             // Validate FFT-based convolution layer
-            ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info,
+                                                                        act_info, enable_fast_math));
             break;
         }
         default:
@@ -145,8 +176,15 @@ Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo
     return Status{};
 }
 
-ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                             const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation, bool enable_fast_math)
+ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo         *input,
+                                                             const ITensorInfo         *weights,
+                                                             const ITensorInfo         *output,
+                                                             const PadStrideInfo       &conv_info,
+                                                             const WeightsInfo         &weights_info,
+                                                             const ActivationLayerInfo &act_info,
+                                                             const GPUTarget            gpu_target,
+                                                             const Size2D              &dilation,
+                                                             bool                       enable_fast_math)
 {
     const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, 1);
     return opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target);
@@ -158,7 +196,7 @@ void CLConvolutionLayer::run()
 
     MemoryGroupResourceScope scope_mg(_impl->memory_group);
 
-    if(_impl->func)
+    if (_impl->func)
     {
         _impl->func->run();
     }
@@ -170,7 +208,7 @@ void CLConvolutionLayer::run()
 
 void CLConvolutionLayer::prepare()
 {
-    if(_impl->func)
+    if (_impl->func)
     {
         _impl->func->prepare();
     }
diff --git a/src/runtime/CL/functions/CLCopy.cpp b/src/runtime/CL/functions/CLCopy.cpp
index 56400b67a0..a4f2b0634f 100644
--- a/src/runtime/CL/functions/CLCopy.cpp
+++ b/src/runtime/CL/functions/CLCopy.cpp
@@ -27,10 +27,10 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClCopy.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClCopy.h"
 
 #include <utility>
 
@@ -38,16 +38,15 @@ namespace arm_compute
 {
 struct CLCopy::Impl
 {
-    const ICLTensor                *src{ nullptr };
-    ICLTensor                      *dst{ nullptr };
-    std::unique_ptr<opencl::ClCopy> op{ nullptr };
+    const ICLTensor                *src{nullptr};
+    ICLTensor                      *dst{nullptr};
+    std::unique_ptr<opencl::ClCopy> op{nullptr};
 };
 
-CLCopy::CLCopy()
-    : _impl(std::make_unique<Impl>())
+CLCopy::CLCopy() : _impl(std::make_unique<Impl>())
 {
 }
-CLCopy::CLCopy(CLCopy &&) = default;
+CLCopy::CLCopy(CLCopy &&)            = default;
 CLCopy &CLCopy::operator=(CLCopy &&) = default;
 CLCopy::~CLCopy()                    = default;
 
diff --git a/src/runtime/CL/functions/CLCrop.cpp b/src/runtime/CL/functions/CLCrop.cpp
index 35ea17cfc2..fc29c43827 100644
--- a/src/runtime/CL/functions/CLCrop.cpp
+++ b/src/runtime/CL/functions/CLCrop.cpp
@@ -27,10 +27,10 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClCrop.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClCrop.h"
 
 #include <utility>
 
@@ -38,27 +38,38 @@ namespace arm_compute
 {
 struct CLCrop::Impl
 {
-    const ICLTensor                *src{ nullptr };
-    ICLTensor                      *dst{ nullptr };
-    std::unique_ptr<opencl::ClCrop> op{ nullptr };
+    const ICLTensor                *src{nullptr};
+    ICLTensor                      *dst{nullptr};
+    std::unique_ptr<opencl::ClCrop> op{nullptr};
 };
 
-CLCrop::CLCrop()
-    : _impl(std::make_unique<Impl>())
+CLCrop::CLCrop() : _impl(std::make_unique<Impl>())
 {
 }
-CLCrop::CLCrop(CLCrop &&) = default;
+CLCrop::CLCrop(CLCrop &&)            = default;
 CLCrop &CLCrop::operator=(CLCrop &&) = default;
 CLCrop::~CLCrop()                    = default;
 
-void CLCrop::configure(const ICLTensor *src, ICLTensor *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value,
-                       Window *dst_window)
+void CLCrop::configure(const ICLTensor *src,
+                       ICLTensor       *dst,
+                       Coordinates2D    start,
+                       Coordinates2D    end,
+                       uint32_t         batch_index,
+                       float            extrapolation_value,
+                       Window          *dst_window)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), src, dst, start, end, batch_index, extrapolation_value, dst_window);
+    configure(CLKernelLibrary::get().get_compile_context(), src, dst, start, end, batch_index, extrapolation_value,
+              dst_window);
 }
 
-void CLCrop::configure(const CLCompileContext &compile_context, const ICLTensor *src, ICLTensor *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value,
-                       Window *dst_window)
+void CLCrop::configure(const CLCompileContext &compile_context,
+                       const ICLTensor        *src,
+                       ICLTensor              *dst,
+                       Coordinates2D           start,
+                       Coordinates2D           end,
+                       uint32_t                batch_index,
+                       float                   extrapolation_value,
+                       Window                 *dst_window)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_LOG_PARAMS(src, dst, start, end, batch_index, extrapolation_value, dst_window);
@@ -67,10 +78,17 @@ void CLCrop::configure(const CLCompileContext &compile_context, const ICLTensor
     _impl->dst = dst;
 
     _impl->op = std::make_unique<opencl::ClCrop>();
-    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), start, end, batch_index, extrapolation_value, dst_window);
+    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), start, end, batch_index,
+                         extrapolation_value, dst_window);
 }
 
-Status CLCrop::validate(const ITensorInfo *input, const ITensorInfo *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window)
+Status CLCrop::validate(const ITensorInfo *input,
+                        const ITensorInfo *output,
+                        Coordinates2D      start,
+                        Coordinates2D      end,
+                        uint32_t           batch_index,
+                        float              extrapolation_value,
+                        Window            *dst_window)
 {
     return opencl::ClCrop::validate(input, output, start, end, batch_index, extrapolation_value, dst_window);
 }
diff --git a/src/runtime/CL/functions/CLCropResize.cpp b/src/runtime/CL/functions/CLCropResize.cpp
index d8fc38d99e..821412b149 100644
--- a/src/runtime/CL/functions/CLCropResize.cpp
+++ b/src/runtime/CL/functions/CLCropResize.cpp
@@ -25,19 +25,26 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
-#include "src/common/utils/Log.h"
-
 #include <cstddef>
 
 namespace arm_compute
 {
 namespace
 {
-inline void configure_crop(const ICLTensor *input, ICLTensor *crop_boxes, ICLTensor *box_ind, ICLTensor *output, uint32_t crop_box_ind, Coordinates &start, Coordinates &end, uint32_t &batch_index)
+inline void configure_crop(const ICLTensor *input,
+                           ICLTensor       *crop_boxes,
+                           ICLTensor       *box_ind,
+                           ICLTensor       *output,
+                           uint32_t         crop_box_ind,
+                           Coordinates     &start,
+                           Coordinates     &end,
+                           uint32_t        &batch_index)
 {
     batch_index = *(reinterpret_cast<int32_t *>(box_ind->ptr_to_element(Coordinates(crop_box_ind))));
 
@@ -50,30 +57,48 @@ inline void configure_crop(const ICLTensor *input, ICLTensor *crop_boxes, ICLTen
     // The normalized coordinates are scaled to retrieve the floating point image coordinates which are rounded to integers.
     start = Coordinates(std::floor(x0 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
                         std::floor(y0 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
-    end = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
-                      std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
-    const TensorShape out_shape(input->info()->tensor_shape()[0], static_cast<uint32_t>(abs(end[0] - start[0])) + 1, static_cast<uint32_t>(abs(end[1] - start[1])) + 1);
+    end   = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
+                        std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
+    const TensorShape out_shape(input->info()->tensor_shape()[0], static_cast<uint32_t>(abs(end[0] - start[0])) + 1,
+                                static_cast<uint32_t>(abs(end[1] - start[1])) + 1);
     output->info()->set_tensor_shape(out_shape);
 }
 } // namespace
 
 CLCropResize::CLCropResize()
-    : _input(nullptr), _boxes(nullptr), _box_ind(nullptr), _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _scale(), _copy(), _crop_results(), _scaled_results(), _internal_functions()
+    : _input(nullptr),
+      _boxes(nullptr),
+      _box_ind(nullptr),
+      _output(nullptr),
+      _num_boxes(0),
+      _method(),
+      _extrapolation_value(0),
+      _scale(),
+      _copy(),
+      _crop_results(),
+      _scaled_results(),
+      _internal_functions()
 {
 }
 
 CLCropResize::~CLCropResize() = default;
 
-Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITensorInfo *box_ind, const ITensorInfo *output,
-                              Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value)
+Status CLCropResize::validate(const ITensorInfo  *input,
+                              ITensorInfo        *boxes,
+                              ITensorInfo        *box_ind,
+                              const ITensorInfo  *output,
+                              Coordinates2D       crop_size,
+                              InterpolationPolicy method,
+                              float               extrapolation_value)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0);
     ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA);
     ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[0] != 4);
     ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[1] != box_ind->tensor_shape()[0]);
     TensorInfo temp_info;
-    ARM_COMPUTE_RETURN_ON_ERROR(CLCrop::validate(input->clone().get(), &temp_info, { 0, 0 }, { 1, 1 }, input->dimension(3) - 1, extrapolation_value));
-    if(output->total_size() > 0)
+    ARM_COMPUTE_RETURN_ON_ERROR(CLCrop::validate(input->clone().get(), &temp_info, {0, 0}, {1, 1},
+                                                 input->dimension(3) - 1, extrapolation_value));
+    if (output->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -83,20 +108,34 @@ Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITen
     return Status{};
 }
 
-void CLCropResize::configure(const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
-                             InterpolationPolicy method, float extrapolation_value)
+void CLCropResize::configure(const ICLTensor    *input,
+                             ICLTensor          *boxes,
+                             ICLTensor          *box_ind,
+                             ICLTensor          *output,
+                             Coordinates2D       crop_size,
+                             InterpolationPolicy method,
+                             float               extrapolation_value)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, boxes, box_ind, output, crop_size, method, extrapolation_value);
+    configure(CLKernelLibrary::get().get_compile_context(), input, boxes, box_ind, output, crop_size, method,
+              extrapolation_value);
 }
 
-void CLCropResize::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
-                             InterpolationPolicy method, float extrapolation_value)
+void CLCropResize::configure(const CLCompileContext &compile_context,
+                             const ICLTensor        *input,
+                             ICLTensor              *boxes,
+                             ICLTensor              *box_ind,
+                             ICLTensor              *output,
+                             Coordinates2D           crop_size,
+                             InterpolationPolicy     method,
+                             float                   extrapolation_value)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, boxes, box_ind);
-    ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value));
+    ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(),
+                                                      crop_size, method, extrapolation_value));
     ARM_COMPUTE_LOG_PARAMS(input, boxes, box_ind, output, crop_size, method, extrapolation_value);
 
-    TensorShape output_shape = TensorShape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y, boxes->info()->tensor_shape()[1]);
+    TensorShape output_shape =
+        TensorShape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y, boxes->info()->tensor_shape()[1]);
     auto_init_if_empty(*output->info(), output_shape, 1, DataType::F32);
 
     _num_boxes = boxes->info()->tensor_shape()[1];
@@ -122,7 +161,7 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT
     // kernels used for cropping and scaling.
     _boxes->map(CLScheduler::get().queue());
     _box_ind->map(CLScheduler::get().queue());
-    for(unsigned int num_box = 0; num_box < _num_boxes; ++num_box)
+    for (unsigned int num_box = 0; num_box < _num_boxes; ++num_box)
     {
         auto       crop_tensor = std::make_unique<CLTensor>();
         TensorInfo crop_result_info(1, DataType::F32);
@@ -143,7 +182,9 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT
         configure_crop(_input, _boxes, _box_ind, _crop_results[num_box].get(), num_box, start, end, batch_index);
 
         auto scale_kernel = std::make_unique<CLScale>();
-        scale_kernel->configure(compile_context, _crop_results[num_box].get(), _scaled_results[num_box].get(), ScaleKernelInfo{ _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT });
+        scale_kernel->configure(
+            compile_context, _crop_results[num_box].get(), _scaled_results[num_box].get(),
+            ScaleKernelInfo{_method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT});
         _scale.emplace_back(std::move(scale_kernel));
 
         Window win = calculate_max_window(*_output->info());
@@ -159,28 +200,50 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT
         bool is_width_flipped  = end[0] < start[0];
         bool is_height_flipped = end[1] < start[1];
         /** The number of rows out of bounds at the start and end of _crop_results[num_box].get(). */
-        std::array<int32_t, 2> rows_out_of_bounds{ 0 };
+        std::array<int32_t, 2> rows_out_of_bounds{0};
         /** The number of columns out of bounds at the start and end of _crop_results[num_box].get(). */
-        std::array<int32_t, 2> cols_out_of_bounds{ 0 };
-        if(is_height_flipped)
+        std::array<int32_t, 2> cols_out_of_bounds{0};
+        if (is_height_flipped)
         {
-            rows_out_of_bounds[0] = start[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(start[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0;
-            rows_out_of_bounds[1] = end[1] < 0 ? std::min(-end[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) : 0;
+            rows_out_of_bounds[0] = start[1] >= static_cast<int32_t>(_input->info()->dimension(2))
+                                        ? std::min(start[1] - _input->info()->dimension(2) + 1,
+                                                   _crop_results[num_box].get()->info()->dimension(2))
+                                        : 0;
+            rows_out_of_bounds[1] =
+                end[1] < 0 ? std::min(-end[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)))
+                           : 0;
         }
         else
         {
-            rows_out_of_bounds[0] = start[1] < 0 ? std::min(-start[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) : 0;
-            rows_out_of_bounds[1] = end[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(end[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0;
+            rows_out_of_bounds[0] =
+                start[1] < 0
+                    ? std::min(-start[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)))
+                    : 0;
+            rows_out_of_bounds[1] = end[1] >= static_cast<int32_t>(_input->info()->dimension(2))
+                                        ? std::min(end[1] - _input->info()->dimension(2) + 1,
+                                                   _crop_results[num_box].get()->info()->dimension(2))
+                                        : 0;
         }
-        if(is_width_flipped)
+        if (is_width_flipped)
         {
-            cols_out_of_bounds[0] = start[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(start[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0;
-            cols_out_of_bounds[1] = end[0] < 0 ? std::min(-end[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) : 0;
+            cols_out_of_bounds[0] = start[0] >= static_cast<int32_t>(_input->info()->dimension(1))
+                                        ? std::min(start[0] - _input->info()->dimension(1) + 1,
+                                                   _crop_results[num_box].get()->info()->dimension(1))
+                                        : 0;
+            cols_out_of_bounds[1] =
+                end[0] < 0 ? std::min(-end[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)))
+                           : 0;
         }
         else
         {
-            cols_out_of_bounds[0] = start[0] < 0 ? std::min(-start[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) : 0;
-            cols_out_of_bounds[1] = end[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(end[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0;
+            cols_out_of_bounds[0] =
+                start[0] < 0
+                    ? std::min(-start[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)))
+                    : 0;
+            cols_out_of_bounds[1] = end[0] >= static_cast<int32_t>(_input->info()->dimension(1))
+                                        ? std::min(end[0] - _input->info()->dimension(1) + 1,
+                                                   _crop_results[num_box].get()->info()->dimension(1))
+                                        : 0;
         }
 
         Window full_window = calculate_max_window(*_crop_results[num_box].get()->info());
@@ -203,67 +266,84 @@ void CLCropResize::configure(const CLCompileContext &compile_context, const ICLT
         // Fill all _crop_results[num_box].get() rows that have no elements that are within the input bounds
         // with the extrapolation value using memset.
         // First for the rows before the in bounds rows.
-        if(rows_out_of_bounds[0] > 0)
+        if (rows_out_of_bounds[0] > 0)
         {
             Window slice_fill_rows_before(full_window);
             slice_fill_rows_before.set(2, Window::Dimension(0, rows_out_of_bounds[0], 1));
             auto kernel = std::make_unique<CLFill>();
-            kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_before);
+            kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value,
+                              &slice_fill_rows_before);
             //_internal_functions.emplace_back(std::move(kernel));
             _internal_functions.push_back(std::move(kernel));
         }
 
         Window slice_in(full_window);
-        slice_in.set(2, Window::Dimension(rows_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], 1));
-        slice_in.set(1, Window::Dimension(cols_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], 1));
-
-        int rows_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)) - rows_out_of_bounds[0] - rows_out_of_bounds[1];
-        if(rows_in_bounds > 0)
+        slice_in.set(2,
+                     Window::Dimension(rows_out_of_bounds[0],
+                                       _crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], 1));
+        slice_in.set(1,
+                     Window::Dimension(cols_out_of_bounds[0],
+                                       _crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], 1));
+
+        int rows_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)) -
+                             rows_out_of_bounds[0] - rows_out_of_bounds[1];
+        if (rows_in_bounds > 0)
         {
             // Fill all elements that share a row with an in bounds element with the extrapolation value.
-            if(cols_out_of_bounds[0] > 0)
+            if (cols_out_of_bounds[0] > 0)
             {
                 Window slice_fill_cols_before(slice_in);
                 slice_fill_cols_before.set(1, Window::Dimension(0, cols_out_of_bounds[0], 1));
                 auto kernel = std::make_unique<CLFill>();
-                kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_before);
+                kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value,
+                                  &slice_fill_cols_before);
                 //_internal_functions.emplace_back(std::move(kernel));
                 _internal_functions.push_back(std::move(kernel));
             }
 
-            if(cols_out_of_bounds[1] > 0)
+            if (cols_out_of_bounds[1] > 0)
             {
                 Window slice_fill_cols_after(slice_in);
-                slice_fill_cols_after.set(1, Window::Dimension(_crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(1), 1));
+                slice_fill_cols_after.set(
+                    1, Window::Dimension(_crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1],
+                                         _crop_results[num_box].get()->info()->dimension(1), 1));
                 auto kernel = std::make_unique<CLFill>();
-                kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_after);
+                kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value,
+                                  &slice_fill_cols_after);
                 //_internal_functions.emplace_back(std::move(kernel));
                 _internal_functions.push_back(std::move(kernel));
             }
 
             // Copy all elements within the input bounds from the input tensor.
-            int cols_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)) - cols_out_of_bounds[0] - cols_out_of_bounds[1];
-            if(cols_in_bounds > 0)
+            int cols_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)) -
+                                 cols_out_of_bounds[0] - cols_out_of_bounds[1];
+            if (cols_in_bounds > 0)
             {
-                Coordinates2D start_in{ is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0],
-                                        is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0] };
-                Coordinates2D end_in{ is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1,
-                                      is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1 };
+                Coordinates2D start_in{
+                    is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0],
+                    is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0]};
+                Coordinates2D end_in{
+                    is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1,
+                    is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1};
                 auto kernel = std::make_unique<CLCrop>();
 
-                kernel->configure(compile_context, _input, _crop_results[num_box].get(), start_in, end_in, batch_index, extrapolation_value, &slice_in);
+                kernel->configure(compile_context, _input, _crop_results[num_box].get(), start_in, end_in, batch_index,
+                                  extrapolation_value, &slice_in);
                 //_internal_functions.emplace_back(std::move(kernel));
                 _internal_functions.push_back(std::move(kernel));
             }
         }
 
         // Fill all rows after the in bounds elements with the extrapolation value.
-        if(rows_out_of_bounds[1] > 0)
+        if (rows_out_of_bounds[1] > 0)
         {
             Window slice_fill_rows_after(full_window);
-            slice_fill_rows_after.set(2, Window::Dimension(_crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(2), 1));
+            slice_fill_rows_after.set(
+                2, Window::Dimension(_crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1],
+                                     _crop_results[num_box].get()->info()->dimension(2), 1));
             auto kernel = std::make_unique<CLFill>();
-            kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_after);
+            kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value,
+                              &slice_fill_rows_after);
             //_internal_functions.emplace_back(std::move(kernel));
             _internal_functions.push_back(std::move(kernel));
         }
@@ -277,18 +357,18 @@ void CLCropResize::run()
 {
     ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function");
 
-    for(unsigned int i = 0; i < _internal_functions.size(); ++i)
+    for (unsigned int i = 0; i < _internal_functions.size(); ++i)
     {
         _internal_functions[i]->run();
     }
 
     CLScheduler::get().sync();
-    for(auto &kernel : _scale)
+    for (auto &kernel : _scale)
     {
         kernel->run();
     }
     CLScheduler::get().sync();
-    for(auto &kernel : _copy)
+    for (auto &kernel : _copy)
     {
         kernel->run();
     }
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index 4421a18f2a..e988ab0ac4 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -25,16 +25,16 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/IClOperator.h"
 #include "src/gpu/cl/operators/ClTransposedConvolution.h"
 
-#include "src/common/utils/Log.h"
-
 #include <cmath>
 #include <memory>
 #include <tuple>
@@ -44,11 +44,11 @@ using namespace arm_compute::misc::shape_calculator;
 
 struct CLDeconvolutionLayer::Impl
 {
-    const ICLTensor                     *src{ nullptr };
-    const ICLTensor                     *weights{ nullptr };
-    const ICLTensor                     *biases{ nullptr };
-    ICLTensor                           *dst{ nullptr };
-    std::unique_ptr<opencl::IClOperator> op{ nullptr };
+    const ICLTensor                     *src{nullptr};
+    const ICLTensor                     *weights{nullptr};
+    const ICLTensor                     *biases{nullptr};
+    ICLTensor                           *dst{nullptr};
+    std::unique_ptr<opencl::IClOperator> op{nullptr};
 };
 
 CLDeconvolutionLayer::~CLDeconvolutionLayer() = default;
@@ -58,24 +58,35 @@ CLDeconvolutionLayer::CLDeconvolutionLayer(std::shared_ptr<IMemoryManager> memor
 {
 }
 
-void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
-                                     const WeightsInfo &weights_info)
+void CLDeconvolutionLayer::configure(ICLTensor           *input,
+                                     ICLTensor           *weights,
+                                     const ICLTensor     *bias,
+                                     ICLTensor           *output,
+                                     const PadStrideInfo &deconv_info,
+                                     const WeightsInfo   &weights_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info, weights_info);
 }
 
-void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
-                                     const WeightsInfo &weights_info)
+void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context,
+                                     ICLTensor              *input,
+                                     ICLTensor              *weights,
+                                     const ICLTensor        *bias,
+                                     ICLTensor              *output,
+                                     const PadStrideInfo    &deconv_info,
+                                     const WeightsInfo      &weights_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, deconv_info, weights_info);
 
-    switch(CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(), deconv_info, weights_info))
+    switch (CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(),
+                                                           deconv_info, weights_info))
     {
         case DeconvolutionMethod::DIRECT:
         {
             auto op = std::make_unique<opencl::ClTransposedConvolution>();
-            op->configure(compile_context, input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr, output->info(), deconv_info);
+            op->configure(compile_context, input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr,
+                          output->info(), deconv_info);
 
             _impl->src     = input;
             _impl->weights = weights;
@@ -105,22 +116,28 @@ void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, IC
     }
 }
 
-Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
-                                      const WeightsInfo &weights_info)
+Status CLDeconvolutionLayer::validate(const ITensorInfo   *input,
+                                      const ITensorInfo   *weights,
+                                      const ITensorInfo   *bias,
+                                      ITensorInfo         *output,
+                                      const PadStrideInfo &deconv_info,
+                                      const WeightsInfo   &weights_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    switch(CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info))
+    switch (CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info))
     {
         case DeconvolutionMethod::DIRECT:
         {
             // Validate transposed convolution operator
-            ARM_COMPUTE_RETURN_ON_ERROR(opencl::ClTransposedConvolution::validate(input, weights, bias, output, deconv_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                opencl::ClTransposedConvolution::validate(input, weights, bias, output, deconv_info));
             break;
         }
         case DeconvolutionMethod::UPSCALE_CONV2D:
         {
             // Validate direct convolution layer
-            ARM_COMPUTE_RETURN_ON_ERROR(CLDirectDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, weights_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLDirectDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, weights_info));
             break;
         }
         case DeconvolutionMethod::GEMM:
@@ -137,12 +154,16 @@ Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf
     return Status{};
 }
 
-DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
-                                                                   const WeightsInfo &weights_info)
+DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensorInfo   *input,
+                                                                   const ITensorInfo   *weights,
+                                                                   const ITensorInfo   *bias,
+                                                                   ITensorInfo         *output,
+                                                                   const PadStrideInfo &deconv_info,
+                                                                   const WeightsInfo   &weights_info)
 {
     ARM_COMPUTE_UNUSED(output, bias, weights_info);
 
-    if(is_data_type_quantized_per_channel(weights->data_type()))
+    if (is_data_type_quantized_per_channel(weights->data_type()))
     {
         return DeconvolutionMethod::UPSCALE_CONV2D;
     }
@@ -154,11 +175,12 @@ DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensor
     const size_t idx_n = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
     const size_t ofm   = weights->tensor_shape()[idx_n];
 
-    if(weights->dimension(idx_w) != deconv_info.stride().first || weights->dimension(idx_h) != deconv_info.stride().second)
+    if (weights->dimension(idx_w) != deconv_info.stride().first ||
+        weights->dimension(idx_h) != deconv_info.stride().second)
     {
         // We observe better performance for FP32 types only when ofm <= 16.
         // A better heuristic is required for selecting the method for FP16 data types.
-        if(input->data_layout() == DataLayout::NHWC && !((input->data_type() == DataType::F32) && (ofm > 16)))
+        if (input->data_layout() == DataLayout::NHWC && !((input->data_type() == DataType::F32) && (ofm > 16)))
         {
             return DeconvolutionMethod::DIRECT;
         }
@@ -175,7 +197,7 @@ void CLDeconvolutionLayer::run()
 {
     prepare();
 
-    if(_impl->op != nullptr)
+    if (_impl->op != nullptr)
     {
         // Optimized Operator will be used
         ITensorPack pack;
@@ -195,7 +217,7 @@ void CLDeconvolutionLayer::run()
 
 void CLDeconvolutionLayer::prepare()
 {
-    if(_impl->op == nullptr)
+    if (_impl->op == nullptr)
     {
         _function->prepare();
     }
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
index 0b428f5b17..b92bf903a6 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
@@ -27,22 +27,21 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
-#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
 
 namespace arm_compute
 {
 CLDeconvolutionLayerUpsample::CLDeconvolutionLayerUpsample() // NOLINT
-    : _upsample(std::make_unique<CLDeconvolutionLayerUpsampleKernel>()),
-      _fill(),
-      _output(nullptr)
+    : _upsample(std::make_unique<CLDeconvolutionLayerUpsampleKernel>()), _fill(), _output(nullptr)
 {
 }
 
 CLDeconvolutionLayerUpsample::~CLDeconvolutionLayerUpsample() = default;
 
-Status CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info)
+Status
+CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info)
 {
     return CLDeconvolutionLayerUpsampleKernel::validate(input, output, info);
 }
@@ -52,13 +51,17 @@ void CLDeconvolutionLayerUpsample::configure(ICLTensor *input, ICLTensor *output
     configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
 }
 
-void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PadStrideInfo &info)
+void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_context,
+                                             ICLTensor              *input,
+                                             ICLTensor              *output,
+                                             const PadStrideInfo    &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, output, info);
 
     _output = output;
-    _fill.configure(compile_context, _output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info()));
+    _fill.configure(compile_context, _output,
+                    PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info()));
     _upsample->configure(compile_context, input, _output, info);
 }
 
diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
index cac3f51013..6d2fea974e 100644
--- a/src/runtime/CL/functions/CLDepthConvertLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
@@ -26,10 +26,10 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClCast.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClCast.h"
 
 #include <utility>
 
@@ -37,16 +37,15 @@ namespace arm_compute
 {
 struct CLDepthConvertLayer::Impl
 {
-    const ICLTensor                *src{ nullptr };
-    ICLTensor                      *dst{ nullptr };
-    std::unique_ptr<opencl::ClCast> op{ nullptr };
+    const ICLTensor                *src{nullptr};
+    ICLTensor                      *dst{nullptr};
+    std::unique_ptr<opencl::ClCast> op{nullptr};
 };
 
-CLDepthConvertLayer::CLDepthConvertLayer()
-    : _impl(std::make_unique<Impl>())
+CLDepthConvertLayer::CLDepthConvertLayer() : _impl(std::make_unique<Impl>())
 {
 }
-CLDepthConvertLayer::CLDepthConvertLayer(CLDepthConvertLayer &&) = default;
+CLDepthConvertLayer::CLDepthConvertLayer(CLDepthConvertLayer &&)            = default;
 CLDepthConvertLayer &CLDepthConvertLayer::operator=(CLDepthConvertLayer &&) = default;
 CLDepthConvertLayer::~CLDepthConvertLayer()                                 = default;
 
@@ -55,7 +54,11 @@ void CLDepthConvertLayer::configure(const ICLTensor *input, ICLTensor *output, C
     configure(CLKernelLibrary::get().get_compile_context(), input, output, policy, shift);
 }
 
-void CLDepthConvertLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
+void CLDepthConvertLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    ICLTensor              *output,
+                                    ConvertPolicy           policy,
+                                    uint32_t                shift)
 {
     ARM_COMPUTE_UNUSED(shift);
     ARM_COMPUTE_LOG_PARAMS(input, output, policy, shift);
@@ -70,7 +73,8 @@ void CLDepthConvertLayer::configure(const CLCompileContext &compile_context, con
     _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), policy);
 }
 
-Status CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
+Status
+CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(shift != 0);
     return opencl::ClCast::validate(input, output, policy);
@@ -78,7 +82,7 @@ Status CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo
 
 void CLDepthConvertLayer::run()
 {
-    ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } };
+    ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}};
     _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
index 98531e7cac..9477c7f81d 100644
--- a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h"
 
-#include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
 
 #include <utility>
 
@@ -36,7 +35,10 @@ void CLDepthToSpaceLayer::configure(const ICLTensor *input, ICLTensor *output, i
     configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
 }
 
-void CLDepthToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLDepthToSpaceLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    ICLTensor              *output,
+                                    int32_t                 block_shape)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, block_shape);
     auto k = std::make_unique<CLDepthToSpaceLayerKernel>();
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index dcb982fa56..873601bb11 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -29,12 +29,12 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
 #include "src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h"
 #include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 using namespace arm_compute::misc;
@@ -63,25 +63,33 @@ CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer(std::shared_ptr<IMemory
 
 CLDepthwiseConvolutionLayer::~CLDepthwiseConvolutionLayer() = default;
 
-void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                                            unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
+void CLDepthwiseConvolutionLayer::configure(ICLTensor           *input,
+                                            const ICLTensor     *weights,
+                                            const ICLTensor     *biases,
+                                            ICLTensor           *output,
+                                            const PadStrideInfo &conv_info,
+                                            unsigned int         depth_multiplier,
+                                            ActivationLayerInfo  act_info,
+                                            const Size2D        &dilation)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier,
+              act_info, dilation);
 }
 
-void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases,
-                                            ICLTensor *output, const PadStrideInfo &conv_info,
-                                            unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
+void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_context,
+                                            ICLTensor              *input,
+                                            const ICLTensor        *weights,
+                                            const ICLTensor        *biases,
+                                            ICLTensor              *output,
+                                            const PadStrideInfo    &conv_info,
+                                            unsigned int            depth_multiplier,
+                                            ActivationLayerInfo     act_info,
+                                            const Size2D           &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
-    ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate(input->info(),
-                                                                     weights->info(),
-                                                                     biases != nullptr ? biases->info() : nullptr,
-                                                                     output != nullptr ? output->info() : input->info(),
-                                                                     conv_info,
-                                                                     depth_multiplier,
-                                                                     act_info,
-                                                                     dilation));
+    ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate(
+        input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr,
+        output != nullptr ? output->info() : input->info(), conv_info, depth_multiplier, act_info, dilation));
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
 
     _is_quantized     = is_data_type_quantized(input->info()->data_type());
@@ -96,7 +104,7 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont
     ICLTensor       *input_to_use   = input;
     const ICLTensor *weights_to_use = weights;
     ICLTensor       *output_to_use  = output;
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _memory_group.manage(&_permuted_input);
         _memory_group.manage(&_permuted_output);
@@ -119,10 +127,12 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont
 
     CLTensor *output_multipliers_to_use = nullptr;
     CLTensor *output_shifts_to_use      = nullptr;
-    if(_is_quantized)
+    if (_is_quantized)
     {
-        const size_t idx_c       = get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL);
-        const size_t num_filters = (is_data_type_quantized_per_channel(weights->info()->data_type())) ? weights->info()->dimension(idx_c) : 1;
+        const size_t idx_c =
+            get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t num_filters =
+            (is_data_type_quantized_per_channel(weights->info()->data_type())) ? weights->info()->dimension(idx_c) : 1;
 
         _output_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
         _output_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
@@ -132,16 +142,18 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont
     }
 
     // Get the depthwise convolution compute parameters
-    auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
-    const DWCComputeKernelInfo dwc_native_compute_info = t->configure(input_to_use->info(), weights_to_use->info(), conv_info, dilation, depth_multiplier);
+    auto                       t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+    const DWCComputeKernelInfo dwc_native_compute_info =
+        t->configure(input_to_use->info(), weights_to_use->info(), conv_info, dilation, depth_multiplier);
 
-    const ConvolutionInfo conv_kernel_info{ conv_info, depth_multiplier, act_info, dilation };
+    const ConvolutionInfo conv_kernel_info{conv_info, depth_multiplier, act_info, dilation};
 
     _dwc_native_kernel->set_target(gpu_target);
     _dwc_native_kernel->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use,
-                                  dwc_native_compute_info, conv_kernel_info, output_multipliers_to_use, output_shifts_to_use);
+                                  dwc_native_compute_info, conv_kernel_info, output_multipliers_to_use,
+                                  output_shifts_to_use);
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permuted_input.allocator()->allocate();
 
@@ -151,22 +163,27 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont
         _permuted_output.allocator()->allocate();
     }
 
-    if(_is_quantized)
+    if (_is_quantized)
     {
         _output_multipliers.allocator()->allocate();
         _output_shifts.allocator()->allocate();
     }
 }
 
-Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo   *input,
+                                             const ITensorInfo   *weights,
+                                             const ITensorInfo   *biases,
+                                             const ITensorInfo   *output,
                                              const PadStrideInfo &conv_info,
-                                             unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
+                                             unsigned int         depth_multiplier,
+                                             ActivationLayerInfo  act_info,
+                                             const Size2D        &dilation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported");
 
     const bool in_place = input == output || output == nullptr;
-    if(in_place)
+    if (in_place)
     {
         output = input;
     }
@@ -174,21 +191,23 @@ Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe
     const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
 
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) >
+                                input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) >
+                                input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
 
     const GPUTarget gpu_target = CLScheduler::get().target();
 
-    const ConvolutionInfo conv_kernel_info{ conv_info, depth_multiplier, act_info, dilation };
+    const ConvolutionInfo conv_kernel_info{conv_info, depth_multiplier, act_info, dilation};
 
     const bool needs_permute = input->data_layout() == DataLayout::NCHW;
 
     const bool is_quantized = is_data_type_quantized(input->data_type());
 
     TensorInfo output_multipliers_shifts_info(TensorInfo(TensorShape(1U), 1, DataType::S32));
-    if(is_quantized)
+    if (is_quantized)
     {
-        if(is_data_type_quantized_per_channel(weights->data_type()))
+        if (is_data_type_quantized_per_channel(weights->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
 
@@ -201,40 +220,57 @@ Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe
         }
     }
 
-    if(needs_permute)
+    if (needs_permute)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(in_place, "In-place is supported only with NHWC data layout");
         TensorShape           permuted_input_shape   = input->tensor_shape();
         TensorShape           permuted_weights_shape = weights->tensor_shape();
-        const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation };
-        TensorShape           permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
+        const ConvolutionInfo info{conv_info, depth_multiplier, ActivationLayerInfo(), dilation};
+        TensorShape           permuted_output_shape =
+            shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
 
         permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
         permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
         permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
 
-        const TensorInfo permuted_input   = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC);
-        const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC);
-        const TensorInfo permuted_output  = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NHWC);
+        const TensorInfo permuted_input = input->clone()
+                                              ->set_is_resizable(true)
+                                              .reset_padding()
+                                              .set_tensor_shape(permuted_input_shape)
+                                              .set_data_layout(DataLayout::NHWC);
+        const TensorInfo permuted_weights = weights->clone()
+                                                ->set_is_resizable(true)
+                                                .reset_padding()
+                                                .set_tensor_shape(permuted_weights_shape)
+                                                .set_data_layout(DataLayout::NHWC);
+        const TensorInfo permuted_output = output->clone()
+                                               ->set_is_resizable(true)
+                                               .reset_padding()
+                                               .set_tensor_shape(permuted_output_shape)
+                                               .set_data_layout(DataLayout::NHWC);
 
         ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U)));
         ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
 
         // Get the depthwise convolution compute parameters
-        auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
-        const DWCComputeKernelInfo dwc_native_compute_info = t->configure(&permuted_input, &permuted_weights, conv_info, dilation, depth_multiplier);
+        auto                       t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+        const DWCComputeKernelInfo dwc_native_compute_info =
+            t->configure(&permuted_input, &permuted_weights, conv_info, dilation, depth_multiplier);
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output,
-                                                                                      dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(
+            &permuted_input, &permuted_weights, biases, &permuted_output, dwc_native_compute_info, conv_kernel_info,
+            &output_multipliers_shifts_info, &output_multipliers_shifts_info));
         ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(&permuted_output, output, PermutationVector(1U, 2U, 0U)));
     }
     else
     {
         // Get the depthwise convolution compute parameters
-        auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
-        const DWCComputeKernelInfo dwc_native_compute_info = t->configure(input, weights, conv_info, dilation, depth_multiplier);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(input, weights, biases, output, dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info,
-                                                                                      &output_multipliers_shifts_info));
+        auto                       t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+        const DWCComputeKernelInfo dwc_native_compute_info =
+            t->configure(input, weights, conv_info, dilation, depth_multiplier);
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(
+            input, weights, biases, output, dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info,
+            &output_multipliers_shifts_info));
     }
     return Status{};
 }
@@ -245,12 +281,12 @@ void CLDepthwiseConvolutionLayer::run()
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permute_input_to_nhwc.run();
     }
     CLScheduler::get().enqueue(*_dwc_native_kernel);
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permute_output_to_nchw.run();
     }
@@ -258,22 +294,21 @@ void CLDepthwiseConvolutionLayer::run()
 
 void CLDepthwiseConvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
-        if(_is_quantized)
+        if (_is_quantized)
         {
             _output_multipliers.map();
             _output_shifts.map();
-            quantization::compute_quantized_multipliers_and_shifts(_input->info(),
-                                                                   _original_weights->info(),
-                                                                   _output != nullptr ? _output->info() : _input->info(),
-                                                                   reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))),
-                                                                   reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0))));
+            quantization::compute_quantized_multipliers_and_shifts(
+                _input->info(), _original_weights->info(), _output != nullptr ? _output->info() : _input->info(),
+                reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))),
+                reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0))));
             _output_multipliers.unmap();
             _output_shifts.unmap();
         }
 
-        if(_needs_permute)
+        if (_needs_permute)
         {
             ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
 
diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp
index 64c6b5d91c..20162a03db 100644
--- a/src/runtime/CL/functions/CLDequantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp
@@ -26,22 +26,21 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClDequantize.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClDequantize.h"
 
 namespace arm_compute
 {
 struct CLDequantizationLayer::Impl
 {
-    const ICLTensor                      *src{ nullptr };
-    ICLTensor                            *dst{ nullptr };
-    std::unique_ptr<opencl::ClDequantize> op{ nullptr };
+    const ICLTensor                      *src{nullptr};
+    ICLTensor                            *dst{nullptr};
+    std::unique_ptr<opencl::ClDequantize> op{nullptr};
 };
 
-CLDequantizationLayer::CLDequantizationLayer()
-    : _impl(std::make_unique<Impl>())
+CLDequantizationLayer::CLDequantizationLayer() : _impl(std::make_unique<Impl>())
 {
 }
 CLDequantizationLayer::~CLDequantizationLayer() = default;
@@ -51,7 +50,9 @@ void CLDequantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
     configure(CLKernelLibrary::get().get_compile_context(), input, output);
 }
 
-void CLDequantizationLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+void CLDequantizationLayer::configure(const CLCompileContext &compile_context,
+                                      const ICLTensor        *input,
+                                      ICLTensor              *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output);
     _impl->src = input;
diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
index 752e0e4a60..d6dae0d732 100644
--- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
@@ -28,37 +28,46 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/gpu/cl/operators/ClActivation.h"
-#include "src/gpu/cl/operators/ClDirectConv2d.h"
 
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/operators/ClActivation.h"
+#include "src/gpu/cl/operators/ClDirectConv2d.h"
 
 namespace arm_compute
 {
 struct CLDirectConvolutionLayer::Impl
 {
-    const ICLTensor                        *src{ nullptr };
-    const ICLTensor                        *weights{ nullptr };
-    const ICLTensor                        *biases{ nullptr };
-    ICLTensor                              *dst{ nullptr };
-    std::unique_ptr<opencl::ClDirectConv2d> op{ nullptr };
+    const ICLTensor                        *src{nullptr};
+    const ICLTensor                        *weights{nullptr};
+    const ICLTensor                        *biases{nullptr};
+    ICLTensor                              *dst{nullptr};
+    std::unique_ptr<opencl::ClDirectConv2d> op{nullptr};
 };
 
-CLDirectConvolutionLayer::CLDirectConvolutionLayer()
-    : _impl(std::make_unique<Impl>())
+CLDirectConvolutionLayer::CLDirectConvolutionLayer() : _impl(std::make_unique<Impl>())
 {
 }
-CLDirectConvolutionLayer::CLDirectConvolutionLayer(CLDirectConvolutionLayer &&) = default;
+CLDirectConvolutionLayer::CLDirectConvolutionLayer(CLDirectConvolutionLayer &&)            = default;
 CLDirectConvolutionLayer &CLDirectConvolutionLayer::operator=(CLDirectConvolutionLayer &&) = default;
 CLDirectConvolutionLayer::~CLDirectConvolutionLayer()                                      = default;
 
-void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CLDirectConvolutionLayer::configure(ICLTensor                 *input,
+                                         const ICLTensor           *weights,
+                                         const ICLTensor           *biases,
+                                         ICLTensor                 *output,
+                                         const PadStrideInfo       &conv_info,
+                                         const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info);
 }
 
-void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                         const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CLDirectConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                         ICLTensor                 *input,
+                                         const ICLTensor           *weights,
+                                         const ICLTensor           *biases,
+                                         ICLTensor                 *output,
+                                         const PadStrideInfo       &conv_info,
+                                         const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info);
@@ -69,10 +78,15 @@ void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context
     _impl->dst     = output;
 
     _impl->op = std::make_unique<opencl::ClDirectConv2d>();
-    _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info);
+    _impl->op->configure(compile_context, input->info(), weights->info(),
+                         (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info);
 }
 
-Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+Status CLDirectConvolutionLayer::validate(const ITensorInfo         *input,
+                                          const ITensorInfo         *weights,
+                                          const ITensorInfo         *biases,
+                                          const ITensorInfo         *output,
+                                          const PadStrideInfo       &conv_info,
                                           const ActivationLayerInfo &act_info)
 {
     return opencl::ClDirectConv2d::validate(input, weights, biases, output, conv_info, act_info);
@@ -87,4 +101,4 @@ void CLDirectConvolutionLayer::run()
     pack.add_tensor(TensorType::ACL_DST, _impl->dst);
     _impl->op->run(pack);
 }
-}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
index 88c3c6193c..3717f30ae1 100644
--- a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
@@ -26,15 +26,15 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 
-#include "src/common/utils/Log.h"
-
 #include <memory>
 #include <tuple>
 
@@ -55,11 +55,16 @@ CLDirectDeconvolutionLayer::CLDirectDeconvolutionLayer(std::shared_ptr<IMemoryMa
 {
 }
 
-Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
-                                            const WeightsInfo &weights_info)
+Status CLDirectDeconvolutionLayer::validate(const ITensorInfo   *input,
+                                            const ITensorInfo   *weights,
+                                            const ITensorInfo   *bias,
+                                            ITensorInfo         *output,
+                                            const PadStrideInfo &info,
+                                            const WeightsInfo   &weights_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
     const DataLayout data_layout = input->data_layout();
 
@@ -70,20 +75,22 @@ Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITen
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) < 1);
 
-    auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), info);
+    auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h),
+                                                    weights->dimension(idx_w), weights->dimension(idx_h), info);
 
     const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
-    if(input->data_type() != weights->data_type())
+    if (input->data_type() != weights->data_type())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(weights->data_type() != DataType::QSYMM8_PER_CHANNEL || !is_data_type_quantized_asymmetric(input->data_type()));
+        ARM_COMPUTE_RETURN_ERROR_ON(weights->data_type() != DataType::QSYMM8_PER_CHANNEL ||
+                                    !is_data_type_quantized_asymmetric(input->data_type()));
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
-        if(is_data_type_quantized_asymmetric(input->data_type()))
+        if (is_data_type_quantized_asymmetric(input->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         }
@@ -102,24 +109,39 @@ Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITen
     unsigned int        deconv_pad_y    = 0;
     const unsigned int  stride_x        = info.stride().first;
     const unsigned int  stride_y        = info.stride().second;
-    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
-    TensorInfo          scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout));
+    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y,
+                                                                                out_dims, deconv_pad_x, deconv_pad_y);
+    TensorInfo          scale_out_info(input->clone()
+                                           ->set_is_resizable(true)
+                                           .reset_padding()
+                                           .set_tensor_shape(scale_out_shape)
+                                           .set_data_layout(data_layout));
     const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info));
 
     return Status{};
 }
 
-void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
-                                           const WeightsInfo &weights_info)
+void CLDirectDeconvolutionLayer::configure(ICLTensor           *input,
+                                           ICLTensor           *weights,
+                                           const ICLTensor     *bias,
+                                           ICLTensor           *output,
+                                           const PadStrideInfo &info,
+                                           const WeightsInfo   &weights_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, info, weights_info);
 }
 
-void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
-                                           const WeightsInfo &weights_info)
+void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_context,
+                                           ICLTensor              *input,
+                                           ICLTensor              *weights,
+                                           const ICLTensor        *bias,
+                                           ICLTensor              *output,
+                                           const PadStrideInfo    &info,
+                                           const WeightsInfo      &weights_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, info, weights_info);
@@ -141,15 +163,19 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte
     _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
     _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis);
 
-    auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info);
+    auto out_dims =
+        deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h),
+                                        weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info);
 
     const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
+    auto_init_if_empty(*output->info(),
+                       input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(CLDirectDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLDirectDeconvolutionLayer::validate(
+        input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info));
 
     _is_prepared = weights_info.retain_internal_weights();
 
@@ -158,7 +184,8 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte
     // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
     unsigned int      deconv_pad_x    = 0;
     unsigned int      deconv_pad_y    = 0;
-    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
+    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(
+        *input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
 
     unsigned int deconv_pad_left  = pad_right > pad_left ? pad_right - pad_left : 0;
     unsigned int deconv_pad_right = pad_left > pad_right ? pad_left - pad_right : 0;
@@ -179,7 +206,8 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte
     _scaled_output.allocator()->init(scale_out_info);
 
     // configure scale function
-    const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR);
+    const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top,
+                                      deconv_pad_bottom, DimensionRoundingType::FLOOR);
     _scale_f.configure(compile_context, input, &_scaled_output, upsample_info);
 
     // Setup the function to convolve the upscaled output
@@ -191,7 +219,7 @@ void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_conte
     _flip_axis.allocator()->allocate();
     _flip_axis.map(true);
     auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
-    if(weights->info()->data_layout() == DataLayout::NHWC)
+    if (weights->info()->data_layout() == DataLayout::NHWC)
     {
         axis_data[0] = 1;
         axis_data[1] = 2;
@@ -216,7 +244,7 @@ void CLDirectDeconvolutionLayer::run()
 
 void CLDirectDeconvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
 
@@ -229,7 +257,7 @@ void CLDirectDeconvolutionLayer::prepare()
         _conv_f.prepare();
 
         // Free flipped weights
-        if(!_weights_flipped.is_used())
+        if (!_weights_flipped.is_used())
         {
             _weights_flipped.allocator()->free();
         }
diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp
index 936b37fb31..d9529f0b7f 100644
--- a/src/runtime/CL/functions/CLElementwiseOperations.cpp
+++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp
@@ -26,8 +26,8 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
 
+#include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClAdd.h"
 #include "src/gpu/cl/operators/ClElementwiseOperations.h"
 #include "src/gpu/cl/operators/ClSub.h"
@@ -36,26 +36,30 @@ namespace arm_compute
 {
 struct CLArithmeticAddition::Impl
 {
-    const ICLTensor               *src_0{ nullptr };
-    const ICLTensor               *src_1{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClAdd> op{ nullptr };
+    const ICLTensor               *src_0{nullptr};
+    const ICLTensor               *src_1{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClAdd> op{nullptr};
 };
 
-CLArithmeticAddition::CLArithmeticAddition()
-    : _impl(std::make_unique<Impl>())
+CLArithmeticAddition::CLArithmeticAddition() : _impl(std::make_unique<Impl>())
 {
 }
-CLArithmeticAddition::CLArithmeticAddition(CLArithmeticAddition &&) = default;
+CLArithmeticAddition::CLArithmeticAddition(CLArithmeticAddition &&)            = default;
 CLArithmeticAddition &CLArithmeticAddition::operator=(CLArithmeticAddition &&) = default;
 CLArithmeticAddition::~CLArithmeticAddition()                                  = default;
 
-void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CLArithmeticAddition::configure(
+    ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info);
 }
 
-void CLArithmeticAddition::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy,
+void CLArithmeticAddition::configure(const CLCompileContext    &compile_context,
+                                     const ICLTensor           *input1,
+                                     const ICLTensor           *input2,
+                                     ICLTensor                 *output,
+                                     ConvertPolicy              policy,
                                      const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
@@ -65,7 +69,11 @@ void CLArithmeticAddition::configure(const CLCompileContext &compile_context, co
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), policy, act_info);
 }
 
-Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CLArithmeticAddition::validate(const ITensorInfo         *input1,
+                                      const ITensorInfo         *input2,
+                                      const ITensorInfo         *output,
+                                      ConvertPolicy              policy,
+                                      const ActivationLayerInfo &act_info)
 {
     return opencl::ClAdd::validate(input1, input2, output, policy, act_info);
 }
@@ -82,26 +90,33 @@ void CLArithmeticAddition::run()
 
 struct CLArithmeticSubtraction::Impl
 {
-    const ICLTensor               *src_0{ nullptr };
-    const ICLTensor               *src_1{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClSub> op{ nullptr };
+    const ICLTensor               *src_0{nullptr};
+    const ICLTensor               *src_1{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClSub> op{nullptr};
 };
 
-CLArithmeticSubtraction::CLArithmeticSubtraction()
-    : _impl(std::make_unique<Impl>())
+CLArithmeticSubtraction::CLArithmeticSubtraction() : _impl(std::make_unique<Impl>())
 {
 }
-CLArithmeticSubtraction::CLArithmeticSubtraction(CLArithmeticSubtraction &&) = default;
+CLArithmeticSubtraction::CLArithmeticSubtraction(CLArithmeticSubtraction &&)            = default;
 CLArithmeticSubtraction &CLArithmeticSubtraction::operator=(CLArithmeticSubtraction &&) = default;
 CLArithmeticSubtraction::~CLArithmeticSubtraction()                                     = default;
 
-void CLArithmeticSubtraction::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CLArithmeticSubtraction::configure(const ICLTensor           *input1,
+                                        const ICLTensor           *input2,
+                                        ICLTensor                 *output,
+                                        ConvertPolicy              policy,
+                                        const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info);
 }
 
-void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy,
+void CLArithmeticSubtraction::configure(const CLCompileContext    &compile_context,
+                                        const ICLTensor           *input1,
+                                        const ICLTensor           *input2,
+                                        ICLTensor                 *output,
+                                        ConvertPolicy              policy,
                                         const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
@@ -111,7 +126,11 @@ void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context,
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), policy, act_info);
 }
 
-Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CLArithmeticSubtraction::validate(const ITensorInfo         *input1,
+                                         const ITensorInfo         *input2,
+                                         const ITensorInfo         *output,
+                                         ConvertPolicy              policy,
+                                         const ActivationLayerInfo &act_info)
 {
     return opencl::ClSub::validate(input1, input2, output, policy, act_info);
 }
@@ -128,26 +147,32 @@ void CLArithmeticSubtraction::run()
 
 struct CLArithmeticDivision::Impl
 {
-    const ICLTensor                               *src_0{ nullptr };
-    const ICLTensor                               *src_1{ nullptr };
-    ICLTensor                                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClElementwiseDivision> op{ nullptr };
+    const ICLTensor                               *src_0{nullptr};
+    const ICLTensor                               *src_1{nullptr};
+    ICLTensor                                     *dst{nullptr};
+    std::unique_ptr<opencl::ClElementwiseDivision> op{nullptr};
 };
 
-CLArithmeticDivision::CLArithmeticDivision()
-    : _impl(std::make_unique<Impl>())
+CLArithmeticDivision::CLArithmeticDivision() : _impl(std::make_unique<Impl>())
 {
 }
-CLArithmeticDivision::CLArithmeticDivision(CLArithmeticDivision &&) = default;
+CLArithmeticDivision::CLArithmeticDivision(CLArithmeticDivision &&)            = default;
 CLArithmeticDivision &CLArithmeticDivision::operator=(CLArithmeticDivision &&) = default;
 CLArithmeticDivision::~CLArithmeticDivision()                                  = default;
 
-void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLArithmeticDivision::configure(ICLTensor                 *input1,
+                                     ICLTensor                 *input2,
+                                     ICLTensor                 *output,
+                                     const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLArithmeticDivision::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLArithmeticDivision::configure(const CLCompileContext    &compile_context,
+                                     const ICLTensor           *input1,
+                                     const ICLTensor           *input2,
+                                     ICLTensor                 *output,
+                                     const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
@@ -156,7 +181,10 @@ void CLArithmeticDivision::configure(const CLCompileContext &compile_context, co
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
 }
 
-Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLArithmeticDivision::validate(const ITensorInfo         *input1,
+                                      const ITensorInfo         *input2,
+                                      const ITensorInfo         *output,
+                                      const ActivationLayerInfo &act_info)
 {
     return opencl::ClElementwiseDivision::validate(input1, input2, output, act_info);
 }
@@ -173,26 +201,32 @@ void CLArithmeticDivision::run()
 
 struct CLElementwiseMax::Impl
 {
-    const ICLTensor                          *src_0{ nullptr };
-    const ICLTensor                          *src_1{ nullptr };
-    ICLTensor                                *dst{ nullptr };
-    std::unique_ptr<opencl::ClElementwiseMax> op{ nullptr };
+    const ICLTensor                          *src_0{nullptr};
+    const ICLTensor                          *src_1{nullptr};
+    ICLTensor                                *dst{nullptr};
+    std::unique_ptr<opencl::ClElementwiseMax> op{nullptr};
 };
 
-CLElementwiseMax::CLElementwiseMax()
-    : _impl(std::make_unique<Impl>())
+CLElementwiseMax::CLElementwiseMax() : _impl(std::make_unique<Impl>())
 {
 }
-CLElementwiseMax::CLElementwiseMax(CLElementwiseMax &&) = default;
+CLElementwiseMax::CLElementwiseMax(CLElementwiseMax &&)            = default;
 CLElementwiseMax &CLElementwiseMax::operator=(CLElementwiseMax &&) = default;
 CLElementwiseMax::~CLElementwiseMax()                              = default;
 
-void CLElementwiseMax::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMax::configure(ICLTensor                 *input1,
+                                 ICLTensor                 *input2,
+                                 ICLTensor                 *output,
+                                 const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLElementwiseMax::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMax::configure(const CLCompileContext    &compile_context,
+                                 ICLTensor                 *input1,
+                                 ICLTensor                 *input2,
+                                 ICLTensor                 *output,
+                                 const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
@@ -201,7 +235,10 @@ void CLElementwiseMax::configure(const CLCompileContext &compile_context, ICLTen
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
 }
 
-Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLElementwiseMax::validate(const ITensorInfo         *input1,
+                                  const ITensorInfo         *input2,
+                                  const ITensorInfo         *output,
+                                  const ActivationLayerInfo &act_info)
 {
     return opencl::ClElementwiseMax::validate(input1, input2, output, act_info);
 }
@@ -218,26 +255,32 @@ void CLElementwiseMax::run()
 
 struct CLElementwiseMin::Impl
 {
-    const ICLTensor                          *src_0{ nullptr };
-    const ICLTensor                          *src_1{ nullptr };
-    ICLTensor                                *dst{ nullptr };
-    std::unique_ptr<opencl::ClElementwiseMin> op{ nullptr };
+    const ICLTensor                          *src_0{nullptr};
+    const ICLTensor                          *src_1{nullptr};
+    ICLTensor                                *dst{nullptr};
+    std::unique_ptr<opencl::ClElementwiseMin> op{nullptr};
 };
 
-CLElementwiseMin::CLElementwiseMin()
-    : _impl(std::make_unique<Impl>())
+CLElementwiseMin::CLElementwiseMin() : _impl(std::make_unique<Impl>())
 {
 }
-CLElementwiseMin::CLElementwiseMin(CLElementwiseMin &&) = default;
+CLElementwiseMin::CLElementwiseMin(CLElementwiseMin &&)            = default;
 CLElementwiseMin &CLElementwiseMin::operator=(CLElementwiseMin &&) = default;
 CLElementwiseMin::~CLElementwiseMin()                              = default;
 
-void CLElementwiseMin::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMin::configure(ICLTensor                 *input1,
+                                 ICLTensor                 *input2,
+                                 ICLTensor                 *output,
+                                 const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLElementwiseMin::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMin::configure(const CLCompileContext    &compile_context,
+                                 ICLTensor                 *input1,
+                                 ICLTensor                 *input2,
+                                 ICLTensor                 *output,
+                                 const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
@@ -246,7 +289,10 @@ void CLElementwiseMin::configure(const CLCompileContext &compile_context, ICLTen
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
 }
 
-Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLElementwiseMin::validate(const ITensorInfo         *input1,
+                                  const ITensorInfo         *input2,
+                                  const ITensorInfo         *output,
+                                  const ActivationLayerInfo &act_info)
 {
     return opencl::ClElementwiseMin::validate(input1, input2, output, act_info);
 }
@@ -263,26 +309,32 @@ void CLElementwiseMin::run()
 
 struct CLElementwiseSquaredDiff::Impl
 {
-    const ICLTensor                                  *src_0{ nullptr };
-    const ICLTensor                                  *src_1{ nullptr };
-    ICLTensor                                        *dst{ nullptr };
-    std::unique_ptr<opencl::ClElementwiseSquaredDiff> op{ nullptr };
+    const ICLTensor                                  *src_0{nullptr};
+    const ICLTensor                                  *src_1{nullptr};
+    ICLTensor                                        *dst{nullptr};
+    std::unique_ptr<opencl::ClElementwiseSquaredDiff> op{nullptr};
 };
 
-CLElementwiseSquaredDiff::CLElementwiseSquaredDiff()
-    : _impl(std::make_unique<Impl>())
+CLElementwiseSquaredDiff::CLElementwiseSquaredDiff() : _impl(std::make_unique<Impl>())
 {
 }
-CLElementwiseSquaredDiff::CLElementwiseSquaredDiff(CLElementwiseSquaredDiff &&) = default;
+CLElementwiseSquaredDiff::CLElementwiseSquaredDiff(CLElementwiseSquaredDiff &&)            = default;
 CLElementwiseSquaredDiff &CLElementwiseSquaredDiff::operator=(CLElementwiseSquaredDiff &&) = default;
 CLElementwiseSquaredDiff::~CLElementwiseSquaredDiff()                                      = default;
 
-void CLElementwiseSquaredDiff::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseSquaredDiff::configure(ICLTensor                 *input1,
+                                         ICLTensor                 *input2,
+                                         ICLTensor                 *output,
+                                         const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseSquaredDiff::configure(const CLCompileContext    &compile_context,
+                                         ICLTensor                 *input1,
+                                         ICLTensor                 *input2,
+                                         ICLTensor                 *output,
+                                         const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
@@ -291,7 +343,10 @@ void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
 }
 
-Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLElementwiseSquaredDiff::validate(const ITensorInfo         *input1,
+                                          const ITensorInfo         *input2,
+                                          const ITensorInfo         *output,
+                                          const ActivationLayerInfo &act_info)
 {
     return opencl::ClElementwiseSquaredDiff::validate(input1, input2, output, act_info);
 }
@@ -308,26 +363,32 @@ void CLElementwiseSquaredDiff::run()
 
 struct CLElementwisePower::Impl
 {
-    const ICLTensor                            *src_0{ nullptr };
-    const ICLTensor                            *src_1{ nullptr };
-    ICLTensor                                  *dst{ nullptr };
-    std::unique_ptr<opencl::ClElementwisePower> op{ nullptr };
+    const ICLTensor                            *src_0{nullptr};
+    const ICLTensor                            *src_1{nullptr};
+    ICLTensor                                  *dst{nullptr};
+    std::unique_ptr<opencl::ClElementwisePower> op{nullptr};
 };
 
-CLElementwisePower::CLElementwisePower()
-    : _impl(std::make_unique<Impl>())
+CLElementwisePower::CLElementwisePower() : _impl(std::make_unique<Impl>())
 {
 }
-CLElementwisePower::CLElementwisePower(CLElementwisePower &&) = default;
+CLElementwisePower::CLElementwisePower(CLElementwisePower &&)            = default;
 CLElementwisePower &CLElementwisePower::operator=(CLElementwisePower &&) = default;
 CLElementwisePower::~CLElementwisePower()                                = default;
 
-void CLElementwisePower::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwisePower::configure(ICLTensor                 *input1,
+                                   ICLTensor                 *input2,
+                                   ICLTensor                 *output,
+                                   const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLElementwisePower::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwisePower::configure(const CLCompileContext    &compile_context,
+                                   ICLTensor                 *input1,
+                                   ICLTensor                 *input2,
+                                   ICLTensor                 *output,
+                                   const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
@@ -336,7 +397,10 @@ void CLElementwisePower::configure(const CLCompileContext &compile_context, ICLT
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
 }
 
-Status CLElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLElementwisePower::validate(const ITensorInfo         *input1,
+                                    const ITensorInfo         *input2,
+                                    const ITensorInfo         *output,
+                                    const ActivationLayerInfo &act_info)
 {
     return opencl::ClElementwisePower::validate(input1, input2, output, act_info);
 }
diff --git a/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp
index 9dcd2d1891..3043c26feb 100644
--- a/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp
+++ b/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClElementwiseUnary.h"
 
@@ -32,17 +33,16 @@ namespace arm_compute
 {
 struct CLRsqrtLayer::Impl
 {
-    const ICLTensor                 *src{ nullptr };
-    ICLTensor                       *dst{ nullptr };
-    std::unique_ptr<opencl::ClRsqrt> op{ nullptr };
+    const ICLTensor                 *src{nullptr};
+    ICLTensor                       *dst{nullptr};
+    std::unique_ptr<opencl::ClRsqrt> op{nullptr};
 };
 
-CLRsqrtLayer::CLRsqrtLayer()
-    : _impl(std::make_unique<Impl>())
+CLRsqrtLayer::CLRsqrtLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLRsqrtLayer::CLRsqrtLayer(CLRsqrtLayer &&) = default;
+CLRsqrtLayer::CLRsqrtLayer(CLRsqrtLayer &&)            = default;
 CLRsqrtLayer &CLRsqrtLayer::operator=(CLRsqrtLayer &&) = default;
 CLRsqrtLayer::~CLRsqrtLayer()                          = default;
 
@@ -74,17 +74,16 @@ void CLRsqrtLayer::run()
 
 struct CLExpLayer::Impl
 {
-    const ICLTensor               *src{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClExp> op{ nullptr };
+    const ICLTensor               *src{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClExp> op{nullptr};
 };
 
-CLExpLayer::CLExpLayer()
-    : _impl(std::make_unique<Impl>())
+CLExpLayer::CLExpLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLExpLayer::CLExpLayer(CLExpLayer &&) = default;
+CLExpLayer::CLExpLayer(CLExpLayer &&)            = default;
 CLExpLayer &CLExpLayer::operator=(CLExpLayer &&) = default;
 CLExpLayer::~CLExpLayer()                        = default;
 
@@ -116,17 +115,16 @@ void CLExpLayer::run()
 
 struct CLNegLayer::Impl
 {
-    const ICLTensor               *src{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClNeg> op{ nullptr };
+    const ICLTensor               *src{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClNeg> op{nullptr};
 };
 
-CLNegLayer::CLNegLayer()
-    : _impl(std::make_unique<Impl>())
+CLNegLayer::CLNegLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLNegLayer::CLNegLayer(CLNegLayer &&) = default;
+CLNegLayer::CLNegLayer(CLNegLayer &&)            = default;
 CLNegLayer &CLNegLayer::operator=(CLNegLayer &&) = default;
 CLNegLayer::~CLNegLayer()                        = default;
 
@@ -157,17 +155,16 @@ void CLNegLayer::run()
 
 struct CLSinLayer::Impl
 {
-    const ICLTensor               *src{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClSin> op{ nullptr };
+    const ICLTensor               *src{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClSin> op{nullptr};
 };
 
-CLSinLayer::CLSinLayer()
-    : _impl(std::make_unique<Impl>())
+CLSinLayer::CLSinLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLSinLayer::CLSinLayer(CLSinLayer &&) = default;
+CLSinLayer::CLSinLayer(CLSinLayer &&)            = default;
 CLSinLayer &CLSinLayer::operator=(CLSinLayer &&) = default;
 CLSinLayer::~CLSinLayer()                        = default;
 
@@ -198,17 +195,16 @@ void CLSinLayer::run()
 
 struct CLAbsLayer::Impl
 {
-    const ICLTensor               *src{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClAbs> op{ nullptr };
+    const ICLTensor               *src{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClAbs> op{nullptr};
 };
 
-CLAbsLayer::CLAbsLayer()
-    : _impl(std::make_unique<Impl>())
+CLAbsLayer::CLAbsLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLAbsLayer::CLAbsLayer(CLAbsLayer &&) = default;
+CLAbsLayer::CLAbsLayer(CLAbsLayer &&)            = default;
 CLAbsLayer &CLAbsLayer::operator=(CLAbsLayer &&) = default;
 CLAbsLayer::~CLAbsLayer()                        = default;
 
@@ -239,17 +235,16 @@ void CLAbsLayer::run()
 
 struct CLLogLayer::Impl
 {
-    const ICLTensor               *src{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClLog> op{ nullptr };
+    const ICLTensor               *src{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClLog> op{nullptr};
 };
 
-CLLogLayer::CLLogLayer()
-    : _impl(std::make_unique<Impl>())
+CLLogLayer::CLLogLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLLogLayer::CLLogLayer(CLLogLayer &&) = default;
+CLLogLayer::CLLogLayer(CLLogLayer &&)            = default;
 CLLogLayer &CLLogLayer::operator=(CLLogLayer &&) = default;
 CLLogLayer::~CLLogLayer()                        = default;
 
@@ -280,17 +275,16 @@ void CLLogLayer::run()
 
 struct CLRoundLayer::Impl
 {
-    const ICLTensor                 *src{ nullptr };
-    ICLTensor                       *dst{ nullptr };
-    std::unique_ptr<opencl::ClRound> op{ nullptr };
+    const ICLTensor                 *src{nullptr};
+    ICLTensor                       *dst{nullptr};
+    std::unique_ptr<opencl::ClRound> op{nullptr};
 };
 
-CLRoundLayer::CLRoundLayer()
-    : _impl(std::make_unique<Impl>())
+CLRoundLayer::CLRoundLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLRoundLayer::CLRoundLayer(CLRoundLayer &&) = default;
+CLRoundLayer::CLRoundLayer(CLRoundLayer &&)            = default;
 CLRoundLayer &CLRoundLayer::operator=(CLRoundLayer &&) = default;
 CLRoundLayer::~CLRoundLayer()                          = default;
 
diff --git a/src/runtime/CL/functions/CLFFT1D.cpp b/src/runtime/CL/functions/CLFFT1D.cpp
index bd0966b65f..48e9ae824a 100644
--- a/src/runtime/CL/functions/CLFFT1D.cpp
+++ b/src/runtime/CL/functions/CLFFT1D.cpp
@@ -26,13 +26,13 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
 #include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
 #include "src/core/CL/kernels/CLFFTScaleKernel.h"
 #include "src/core/utils/helpers/fft.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 CLFFT1D::CLFFT1D(std::shared_ptr<IMemoryManager> memory_manager)
@@ -54,7 +54,10 @@ void CLFFT1D::configure(const ICLTensor *input, ICLTensor *output, const FFT1DIn
     configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
 }
 
-void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config)
+void CLFFT1D::configure(const CLCompileContext &compile_context,
+                        const ICLTensor        *input,
+                        ICLTensor              *output,
+                        const FFT1DInfo        &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLFFT1D::validate(input->info(), output->info(), config));
@@ -77,13 +80,14 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor
     TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32);
     _digit_reverse_indices.allocator()->init(digit_reverse_indices_info);
     _memory_group.manage(&_digit_reversed_input);
-    _digit_reverse_kernel->configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config);
+    _digit_reverse_kernel->configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices,
+                                     digit_reverse_config);
 
     // Create and configure FFT kernels
     unsigned int Nx = 1;
     _num_ffts       = decomposed_vector.size();
     _fft_kernels.reserve(_num_ffts);
-    for(unsigned int i = 0; i < _num_ffts; ++i)
+    for (unsigned int i = 0; i < _num_ffts; ++i)
     {
         const unsigned int radix_for_stage = decomposed_vector.at(i);
 
@@ -93,18 +97,20 @@ void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor
         fft_kernel_info.Nx             = Nx;
         fft_kernel_info.is_first_stage = (i == 0);
         _fft_kernels.emplace_back(std::make_unique<CLFFTRadixStageKernel>());
-        _fft_kernels.back()->configure(compile_context, &_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
+        _fft_kernels.back()->configure(compile_context, &_digit_reversed_input,
+                                       ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
 
         Nx *= radix_for_stage;
     }
 
     // Configure scale kernel
-    if(_run_scale)
+    if (_run_scale)
     {
         FFTScaleKernelInfo scale_config;
         scale_config.scale     = static_cast<float>(N);
         scale_config.conjugate = config.direction == FFTDirection::Inverse;
-        is_c2r ? _scale_kernel->configure(compile_context, &_digit_reversed_input, output, scale_config) : _scale_kernel->configure(output, nullptr, scale_config);
+        is_c2r ? _scale_kernel->configure(compile_context, &_digit_reversed_input, output, scale_config)
+               : _scale_kernel->configure(output, nullptr, scale_config);
     }
 
     // Allocate tensors
@@ -123,7 +129,7 @@ Status CLFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != 1 && input->num_channels() != 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
 
     // Check if FFT is decomposable
     const auto         supported_radix   = CLFFTRadixStageKernel::supported_radix();
@@ -132,7 +138,7 @@ Status CLFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co
     ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty());
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1);
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2);
@@ -151,13 +157,13 @@ void CLFFT1D::run()
     CLScheduler::get().enqueue(*_digit_reverse_kernel, false);
 
     // Run radix kernels
-    for(unsigned int i = 0; i < _num_ffts; ++i)
+    for (unsigned int i = 0; i < _num_ffts; ++i)
     {
         CLScheduler::get().enqueue(*_fft_kernels[i], i == (_num_ffts - 1) && !_run_scale);
     }
 
     // Run output scaling
-    if(_run_scale)
+    if (_run_scale)
     {
         CLScheduler::get().enqueue(*_scale_kernel, true);
     }
diff --git a/src/runtime/CL/functions/CLFFT2D.cpp b/src/runtime/CL/functions/CLFFT2D.cpp
index 94fc411355..3857046719 100644
--- a/src/runtime/CL/functions/CLFFT2D.cpp
+++ b/src/runtime/CL/functions/CLFFT2D.cpp
@@ -26,16 +26,19 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
 #include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
 #include "src/core/CL/kernels/CLFFTScaleKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 CLFFT2D::CLFFT2D(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor()
+    : _memory_group(memory_manager),
+      _first_pass_func(memory_manager),
+      _second_pass_func(memory_manager),
+      _first_pass_tensor()
 {
 }
 
@@ -46,7 +49,10 @@ void CLFFT2D::configure(const ICLTensor *input, ICLTensor *output, const FFT2DIn
     configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
 }
 
-void CLFFT2D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config)
+void CLFFT2D::configure(const CLCompileContext &compile_context,
+                        const ICLTensor        *input,
+                        ICLTensor              *output,
+                        const FFT2DInfo        &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLFFT2D::validate(input->info(), output->info(), config));
@@ -88,7 +94,7 @@ Status CLFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, co
     ARM_COMPUTE_RETURN_ON_ERROR(CLFFT1D::validate(&first_pass_tensor, output, second_pass_config));
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
diff --git a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
index d12e2de3bf..3894b10785 100644
--- a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
@@ -25,10 +25,12 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CPP/CPPScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
 #include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
 #include "src/core/CL/kernels/CLFFTScaleKernel.h"
@@ -38,8 +40,6 @@
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/utils/helpers/fft.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace
@@ -50,11 +50,11 @@ int pad_decomposable(int N)
 
     int  pad           = 0;
     bool is_decomposed = false;
-    while(!is_decomposed)
+    while (!is_decomposed)
     {
         const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix);
         is_decomposed                = !decomposed_vector.empty();
-        if(!is_decomposed)
+        if (!is_decomposed)
         {
             ++pad;
         }
@@ -104,17 +104,31 @@ CLFFTConvolutionLayer::CLFFTConvolutionLayer(std::shared_ptr<IMemoryManager> mem
 {
 }
 
-void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                                      const ActivationLayerInfo &act_info, bool enable_fast_math)
+void CLFFTConvolutionLayer::configure(ICLTensor                 *input,
+                                      const ICLTensor           *weights,
+                                      const ICLTensor           *biases,
+                                      ICLTensor                 *output,
+                                      const PadStrideInfo       &conv_info,
+                                      const ActivationLayerInfo &act_info,
+                                      bool                       enable_fast_math)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, enable_fast_math);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info,
+              enable_fast_math);
 }
 
-void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                                      const ActivationLayerInfo &act_info, bool enable_fast_math)
+void CLFFTConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                      ICLTensor                 *input,
+                                      const ICLTensor           *weights,
+                                      const ICLTensor           *biases,
+                                      ICLTensor                 *output,
+                                      const PadStrideInfo       &conv_info,
+                                      const ActivationLayerInfo &act_info,
+                                      bool                       enable_fast_math)
 {
     ARM_COMPUTE_UNUSED(enable_fast_math);
-    ARM_COMPUTE_ERROR_THROW_ON(CLFFTConvolutionLayer::validate(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info, act_info, enable_fast_math));
+    ARM_COMPUTE_ERROR_THROW_ON(CLFFTConvolutionLayer::validate(input->info(), weights->info(),
+                                                               biases != nullptr ? biases->info() : nullptr,
+                                                               output->info(), conv_info, act_info, enable_fast_math));
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info, enable_fast_math);
 
     _original_weights = weights;
@@ -124,21 +138,24 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
     _has_bias = biases != nullptr;
 
     // Get indices for the width and height
-    const size_t idx_width  = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+    const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height =
+        get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
 
     // Input shape, kernel size and output tile
-    const Size2D input_dims  = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
-    const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
-    const Size2D pad_valid   = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
-                                      pad_decomposable(input_dims.y() + kernel_size.y() - 1));
+    const Size2D input_dims =
+        Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
+    const Size2D kernel_size =
+        Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
+    const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
+                                    pad_decomposable(input_dims.y() + kernel_size.y() - 1));
     // Tensors to use
     ICLTensor       *input_to_use   = input;
     const ICLTensor *weights_to_use = weights;
     ICLTensor       *output_to_use  = _has_bias ? &_bias_output : output;
 
     // Permute bias
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         _permute_bias_func.configure(compile_context, biases, &_permuted_bias, PermutationVector(1U, 2U, 0U));
         _permuted_bias.info()->set_data_layout(DataLayout::NCHW);
@@ -146,7 +163,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
 
     // Permute input if needed
     _needs_permute = input->info()->data_layout() == DataLayout::NHWC;
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _memory_group.manage(&_permuted_input);
         // Configure the function to transform the input tensor from NHWC -> NCHW
@@ -167,7 +184,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
     _flip_weights_func.configure(compile_context, weights_to_use, &_flipped_weights, &_flip_axis);
 
     // Pad weights
-    const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } };
+    const PaddingList padding_w = {{0, input_dims.x() + pad_valid.x() - 1}, {0, input_dims.y() + pad_valid.y() - 1}};
     _pad_weights_func.configure(compile_context, &_flipped_weights, &_padded_weights, padding_w);
 
     // Transform weights
@@ -175,10 +192,10 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
     _transform_weights_func->configure(compile_context, &_padded_weights, &_transformed_weights, FFT2DInfo());
 
     // Pad input
-    const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } };
+    const PaddingList padding_in = {{0, kernel_size.x() + pad_valid.x() - 1}, {0, kernel_size.y() + pad_valid.y() - 1}};
     _memory_group.manage(&_padded_input);
     _pad_input_func.configure(compile_context, input_to_use, &_padded_input, padding_in);
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permuted_input.allocator()->allocate();
     }
@@ -202,7 +219,8 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
     _memory_group.manage(&_itransformed_output);
     FFT2DInfo itranform_info;
     itranform_info.direction = FFTDirection::Inverse;
-    _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
+    _itransformed_output.allocator()->init(
+        _output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
     _itransform_output_func.configure(compile_context, &_output_reduced, &_itransformed_output, itranform_info);
     _output_reduced.allocator()->allocate();
 
@@ -214,25 +232,28 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
     // Extract correct region
     const int start_left = kernel_size.x() - conv_info.pad_left() - 1;
     const int start_top  = kernel_size.y() - conv_info.pad_top() - 1;
-    const int end_right  = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
-    const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
-    if(_has_bias)
+    const int end_right =
+        _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
+    const int end_botton =
+        _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
+    if (_has_bias)
     {
         _memory_group.manage(&_bias_output);
     }
-    else if(_needs_permute)
+    else if (_needs_permute)
     {
         output_to_use = &_permuted_output;
         _memory_group.manage(&_permuted_output);
     }
-    _extract_output_func.configure(compile_context, &_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
+    _extract_output_func.configure(compile_context, &_reshaped_output, output_to_use,
+                                   Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
     _itransformed_output.allocator()->allocate();
 
     // Add bias
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         output_to_use = output;
-        if(_needs_permute)
+        if (_needs_permute)
         {
             output_to_use = &_permuted_output;
             _memory_group.manage(&_permuted_output);
@@ -243,7 +264,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
     }
 
     // Permute output
-    if(_needs_permute)
+    if (_needs_permute)
     {
         // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
         _permuted_output.info()->set_data_layout(DataLayout::NCHW);
@@ -255,7 +276,7 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
 
     // Configure Activation Layer
     _is_activationlayer_enabled = act_info.enabled();
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activation_layer_func.configure(compile_context, output, nullptr, act_info);
     }
@@ -269,8 +290,13 @@ void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, I
     _flip_axis.unmap();
 }
 
-Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                       const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status CLFFTConvolutionLayer::validate(const ITensorInfo         *input,
+                                       const ITensorInfo         *weights,
+                                       const ITensorInfo         *biases,
+                                       const ITensorInfo         *output,
+                                       const PadStrideInfo       &conv_info,
+                                       const ActivationLayerInfo &act_info,
+                                       bool                       enable_fast_math)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON((input->data_type() == DataType::F16) && !enable_fast_math);
@@ -287,24 +313,27 @@ Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn
     const auto strides = conv_info.stride();
     ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y());
-    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2));
-    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2));
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) ||
+                                conv_info.pad_right() != (kernel_size.x() / 2));
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) ||
+                                conv_info.pad_bottom() != (kernel_size.y() / 2));
 
     // Validate biases
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
         ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[3] != biases->tensor_shape().x());
     }
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
+        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) ||
+                                    (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
 
         // Validate Activation Layer
-        if(act_info.enabled())
+        if (act_info.enabled())
         {
             ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
         }
@@ -320,7 +349,7 @@ void CLFFTConvolutionLayer::run()
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Transform input
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permute_input_func.run();
     }
@@ -336,17 +365,17 @@ void CLFFTConvolutionLayer::run()
     _reshaped_output.allocator()->import_memory(_itransformed_output.cl_buffer());
     _extract_output_func.run();
     // Add bias
-    if(_has_bias)
+    if (_has_bias)
     {
         _bias_add_func.run();
     }
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permute_output_func.run();
     }
 
     // Run activation layer
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activation_layer_func.run();
     }
@@ -354,10 +383,10 @@ void CLFFTConvolutionLayer::run()
 
 void CLFFTConvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         // Permute bias to NCHW
-        if(_original_bias != nullptr)
+        if (_original_bias != nullptr)
         {
             _permuted_bias.allocator()->allocate();
             _permute_bias_func.run();
@@ -366,7 +395,7 @@ void CLFFTConvolutionLayer::prepare()
 
         const ICLTensor *cur_weights = _original_weights;
         // Permute weights
-        if(_needs_permute)
+        if (_needs_permute)
         {
             ARM_COMPUTE_ERROR_ON(!cur_weights->is_used());
 
diff --git a/src/runtime/CL/functions/CLFill.cpp b/src/runtime/CL/functions/CLFill.cpp
index 6019a84aba..9bd96a975e 100644
--- a/src/runtime/CL/functions/CLFill.cpp
+++ b/src/runtime/CL/functions/CLFill.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClFill.h"
 
@@ -36,16 +37,15 @@ namespace arm_compute
 {
 struct CLFill::Impl
 {
-    const ICLTensor                *src{ nullptr };
-    ICLTensor                      *dst{ nullptr };
-    std::unique_ptr<opencl::ClFill> op{ nullptr };
+    const ICLTensor                *src{nullptr};
+    ICLTensor                      *dst{nullptr};
+    std::unique_ptr<opencl::ClFill> op{nullptr};
 };
 
-CLFill::CLFill()
-    : _impl(std::make_unique<Impl>())
+CLFill::CLFill() : _impl(std::make_unique<Impl>())
 {
 }
-CLFill::CLFill(CLFill &&) = default;
+CLFill::CLFill(CLFill &&)            = default;
 CLFill &CLFill::operator=(CLFill &&) = default;
 CLFill::~CLFill()                    = default;
 
@@ -54,7 +54,10 @@ void CLFill::configure(ICLTensor *tensor, const PixelValue &constant_value, Wind
     configure(CLKernelLibrary::get().get_compile_context(), tensor, constant_value, dst_window);
 }
 
-void CLFill::configure(const CLCompileContext &compile_context, ICLTensor *tensor, const PixelValue &constant_value, Window *dst_window)
+void CLFill::configure(const CLCompileContext &compile_context,
+                       ICLTensor              *tensor,
+                       const PixelValue       &constant_value,
+                       Window                 *dst_window)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
 
diff --git a/src/runtime/CL/functions/CLFlattenLayer.cpp b/src/runtime/CL/functions/CLFlattenLayer.cpp
index 32fc37552c..ba1b5372d3 100644
--- a/src/runtime/CL/functions/CLFlattenLayer.cpp
+++ b/src/runtime/CL/functions/CLFlattenLayer.cpp
@@ -26,8 +26,9 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/gpu/cl/operators/ClFlatten.h"
@@ -36,16 +37,15 @@ namespace arm_compute
 {
 struct CLFlattenLayer::Impl
 {
-    const ICLTensor                   *src{ nullptr };
-    ICLTensor                         *dst{ nullptr };
-    std::unique_ptr<opencl::ClFlatten> op{ nullptr };
+    const ICLTensor                   *src{nullptr};
+    ICLTensor                         *dst{nullptr};
+    std::unique_ptr<opencl::ClFlatten> op{nullptr};
 };
 
-CLFlattenLayer::CLFlattenLayer()
-    : _impl(std::make_unique<Impl>())
+CLFlattenLayer::CLFlattenLayer() : _impl(std::make_unique<Impl>())
 {
 }
-CLFlattenLayer::CLFlattenLayer(CLFlattenLayer &&) = default;
+CLFlattenLayer::CLFlattenLayer(CLFlattenLayer &&)            = default;
 CLFlattenLayer &CLFlattenLayer::operator=(CLFlattenLayer &&) = default;
 CLFlattenLayer::~CLFlattenLayer()                            = default;
 
@@ -59,7 +59,8 @@ void CLFlattenLayer::configure(const CLCompileContext &compile_context, const IC
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     _impl->src = input;
     _impl->dst = output;
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input->info())));
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(
+                                            misc::shape_calculator::compute_flatten_shape(input->info())));
 
     _impl->op = std::make_unique<opencl::ClFlatten>();
     _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info());
@@ -68,9 +69,10 @@ void CLFlattenLayer::configure(const CLCompileContext &compile_context, const IC
 Status CLFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input));
+        const TensorInfo tensor_info_output =
+            input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
     }
     return opencl::ClFlatten::validate(input, output);
@@ -83,4 +85,4 @@ void CLFlattenLayer::run()
     pack.add_tensor(TensorType::ACL_DST, _impl->dst);
     _impl->op->run(pack);
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFloor.cpp b/src/runtime/CL/functions/CLFloor.cpp
index 8739e1803e..4322219dd9 100644
--- a/src/runtime/CL/functions/CLFloor.cpp
+++ b/src/runtime/CL/functions/CLFloor.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClFloor.h"
 
@@ -34,16 +35,15 @@ namespace arm_compute
 {
 struct CLFloor::Impl
 {
-    const ICLTensor                 *src{ nullptr };
-    ICLTensor                       *dst{ nullptr };
-    std::unique_ptr<opencl::ClFloor> op{ nullptr };
+    const ICLTensor                 *src{nullptr};
+    ICLTensor                       *dst{nullptr};
+    std::unique_ptr<opencl::ClFloor> op{nullptr};
 };
 
-CLFloor::CLFloor()
-    : _impl(std::make_unique<Impl>())
+CLFloor::CLFloor() : _impl(std::make_unique<Impl>())
 {
 }
-CLFloor::CLFloor(CLFloor &&) = default;
+CLFloor::CLFloor(CLFloor &&)            = default;
 CLFloor &CLFloor::operator=(CLFloor &&) = default;
 CLFloor::~CLFloor()                     = default;
 
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 1c162db79a..b30f9e701f 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClFullyConnected.h"
 
@@ -35,21 +36,22 @@ using namespace arm_compute::experimental;
 struct CLFullyConnectedLayer::Impl
 {
     MemoryGroup      memory_group{};
-    IWeightsManager *weights_manager{ nullptr };
+    IWeightsManager *weights_manager{nullptr};
 
-    std::unique_ptr<opencl::ClFullyConnected> op{ nullptr };
+    std::unique_ptr<opencl::ClFullyConnected> op{nullptr};
 
-    const ITensor *original_weights{ nullptr };
+    const ITensor *original_weights{nullptr};
 
     ITensorPack                      run_pack{};
     WorkspaceData<CLTensor>          workspace{};
     experimental::MemoryRequirements aux_mem_req{};
 
-    bool is_prepared{ false };
-    bool dynamic_weights{ false };
+    bool is_prepared{false};
+    bool dynamic_weights{false};
 };
 
-CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
+CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager,
+                                             IWeightsManager                *weights_manager)
     : _impl(std::make_unique<Impl>())
 {
     _impl->memory_group    = MemoryGroup(std::move(memory_manager));
@@ -58,39 +60,45 @@ CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> mem
 
 CLFullyConnectedLayer::~CLFullyConnectedLayer() = default;
 
-void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+void CLFullyConnectedLayer::configure(const ICLTensor        *input,
+                                      const ICLTensor        *weights,
+                                      const ICLTensor        *biases,
+                                      ICLTensor              *output,
                                       FullyConnectedLayerInfo fc_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, fc_info);
 }
 
-void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context,
+                                      const ICLTensor        *input,
+                                      const ICLTensor        *weights,
+                                      const ICLTensor        *biases,
+                                      ICLTensor              *output,
                                       FullyConnectedLayerInfo fc_info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayer::validate(input->info(),
-                                                               weights->info(),
-                                                               biases != nullptr ? biases->info() : nullptr,
-                                                               output->info(),
-                                                               fc_info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayer::validate(
+        input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), fc_info));
 
     _impl->op               = std::make_unique<opencl::ClFullyConnected>();
     _impl->original_weights = weights;
     _impl->is_prepared      = fc_info.retain_internal_weights;
 
-    _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), fc_info);
+    _impl->op->configure(compile_context, input->info(), weights->info(),
+                         (biases != nullptr) ? biases->info() : nullptr, output->info(), fc_info);
 
-    if(_impl->weights_manager != nullptr)
+    if (_impl->weights_manager != nullptr)
     {
         _impl->weights_manager->manage(_impl->original_weights);
     }
 
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->aux_mem_req = _impl->op->workspace();
-        _impl->run_pack    = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
-        _impl->workspace   = manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
+        _impl->run_pack    = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+        _impl->workspace =
+            manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
     }
     else
     {
@@ -98,14 +106,14 @@ void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, c
         _impl->run_pack.add_tensor(ACL_DST, output);
     }
 
-    _impl->dynamic_weights =
-        !weights->info()->are_values_constant() &&
-        fc_info.transpose_weights &&
-        !fc_info.are_weights_reshaped &&
-        !fc_info.retain_internal_weights;
+    _impl->dynamic_weights = !weights->info()->are_values_constant() && fc_info.transpose_weights &&
+                             !fc_info.are_weights_reshaped && !fc_info.retain_internal_weights;
 }
 
-Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+Status CLFullyConnectedLayer::validate(const ITensorInfo      *input,
+                                       const ITensorInfo      *weights,
+                                       const ITensorInfo      *biases,
+                                       const ITensorInfo      *output,
                                        FullyConnectedLayerInfo fc_info)
 {
     return opencl::ClFullyConnected::validate(input, weights, biases, output, fc_info);
@@ -113,7 +121,7 @@ Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn
 
 void CLFullyConnectedLayer::run()
 {
-    if(!_impl->dynamic_weights)
+    if (!_impl->dynamic_weights)
     {
         prepare();
     }
@@ -124,7 +132,7 @@ void CLFullyConnectedLayer::run()
 
 void CLFullyConnectedLayer::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->run_pack);
 
@@ -133,13 +141,13 @@ void CLFullyConnectedLayer::prepare()
         _impl->is_prepared = true;
 
         // Handle weights managed infrastructure
-        if(_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights))
+        if (_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights))
         {
             // Ensure that b gets marked as unused (memory released) only after the last function which uses b also finishes its prepare
             // This is for cases where multiple functions share the same b (weights)
             // Therefore when a function marks original b as unused, we pre-mark it in weights manager, and mark it back to used so that it doesn't get released before its last reference
             const ITensor *original_b = _impl->original_weights;
-            if(!original_b->is_used())
+            if (!original_b->is_used())
             {
                 _impl->weights_manager->pre_mark_as_unused(original_b);
             }
diff --git a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
index 7379e9d9fe..e4fbf78e13 100644
--- a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
+++ b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
@@ -28,9 +28,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
 
 namespace arm_compute
 {
@@ -41,29 +41,52 @@ CLFuseBatchNormalization::CLFuseBatchNormalization()
 
 CLFuseBatchNormalization::~CLFuseBatchNormalization() = default;
 
-void CLFuseBatchNormalization::configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
-                                         ICLTensor *fused_weights, ICLTensor *fused_bias,
-                                         const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
-                                         float epsilon, FuseBatchNormalizationType fbn_type)
+void CLFuseBatchNormalization::configure(const ICLTensor           *input_weights,
+                                         const ICLTensor           *bn_mean,
+                                         const ICLTensor           *bn_var,
+                                         ICLTensor                 *fused_weights,
+                                         ICLTensor                 *fused_bias,
+                                         const ICLTensor           *input_bias,
+                                         const ICLTensor           *bn_beta,
+                                         const ICLTensor           *bn_gamma,
+                                         float                      epsilon,
+                                         FuseBatchNormalizationType fbn_type)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+              input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
-void CLFuseBatchNormalization::configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
-                                         ICLTensor *fused_weights, ICLTensor *fused_bias,
-                                         const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
-                                         float epsilon, FuseBatchNormalizationType fbn_type)
+void CLFuseBatchNormalization::configure(const CLCompileContext    &compile_context,
+                                         const ICLTensor           *input_weights,
+                                         const ICLTensor           *bn_mean,
+                                         const ICLTensor           *bn_var,
+                                         ICLTensor                 *fused_weights,
+                                         ICLTensor                 *fused_bias,
+                                         const ICLTensor           *input_bias,
+                                         const ICLTensor           *bn_beta,
+                                         const ICLTensor           *bn_gamma,
+                                         float                      epsilon,
+                                         FuseBatchNormalizationType fbn_type)
 {
-    ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
-    _fuse_bn_kernel->configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma,
+                           epsilon, fbn_type);
+    _fuse_bn_kernel->configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias,
+                               bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
-Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                                          const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                                          const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
-                                          float epsilon, FuseBatchNormalizationType fbn_type)
+Status CLFuseBatchNormalization::validate(const ITensorInfo         *input_weights,
+                                          const ITensorInfo         *bn_mean,
+                                          const ITensorInfo         *bn_var,
+                                          const ITensorInfo         *fused_weights,
+                                          const ITensorInfo         *fused_bias,
+                                          const ITensorInfo         *input_bias,
+                                          const ITensorInfo         *bn_beta,
+                                          const ITensorInfo         *bn_gamma,
+                                          float                      epsilon,
+                                          FuseBatchNormalizationType fbn_type)
 {
-    return CLFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    return CLFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+                                                    input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
 void CLFuseBatchNormalization::run()
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 427ea51ab9..871a1d6e27 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClGemm.h"
 
@@ -40,15 +41,15 @@ using OperatorType = opencl::ClGemm;
 
 struct CLGEMM::Impl
 {
-    const ICLTensor              *b{ nullptr };
-    std::unique_ptr<OperatorType> op{ nullptr };
+    const ICLTensor              *b{nullptr};
+    std::unique_ptr<OperatorType> op{nullptr};
     MemoryGroup                   memory_group{};
-    IWeightsManager              *weights_manager{ nullptr };
+    IWeightsManager              *weights_manager{nullptr};
     ITensorPack                   run_pack{};
     ITensorPack                   prep_pack{};
     MemoryRequirements            aux_mem_req{};
     WorkspaceData<CLTensor>       workspace_tensors{};
-    bool                          is_prepared{ false };
+    bool                          is_prepared{false};
 };
 
 CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
@@ -60,12 +61,25 @@ CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *
 
 CLGEMM::~CLGEMM() = default;
 
-void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void CLGEMM::configure(const ICLTensor *a,
+                       const ICLTensor *b,
+                       const ICLTensor *c,
+                       ICLTensor       *output,
+                       float            alpha,
+                       float            beta,
+                       const GEMMInfo  &gemm_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, alpha, beta, gemm_info);
 }
 
-void CLGEMM::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void CLGEMM::configure(const CLCompileContext &compile_context,
+                       const ICLTensor        *a,
+                       const ICLTensor        *b,
+                       const ICLTensor        *c,
+                       ICLTensor              *output,
+                       float                   alpha,
+                       float                   beta,
+                       const GEMMInfo         &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
 
@@ -73,25 +87,33 @@ void CLGEMM::configure(const CLCompileContext &compile_context, const ICLTensor
     _impl->op          = std::make_unique<OperatorType>();
     _impl->is_prepared = gemm_info.retain_internal_weights();
 
-    _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info);
+    _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(),
+                         alpha, beta, gemm_info);
     _impl->aux_mem_req = _impl->op->workspace();
 
     // Manage/allocate auxilairy tensors
-    if(_impl->is_prepared)
+    if (_impl->is_prepared)
     {
         _impl->run_pack.add_const_tensor(ACL_SRC_0, a);
         _impl->run_pack.add_tensor(ACL_DST, output);
     }
     else
     {
-        _impl->run_pack  = { { ACL_SRC_0, a }, { ACL_SRC_2, c }, { ACL_DST, output } };
-        _impl->prep_pack = { { ACL_SRC_1, _impl->b } };
+        _impl->run_pack  = {{ACL_SRC_0, a}, {ACL_SRC_2, c}, {ACL_DST, output}};
+        _impl->prep_pack = {{ACL_SRC_1, _impl->b}};
 
-        _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+        _impl->workspace_tensors =
+            manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack);
     }
 }
 
-Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status CLGEMM::validate(const ITensorInfo *a,
+                        const ITensorInfo *b,
+                        const ITensorInfo *c,
+                        const ITensorInfo *output,
+                        float              alpha,
+                        float              beta,
+                        const GEMMInfo    &gemm_info)
 {
     return OperatorType::validate(a, b, c, output, alpha, beta, gemm_info);
 }
@@ -107,15 +129,15 @@ void CLGEMM::run()
 
 void CLGEMM::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->prep_pack);
 
-        auto has_reshape = std::find_if(_impl->aux_mem_req.begin(),
-                                        _impl->aux_mem_req.end(),
-                                        [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+        auto has_reshape =
+            std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+                         [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
 
-        if(has_reshape != std::end(_impl->aux_mem_req))
+        if (has_reshape != std::end(_impl->aux_mem_req))
         {
             _impl->b->mark_as_unused();
         }
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index c8c18f35db..aef7cddd7a 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -27,10 +27,11 @@
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClGemmConv2d.h"
 #include "support/Cast.h"
@@ -47,18 +48,19 @@ using namespace arm_compute::experimental;
 
 struct CLGEMMConvolutionLayer::Impl
 {
-    const ITensor                        *weights{ nullptr };
-    std::unique_ptr<opencl::ClGemmConv2d> op{ nullptr };
+    const ITensor                        *weights{nullptr};
+    std::unique_ptr<opencl::ClGemmConv2d> op{nullptr};
     ITensorPack                           run_pack{};
     ITensorPack                           prep_pack{};
     MemoryGroup                           memory_group{};
-    IWeightsManager                      *weights_manager{ nullptr };
+    IWeightsManager                      *weights_manager{nullptr};
     MemoryRequirements                    aux_mem_req{};
     WorkspaceData<CLTensor>               workspace_tensors{};
-    bool                                  is_prepared{ false };
+    bool                                  is_prepared{false};
 };
 
-CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
+CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager,
+                                               IWeightsManager                *weights_manager)
     : _impl(std::make_unique<Impl>())
 {
     _impl->memory_group    = MemoryGroup(memory_manager);
@@ -67,40 +69,60 @@ CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> m
 
 CLGEMMConvolutionLayer::~CLGEMMConvolutionLayer() = default;
 
-void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                       const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+void CLGEMMConvolutionLayer::configure(const ICLTensor           *input,
+                                       const ICLTensor           *weights,
+                                       const ICLTensor           *biases,
+                                       ICLTensor                 *output,
+                                       const PadStrideInfo       &conv_info,
+                                       const WeightsInfo         &weights_info,
+                                       const Size2D              &dilation,
+                                       const ActivationLayerInfo &act_info,
+                                       unsigned int               num_groups)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info,
+              dilation, act_info, num_groups);
 }
 
-void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                       const PadStrideInfo &conv_info,
-                                       const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+void CLGEMMConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                       const ICLTensor           *input,
+                                       const ICLTensor           *weights,
+                                       const ICLTensor           *biases,
+                                       ICLTensor                 *output,
+                                       const PadStrideInfo       &conv_info,
+                                       const WeightsInfo         &weights_info,
+                                       const Size2D              &dilation,
+                                       const ActivationLayerInfo &act_info,
+                                       unsigned int               num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     _impl->weights               = weights;
     _impl->op                    = std::make_unique<opencl::ClGemmConv2d>();
     const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups);
-    _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv2d_info, weights_info);
+    _impl->op->configure(compile_context, input->info(), weights->info(),
+                         (biases != nullptr ? biases->info() : nullptr), output->info(), conv2d_info, weights_info);
 
-    _impl->run_pack =
-    {
-        { TensorType::ACL_SRC_0, input },
-        { TensorType::ACL_SRC_1, weights },
-        { TensorType::ACL_SRC_2, biases },
-        { TensorType::ACL_DST, output }
-    };
-    _impl->prep_pack =
-    {
-        { TensorType::ACL_SRC_1, weights },
-        { TensorType::ACL_SRC_2, biases },
+    _impl->run_pack  = {{TensorType::ACL_SRC_0, input},
+                        {TensorType::ACL_SRC_1, weights},
+                        {TensorType::ACL_SRC_2, biases},
+                        {TensorType::ACL_DST, output}};
+    _impl->prep_pack = {
+        {TensorType::ACL_SRC_1, weights},
+        {TensorType::ACL_SRC_2, biases},
     };
-    _impl->aux_mem_req       = _impl->op->workspace();
-    _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+    _impl->aux_mem_req = _impl->op->workspace();
+    _impl->workspace_tensors =
+        manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
 }
 
-Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                        const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+Status CLGEMMConvolutionLayer::validate(const ITensorInfo         *input,
+                                        const ITensorInfo         *weights,
+                                        const ITensorInfo         *biases,
+                                        const ITensorInfo         *output,
+                                        const PadStrideInfo       &conv_info,
+                                        const WeightsInfo         &weights_info,
+                                        const Size2D              &dilation,
+                                        const ActivationLayerInfo &act_info,
+                                        unsigned int               num_groups)
 {
     const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups);
     return opencl::ClGemmConv2d::validate(input, weights, biases, output, conv2d_info, weights_info);
@@ -115,14 +137,14 @@ void CLGEMMConvolutionLayer::run()
 
 void CLGEMMConvolutionLayer::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->prep_pack);
-        auto has_reshape = std::find_if(_impl->aux_mem_req.begin(),
-                                        _impl->aux_mem_req.end(),
-                                        [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+        auto has_reshape =
+            std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+                         [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
 
-        if(has_reshape != std::end(_impl->aux_mem_req))
+        if (has_reshape != std::end(_impl->aux_mem_req))
         {
             _impl->weights->mark_as_unused();
         }
diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
index 9fc81c11da..7d40cf1829 100644
--- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
@@ -24,15 +24,15 @@
 #include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h"
 
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 #include <tuple>
 
@@ -40,12 +40,13 @@ namespace arm_compute
 {
 namespace
 {
-std::pair<Coordinates, Coordinates> compute_start_end_slice_coordinates(const ITensorInfo &output_info, const PadStrideInfo &deconv_info, bool is_nchw)
+std::pair<Coordinates, Coordinates>
+compute_start_end_slice_coordinates(const ITensorInfo &output_info, const PadStrideInfo &deconv_info, bool is_nchw)
 {
     Coordinates start;
     Coordinates end;
 
-    if(is_nchw)
+    if (is_nchw)
     {
         start.set(0, deconv_info.pad_left());
         start.set(1, deconv_info.pad_top());
@@ -63,13 +64,16 @@ std::pair<Coordinates, Coordinates> compute_start_end_slice_coordinates(const IT
         end.set(2, output_info.dimension(2) - deconv_info.pad_bottom());
     }
 
-    return { start, end };
+    return {start, end};
 }
-Status construct_gemmlowp_output_stage(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, GEMMLowpOutputStageInfo &output_stage_info)
+Status construct_gemmlowp_output_stage(const ITensorInfo       *input,
+                                       const ITensorInfo       *weights,
+                                       const ITensorInfo       *output,
+                                       GEMMLowpOutputStageInfo &output_stage_info)
 {
     const auto data_type = input->data_type();
 
-    if(is_data_type_quantized_asymmetric(data_type))
+    if (is_data_type_quantized_asymmetric(data_type))
     {
         const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
         const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
@@ -78,7 +82,8 @@ Status construct_gemmlowp_output_stage(const ITensorInfo *input, const ITensorIn
         float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
         int   output_multiplier(0);
         int   output_shift(0);
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
 
         output_stage_info.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
         output_stage_info.gemmlowp_multiplier = output_multiplier;
@@ -122,15 +127,21 @@ CLGEMMDeconvolutionLayer::CLGEMMDeconvolutionLayer(std::shared_ptr<IMemoryManage
 
 CLGEMMDeconvolutionLayer::~CLGEMMDeconvolutionLayer() = default;
 
-Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &deconv_info)
+Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo   *input,
+                                          const ITensorInfo   *weights,
+                                          const ITensorInfo   *bias,
+                                          const ITensorInfo   *output,
+                                          const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
 
     DataLayout data_layout  = input->data_layout();
-    const bool padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0;
+    const bool padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 ||
+                              deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0;
     const bool is_nchw      = input->data_layout() == DataLayout::NCHW;
     const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
 
@@ -144,21 +155,31 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso
     TensorShape nhwc_weights_shape = weights->tensor_shape();
     TensorShape nhwc_input_shape   = input->tensor_shape();
 
-    if(is_nchw)
+    if (is_nchw)
     {
         permute(nhwc_weights_shape, PermutationVector(2, 0, 1));
         permute(nhwc_input_shape, PermutationVector(2, 0, 1));
 
-        TensorInfo nhwc_input_info = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_input_shape).set_data_layout(DataLayout::NCHW);
+        TensorInfo nhwc_input_info = input->clone()
+                                         ->set_is_resizable(true)
+                                         .reset_padding()
+                                         .set_tensor_shape(nhwc_input_shape)
+                                         .set_data_layout(DataLayout::NCHW);
 
-        TensorInfo nhwc_weights_info = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_weights_shape).set_data_layout(DataLayout::NCHW);
+        TensorInfo nhwc_weights_info = weights->clone()
+                                           ->set_is_resizable(true)
+                                           .reset_padding()
+                                           .set_tensor_shape(nhwc_weights_shape)
+                                           .set_data_layout(DataLayout::NCHW);
 
         CLPermute::validate(weights, &nhwc_weights_info, PermutationVector(2, 0, 1));
         CLPermute::validate(input, &nhwc_input_info, PermutationVector(2, 0, 1));
     }
 
-    const TensorShape reshaped_shape = TensorShape(nhwc_weights_shape[0], nhwc_weights_shape[1] * nhwc_weights_shape[2] * nhwc_weights_shape[3]);
-    const TensorInfo  reshaped_info  = weights->clone()->set_tensor_shape(reshaped_shape).set_data_layout(DataLayout::NCHW).set_is_resizable(true);
+    const TensorShape reshaped_shape =
+        TensorShape(nhwc_weights_shape[0], nhwc_weights_shape[1] * nhwc_weights_shape[2] * nhwc_weights_shape[3]);
+    const TensorInfo reshaped_info =
+        weights->clone()->set_tensor_shape(reshaped_shape).set_data_layout(DataLayout::NCHW).set_is_resizable(true);
     ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(weights, &reshaped_info));
 
     TensorShape      transposed_shape(reshaped_shape[1], reshaped_shape[0]);
@@ -166,77 +187,95 @@ Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITenso
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&reshaped_info, &reshaped_t_info));
 
     TensorShape gemm_output_shape(weights->dimension(idx_w) * weights->dimension(idx_h) * weights->dimension(idx_b),
-                                  input->dimension(idx_w),
-                                  input->dimension(idx_h),
-                                  input->dimension(idx_b));
+                                  input->dimension(idx_w), input->dimension(idx_h), input->dimension(idx_b));
 
     TensorInfo gemm_output_info = reshaped_t_info.clone()->set_tensor_shape(gemm_output_shape).set_is_resizable(true);
     GEMMInfo   gemm_info(false, false, true, input->dimension(idx_h), true);
 
     GEMMLowpOutputStageInfo output_stage_info;
 
-    if(is_quantized)
+    if (is_quantized)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr, &gemm_output_info.set_data_type(DataType::S32),
-                                                                           gemm_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(
+            &input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr,
+            &gemm_output_info.set_data_type(DataType::S32), gemm_info));
         ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(input, weights, output, output_stage_info));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true), &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true),
+                             &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info));
     }
 
     const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second);
-    auto                out_dims           = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), stride_info);
-    const TensorShape   deconv_shape       = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
-    TensorInfo          col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true);
+    auto                out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h),
+                                                                   weights->dimension(idx_w), weights->dimension(idx_h), stride_info);
+    const TensorShape   deconv_shape =
+        misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
+    TensorInfo col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true);
 
-    if(padded_input && is_quantized)
+    if (padded_input && is_quantized)
     {
         const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, &col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output_stage_info));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output, start_end.first, start_end.second));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(
+            &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(
+            &col2im_output_info, nullptr,
+            &col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output_stage_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()),
+                              output, start_end.first, start_end.second));
     }
-    else if(padded_input)
+    else if (padded_input)
     {
         const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(
+            &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
         ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info, output, start_end.first, start_end.second));
     }
-    else if(is_quantized)
+    else if (is_quantized)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, output, output_stage_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(
+            &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, output, output_stage_info));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, output, input, weights, deconv_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, output, input, weights, deconv_info));
     }
 
     return Status{};
 }
 
-void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info)
+void CLGEMMDeconvolutionLayer::configure(const ICLTensor     *input,
+                                         const ICLTensor     *weights,
+                                         const ICLTensor     *bias,
+                                         ICLTensor           *output,
+                                         const PadStrideInfo &deconv_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info);
 }
 
-void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
-                                         const PadStrideInfo &deconv_info)
+void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context,
+                                         const ICLTensor        *input,
+                                         const ICLTensor        *weights,
+                                         const ICLTensor        *bias,
+                                         ICLTensor              *output,
+                                         const PadStrideInfo    &deconv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate(input->info(),
-                                                                  weights->info(),
-                                                                  bias != nullptr ? bias->info() : nullptr,
-                                                                  output->info(),
-                                                                  deconv_info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate(
+        input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr, output->info(), deconv_info));
     ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, deconv_info);
 
     _original_weights = weights;
-    _padded_input     = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0;
-    _is_nchw          = input->info()->data_layout() == DataLayout::NCHW;
-    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _padded_input     = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 ||
+                    deconv_info.pad_top() > 0;
+    _is_nchw      = input->info()->data_layout() == DataLayout::NCHW;
+    _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
 
     const ICLTensor *input_to_use   = input;
     const ICLTensor *weights_to_use = weights;
@@ -245,7 +284,7 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
     // do an outer product in NCHW and then an accumulation through a reduction. This would have two
     // drawbacks: first, the outer product is less efficient than a full GEMM. Second, the reduction
     // might be slower than GEMM.
-    if(_is_nchw)
+    if (_is_nchw)
     {
         _memory_group.manage(&_permuted_input);
         _permute_input_to_nhwc.configure(compile_context, input, &_permuted_input, PermutationVector(2U, 0U, 1U));
@@ -257,10 +296,11 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
     }
 
     // Reshape the input weights. The weights will be reshaped only once during the call to prepare()
-    _reshaped_weights.allocator()->init(TensorInfo(TensorShape(weights_to_use->info()->dimension(0),
-                                                               weights_to_use->info()->dimension(1) * weights_to_use->info()->dimension(2) * weights_to_use->info()->dimension(3)),
-                                                   1,
-                                                   input->info()->data_type(), weights->info()->quantization_info()));
+    _reshaped_weights.allocator()->init(
+        TensorInfo(TensorShape(weights_to_use->info()->dimension(0), weights_to_use->info()->dimension(1) *
+                                                                         weights_to_use->info()->dimension(2) *
+                                                                         weights_to_use->info()->dimension(3)),
+                   1, input->info()->data_type(), weights->info()->quantization_info()));
 
     _reshape_weights.configure(compile_context, weights_to_use, &_reshaped_weights);
     _transpose_weights.configure(compile_context, &_reshaped_weights, &_reshaped_weights_t);
@@ -269,15 +309,17 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
     GEMMInfo     gemm_info(false, false, true, input->info()->dimension(idx_h), true);
 
     // Configure output stage for asymmetric quantized types
-    if(_is_quantized)
+    if (_is_quantized)
     {
         // gemmlowp adds the offsets (instead of subtracting them). Thus, we need to negate the original
         // and restore them back to make it work properly.
         QuantizationInfo iq_info = input->info()->quantization_info();
         QuantizationInfo wq_info = weights->info()->quantization_info();
 
-        input_to_use->info()->set_quantization_info(QuantizationInfo(iq_info.uniform().scale, -iq_info.uniform().offset));
-        _reshaped_weights_t.info()->set_quantization_info(QuantizationInfo(wq_info.uniform().scale, -wq_info.uniform().offset));
+        input_to_use->info()->set_quantization_info(
+            QuantizationInfo(iq_info.uniform().scale, -iq_info.uniform().offset));
+        _reshaped_weights_t.info()->set_quantization_info(
+            QuantizationInfo(wq_info.uniform().scale, -wq_info.uniform().offset));
 
         _mm_gemmlowp.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, gemm_info);
 
@@ -286,10 +328,11 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
     }
     else
     {
-        _mm_gemm.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f, gemm_info);
+        _mm_gemm.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f,
+                           gemm_info);
     }
 
-    if(_is_nchw)
+    if (_is_nchw)
     {
         _permuted_input.allocator()->allocate();
     }
@@ -298,7 +341,7 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
     ICLTensor *slice_output          = nullptr;
     ICLTensor *output_stage_output   = nullptr;
 
-    if(_padded_input && _is_quantized)
+    if (_padded_input && _is_quantized)
     {
         _memory_group.manage(&_slice_gemm_input);
         _memory_group.manage(&_gemmlowp_final);
@@ -306,13 +349,13 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
         output_stage_output   = &_slice_gemm_input;
         slice_output          = output;
     }
-    else if(_padded_input)
+    else if (_padded_input)
     {
         _memory_group.manage(&_slice_gemm_input);
         deconv_reshape_output = &_slice_gemm_input;
         slice_output          = output;
     }
-    else if(_is_quantized)
+    else if (_is_quantized)
     {
         _memory_group.manage(&_gemmlowp_final);
         deconv_reshape_output = &_gemmlowp_final;
@@ -324,21 +367,24 @@ void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context
     }
 
     // Configure a Col2Im call to reshape the output of GEMM
-    _deconv_reshape->configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(), weights->info(), deconv_info);
+    _deconv_reshape->configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(),
+                               weights->info(), deconv_info);
     _gemm_output.allocator()->allocate();
 
-    if(_is_quantized)
+    if (_is_quantized)
     {
         GEMMLowpOutputStageInfo output_stage_info;
         construct_gemmlowp_output_stage(input->info(), weights->info(), output->info(), output_stage_info);
-        _gemmlowp_output_stage.configure(compile_context, &_gemmlowp_final, nullptr, output_stage_output, output_stage_info);
+        _gemmlowp_output_stage.configure(compile_context, &_gemmlowp_final, nullptr, output_stage_output,
+                                         output_stage_info);
         _gemmlowp_final.allocator()->allocate();
     }
 
     // If the input was padded, the output needs to be sliced.
-    if(_padded_input)
+    if (_padded_input)
     {
-        const auto start_end = compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw);
+        const auto start_end =
+            compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw);
         _slice_gemm.configure(compile_context, &_slice_gemm_input, slice_output, start_end.first, start_end.second);
         _slice_gemm_input.allocator()->allocate();
     }
@@ -350,12 +396,12 @@ void CLGEMMDeconvolutionLayer::run()
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    if(_is_nchw)
+    if (_is_nchw)
     {
         _permute_input_to_nhwc.run();
     }
 
-    if(_is_quantized)
+    if (_is_quantized)
     {
         _mm_gemmlowp.run();
     }
@@ -366,12 +412,12 @@ void CLGEMMDeconvolutionLayer::run()
 
     CLScheduler::get().enqueue(*_deconv_reshape, false);
 
-    if(_is_quantized)
+    if (_is_quantized)
     {
         _gemmlowp_output_stage.run();
     }
 
-    if(_padded_input)
+    if (_padded_input)
     {
         _slice_gemm.run();
     }
@@ -379,11 +425,11 @@ void CLGEMMDeconvolutionLayer::run()
 
 void CLGEMMDeconvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
 
-        if(_is_nchw)
+        if (_is_nchw)
         {
             _permuted_weights.allocator()->allocate();
             _permute_weights_to_nhwc.run();
@@ -392,7 +438,7 @@ void CLGEMMDeconvolutionLayer::prepare()
         _reshaped_weights.allocator()->allocate();
         _reshape_weights.run();
 
-        if(_is_nchw)
+        if (_is_nchw)
         {
             _permuted_weights.allocator()->free();
         }
@@ -401,7 +447,7 @@ void CLGEMMDeconvolutionLayer::prepare()
         _transpose_weights.run();
 
         // Prepare gemm
-        if(!_is_quantized)
+        if (!_is_quantized)
         {
             _mm_gemm.prepare();
         }
@@ -411,7 +457,7 @@ void CLGEMMDeconvolutionLayer::prepare()
         }
 
         // Free resources
-        if(!_reshaped_weights_t.is_used())
+        if (!_reshaped_weights_t.is_used())
         {
             _reshaped_weights_t.allocator()->free();
         }
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index d9029478a1..8bad198658 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -31,12 +31,12 @@
 #include "arm_compute/core/Log.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/IMemoryManager.h"
-#include "src/core/helpers/MemoryHelpers.h"
 
+#include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
 
 namespace arm_compute
@@ -46,13 +46,13 @@ using OperatorType = opencl::ClGemmLowpMatrixMultiplyCore;
 
 struct CLGEMMLowpMatrixMultiplyCore::Impl
 {
-    const ICLTensor              *b{ nullptr };
-    std::unique_ptr<OperatorType> op{ nullptr };
+    const ICLTensor              *b{nullptr};
+    std::unique_ptr<OperatorType> op{nullptr};
     MemoryGroup                   memory_group{};
     ITensorPack                   run_pack{};
     MemoryRequirements            aux_mem_req{};
     WorkspaceData<CLTensor>       workspace_tensors{};
-    bool                          is_prepared{ false };
+    bool                          is_prepared{false};
 };
 
 CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
@@ -63,12 +63,18 @@ CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemo
 
 CLGEMMLowpMatrixMultiplyCore::~CLGEMMLowpMatrixMultiplyCore() = default;
 
-void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
+void CLGEMMLowpMatrixMultiplyCore::configure(
+    const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, gemm_info);
 }
 
-void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
+void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context,
+                                             const ICLTensor        *a,
+                                             const ICLTensor        *b,
+                                             const ICLTensor        *c,
+                                             ICLTensor              *output,
+                                             const GEMMInfo         &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
 
@@ -76,23 +82,29 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
     _impl->op          = std::make_unique<OperatorType>();
     _impl->is_prepared = gemm_info.retain_internal_weights();
 
-    _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info);
+    _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(),
+                         gemm_info);
     _impl->aux_mem_req = _impl->op->workspace();
 
     // Manage/allocate auxilairy tensors
-    if(_impl->is_prepared)
+    if (_impl->is_prepared)
     {
         _impl->run_pack.add_const_tensor(ACL_SRC_0, a);
         _impl->run_pack.add_tensor(ACL_DST, output);
     }
     else
     {
-        _impl->run_pack          = { { ACL_SRC_0, a }, { ACL_SRC_1, _impl->b }, { ACL_SRC_2, c }, { ACL_DST, output } };
-        _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack);
+        _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_1, _impl->b}, {ACL_SRC_2, c}, {ACL_DST, output}};
+        _impl->workspace_tensors =
+            manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack);
     }
 }
 
-Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
+Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a,
+                                              const ITensorInfo *b,
+                                              const ITensorInfo *c,
+                                              const ITensorInfo *output,
+                                              const GEMMInfo    &gemm_info)
 {
     return OperatorType::validate(a, b, c, output, gemm_info);
 }
@@ -108,7 +120,7 @@ void CLGEMMLowpMatrixMultiplyCore::run()
 
 void CLGEMMLowpMatrixMultiplyCore::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->run_pack);
 
diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
index 6feed0d713..3dd8c5f101 100644
--- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
@@ -40,27 +40,33 @@ namespace arm_compute
 {
 struct CLGEMMLowpOutputStage::Impl
 {
-    const ICLTensor                               *src{ nullptr };
-    const ICLTensor                               *bias{ nullptr };
-    ICLTensor                                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClGemmLowpOutputStage> op{ nullptr };
+    const ICLTensor                               *src{nullptr};
+    const ICLTensor                               *bias{nullptr};
+    ICLTensor                                     *dst{nullptr};
+    std::unique_ptr<opencl::ClGemmLowpOutputStage> op{nullptr};
     ITensorPack                                    run_pack{};
 };
 
-CLGEMMLowpOutputStage::CLGEMMLowpOutputStage()
-    : _impl(std::make_unique<Impl>())
+CLGEMMLowpOutputStage::CLGEMMLowpOutputStage() : _impl(std::make_unique<Impl>())
 {
 }
-CLGEMMLowpOutputStage::CLGEMMLowpOutputStage(CLGEMMLowpOutputStage &&) = default;
+CLGEMMLowpOutputStage::CLGEMMLowpOutputStage(CLGEMMLowpOutputStage &&)            = default;
 CLGEMMLowpOutputStage &CLGEMMLowpOutputStage::operator=(CLGEMMLowpOutputStage &&) = default;
 CLGEMMLowpOutputStage::~CLGEMMLowpOutputStage()                                   = default;
 
-void CLGEMMLowpOutputStage::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info)
+void CLGEMMLowpOutputStage::configure(const ICLTensor               *input,
+                                      const ICLTensor               *bias,
+                                      ICLTensor                     *output,
+                                      const GEMMLowpOutputStageInfo &info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, info);
 }
 
-void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info)
+void CLGEMMLowpOutputStage::configure(const CLCompileContext        &compile_context,
+                                      const ICLTensor               *input,
+                                      const ICLTensor               *bias,
+                                      ICLTensor                     *output,
+                                      const GEMMLowpOutputStageInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -69,11 +75,15 @@ void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, c
     _impl->dst  = output;
 
     _impl->op = std::make_unique<opencl::ClGemmLowpOutputStage>();
-    _impl->op->configure(compile_context, input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info);
-    _impl->run_pack = { { ACL_SRC, _impl->src }, { ACL_BIAS, _impl->bias }, { ACL_DST, _impl->dst } };
+    _impl->op->configure(compile_context, input->info(), bias != nullptr ? bias->info() : nullptr, output->info(),
+                         info);
+    _impl->run_pack = {{ACL_SRC, _impl->src}, {ACL_BIAS, _impl->bias}, {ACL_DST, _impl->dst}};
 }
 
-Status CLGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info)
+Status CLGEMMLowpOutputStage::validate(const ITensorInfo             *input,
+                                       const ITensorInfo             *bias,
+                                       const ITensorInfo             *output,
+                                       const GEMMLowpOutputStageInfo &info)
 {
     return opencl::ClGemmLowpOutputStage::validate(input, bias, output, info);
 }
diff --git a/src/runtime/CL/functions/CLGather.cpp b/src/runtime/CL/functions/CLGather.cpp
index 033c117cec..2610cb1a3b 100644
--- a/src/runtime/CL/functions/CLGather.cpp
+++ b/src/runtime/CL/functions/CLGather.cpp
@@ -24,9 +24,9 @@
 #include "arm_compute/runtime/CL/functions/CLGather.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "src/core/CL/kernels/CLGatherKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLGatherKernel.h"
 
 namespace arm_compute
 {
@@ -35,7 +35,11 @@ void CLGather::configure(const ICLTensor *input, const ICLTensor *indices, ICLTe
     configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, axis);
 }
 
-void CLGather::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
+void CLGather::configure(const CLCompileContext &compile_context,
+                         const ICLTensor        *input,
+                         const ICLTensor        *indices,
+                         ICLTensor              *output,
+                         int                     axis)
 {
     ARM_COMPUTE_LOG_PARAMS(input, indices, output, axis);
     auto k = std::make_unique<CLGatherKernel>();
diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
index 9cb7d618cf..b2c1d2631e 100644
--- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
+++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
@@ -27,13 +27,13 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
 #include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
 #include "src/core/CL/kernels/CLPadLayerKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptr<IMemoryManager> memory_manager)
@@ -71,48 +71,67 @@ CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptr<IMemoryManage
 
 CLGenerateProposalsLayer::~CLGenerateProposalsLayer() = default;
 
-void CLGenerateProposalsLayer::configure(const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals, ICLTensor *scores_out, ICLTensor *num_valid_proposals,
+void CLGenerateProposalsLayer::configure(const ICLTensor             *scores,
+                                         const ICLTensor             *deltas,
+                                         const ICLTensor             *anchors,
+                                         ICLTensor                   *proposals,
+                                         ICLTensor                   *scores_out,
+                                         ICLTensor                   *num_valid_proposals,
                                          const GenerateProposalsInfo &info)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info);
+    configure(CLKernelLibrary::get().get_compile_context(), scores, deltas, anchors, proposals, scores_out,
+              num_valid_proposals, info);
 }
 
-void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context, const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals,
-                                         ICLTensor *scores_out,
-                                         ICLTensor *num_valid_proposals, const GenerateProposalsInfo &info)
+void CLGenerateProposalsLayer::configure(const CLCompileContext      &compile_context,
+                                         const ICLTensor             *scores,
+                                         const ICLTensor             *deltas,
+                                         const ICLTensor             *anchors,
+                                         ICLTensor                   *proposals,
+                                         ICLTensor                   *scores_out,
+                                         ICLTensor                   *num_valid_proposals,
+                                         const GenerateProposalsInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
-    ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(),
+                                                                  proposals->info(), scores_out->info(),
+                                                                  num_valid_proposals->info(), info));
     ARM_COMPUTE_LOG_PARAMS(scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info);
 
     _is_nhwc                        = scores->info()->data_layout() == DataLayout::NHWC;
     const DataType scores_data_type = scores->info()->data_type();
     _is_qasymm8                     = scores_data_type == DataType::QASYMM8;
-    const int    num_anchors        = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
-    const int    feat_width         = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
-    const int    feat_height        = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
-    const int    total_num_anchors  = num_anchors * feat_width * feat_height;
-    const int    pre_nms_topN       = info.pre_nms_topN();
-    const int    post_nms_topN      = info.post_nms_topN();
-    const size_t values_per_roi     = info.values_per_roi();
+    const int num_anchors           = scores->info()->dimension(
+                  get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
+    const int feat_width = scores->info()->dimension(
+        get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
+    const int feat_height = scores->info()->dimension(
+        get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
+    const int    total_num_anchors = num_anchors * feat_width * feat_height;
+    const int    pre_nms_topN      = info.pre_nms_topN();
+    const int    post_nms_topN     = info.post_nms_topN();
+    const size_t values_per_roi    = info.values_per_roi();
 
     const QuantizationInfo scores_qinfo   = scores->info()->quantization_info();
     const DataType         rois_data_type = (_is_qasymm8) ? DataType::QASYMM16 : scores_data_type;
-    const QuantizationInfo rois_qinfo     = (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
+    const QuantizationInfo rois_qinfo =
+        (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
 
     // Compute all the anchors
     _memory_group.manage(&_all_anchors);
-    _compute_anchors_kernel->configure(compile_context, anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
+    _compute_anchors_kernel->configure(compile_context, anchors, &_all_anchors,
+                                       ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
 
     const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors);
-    _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
+    _deltas_flattened.allocator()->init(
+        TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
 
     // Permute and reshape deltas
     _memory_group.manage(&_deltas_flattened);
-    if(!_is_nhwc)
+    if (!_is_nhwc)
     {
         _memory_group.manage(&_deltas_permuted);
-        _permute_deltas.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
+        _permute_deltas.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{2, 0, 1});
         _flatten_deltas.configure(compile_context, &_deltas_permuted, &_deltas_flattened);
         _deltas_permuted.allocator()->allocate();
     }
@@ -126,10 +145,10 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
 
     // Permute and reshape scores
     _memory_group.manage(&_scores_flattened);
-    if(!_is_nhwc)
+    if (!_is_nhwc)
     {
         _memory_group.manage(&_scores_permuted);
-        _permute_scores.configure(compile_context, scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
+        _permute_scores.configure(compile_context, scores, &_scores_permuted, PermutationVector{2, 0, 1});
         _flatten_scores.configure(compile_context, &_scores_permuted, &_scores_flattened);
         _scores_permuted.allocator()->allocate();
     }
@@ -140,7 +159,7 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
 
     CLTensor *anchors_to_use = &_all_anchors;
     CLTensor *deltas_to_use  = &_deltas_flattened;
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _all_anchors_f32.allocator()->init(TensorInfo(_all_anchors.info()->tensor_shape(), 1, DataType::F32));
         _deltas_flattened_f32.allocator()->init(TensorInfo(_deltas_flattened.info()->tensor_shape(), 1, DataType::F32));
@@ -163,11 +182,12 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
     anchors_to_use->allocator()->allocate();
 
     _all_proposals_to_use = &_all_proposals;
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _memory_group.manage(&_all_proposals_quantized);
         // Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset
-        _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
+        _all_proposals_quantized.allocator()->init(
+            TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
         _quantize_all_proposals->configure(compile_context, &_all_proposals, &_all_proposals_quantized);
         _all_proposals.allocator()->allocate();
         _all_proposals_to_use = &_all_proposals_quantized;
@@ -183,7 +203,8 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
 
     // Note that NMS needs outputs preinitialized.
     auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo);
-    auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, rois_qinfo);
+    auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type,
+                       rois_qinfo);
     auto_init_if_empty(*num_valid_proposals->info(), TensorShape(1), 1, DataType::U32);
 
     // Initialize temporaries (unused) outputs
@@ -195,20 +216,27 @@ void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context
     _num_valid_proposals = num_valid_proposals;
 
     _memory_group.manage(&_proposals_4_roi_values);
-    _cpp_nms.configure(&_scores_flattened, _all_proposals_to_use, nullptr, scores_out, &_proposals_4_roi_values, &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals,
-                       BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height()));
+    _cpp_nms.configure(&_scores_flattened, _all_proposals_to_use, nullptr, scores_out, &_proposals_4_roi_values,
+                       &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals,
+                       BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f,
+                                       true, min_size_scaled, info.im_width(), info.im_height()));
     _keeps_nms_unused.allocator()->allocate();
     _classes_nms_unused.allocator()->allocate();
     _all_proposals_to_use->allocator()->allocate();
     _scores_flattened.allocator()->allocate();
 
     // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images
-    _pad_kernel->configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
+    _pad_kernel->configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{{1, 0}});
     _proposals_4_roi_values.allocator()->allocate();
 }
 
-Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out,
-                                          const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info)
+Status CLGenerateProposalsLayer::validate(const ITensorInfo           *scores,
+                                          const ITensorInfo           *deltas,
+                                          const ITensorInfo           *anchors,
+                                          const ITensorInfo           *proposals,
+                                          const ITensorInfo           *scores_out,
+                                          const ITensorInfo           *num_valid_proposals,
+                                          const GenerateProposalsInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
@@ -216,9 +244,12 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores, deltas);
 
-    const int num_anchors       = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
-    const int feat_width        = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
-    const int feat_height       = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
+    const int num_anchors =
+        scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
+    const int feat_width =
+        scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
+    const int feat_height =
+        scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
     const int num_images        = scores->dimension(3);
     const int total_num_anchors = num_anchors * feat_width * feat_height;
     const int values_per_roi    = info.values_per_roi();
@@ -227,76 +258,101 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
 
     ARM_COMPUTE_RETURN_ERROR_ON(num_images > 1);
 
-    if(is_qasymm8)
+    if (is_qasymm8)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(anchors, 1, DataType::QSYMM16);
         const UniformQuantizationInfo anchors_qinfo = anchors->quantization_info().uniform();
         ARM_COMPUTE_RETURN_ERROR_ON(anchors_qinfo.scale != 0.125f);
     }
 
-    TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
-
-    TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true);
-    TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
-    if(scores->data_layout() == DataLayout::NHWC)
+    TensorInfo all_anchors_info(
+        anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate(
+        anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
+
+    TensorInfo deltas_permuted_info =
+        deltas->clone()
+            ->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height))
+            .set_is_resizable(true);
+    TensorInfo scores_permuted_info =
+        scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
+    if (scores->data_layout() == DataLayout::NHWC)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info);
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(deltas, &deltas_permuted_info, PermutationVector{2, 0, 1}));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(scores, &scores_permuted_info, PermutationVector{2, 0, 1}));
     }
 
-    TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    TensorInfo deltas_flattened_info(
+        deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
     ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&deltas_permuted_info, &deltas_flattened_info));
 
-    TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
-    TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    TensorInfo scores_flattened_info(
+        scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
+    TensorInfo proposals_4_roi_values(
+        deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&scores_permuted_info, &scores_flattened_info));
 
     TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values;
-    TensorInfo  proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
-    proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16).set_quantization_info(QuantizationInfo(0.125f, 0));
-    if(is_qasymm8)
+    TensorInfo  proposals_4_roi_values_quantized(
+         deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16)
+        .set_quantization_info(QuantizationInfo(0.125f, 0));
+    if (is_qasymm8)
     {
-        TensorInfo all_anchors_f32_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
+        TensorInfo all_anchors_f32_info(anchors->clone()
+                                            ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+                                            .set_is_resizable(true)
+                                            .set_data_type(DataType::F32));
         ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayer::validate(&all_anchors_info, &all_anchors_f32_info));
 
-        TensorInfo deltas_flattened_f32_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
-
-        TensorInfo proposals_4_roi_values_f32(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
-                                                                           BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
-
-        ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
+        TensorInfo deltas_flattened_f32_info(deltas->clone()
+                                                 ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+                                                 .set_is_resizable(true)
+                                                 .set_data_type(DataType::F32));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
+
+        TensorInfo proposals_4_roi_values_f32(deltas->clone()
+                                                  ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+                                                  .set_is_resizable(true)
+                                                  .set_data_type(DataType::F32));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(
+            &all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
+            BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
         proposals_4_roi_values_to_use = &proposals_4_roi_values_quantized;
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
-                                                                           BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
+                                                   BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } }));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{{1, 0}}));
 
-    if(num_valid_proposals->total_size() > 0)
+    if (num_valid_proposals->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->dimension(0) > 1);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_valid_proposals, 1, DataType::U32);
     }
 
-    if(proposals->total_size() > 0)
+    if (proposals->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(0) != size_t(values_per_roi) + 1);
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(1) != size_t(total_num_anchors));
-        if(is_qasymm8)
+        if (is_qasymm8)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(proposals, 1, DataType::QASYMM16);
             const UniformQuantizationInfo proposals_qinfo = proposals->quantization_info().uniform();
@@ -309,7 +365,7 @@ Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
         }
     }
 
-    if(scores_out->total_size() > 0)
+    if (scores_out->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(scores_out->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(scores_out->dimension(0) != size_t(total_num_anchors));
@@ -356,7 +412,7 @@ void CLGenerateProposalsLayer::run()
     CLScheduler::get().enqueue(*_compute_anchors_kernel, false);
 
     // Transpose and reshape the inputs
-    if(!_is_nhwc)
+    if (!_is_nhwc)
     {
         _permute_deltas.run();
         _permute_scores.run();
@@ -364,7 +420,7 @@ void CLGenerateProposalsLayer::run()
     _flatten_deltas.run();
     _flatten_scores.run();
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _dequantize_anchors->run();
         _dequantize_deltas->run();
@@ -373,7 +429,7 @@ void CLGenerateProposalsLayer::run()
     // Build the boxes
     CLScheduler::get().enqueue(*_bounding_box_kernel, false);
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _quantize_all_proposals->run();
     }
diff --git a/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp
index 90af36aa77..1a2369c5c2 100644
--- a/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp
@@ -26,36 +26,45 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "src/gpu/cl/operators/ClIndirectConv2d.h"
 
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/operators/ClIndirectConv2d.h"
 
 namespace arm_compute
 {
 struct CLIndirectConvolutionLayer::Impl
 {
-    const ICLTensor                          *src{ nullptr };
-    const ICLTensor                          *weights{ nullptr };
-    const ICLTensor                          *biases{ nullptr };
-    ICLTensor                                *dst{ nullptr };
-    std::unique_ptr<opencl::ClIndirectConv2d> op{ nullptr };
+    const ICLTensor                          *src{nullptr};
+    const ICLTensor                          *weights{nullptr};
+    const ICLTensor                          *biases{nullptr};
+    ICLTensor                                *dst{nullptr};
+    std::unique_ptr<opencl::ClIndirectConv2d> op{nullptr};
 };
 
-CLIndirectConvolutionLayer::CLIndirectConvolutionLayer()
-    : _impl(std::make_unique<Impl>())
+CLIndirectConvolutionLayer::CLIndirectConvolutionLayer() : _impl(std::make_unique<Impl>())
 {
 }
-CLIndirectConvolutionLayer::CLIndirectConvolutionLayer(CLIndirectConvolutionLayer &&) = default;
+CLIndirectConvolutionLayer::CLIndirectConvolutionLayer(CLIndirectConvolutionLayer &&)            = default;
 CLIndirectConvolutionLayer &CLIndirectConvolutionLayer::operator=(CLIndirectConvolutionLayer &&) = default;
 CLIndirectConvolutionLayer::~CLIndirectConvolutionLayer()                                        = default;
 
-void CLIndirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CLIndirectConvolutionLayer::configure(ICLTensor                 *input,
+                                           const ICLTensor           *weights,
+                                           const ICLTensor           *biases,
+                                           ICLTensor                 *output,
+                                           const PadStrideInfo       &conv_info,
+                                           const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info);
 }
 
-void CLIndirectConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                           const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CLIndirectConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                           ICLTensor                 *input,
+                                           const ICLTensor           *weights,
+                                           const ICLTensor           *biases,
+                                           ICLTensor                 *output,
+                                           const PadStrideInfo       &conv_info,
+                                           const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info);
@@ -65,10 +74,15 @@ void CLIndirectConvolutionLayer::configure(const CLCompileContext &compile_conte
     _impl->biases  = biases;
     _impl->dst     = output;
     _impl->op      = std::make_unique<opencl::ClIndirectConv2d>();
-    _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info);
+    _impl->op->configure(compile_context, input->info(), weights->info(),
+                         (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info);
 }
 
-Status CLIndirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+Status CLIndirectConvolutionLayer::validate(const ITensorInfo         *input,
+                                            const ITensorInfo         *weights,
+                                            const ITensorInfo         *biases,
+                                            const ITensorInfo         *output,
+                                            const PadStrideInfo       &conv_info,
                                             const ActivationLayerInfo &act_info)
 {
     return opencl::ClIndirectConv2d::validate(input, weights, biases, output, conv_info, act_info);
@@ -83,4 +97,4 @@ void CLIndirectConvolutionLayer::run()
     pack.add_tensor(TensorType::ACL_DST, _impl->dst);
     _impl->op->run(pack);
 }
-}
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
index 5feafe19db..0e994e1aee 100644
--- a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
@@ -27,50 +27,62 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/ICLKernel.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 CLInstanceNormalizationLayer::CLInstanceNormalizationLayer(CLRuntimeContext *ctx) // NOLINT
-    : _inst_norm_kernel(),
-      _mean_var_kernel(),
-      _mean_var_tensor(),
-      _ctx(ctx)
+    : _inst_norm_kernel(), _mean_var_kernel(), _mean_var_tensor(), _ctx(ctx)
 {
 }
 CLInstanceNormalizationLayer::~CLInstanceNormalizationLayer()
 {
 }
 
-void CLInstanceNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
+void CLInstanceNormalizationLayer::configure(
+    ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, gamma, beta, epsilon, use_mixed_precision);
 }
 
-void CLInstanceNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
+void CLInstanceNormalizationLayer::configure(const CLCompileContext &compile_context,
+                                             ICLTensor              *input,
+                                             ICLTensor              *output,
+                                             float                   gamma,
+                                             float                   beta,
+                                             float                   epsilon,
+                                             bool                    use_mixed_precision)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, gamma, beta, epsilon, use_mixed_precision);
     auto w = std::make_unique<CLComputeMeanVariance>();
     w->configure(compile_context, input, &_mean_var_tensor, use_mixed_precision);
     _mean_var_kernel = std::move(w);
     auto k           = std::make_unique<CLInstanceNormalizationLayerKernel>();
-    k->configure(compile_context, input, &_mean_var_tensor, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
+    k->configure(compile_context, input, &_mean_var_tensor, output,
+                 InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
     _inst_norm_kernel = std::move(k);
     _mean_var_tensor.allocator()->allocate();
 }
 
-Status CLInstanceNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
+Status CLInstanceNormalizationLayer::validate(const ITensorInfo *input,
+                                              const ITensorInfo *output,
+                                              float              gamma,
+                                              float              beta,
+                                              float              epsilon,
+                                              bool               use_mixed_precision)
 {
-    return CLInstanceNormalizationLayerKernel::validate(input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
+    return CLInstanceNormalizationLayerKernel::validate(
+        input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
 }
 
 void CLInstanceNormalizationLayer::run()
 {
-    ARM_COMPUTE_ERROR_ON_MSG(!_inst_norm_kernel, "The child class didn't set the CL kernel or function isn't configured");
+    ARM_COMPUTE_ERROR_ON_MSG(!_inst_norm_kernel,
+                             "The child class didn't set the CL kernel or function isn't configured");
     schedule_kernel_on_ctx(_ctx, _mean_var_kernel.get());
     schedule_kernel_on_ctx(_ctx, _inst_norm_kernel.get());
 }
diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index 1278385f53..4fe1d9b20b 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
@@ -29,12 +29,12 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLL2NormalizeLayerKernel.h"
 #include "src/core/CL/kernels/CLReductionOperationKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace
@@ -57,7 +57,8 @@ void CLL2NormalizeLayer::configure(ICLTensor *input, ICLTensor *output, int axis
     configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, epsilon);
 }
 
-void CLL2NormalizeLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon)
+void CLL2NormalizeLayer::configure(
+    const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, axis, epsilon);
 
@@ -86,7 +87,8 @@ Status CLL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo
     sum_sq.set_tensor_shape(shape);
 
     const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE));
 
     // Reduce shape on axis
     shape.set(actual_axis, 1);
diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index ea08beca75..3b50234c77 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -24,15 +24,15 @@
 #include "arm_compute/runtime/CL/functions/CLLSTMLayer.h"
 
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/gpu/cl/kernels/ClTransposeKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/gpu/cl/kernels/ClTransposeKernel.h"
 
 namespace arm_compute
 {
@@ -40,54 +40,155 @@ using namespace arm_compute::misc::shape_calculator;
 using namespace arm_compute::utils::info_helpers;
 
 CLLSTMLayer::CLLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(),
-      _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(),
-      _transpose_cell_state(std::make_unique<opencl::kernels::ClTransposeKernel>()), _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(),
-      _pixelwise_mul_cell_state2(), _fully_connected_output(), _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(),
-      _fully_connected_output_state(), _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(),
-      _concat_weights_input_gate(), _concat_weights_output(), _ones_fill(), _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(),
-      _pixelwise_mul_forget_gate_coeff(), _accum_forget_gate_bias(), _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(),
-      _pixelwise_mul_output_gate_coeff(), _accum_output_gate_bias(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(),
-      _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(),
-      _output2(), _output3(), _output4(), _cell_state_activation(), _output_state1(), _ones(), _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(),
-      _cell_layer_norm_out1(), _cell_layer_norm_out2(), _output_layer_norm_out1(), _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false),
-      _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false), _is_layer_norm_lstm(false)
+    : _memory_group(std::move(memory_manager)),
+      _fully_connected_input_gate(),
+      _accum_input_gate1(),
+      _subtract_input_gate(),
+      _pixelwise_mul_input_gate(),
+      _activation_input_gate(),
+      _fully_connected_forget_gate(),
+      _accum_forget_gate1(),
+      _pixelwise_mul_forget_gate(),
+      _activation_forget_gate(),
+      _fully_connected_cell_state(),
+      _gemm_cell_state1(),
+      _transpose_cell_state(std::make_unique<opencl::kernels::ClTransposeKernel>()),
+      _accum_cell_state1(),
+      _accum_cell_state2(),
+      _pixelwise_mul_cell_state1(),
+      _activation_cell_state(),
+      _cell_clip(),
+      _pixelwise_mul_cell_state2(),
+      _fully_connected_output(),
+      _pixelwise_mul_output_state1(),
+      _accum_output1(),
+      _activation_output(),
+      _activation_output_state(),
+      _pixelwise_mul_output_state2(),
+      _fully_connected_output_state(),
+      _projection_clip(),
+      _copy_cell_state(),
+      _copy_output(),
+      _concat_scratch_buffer(),
+      _concat_inputs_forget_gate(),
+      _concat_weights_forget_gate(),
+      _concat_weights_input_gate(),
+      _concat_weights_output(),
+      _ones_fill(),
+      _mean_std_norm_input_gate(),
+      _pixelwise_mul_input_gate_coeff(),
+      _accum_input_gate_bias(),
+      _mean_std_norm_forget_gate(),
+      _pixelwise_mul_forget_gate_coeff(),
+      _accum_forget_gate_bias(),
+      _mean_std_norm_cell_gate(),
+      _pixelwise_mul_cell_gate_coeff(),
+      _accum_cell_gate_bias(),
+      _mean_std_norm_output_gate(),
+      _pixelwise_mul_output_gate_coeff(),
+      _accum_output_gate_bias(),
+      _input_gate_out1(),
+      _input_gate_out2(),
+      _input_gate_out3(),
+      _input_gate_out4(),
+      _forget_gate_out1(),
+      _forget_gate_out2(),
+      _forget_gate_out3(),
+      _forget_gate_out4(),
+      _forget_gate_out5(),
+      _forget_gate_out6(),
+      _cell_state_out1(),
+      _cell_state_out2(),
+      _cell_state_out3(),
+      _cell_state_out4(),
+      _cell_state_out5(),
+      _output1(),
+      _output2(),
+      _output3(),
+      _output4(),
+      _cell_state_activation(),
+      _output_state1(),
+      _ones(),
+      _input_layer_norm_out1(),
+      _input_layer_norm_out2(),
+      _forget_layer_norm_out1(),
+      _forget_layer_norm_out2(),
+      _cell_layer_norm_out1(),
+      _cell_layer_norm_out2(),
+      _output_layer_norm_out1(),
+      _output_layer_norm_out2(),
+      _run_peephole_opt(false),
+      _run_cifg_opt(false),
+      _perform_cell_clipping(false),
+      _has_projection_weights(false),
+      _perform_projection_clipping(false),
+      _is_prepared(false),
+      _is_layer_norm_lstm(false)
 {
 }
 
 CLLSTMLayer::~CLLSTMLayer() = default;
 
-void CLLSTMLayer::configure(const ICLTensor *input,
-                            const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                            const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                            const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                            const ICLTensor *output_state_in, ICLTensor *cell_state_in,
-                            ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
-                            const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+void CLLSTMLayer::configure(const ICLTensor             *input,
+                            const ICLTensor             *input_to_forget_weights,
+                            const ICLTensor             *input_to_cell_weights,
+                            const ICLTensor             *input_to_output_weights,
+                            const ICLTensor             *recurrent_to_forget_weights,
+                            const ICLTensor             *recurrent_to_cell_weights,
+                            const ICLTensor             *recurrent_to_output_weights,
+                            const ICLTensor             *forget_gate_bias,
+                            const ICLTensor             *cell_bias,
+                            const ICLTensor             *output_gate_bias,
+                            const ICLTensor             *output_state_in,
+                            ICLTensor                   *cell_state_in,
+                            ICLTensor                   *scratch_buffer,
+                            ICLTensor                   *output_state_out,
+                            ICLTensor                   *cell_state_out,
+                            ICLTensor                   *output,
+                            const LSTMParams<ICLTensor> &lstm_params,
+                            const ActivationLayerInfo   &activation_info,
+                            float                        cell_threshold,
+                            float                        projection_threshold)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
-              recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info,
+    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights,
+              input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
+              recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in,
+              cell_state_in, scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info,
               cell_threshold, projection_threshold);
 }
 
-void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input,
-                            const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                            const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                            const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                            const ICLTensor *output_state_in, ICLTensor *cell_state_in,
-                            ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
-                            const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+void CLLSTMLayer::configure(const CLCompileContext      &compile_context,
+                            const ICLTensor             *input,
+                            const ICLTensor             *input_to_forget_weights,
+                            const ICLTensor             *input_to_cell_weights,
+                            const ICLTensor             *input_to_output_weights,
+                            const ICLTensor             *recurrent_to_forget_weights,
+                            const ICLTensor             *recurrent_to_cell_weights,
+                            const ICLTensor             *recurrent_to_output_weights,
+                            const ICLTensor             *forget_gate_bias,
+                            const ICLTensor             *cell_bias,
+                            const ICLTensor             *output_gate_bias,
+                            const ICLTensor             *output_state_in,
+                            ICLTensor                   *cell_state_in,
+                            ICLTensor                   *scratch_buffer,
+                            ICLTensor                   *output_state_out,
+                            ICLTensor                   *cell_state_out,
+                            ICLTensor                   *output,
+                            const LSTMParams<ICLTensor> &lstm_params,
+                            const ActivationLayerInfo   &activation_info,
+                            float                        cell_threshold,
+                            float                        projection_threshold)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input,
-                                 input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
                                  recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                 forget_gate_bias, cell_bias, output_gate_bias,
-                                 output_state_in, cell_state_in,
+                                 forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in,
                                  scratch_buffer, output_state_out, cell_state_out, output);
 
-    ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
-                           recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out,
-                           output, lstm_params, activation_info, cell_threshold, projection_threshold);
+    ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+                           recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+                           forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in,
+                           scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info,
+                           cell_threshold, projection_threshold);
 
     _is_layer_norm_lstm = lstm_params.use_layer_norm();
 
@@ -96,13 +197,12 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
 
     // Validate
-    ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate(input->info(), input_to_forget_weights->info(),
-                                                     input_to_cell_weights->info(), input_to_output_weights->info(),
-                                                     recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                     forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
-                                                     output_state_in->info(), cell_state_in->info(),
-                                                     scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
-                                                     lstm_params_info, activation_info, cell_threshold, projection_threshold));
+    ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate(
+        input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
+        recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+        forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), output_state_in->info(),
+        cell_state_in->info(), scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
+        lstm_params_info, activation_info, cell_threshold, projection_threshold));
 
     const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape();
     // Configure block that calculates the forget gate
@@ -126,26 +226,31 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
 
     weights_vector.emplace_back(input_to_forget_weights);
     weights_vector.emplace_back(recurrent_to_forget_weights);
-    const TensorShape weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0);
+    const TensorShape weights_concat_shape =
+        arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0);
     _forget_gate_out6.allocator()->init(TensorInfo(weights_concat_shape, 1, input->info()->data_type()));
 
     _concat_weights_forget_gate.configure(compile_context, weights_vector, &_forget_gate_out6, Window::DimX);
 
     _memory_group.manage(&_forget_gate_out5);
-    _fully_connected_forget_gate.configure(compile_context, &_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
+    _fully_connected_forget_gate.configure(compile_context, &_forget_gate_out2, &_forget_gate_out6,
+                                           (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
     _memory_group.manage(&_forget_gate_out1);
     _memory_group.manage(&_forget_gate_out3);
     _forget_gate_out6.allocator()->allocate();
 
     CLTensor *forget_gate_out = &_forget_gate_out5;
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         _forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
         _run_peephole_opt = true;
         _memory_group.manage(&_forget_gate_out4);
-        _pixelwise_mul_forget_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-        _accum_forget_gate1.configure(compile_context, &_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE);
+        _pixelwise_mul_forget_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(),
+                                             &_forget_gate_out4, 1, ConvertPolicy::SATURATE,
+                                             RoundingPolicy::TO_NEAREST_EVEN);
+        _accum_forget_gate1.configure(compile_context, &_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3,
+                                      ConvertPolicy::SATURATE);
         _forget_gate_out4.allocator()->allocate();
         _forget_gate_out5.allocator()->allocate();
         forget_gate_out = &_forget_gate_out3;
@@ -154,22 +259,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     {
         _forget_gate_out3.allocator()->allocate();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _forget_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _forget_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_forget_layer_norm_out1);
         _memory_group.manage(&_forget_layer_norm_out2);
         _mean_std_norm_forget_gate.configure(compile_context, forget_gate_out);
-        _pixelwise_mul_forget_gate_coeff.configure(compile_context, forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE,
-                                                   RoundingPolicy::TO_NEAREST_EVEN);
+        _pixelwise_mul_forget_gate_coeff.configure(compile_context, forget_gate_out,
+                                                   lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1,
+                                                   ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
         // forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
         forget_gate_out->allocator()->allocate();
-        _accum_forget_gate_bias.configure(compile_context, &_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_forget_gate_bias.configure(compile_context, &_forget_layer_norm_out1, forget_gate_bias,
+                                          &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
         _forget_layer_norm_out1.allocator()->allocate();
         forget_gate_out = &_forget_layer_norm_out2;
     }
-    _activation_forget_gate.configure(compile_context, forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _activation_forget_gate.configure(compile_context, forget_gate_out, nullptr,
+                                      ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
 
     // Configure block that calculates the input gate
     // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
@@ -178,12 +286,13 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     // input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
     _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
     CLTensor *input_gate_out = &_input_gate_out1;
-    if(lstm_params.has_cifg_opt())
+    if (lstm_params.has_cifg_opt())
     {
         _memory_group.manage(&_input_gate_out1);
         _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _ones_fill.configure(compile_context, &_ones, PixelValue(1, _ones.info()->data_type()));
-        _subtract_input_gate.configure(compile_context, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
+        _subtract_input_gate.configure(compile_context, &_ones, forget_gate_out, &_input_gate_out1,
+                                       ConvertPolicy::SATURATE);
         _ones.allocator()->allocate();
         _run_cifg_opt = true;
     }
@@ -195,7 +304,8 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
         std::vector<const ICLTensor *> lstm_weights;
         lstm_weights.emplace_back(lstm_params.input_to_input_weights());
         lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
-        TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+        TensorShape lstm_weights_concat_shape =
+            arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
         _input_gate_out2.allocator()->init(TensorInfo(lstm_weights_concat_shape, 1, input->info()->data_type()));
 
         _concat_weights_input_gate.configure(compile_context, lstm_weights, &_input_gate_out2, Window::DimX);
@@ -203,15 +313,20 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
         _memory_group.manage(&_input_gate_out1);
 
         _memory_group.manage(&_input_gate_out3);
-        _fully_connected_input_gate.configure(compile_context, &_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3);
+        _fully_connected_input_gate.configure(compile_context, &_forget_gate_out2, &_input_gate_out2,
+                                              (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(),
+                                              &_input_gate_out3);
         _input_gate_out2.allocator()->allocate();
 
         input_gate_out = &_input_gate_out3;
-        if(_run_peephole_opt)
+        if (_run_peephole_opt)
         {
             _memory_group.manage(&_input_gate_out4);
-            _pixelwise_mul_input_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-            _accum_input_gate1.configure(compile_context, &_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
+            _pixelwise_mul_input_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(),
+                                                &_input_gate_out4, 1, ConvertPolicy::SATURATE,
+                                                RoundingPolicy::TO_NEAREST_EVEN);
+            _accum_input_gate1.configure(compile_context, &_input_gate_out3, &_input_gate_out4, &_input_gate_out1,
+                                         ConvertPolicy::SATURATE);
             _input_gate_out3.allocator()->allocate();
             _input_gate_out4.allocator()->allocate();
             input_gate_out = &_input_gate_out1;
@@ -221,22 +336,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
             _input_gate_out1.allocator()->allocate();
         }
 
-        if(_is_layer_norm_lstm)
+        if (_is_layer_norm_lstm)
         {
             _input_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
             _input_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
             _memory_group.manage(&_input_layer_norm_out1);
             _memory_group.manage(&_input_layer_norm_out2);
             _mean_std_norm_input_gate.configure(compile_context, input_gate_out);
-            _pixelwise_mul_input_gate_coeff.configure(compile_context, input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE,
-                                                      RoundingPolicy::TO_NEAREST_EVEN);
+            _pixelwise_mul_input_gate_coeff.configure(compile_context, input_gate_out,
+                                                      lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1,
+                                                      1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
             // input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
             input_gate_out->allocator()->allocate();
-            _accum_input_gate_bias.configure(compile_context, &_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE);
+            _accum_input_gate_bias.configure(compile_context, &_input_layer_norm_out1, lstm_params.input_gate_bias(),
+                                             &_input_layer_norm_out2, ConvertPolicy::SATURATE);
             _input_layer_norm_out1.allocator()->allocate();
             input_gate_out = &_input_layer_norm_out2;
         }
-        _activation_input_gate.configure(compile_context, input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        _activation_input_gate.configure(compile_context, input_gate_out, nullptr,
+                                         ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     }
 
     // Configure block that calculates the cell state
@@ -249,44 +367,54 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
     _memory_group.manage(&_cell_state_out1);
-    _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1);
+    _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights,
+                                          (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1);
     _memory_group.manage(&_cell_state_out2);
     _transpose_cell_state->configure(compile_context, recurrent_to_cell_weights->info(), _cell_state_out2.info());
     _recurrent_to_cell_weights = recurrent_to_cell_weights;
     _memory_group.manage(&_cell_state_out3);
-    _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
+    _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f,
+                                0.f);
     _cell_state_out2.allocator()->allocate();
     _memory_group.manage(&_cell_state_out4);
-    _accum_cell_state1.configure(compile_context, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
+    _accum_cell_state1.configure(compile_context, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4,
+                                 ConvertPolicy::SATURATE);
     CLTensor *cell_state_out_ptr = &_cell_state_out4;
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _cell_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _cell_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_cell_layer_norm_out1);
         _memory_group.manage(&_cell_layer_norm_out2);
         _mean_std_norm_cell_gate.configure(compile_context, cell_state_out_ptr);
-        _pixelwise_mul_cell_gate_coeff.configure(compile_context, cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE,
-                                                 RoundingPolicy::TO_NEAREST_EVEN);
+        _pixelwise_mul_cell_gate_coeff.configure(compile_context, cell_state_out_ptr,
+                                                 lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1,
+                                                 ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
         // cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before
         cell_state_out_ptr->allocator()->allocate();
-        _accum_cell_gate_bias.configure(compile_context, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_cell_gate_bias.configure(compile_context, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2,
+                                        ConvertPolicy::SATURATE);
         _cell_layer_norm_out1.allocator()->allocate();
         cell_state_out_ptr = &_cell_layer_norm_out2;
     }
     _activation_cell_state.configure(compile_context, cell_state_out_ptr, nullptr, activation_info);
     _memory_group.manage(&_cell_state_out5);
-    _pixelwise_mul_cell_state1.configure(compile_context, cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _pixelwise_mul_cell_state1.configure(compile_context, cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1,
+                                         ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
     cell_state_out_ptr->allocator()->allocate();
-    _pixelwise_mul_cell_state2.configure(compile_context, forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-    _accum_cell_state2.configure(compile_context, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
+    _pixelwise_mul_cell_state2.configure(compile_context, forget_gate_out, cell_state_in, &_cell_state_out3, 1,
+                                         ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _accum_cell_state2.configure(compile_context, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1,
+                                 ConvertPolicy::SATURATE);
     _cell_state_out3.allocator()->allocate();
     _cell_state_out5.allocator()->allocate();
     // Perform clipping
-    if(cell_threshold != 0.f)
+    if (cell_threshold != 0.f)
     {
         _perform_cell_clipping = true;
-        _cell_clip.configure(compile_context, &_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold, -cell_threshold));
+        _cell_clip.configure(compile_context, &_cell_state_out1, nullptr,
+                             ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                 cell_threshold, -cell_threshold));
     }
 
     // Configure block that calculates the output
@@ -298,7 +426,8 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     std::vector<const ICLTensor *> in_out_weights;
     in_out_weights.emplace_back(input_to_output_weights);
     in_out_weights.emplace_back(recurrent_to_output_weights);
-    TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+    TensorShape in_out_weights_concat_shape =
+        arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
     _output2.allocator()->init(TensorInfo(in_out_weights_concat_shape, 1, input->info()->data_type()));
 
     _concat_weights_output.configure(compile_context, in_out_weights, &_output2, Window::DimX);
@@ -306,18 +435,20 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     _memory_group.manage(&_output1);
     _memory_group.manage(&_output4);
 
-    _fully_connected_output.configure(compile_context, &_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4);
+    _fully_connected_output.configure(compile_context, &_forget_gate_out2, &_output2,
+                                      (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4);
 
     _output2.allocator()->allocate();
     _forget_gate_out2.allocator()->allocate();
 
     CLTensor *output_gate_out = &_output4;
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
 
         _memory_group.manage(&_output3);
-        _pixelwise_mul_output_state1.configure(compile_context, &_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+        _pixelwise_mul_output_state1.configure(compile_context, &_cell_state_out1, lstm_params.cell_to_output_weights(),
+                                               &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
         _accum_output1.configure(compile_context, &_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
         _output4.allocator()->allocate();
         output_gate_out = &_output1;
@@ -329,22 +460,25 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     {
         _output1.allocator()->allocate();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _output_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _output_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_output_layer_norm_out1);
         _memory_group.manage(&_output_layer_norm_out2);
         _mean_std_norm_output_gate.configure(compile_context, output_gate_out);
-        _pixelwise_mul_output_gate_coeff.configure(compile_context, output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE,
-                                                   RoundingPolicy::TO_NEAREST_EVEN);
+        _pixelwise_mul_output_gate_coeff.configure(compile_context, output_gate_out,
+                                                   lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1,
+                                                   ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
         // output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
         output_gate_out->allocator()->allocate();
-        _accum_output_gate_bias.configure(compile_context, &_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_output_gate_bias.configure(compile_context, &_output_layer_norm_out1, output_gate_bias,
+                                          &_output_layer_norm_out2, ConvertPolicy::SATURATE);
         _output_layer_norm_out1.allocator()->allocate();
         output_gate_out = &_output_layer_norm_out2;
     }
-    _activation_output.configure(compile_context, output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _activation_output.configure(compile_context, output_gate_out, nullptr,
+                                 ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
 
     // Configure block that calculates the output state
     /** lstm_res = PixelwiseMul(output, Activation(cell_state))
@@ -361,19 +495,24 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
 
     _memory_group.manage(&_cell_state_activation);
     _activation_output_state.configure(compile_context, &_cell_state_out1, &_cell_state_activation, activation_info);
-    _pixelwise_mul_output_state2.configure(compile_context, &_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _pixelwise_mul_output_state2.configure(compile_context, &_cell_state_activation, output_gate_out,
+                                           output_state_out_tmp, 1, ConvertPolicy::SATURATE,
+                                           RoundingPolicy::TO_NEAREST_EVEN);
     _cell_state_activation.allocator()->allocate();
 
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
         _has_projection_weights = true;
-        _fully_connected_output_state.configure(compile_context, output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out);
+        _fully_connected_output_state.configure(compile_context, output_state_out_tmp, lstm_params.projection_weights(),
+                                                lstm_params.projection_bias(), output_state_out);
         _output_state1.allocator()->allocate();
         // Perform clipping
-        if(projection_threshold != 0.f)
+        if (projection_threshold != 0.f)
         {
             _perform_projection_clipping = true;
-            _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold));
+            _projection_clip.configure(compile_context, output_state_out, nullptr,
+                                       ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                           -projection_threshold, projection_threshold));
         }
     }
 
@@ -383,7 +522,7 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
 
     // Vector for holding the tensors to store in scratch buffer
     std::vector<const ICLTensor *> scratch_inputs;
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
         scratch_inputs.emplace_back(input_gate_out);
     }
@@ -397,29 +536,38 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe
     output_gate_out->allocator()->allocate();
 }
 
-Status CLLSTMLayer::validate(const ITensorInfo *input,
-                             const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                             const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                             const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                             const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in,
-                             const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output,
-                             const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+Status CLLSTMLayer::validate(const ITensorInfo             *input,
+                             const ITensorInfo             *input_to_forget_weights,
+                             const ITensorInfo             *input_to_cell_weights,
+                             const ITensorInfo             *input_to_output_weights,
+                             const ITensorInfo             *recurrent_to_forget_weights,
+                             const ITensorInfo             *recurrent_to_cell_weights,
+                             const ITensorInfo             *recurrent_to_output_weights,
+                             const ITensorInfo             *forget_gate_bias,
+                             const ITensorInfo             *cell_bias,
+                             const ITensorInfo             *output_gate_bias,
+                             const ITensorInfo             *output_state_in,
+                             const ITensorInfo             *cell_state_in,
+                             const ITensorInfo             *scratch_buffer,
+                             const ITensorInfo             *output_state_out,
+                             const ITensorInfo             *cell_state_out,
+                             const ITensorInfo             *output,
+                             const LSTMParams<ITensorInfo> &lstm_params,
+                             const ActivationLayerInfo     &activation_info,
+                             float                          cell_threshold,
+                             float                          projection_threshold)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input,
-                                        input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                        recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                        forget_gate_bias, cell_bias, output_gate_bias,
-                                        output_state_in, cell_state_in,
-                                        scratch_buffer, output_state_out, cell_state_out, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(
+        input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights,
+        recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+        output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output);
 
     // Check data types
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input,
-                                                       input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                                       recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                                       forget_gate_bias, cell_bias, output_gate_bias,
-                                                       output_state_in, cell_state_in,
-                                                       scratch_buffer, output_state_out, cell_state_out, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(
+        input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights,
+        recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+        output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output);
 
     // Check dimensions
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
@@ -438,16 +586,16 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ERROR_ON(output_state_out->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(cell_state_out->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0)
-                                && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) &&
+                                cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
 
     const unsigned int num_batches = input->dimension(1);
     const unsigned int num_cells   = input_to_output_weights->dimension(1);
 
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
         // If CIFG is used, input layer normalization weights tensor is omitted
-        if(lstm_params.has_cifg_opt())
+        if (lstm_params.has_cifg_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights() != nullptr);
         }
@@ -459,8 +607,12 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.input_layer_norm_weights());
         }
 
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(),
+                                            lstm_params.cell_layer_norm_weights(),
+                                            lstm_params.output_layer_norm_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(),
+                                                           lstm_params.cell_layer_norm_weights(),
+                                                           lstm_params.output_layer_norm_weights());
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->num_dimensions() > 1);
@@ -470,7 +622,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
     }
 
     // Check peephole optimization
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights());
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() > 1);
@@ -488,36 +640,42 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
     TensorInfo cell_state_tmp  = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
 
     // Validate forget gate
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(
+        input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate));
 
     std::vector<const ITensorInfo *> inputs_vector;
     inputs_vector.emplace_back(input);
     inputs_vector.emplace_back(output_state_in);
-    const TensorShape concat_shape       = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
+    const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
     TensorInfo        forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type());
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(inputs_vector, &forget_gate_concat, Window::DimX));
 
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1,
+                                                ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&forget_gate));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_NEAREST_EVEN));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1,
+                                                ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+        &forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Validate input gate
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
-                                            lstm_params.recurrent_to_input_weights(),
-                                            lstm_params.input_gate_bias());
+                                            lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1);
@@ -525,88 +683,121 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
         std::vector<const ITensorInfo *> lstm_weights;
         lstm_weights.emplace_back(lstm_params.input_to_input_weights());
         lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
-        TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
-        TensorInfo  lstm_gate_concat          = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
+        TensorShape lstm_weights_concat_shape =
+            arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+        TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
         ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(lstm_weights, &lstm_gate_concat, Window::DimX));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(
+            input, lstm_params.input_to_input_weights(),
+            (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
 
-        if(lstm_params.has_peephole_opt())
+        if (lstm_params.has_peephole_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
             ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1);
-            ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1,
+                                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
         }
 
-        if(lstm_params.use_layer_norm())
+        if (lstm_params.use_layer_norm())
         {
             ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&input_gate));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1,
+                                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(),
+                                                                       &input_gate, ConvertPolicy::SATURATE));
         }
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+            &input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
 
     // Validate cell state
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
-    if(lstm_params.use_layer_norm())
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(
+        input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+    if (lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&cell_state_tmp));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_NEAREST_EVEN));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp,
+                                                1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
     }
     ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, nullptr, activation_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
-    if(cell_threshold != 0.f)
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+    if (cell_threshold != 0.f)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold,
-                                                                                                              -cell_threshold)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLActivationLayer::validate(&cell_state_tmp, nullptr,
+                                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                            cell_threshold, -cell_threshold)));
     }
 
     std::vector<const ITensorInfo *> in_out_weights;
     in_out_weights.emplace_back(input_to_output_weights);
     in_out_weights.emplace_back(recurrent_to_output_weights);
-    TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
-    TensorInfo  in_out_gate_concat          = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
+    TensorShape in_out_weights_concat_shape =
+        arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+    TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
     ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(in_out_weights, &in_out_gate_concat, Window::DimX));
     // Validate output gate tmp
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(
+        input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
 
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_NEAREST_EVEN));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp,
+                                                1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp,
+                                                                   ConvertPolicy::SATURATE));
     }
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&output_gate_tmp));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_NEAREST_EVEN));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+            &output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
+            RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp,
+                                                                   ConvertPolicy::SATURATE));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+        &output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Validate output state
     ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, &cell_state_tmp, activation_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-    if(lstm_params.has_projection())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out));
-        if(projection_threshold != 0.f)
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp,
+                                                                    1, ConvertPolicy::SATURATE,
+                                                                    RoundingPolicy::TO_NEAREST_EVEN));
+    if (lstm_params.has_projection())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(),
+                                                                    lstm_params.projection_bias(), output_state_out));
+        if (projection_threshold != 0.f)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output_state_out, output_state_out,
-                                                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+                output_state_out, output_state_out,
+                ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold,
+                                    projection_threshold)));
         }
     }
 
@@ -616,7 +807,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
 
     // Validate scratch concatenation
     std::vector<const ITensorInfo *> inputs_vector_info_raw;
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
         inputs_vector_info_raw.push_back(&input_gate);
     }
@@ -638,12 +829,12 @@ void CLLSTMLayer::run()
 
     _fully_connected_forget_gate.run();
 
-    if(_run_peephole_opt)
+    if (_run_peephole_opt)
     {
         _pixelwise_mul_forget_gate.run();
         _accum_forget_gate1.run();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _mean_std_norm_forget_gate.run();
         _pixelwise_mul_forget_gate_coeff.run();
@@ -651,7 +842,7 @@ void CLLSTMLayer::run()
     }
     _activation_forget_gate.run();
 
-    if(_run_cifg_opt)
+    if (_run_cifg_opt)
     {
         _ones_fill.run();
         _subtract_input_gate.run();
@@ -660,13 +851,13 @@ void CLLSTMLayer::run()
     {
         _fully_connected_input_gate.run();
 
-        if(_run_peephole_opt)
+        if (_run_peephole_opt)
         {
             _pixelwise_mul_input_gate.run();
             _accum_input_gate1.run();
         }
 
-        if(_is_layer_norm_lstm)
+        if (_is_layer_norm_lstm)
         {
             _mean_std_norm_input_gate.run();
             _pixelwise_mul_input_gate_coeff.run();
@@ -679,12 +870,10 @@ void CLLSTMLayer::run()
     ITensorPack pack;
     pack.add_tensor(TensorType::ACL_SRC, _recurrent_to_cell_weights);
     pack.add_tensor(TensorType::ACL_DST, &_cell_state_out2);
-    CLScheduler::get().enqueue_op(*_transpose_cell_state,
-                                  pack,
-                                  false);
+    CLScheduler::get().enqueue_op(*_transpose_cell_state, pack, false);
     _gemm_cell_state1.run();
     _accum_cell_state1.run();
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _mean_std_norm_cell_gate.run();
         _pixelwise_mul_cell_gate_coeff.run();
@@ -695,19 +884,19 @@ void CLLSTMLayer::run()
     _pixelwise_mul_cell_state2.run();
     _accum_cell_state2.run();
 
-    if(_perform_cell_clipping)
+    if (_perform_cell_clipping)
     {
         _cell_clip.run();
     }
 
     _fully_connected_output.run();
 
-    if(_run_peephole_opt)
+    if (_run_peephole_opt)
     {
         _pixelwise_mul_output_state1.run();
         _accum_output1.run();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _mean_std_norm_output_gate.run();
         _pixelwise_mul_output_gate_coeff.run();
@@ -718,10 +907,10 @@ void CLLSTMLayer::run()
     _activation_output_state.run();
     _pixelwise_mul_output_state2.run();
 
-    if(_has_projection_weights)
+    if (_has_projection_weights)
     {
         _fully_connected_output_state.run();
-        if(_perform_projection_clipping)
+        if (_perform_projection_clipping)
         {
             _projection_clip.run();
         }
@@ -735,10 +924,10 @@ void CLLSTMLayer::run()
 
 void CLLSTMLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         _concat_weights_forget_gate.run();
-        if(!_run_cifg_opt)
+        if (!_run_cifg_opt)
         {
             _concat_weights_input_gate.run();
         }
diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
index d14c6102d5..ea64eda023 100644
--- a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
@@ -25,12 +25,12 @@
 #include "arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h"
 
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include <memory>
 
@@ -46,48 +46,129 @@ const QuantizationInfo qsymm_0(1.f / 32768.f, 0);  // qsymm16 with 0 integer bit
 } // namespace
 
 CLLSTMLayerQuantized::CLLSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(),
-      _concat_bias(), _sigmoid_forget_gate(), _sigmoid_input_gate(), _sigmoid_output_gate(), _tanh_modulation_gate(), _tanh_output_state(), _add_cell_state_tmps(), _add2(), _mul_forget_gate_cell_state(),
-      _mul_input_gate_input_mod_gate(), _mul_output_state_tmp_output_gate(), _slice_input_tensor(), _slice_forget_tensor(), _slice_cell_tensor(), _slice_output_tensor(), _dequantize(), _quantize(),
-      _input_to_input_weights(nullptr), _input_to_forget_weights(nullptr), _input_to_cell_weights(nullptr), _input_to_output_weights(nullptr), _recurrent_to_input_weights(nullptr),
-      _recurrent_to_forget_weights(nullptr), _recurrent_to_cell_weights(nullptr), _recurrent_to_output_weights(nullptr), _input_gate_bias(nullptr), _forget_gate_bias(nullptr), _cell_bias(nullptr),
-      _output_gate_bias(nullptr), _recurrent_weights(), _input_weights(), _weights(), _input(), _weights_transposed(), _output_highp(), _output_lowp(), _bias(), _forget_gate_input(), _input_gate_input(),
-      _output_gate_input(), _input_modulation_gate_input(), _forget_gate_output(), _input_gate_output(), _output_gate_output(), _input_modulation_gate_output(), _cell_state_tmp1(), _cell_state_tmp2(),
-      _output_state_tmp(), _output_state_out_symm(), _output_state_out_f32(), _is_prepared(false)
+    : _memory_group(std::move(memory_manager)),
+      _gemmlowp(),
+      _output_stage(),
+      _transpose_weights(),
+      _concat_input_weights(),
+      _concat_recurrent_weights(),
+      _concat_weights(),
+      _concat_inputs(),
+      _concat_bias(),
+      _sigmoid_forget_gate(),
+      _sigmoid_input_gate(),
+      _sigmoid_output_gate(),
+      _tanh_modulation_gate(),
+      _tanh_output_state(),
+      _add_cell_state_tmps(),
+      _add2(),
+      _mul_forget_gate_cell_state(),
+      _mul_input_gate_input_mod_gate(),
+      _mul_output_state_tmp_output_gate(),
+      _slice_input_tensor(),
+      _slice_forget_tensor(),
+      _slice_cell_tensor(),
+      _slice_output_tensor(),
+      _dequantize(),
+      _quantize(),
+      _input_to_input_weights(nullptr),
+      _input_to_forget_weights(nullptr),
+      _input_to_cell_weights(nullptr),
+      _input_to_output_weights(nullptr),
+      _recurrent_to_input_weights(nullptr),
+      _recurrent_to_forget_weights(nullptr),
+      _recurrent_to_cell_weights(nullptr),
+      _recurrent_to_output_weights(nullptr),
+      _input_gate_bias(nullptr),
+      _forget_gate_bias(nullptr),
+      _cell_bias(nullptr),
+      _output_gate_bias(nullptr),
+      _recurrent_weights(),
+      _input_weights(),
+      _weights(),
+      _input(),
+      _weights_transposed(),
+      _output_highp(),
+      _output_lowp(),
+      _bias(),
+      _forget_gate_input(),
+      _input_gate_input(),
+      _output_gate_input(),
+      _input_modulation_gate_input(),
+      _forget_gate_output(),
+      _input_gate_output(),
+      _output_gate_output(),
+      _input_modulation_gate_output(),
+      _cell_state_tmp1(),
+      _cell_state_tmp2(),
+      _output_state_tmp(),
+      _output_state_out_symm(),
+      _output_state_out_f32(),
+      _is_prepared(false)
 {
 }
 
 void CLLSTMLayerQuantized::configure(const ICLTensor *input,
-                                     const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                                     const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                                     const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                                     ICLTensor *cell_state_in, const ICLTensor *output_state_in,
-                                     ICLTensor *cell_state_out, ICLTensor *output_state_out)
+                                     const ICLTensor *input_to_input_weights,
+                                     const ICLTensor *input_to_forget_weights,
+                                     const ICLTensor *input_to_cell_weights,
+                                     const ICLTensor *input_to_output_weights,
+                                     const ICLTensor *recurrent_to_input_weights,
+                                     const ICLTensor *recurrent_to_forget_weights,
+                                     const ICLTensor *recurrent_to_cell_weights,
+                                     const ICLTensor *recurrent_to_output_weights,
+                                     const ICLTensor *input_gate_bias,
+                                     const ICLTensor *forget_gate_bias,
+                                     const ICLTensor *cell_bias,
+                                     const ICLTensor *output_gate_bias,
+                                     ICLTensor       *cell_state_in,
+                                     const ICLTensor *output_state_in,
+                                     ICLTensor       *cell_state_out,
+                                     ICLTensor       *output_state_out)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
-              recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
-              output_state_out);
+    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_input_weights, input_to_forget_weights,
+              input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+              recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias,
+              output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
 }
 
-void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, const ICLTensor *input,
-                                     const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                                     const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                                     const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                                     ICLTensor *cell_state_in, const ICLTensor *output_state_in,
-                                     ICLTensor *cell_state_out, ICLTensor *output_state_out)
+void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context,
+                                     const ICLTensor        *input,
+                                     const ICLTensor        *input_to_input_weights,
+                                     const ICLTensor        *input_to_forget_weights,
+                                     const ICLTensor        *input_to_cell_weights,
+                                     const ICLTensor        *input_to_output_weights,
+                                     const ICLTensor        *recurrent_to_input_weights,
+                                     const ICLTensor        *recurrent_to_forget_weights,
+                                     const ICLTensor        *recurrent_to_cell_weights,
+                                     const ICLTensor        *recurrent_to_output_weights,
+                                     const ICLTensor        *input_gate_bias,
+                                     const ICLTensor        *forget_gate_bias,
+                                     const ICLTensor        *cell_bias,
+                                     const ICLTensor        *output_gate_bias,
+                                     ICLTensor              *cell_state_in,
+                                     const ICLTensor        *output_state_in,
+                                     ICLTensor              *cell_state_out,
+                                     ICLTensor              *output_state_out)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                 recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                 input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
-
-    ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
-                           recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights,
+                                 input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+                                 recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias,
+                                 forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
+                                 cell_state_out, output_state_out);
+
+    ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights,
+                           input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+                           recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias,
+                           cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
                            output_state_out);
 
-    ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayerQuantized::validate(input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
-                                                              input_to_output_weights->info(),
-                                                              recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                              input_gate_bias->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayerQuantized::validate(
+        input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
+        input_to_output_weights->info(), recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(),
+        recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), input_gate_bias->info(),
+        forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(),
+        output_state_in->info(), cell_state_out->info(), output_state_out->info()));
 
     const int input_size  = input->info()->dimension(0);
     const int batch_size  = input->info()->dimension(1);
@@ -95,8 +176,10 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
 
     const QuantizationInfo qweights = input_to_input_weights->info()->quantization_info(); // Weights quantization
 
-    auto_init_if_empty(*cell_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
-    auto_init_if_empty(*output_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
+    auto_init_if_empty(*cell_state_out->info(),
+                       TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
+    auto_init_if_empty(*output_state_out->info(),
+                       TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
 
     _input_to_input_weights      = input_to_input_weights;
     _input_to_forget_weights     = input_to_forget_weights;
@@ -124,17 +207,20 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
     recurrent_weights_vector.emplace_back(recurrent_to_cell_weights);
     recurrent_weights_vector.emplace_back(recurrent_to_output_weights);
 
-    _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _input_weights.allocator()->init(
+        TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
     _concat_input_weights.configure(compile_context, inputs_weights_vector, &_input_weights, Window::DimY);
 
-    _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _recurrent_weights.allocator()->init(
+        TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
     _concat_recurrent_weights.configure(compile_context, recurrent_weights_vector, &_recurrent_weights, Window::DimY);
 
     std::vector<const ICLTensor *> weights_vector;
     weights_vector.emplace_back(&_recurrent_weights);
     weights_vector.emplace_back(&_input_weights);
 
-    _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _weights.allocator()->init(
+        TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
     _concat_weights.configure(compile_context, weights_vector, &_weights, Window::DimX);
     _transpose_weights.configure(compile_context, &_weights, &_weights_transposed);
 
@@ -144,7 +230,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
     input_vector.emplace_back(output_state_in);
 
     _memory_group.manage(&_input);
-    _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
+    _input.allocator()->init(
+        TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
     _concat_inputs.configure(compile_context, input_vector, &_input, Window::DimX);
 
     // Bias concatenation
@@ -159,7 +246,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
 
     // Invert the offset for gemmlowp
     _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset));
-    _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
+    _weights_transposed.info()->set_quantization_info(
+        QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
 
     // Run gemmlowp
     _memory_group.manage(&_output_highp);
@@ -169,7 +257,8 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
 
     // Set the offset back
     _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
-    _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
+    _weights_transposed.info()->set_quantization_info(
+        QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
 
     // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12))
     _output_lowp.allocator()->init(TensorInfo(_output_highp.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_3));
@@ -191,85 +280,111 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
     _bias.allocator()->allocate();
 
     // Get the gate tensors
-    if(batch_size > 1)
+    if (batch_size > 1)
     {
         _memory_group.manage(&_input_gate_input);
-        _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size });
+        _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, {0, 0},
+                                      {output_size, batch_size});
         _memory_group.manage(&_forget_gate_input);
-        _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size });
+        _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, {output_size, 0},
+                                       {2 * output_size, batch_size});
         _memory_group.manage(&_input_modulation_gate_input);
-        _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size });
+        _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input,
+                                     {2 * output_size, 0}, {3 * output_size, batch_size});
         _memory_group.manage(&_output_gate_input);
-        _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size });
+        _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, {3 * output_size, 0},
+                                       {4 * output_size, batch_size});
         _output_lowp.allocator()->allocate();
     }
     else
     {
         _memory_group.manage(&_input_gate_input);
-        _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0 }, { output_size });
+        _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, {0}, {output_size});
         _memory_group.manage(&_forget_gate_input);
-        _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size });
+        _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, {output_size},
+                                       {2 * output_size});
         _memory_group.manage(&_input_modulation_gate_input);
-        _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size });
+        _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, {2 * output_size},
+                                     {3 * output_size});
         _memory_group.manage(&_output_gate_input);
-        _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size });
+        _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, {3 * output_size},
+                                       {4 * output_size});
         _output_lowp.allocator()->allocate();
     }
 
     // Forget gate
     _memory_group.manage(&_forget_gate_output);
-    _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_forget_gate.configure(compile_context, &_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _forget_gate_output.allocator()->init(
+        TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_forget_gate.configure(compile_context, &_forget_gate_input, &_forget_gate_output,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _forget_gate_input.allocator()->allocate();
 
     // Input gate
     _memory_group.manage(&_input_gate_output);
-    _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_input_gate.configure(compile_context, &_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _input_gate_output.allocator()->init(
+        TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_input_gate.configure(compile_context, &_input_gate_input, &_input_gate_output,
+                                  ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _input_gate_input.allocator()->allocate();
 
     // Input modulation gate equation
     _memory_group.manage(&_input_modulation_gate_output);
-    _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _tanh_modulation_gate.configure(compile_context, &_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+    _input_modulation_gate_output.allocator()->init(
+        TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _tanh_modulation_gate.configure(compile_context, &_input_modulation_gate_input, &_input_modulation_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
     _input_modulation_gate_input.allocator()->allocate();
 
     // Output gate
     _memory_group.manage(&_output_gate_output);
-    _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_output_gate.configure(compile_context, &_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _output_gate_output.allocator()->init(
+        TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_output_gate.configure(compile_context, &_output_gate_input, &_output_gate_output,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _output_gate_input.allocator()->allocate();
 
     // Long term memory
     _memory_group.manage(&_cell_state_tmp1);
-    _cell_state_tmp1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
-    _mul_forget_gate_cell_state.configure(compile_context, &_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _cell_state_tmp1.allocator()->init(
+        TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+    _mul_forget_gate_cell_state.configure(compile_context, &_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1,
+                                          ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _forget_gate_output.allocator()->allocate();
 
     _memory_group.manage(&_cell_state_tmp2);
-    _cell_state_tmp2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
-    _mul_input_gate_input_mod_gate.configure(compile_context, &_input_gate_output, &_input_modulation_gate_output, &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _cell_state_tmp2.allocator()->init(
+        TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+    _mul_input_gate_input_mod_gate.configure(compile_context, &_input_gate_output, &_input_modulation_gate_output,
+                                             &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _input_modulation_gate_output.allocator()->allocate();
     _input_gate_output.allocator()->allocate();
 
-    _add_cell_state_tmps.configure(compile_context, &_cell_state_tmp1, &_cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE);
+    _add_cell_state_tmps.configure(compile_context, &_cell_state_tmp1, &_cell_state_tmp2, cell_state_out,
+                                   ConvertPolicy::SATURATE);
     _cell_state_tmp1.allocator()->allocate();
     _cell_state_tmp2.allocator()->allocate();
 
     // Short term memory
     _memory_group.manage(&_output_state_tmp);
-    _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _tanh_output_state.configure(compile_context, cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+    _output_state_tmp.allocator()->init(
+        TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _tanh_output_state.configure(compile_context, cell_state_out, &_output_state_tmp,
+                                 ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
 
     _memory_group.manage(&_output_state_out_symm);
-    _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _mul_output_state_tmp_output_gate.configure(compile_context, &_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _output_state_out_symm.allocator()->init(
+        TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _mul_output_state_tmp_output_gate.configure(compile_context, &_output_state_tmp, &_output_gate_output,
+                                                &_output_state_out_symm, 1, ConvertPolicy::SATURATE,
+                                                RoundingPolicy::TO_ZERO);
     _output_gate_output.allocator()->allocate();
     _output_state_tmp.allocator()->allocate();
 
     // Requantize the output state from QSYMM16 to QASYMM8
     _memory_group.manage(&_output_state_out_f32);
-    _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
+    _output_state_out_f32.allocator()->init(
+        TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
     _dequantize.configure(compile_context, &_output_state_out_symm, &_output_state_out_f32);
     _output_state_out_symm.allocator()->allocate();
 
@@ -278,15 +393,28 @@ void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, co
 }
 
 Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
-                                      const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                                      const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                                      const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                                      const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                                      const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out)
+                                      const ITensorInfo *input_to_input_weights,
+                                      const ITensorInfo *input_to_forget_weights,
+                                      const ITensorInfo *input_to_cell_weights,
+                                      const ITensorInfo *input_to_output_weights,
+                                      const ITensorInfo *recurrent_to_input_weights,
+                                      const ITensorInfo *recurrent_to_forget_weights,
+                                      const ITensorInfo *recurrent_to_cell_weights,
+                                      const ITensorInfo *recurrent_to_output_weights,
+                                      const ITensorInfo *input_gate_bias,
+                                      const ITensorInfo *forget_gate_bias,
+                                      const ITensorInfo *cell_bias,
+                                      const ITensorInfo *output_gate_bias,
+                                      const ITensorInfo *cell_state_in,
+                                      const ITensorInfo *output_state_in,
+                                      const ITensorInfo *cell_state_out,
+                                      const ITensorInfo *output_state_out)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
-                                        recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in,
-                                        output_state_in, cell_state_out, output_state_out);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(
+        input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+        recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+        input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
+        output_state_out);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::QASYMM8);
 
     const int input_size  = input->dimension(0);
@@ -299,29 +427,51 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ERROR_ON(input_gate_bias->num_dimensions() > 1);
     ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2);
 
-    TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8));
-    TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8));
-    TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
-    TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm));
-    TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4));
+    TensorInfo input_weights_info(input_to_input_weights->clone()
+                                      ->set_tensor_shape(TensorShape(input_size, output_size))
+                                      .set_data_type(DataType::QASYMM8));
+    TensorInfo recurrent_weights_info(input_to_input_weights->clone()
+                                          ->set_tensor_shape(TensorShape(output_size, output_size))
+                                          .set_data_type(DataType::QASYMM8));
+    TensorInfo bias_info(
+        input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
+    TensorInfo output_state_info(cell_state_in->clone()
+                                     ->set_tensor_shape(TensorShape(output_size, batch_size))
+                                     .set_data_type(DataType::QASYMM8)
+                                     .set_quantization_info(qasymm));
+    TensorInfo cell_state_info(cell_state_in->clone()
+                                   ->set_tensor_shape(TensorShape(output_size, batch_size))
+                                   .set_data_type(DataType::QSYMM16)
+                                   .set_quantization_info(qsymm_4));
 
     // Shape checks
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights,
+                                                   input_to_cell_weights, input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights,
+                                                   recurrent_to_forget_weights, recurrent_to_cell_weights,
+                                                   recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias,
+                                                   output_gate_bias);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_in);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_in);
 
     // Data type checks
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights,
+                                                       input_to_forget_weights, input_to_cell_weights,
+                                                       input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights,
+                                                       recurrent_to_forget_weights, recurrent_to_cell_weights,
+                                                       recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias,
+                                                       output_gate_bias);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in);
 
     // Quantization checks
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input_to_input_weights, input_to_forget_weights,
+                                                              input_to_cell_weights, input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights,
+                                                              recurrent_to_cell_weights, recurrent_to_output_weights);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in);
 
@@ -343,7 +493,8 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
     recurrent_weights_vector.emplace_back(recurrent_to_cell_weights);
     recurrent_weights_vector.emplace_back(recurrent_to_output_weights);
     const TensorInfo recurrent_weights(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY));
 
     // _concat_weights
     std::vector<const ITensorInfo *> weights_vector;
@@ -353,7 +504,7 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(weights_vector, &weights, Window::DimX));
     // _transpose_weights
     const TensorShape weights_transposed_shape(weights.tensor_shape()[1], weights.tensor_shape()[0]);
-    TensorInfo        weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape);
+    TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape);
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&weights, &weights_transposed));
 
     // _concat_inputs
@@ -379,7 +530,8 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
 
     // _gemmlowp
     const TensorInfo output_highp(TensorShape(4 * output_size, batch_size), 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp));
 
     // Set the offset back
     input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
@@ -390,7 +542,8 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
     const float multiplier        = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
     int         output_multiplier = 0;
     int         output_shift      = 0;
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
 
     // _output_stage
     GEMMLowpOutputStageInfo info{};
@@ -405,68 +558,91 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
     TensorInfo input_modulation_gate_input;
     TensorInfo output_gate_input;
 
-    if(batch_size > 1)
+    if (batch_size > 1)
     {
         // _slice_input_tensor
         input_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0, 0 }, { output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &input_gate_input, {0, 0}, {output_size, batch_size}));
         // _slice_forget_tensor
         forget_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &forget_gate_input, {output_size, 0}, {2 * output_size, batch_size}));
         // _slice_cell_tensor
         input_modulation_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size, 0},
+                                                      {3 * output_size, batch_size}));
         // _slice_output_tensor
         output_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &output_gate_input, {3 * output_size, 0}, {4 * output_size, batch_size}));
     }
     else
     {
         // _slice_input_tensor
         input_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0 }, { output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, {0}, {output_size}));
         // _slice_forget_tensor
         forget_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size }, { 2 * output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &forget_gate_input, {output_size}, {2 * output_size}));
         // _slice_cell_tensor
         input_modulation_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size }, { 3 * output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size}, {3 * output_size}));
         // _slice_output_tensor
         output_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size }, { 4 * output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &output_gate_input, {3 * output_size}, {4 * output_size}));
     }
 
     // _sigmoid_forget_gate
     const TensorInfo forget_gate_output(forget_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_gate_input, &forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&forget_gate_input, &forget_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     // _sigmoid_input_gate
     const TensorInfo input_gate_output(input_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+        &input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     // _tanh_modulation_gate
-    const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+    const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16,
+                                                  qsymm_0);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
     // _sigmoid_output_gate
     const TensorInfo output_gate_output(output_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_gate_input, &output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&output_gate_input, &output_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // _mul_forget_gate_cell_state
     const TensorInfo cell_state_tmp1(forget_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
 
     // _mul_input_gate_input_mod_gate
     const TensorInfo cell_state_tmp2(input_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, &cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output,
+                                                                    &cell_state_tmp2, 1, ConvertPolicy::SATURATE,
+                                                                    RoundingPolicy::TO_ZERO));
 
     // _add_cell_state_tmps
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE));
 
     // _tanh_modulation_gate
     const TensorInfo output_state_tmp(cell_state_out->tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(cell_state_out, &output_state_tmp,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
 
     // _mul_output_state_tmp_output_gate
     const TensorInfo output_state_out_symm(output_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, &output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output,
+                                                                    &output_state_out_symm, 1, ConvertPolicy::SATURATE,
+                                                                    RoundingPolicy::TO_ZERO));
 
     // _dequantize
     const TensorInfo output_state_out_f32(output_state_out_symm.tensor_shape(), 1, DataType::F32);
@@ -475,14 +651,14 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
     // _quantize
     ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayer::validate(&output_state_out_f32, output_state_out));
 
-    if(cell_state_out->total_size() != 0)
+    if (cell_state_out->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_out);
     }
 
-    if(output_state_out->total_size() != 0)
+    if (output_state_out->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_out);
@@ -541,7 +717,7 @@ void CLLSTMLayerQuantized::run()
 
 void CLLSTMLayerQuantized::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         _input_weights.allocator()->allocate();
         _concat_input_weights.run();
diff --git a/src/runtime/CL/functions/CLLogicalAnd.cpp b/src/runtime/CL/functions/CLLogicalAnd.cpp
index 696191c485..ea21c54bc3 100644
--- a/src/runtime/CL/functions/CLLogicalAnd.cpp
+++ b/src/runtime/CL/functions/CLLogicalAnd.cpp
@@ -22,10 +22,11 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLLogicalAnd.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
 #include <utility>
 
@@ -33,7 +34,10 @@ namespace arm_compute
 {
 namespace experimental
 {
-void CLLogicalAnd::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+void CLLogicalAnd::configure(const CLCompileContext &compile_context,
+                             ITensorInfo            *input1,
+                             ITensorInfo            *input2,
+                             ITensorInfo            *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
     auto k = std::make_unique<arm_compute::opencl::kernels::ClLogicalBinaryKernel>();
@@ -54,17 +58,16 @@ void CLLogicalAnd::run(ITensorPack &tensors)
 
 struct CLLogicalAnd::Impl
 {
-    const ICLTensor                            *src0{ nullptr };
-    const ICLTensor                            *src1{ nullptr };
-    ICLTensor                                  *dst{ nullptr };
-    std::unique_ptr<experimental::CLLogicalAnd> op{ nullptr };
+    const ICLTensor                            *src0{nullptr};
+    const ICLTensor                            *src1{nullptr};
+    ICLTensor                                  *dst{nullptr};
+    std::unique_ptr<experimental::CLLogicalAnd> op{nullptr};
 };
 
-CLLogicalAnd::CLLogicalAnd()
-    : _impl(std::make_unique<Impl>())
+CLLogicalAnd::CLLogicalAnd() : _impl(std::make_unique<Impl>())
 {
 }
-CLLogicalAnd::CLLogicalAnd(CLLogicalAnd &&) = default;
+CLLogicalAnd::CLLogicalAnd(CLLogicalAnd &&)            = default;
 CLLogicalAnd &CLLogicalAnd::operator=(CLLogicalAnd &&) = default;
 CLLogicalAnd::~CLLogicalAnd()                          = default;
 
@@ -73,7 +76,10 @@ void CLLogicalAnd::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *ou
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
 }
 
-void CLLogicalAnd::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+void CLLogicalAnd::configure(const CLCompileContext &compile_context,
+                             ICLTensor              *input1,
+                             ICLTensor              *input2,
+                             ICLTensor              *output)
 {
     _impl->src0 = input1;
     _impl->src1 = input2;
diff --git a/src/runtime/CL/functions/CLLogicalNot.cpp b/src/runtime/CL/functions/CLLogicalNot.cpp
index a0504d7852..71f9cce54f 100644
--- a/src/runtime/CL/functions/CLLogicalNot.cpp
+++ b/src/runtime/CL/functions/CLLogicalNot.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClLogicalNot.h"
 
@@ -32,16 +33,15 @@ namespace arm_compute
 {
 struct CLLogicalNot::Impl
 {
-    const ICLTensor                      *src{ nullptr };
-    ICLTensor                            *dst{ nullptr };
-    std::unique_ptr<opencl::ClLogicalNot> op{ nullptr };
+    const ICLTensor                      *src{nullptr};
+    ICLTensor                            *dst{nullptr};
+    std::unique_ptr<opencl::ClLogicalNot> op{nullptr};
 };
 
-CLLogicalNot::CLLogicalNot()
-    : _impl(std::make_unique<Impl>())
+CLLogicalNot::CLLogicalNot() : _impl(std::make_unique<Impl>())
 {
 }
-CLLogicalNot::CLLogicalNot(CLLogicalNot &&) = default;
+CLLogicalNot::CLLogicalNot(CLLogicalNot &&)            = default;
 CLLogicalNot &CLLogicalNot::operator=(CLLogicalNot &&) = default;
 CLLogicalNot::~CLLogicalNot()                          = default;
 
@@ -72,4 +72,4 @@ void CLLogicalNot::run()
     _impl->op->run(pack);
 }
 
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLLogicalOr.cpp b/src/runtime/CL/functions/CLLogicalOr.cpp
index f9a606e8a5..3db4fdae84 100644
--- a/src/runtime/CL/functions/CLLogicalOr.cpp
+++ b/src/runtime/CL/functions/CLLogicalOr.cpp
@@ -22,10 +22,11 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLLogicalOr.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
 #include <utility>
 
@@ -33,7 +34,10 @@ namespace arm_compute
 {
 namespace experimental
 {
-void CLLogicalOr::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+void CLLogicalOr::configure(const CLCompileContext &compile_context,
+                            ITensorInfo            *input1,
+                            ITensorInfo            *input2,
+                            ITensorInfo            *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
     auto k = std::make_unique<arm_compute::opencl::kernels::ClLogicalBinaryKernel>();
@@ -54,17 +58,16 @@ void CLLogicalOr::run(ITensorPack &tensors)
 
 struct CLLogicalOr::Impl
 {
-    const ICLTensor                           *src0{ nullptr };
-    const ICLTensor                           *src1{ nullptr };
-    ICLTensor                                 *dst{ nullptr };
-    std::unique_ptr<experimental::CLLogicalOr> op{ nullptr };
+    const ICLTensor                           *src0{nullptr};
+    const ICLTensor                           *src1{nullptr};
+    ICLTensor                                 *dst{nullptr};
+    std::unique_ptr<experimental::CLLogicalOr> op{nullptr};
 };
 
-CLLogicalOr::CLLogicalOr()
-    : _impl(std::make_unique<Impl>())
+CLLogicalOr::CLLogicalOr() : _impl(std::make_unique<Impl>())
 {
 }
-CLLogicalOr::CLLogicalOr(CLLogicalOr &&) = default;
+CLLogicalOr::CLLogicalOr(CLLogicalOr &&)            = default;
 CLLogicalOr &CLLogicalOr::operator=(CLLogicalOr &&) = default;
 CLLogicalOr::~CLLogicalOr()                         = default;
 
@@ -73,7 +76,10 @@ void CLLogicalOr::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *out
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
 }
 
-void CLLogicalOr::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+void CLLogicalOr::configure(const CLCompileContext &compile_context,
+                            ICLTensor              *input1,
+                            ICLTensor              *input2,
+                            ICLTensor              *output)
 {
     _impl->src0 = input1;
     _impl->src1 = input2;
diff --git a/src/runtime/CL/functions/CLMatMul.cpp b/src/runtime/CL/functions/CLMatMul.cpp
index bef422fca1..e8bdad706b 100644
--- a/src/runtime/CL/functions/CLMatMul.cpp
+++ b/src/runtime/CL/functions/CLMatMul.cpp
@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLMatMul.h"
+
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTypes.h"
+
 #include "src/gpu/cl/operators/ClMatMul.h"
 
 namespace arm_compute
@@ -32,23 +34,32 @@ using OperatorType = opencl::ClMatMul;
 
 struct CLMatMul::Impl
 {
-    std::unique_ptr<OperatorType> op{ nullptr };
+    std::unique_ptr<OperatorType> op{nullptr};
     ITensorPack                   run_pack{};
 };
-CLMatMul::CLMatMul()
-    : _impl(std::make_unique<Impl>())
+CLMatMul::CLMatMul() : _impl(std::make_unique<Impl>())
 {
 }
 
 CLMatMul::~CLMatMul() = default;
 
-void CLMatMul::configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
+void CLMatMul::configure(ICLTensor                 *lhs,
+                         ICLTensor                 *rhs,
+                         ICLTensor                 *output,
+                         const MatMulInfo          &matmul_info,
+                         const GpuMatMulSettings   &settings,
+                         const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(settings);
     configure(CLKernelLibrary::get().get_compile_context(), lhs, rhs, output, matmul_info, settings, act_info);
 }
 
-void CLMatMul::configure(const CLCompileContext &compile_context, ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings,
+void CLMatMul::configure(const CLCompileContext    &compile_context,
+                         ICLTensor                 *lhs,
+                         ICLTensor                 *rhs,
+                         ICLTensor                 *output,
+                         const MatMulInfo          &matmul_info,
+                         const GpuMatMulSettings   &settings,
                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output);
@@ -56,10 +67,14 @@ void CLMatMul::configure(const CLCompileContext &compile_context, ICLTensor *lhs
 
     _impl->op = std::make_unique<OperatorType>();
     _impl->op->configure(compile_context, lhs->info(), rhs->info(), output->info(), matmul_info, act_info);
-    _impl->run_pack = { { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs }, { ACL_DST, output } };
+    _impl->run_pack = {{ACL_SRC_0, lhs}, {ACL_SRC_1, rhs}, {ACL_DST, output}};
 }
 
-Status CLMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info)
+Status CLMatMul::validate(const ITensorInfo         *lhs,
+                          const ITensorInfo         *rhs,
+                          const ITensorInfo         *output,
+                          const MatMulInfo          &matmul_info,
+                          const ActivationLayerInfo &act_info)
 {
     return OperatorType::validate(lhs, rhs, output, matmul_info, act_info);
 }
diff --git a/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp
index 2786d32d33..7494f379b9 100644
--- a/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp
@@ -27,26 +27,32 @@
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
 
 namespace arm_compute
 {
 CLMaxUnpoolingLayer::CLMaxUnpoolingLayer()
-    : _fill(),
-      _unpooling_layer_kernel(std::make_unique<CLMaxUnpoolingLayerKernel>())
+    : _fill(), _unpooling_layer_kernel(std::make_unique<CLMaxUnpoolingLayerKernel>())
 {
 }
 
 CLMaxUnpoolingLayer::~CLMaxUnpoolingLayer() = default;
 
-void CLMaxUnpoolingLayer::configure(ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info)
+void CLMaxUnpoolingLayer::configure(ICLTensor              *input,
+                                    ICLTensor              *indices,
+                                    ICLTensor              *output,
+                                    const PoolingLayerInfo &pool_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, pool_info);
 }
 
-void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info)
+void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context,
+                                    ICLTensor              *input,
+                                    ICLTensor              *indices,
+                                    ICLTensor              *output,
+                                    const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, indices, output, pool_info);
     const PixelValue zero_value(0.f);
@@ -55,7 +61,10 @@ void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context, ICL
     _unpooling_layer_kernel->configure(compile_context, input, indices, output, pool_info);
 }
 
-Status CLMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+Status CLMaxUnpoolingLayer::validate(const ITensorInfo      *input,
+                                     const ITensorInfo      *indices,
+                                     const ITensorInfo      *output,
+                                     const PoolingLayerInfo &pool_info)
 {
     return CLMaxUnpoolingLayerKernel::validate(input, indices, output, pool_info);
 }
diff --git a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
index a81cbca1b0..5892c0e840 100644
--- a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
@@ -24,9 +24,9 @@
 #include "arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h"
 
 #include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
 
 namespace arm_compute
 {
@@ -35,7 +35,10 @@ void CLMeanStdDevNormalizationLayer::configure(ICLTensor *input, ICLTensor *outp
     configure(CLKernelLibrary::get().get_compile_context(), input, output, epsilon);
 }
 
-void CLMeanStdDevNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float epsilon)
+void CLMeanStdDevNormalizationLayer::configure(const CLCompileContext &compile_context,
+                                               ICLTensor              *input,
+                                               ICLTensor              *output,
+                                               float                   epsilon)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, epsilon);
     auto k = std::make_unique<CLMeanStdDevNormalizationKernel>();
diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp
index c0cc5184e6..f93f82f1a2 100644
--- a/src/runtime/CL/functions/CLNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp
@@ -30,10 +30,10 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLNormalizationLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLNormalizationLayerKernel.h"
 
 namespace arm_compute
 {
@@ -50,7 +50,10 @@ void CLNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const
     configure(CLKernelLibrary::get().get_compile_context(), input, output, norm_info);
 }
 
-void CLNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info)
+void CLNormalizationLayer::configure(const CLCompileContext       &compile_context,
+                                     ICLTensor                    *input,
+                                     ICLTensor                    *output,
+                                     const NormalizationLayerInfo &norm_info)
 {
     ARM_COMPUTE_ERROR_ON(input == nullptr);
     ARM_COMPUTE_LOG_PARAMS(input, output, norm_info);
@@ -58,21 +61,24 @@ void CLNormalizationLayer::configure(const CLCompileContext &compile_context, IC
     // Configure normalization kernel
     _norm_kernel->configure(compile_context, input, output, norm_info);
 
-    if(!_norm_kernel->border_size().empty())
+    if (!_norm_kernel->border_size().empty())
     {
         // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
-        _border_handler->configure(compile_context, input, _norm_kernel->border_size(), BorderMode::CONSTANT, PixelValue());
+        _border_handler->configure(compile_context, input, _norm_kernel->border_size(), BorderMode::CONSTANT,
+                                   PixelValue());
     }
 }
 
-Status CLNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
+Status CLNormalizationLayer::validate(const ITensorInfo            *input,
+                                      const ITensorInfo            *output,
+                                      const NormalizationLayerInfo &norm_info)
 {
     return CLNormalizationLayerKernel::validate(input, output, norm_info);
 }
 
 void CLNormalizationLayer::run()
 {
-    if(!_norm_kernel->border_size().empty())
+    if (!_norm_kernel->border_size().empty())
     {
         // Run border handler
         CLScheduler::get().enqueue(*_border_handler, false);
diff --git a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
index 63c9164a94..939c95bd45 100644
--- a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
@@ -24,20 +24,26 @@
 
 #include "arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h"
 
-#include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
 
 #include <utility>
 
 namespace arm_compute
 {
-void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input,
+                                          ICLTensor       *output,
+                                          const ICLTensor *mean,
+                                          const ICLTensor *std)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, std);
 }
 
-void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          ICLTensor              *output,
+                                          const ICLTensor        *mean,
+                                          const ICLTensor        *std)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, mean, std);
     auto k = std::make_unique<CLNormalizePlanarYUVLayerKernel>();
@@ -45,8 +51,10 @@ void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_contex
     _kernel = std::move(k);
 }
 
-Status CLNormalizePlanarYUVLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                           const ITensorInfo *mean, const ITensorInfo *std)
+Status CLNormalizePlanarYUVLayer::validate(const ITensorInfo *input,
+                                           const ITensorInfo *output,
+                                           const ITensorInfo *mean,
+                                           const ITensorInfo *std)
 {
     return CLNormalizePlanarYUVLayerKernel::validate(input, output, mean, std);
 }
diff --git a/src/runtime/CL/functions/CLPReluLayer.cpp b/src/runtime/CL/functions/CLPReluLayer.cpp
index 186e7b4ba2..ce6d285ebe 100644
--- a/src/runtime/CL/functions/CLPReluLayer.cpp
+++ b/src/runtime/CL/functions/CLPReluLayer.cpp
@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLPReluLayer.h"
+
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/gpu/cl/IClKernel.h"
 #include "src/gpu/cl/operators/ClPRelu.h"
 
@@ -33,17 +35,16 @@ using OperatorType = opencl::ClPRelu;
 
 struct CLPReluLayer::Impl
 {
-    const ICLTensor              *src_0{ nullptr };
-    const ICLTensor              *src_1{ nullptr };
-    ICLTensor                    *dst{ nullptr };
-    std::unique_ptr<OperatorType> op{ nullptr };
+    const ICLTensor              *src_0{nullptr};
+    const ICLTensor              *src_1{nullptr};
+    ICLTensor                    *dst{nullptr};
+    std::unique_ptr<OperatorType> op{nullptr};
 };
 
-CLPReluLayer::CLPReluLayer()
-    : _impl(std::make_unique<Impl>())
+CLPReluLayer::CLPReluLayer() : _impl(std::make_unique<Impl>())
 {
 }
-CLPReluLayer::CLPReluLayer(CLPReluLayer &&) = default;
+CLPReluLayer::CLPReluLayer(CLPReluLayer &&)            = default;
 CLPReluLayer &CLPReluLayer::operator=(CLPReluLayer &&) = default;
 CLPReluLayer::~CLPReluLayer()                          = default;
 
@@ -52,13 +53,17 @@ void CLPReluLayer::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *outp
     configure(CLKernelLibrary::get().get_compile_context(), input, alpha, output);
 }
 
-void CLPReluLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
+void CLPReluLayer::configure(const CLCompileContext &compile_context,
+                             ICLTensor              *input,
+                             ICLTensor              *alpha,
+                             ICLTensor              *output)
 {
     _impl->src_0 = input;
     _impl->src_1 = alpha;
     _impl->dst   = output;
     _impl->op    = std::make_unique<OperatorType>();
-    _impl->op->configure(compile_context, input->info(), alpha->info(), (output == nullptr ? input->info() : output->info()));
+    _impl->op->configure(compile_context, input->info(), alpha->info(),
+                         (output == nullptr ? input->info() : output->info()));
 }
 
 Status CLPReluLayer::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp
index 0ed8f03d64..e788ded512 100644
--- a/src/runtime/CL/functions/CLPadLayer.cpp
+++ b/src/runtime/CL/functions/CLPadLayer.cpp
@@ -22,37 +22,38 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLPadLayer.h"
-#include "src/core/CL/kernels/CLPadLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLPadLayerKernel.h"
 
 namespace arm_compute
 {
-CLPadLayer::CLPadLayer()
-    : _pad_kernel(std::make_unique<CLPadLayerKernel>()),
-      _copy(),
-      _perform_pad(false)
+CLPadLayer::CLPadLayer() : _pad_kernel(std::make_unique<CLPadLayerKernel>()), _copy(), _perform_pad(false)
 {
 }
 
 CLPadLayer::~CLPadLayer() = default;
 
-void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+void CLPadLayer::configure(
+    ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, mode);
 }
 
-void CLPadLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+void CLPadLayer::configure(const CLCompileContext &compile_context,
+                           ICLTensor              *input,
+                           ICLTensor              *output,
+                           const PaddingList      &padding,
+                           PixelValue              constant_value,
+                           PaddingMode             mode)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode));
     ARM_COMPUTE_LOG_PARAMS(input, output, padding, constant_value, mode);
 
-    _perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info)
-    {
-        return info.first > 0 || info.second > 0;
-    });
+    _perform_pad =
+        std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) { return info.first > 0 || info.second > 0; });
 
-    if(_perform_pad)
+    if (_perform_pad)
     {
         _pad_kernel->configure(compile_context, input, output, padding, constant_value, mode);
     }
@@ -62,14 +63,16 @@ void CLPadLayer::configure(const CLCompileContext &compile_context, ICLTensor *i
         _copy.configure(compile_context, input, output);
     }
 }
-Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+Status CLPadLayer::validate(const ITensorInfo *input,
+                            const ITensorInfo *output,
+                            const PaddingList &padding,
+                            PixelValue         constant_value,
+                            PaddingMode        mode)
 {
-    bool perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info)
-    {
-        return info.first > 0 || info.second > 0;
-    });
+    bool perform_pad =
+        std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) { return info.first > 0 || info.second > 0; });
 
-    if(perform_pad)
+    if (perform_pad)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(input, output, padding, constant_value, mode));
     }
@@ -81,7 +84,7 @@ Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
 }
 void CLPadLayer::run()
 {
-    if(_perform_pad)
+    if (_perform_pad)
     {
         CLScheduler::get().enqueue(*_pad_kernel);
     }
diff --git a/src/runtime/CL/functions/CLPermute.cpp b/src/runtime/CL/functions/CLPermute.cpp
index a56afff7df..7f97eed98a 100644
--- a/src/runtime/CL/functions/CLPermute.cpp
+++ b/src/runtime/CL/functions/CLPermute.cpp
@@ -27,22 +27,21 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClPermute.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClPermute.h"
 
 namespace arm_compute
 {
 struct CLPermute::Impl
 {
-    const ICLTensor                   *src{ nullptr };
-    ICLTensor                         *dst{ nullptr };
-    std::unique_ptr<opencl::ClPermute> op{ nullptr };
+    const ICLTensor                   *src{nullptr};
+    ICLTensor                         *dst{nullptr};
+    std::unique_ptr<opencl::ClPermute> op{nullptr};
 };
 
-CLPermute::CLPermute()
-    : _impl(std::make_unique<Impl>())
+CLPermute::CLPermute() : _impl(std::make_unique<Impl>())
 {
 }
 
@@ -53,7 +52,10 @@ void CLPermute::configure(const ICLTensor *input, ICLTensor *output, const Permu
     configure(CLKernelLibrary::get().get_compile_context(), input, output, perm);
 }
 
-void CLPermute::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PermutationVector &perm)
+void CLPermute::configure(const CLCompileContext  &compile_context,
+                          const ICLTensor         *input,
+                          ICLTensor               *output,
+                          const PermutationVector &perm)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, output, perm);
diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
index 9d91e58367..6aa9d9cbb3 100644
--- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
+++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClMul.h"
 
@@ -34,38 +35,55 @@ namespace arm_compute
 {
 struct CLPixelWiseMultiplication::Impl
 {
-    const ICLTensor               *src_0{ nullptr };
-    const ICLTensor               *src_1{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClMul> op{ nullptr };
+    const ICLTensor               *src_0{nullptr};
+    const ICLTensor               *src_1{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClMul> op{nullptr};
 };
 
-CLPixelWiseMultiplication::CLPixelWiseMultiplication()
-    : _impl(std::make_unique<Impl>())
+CLPixelWiseMultiplication::CLPixelWiseMultiplication() : _impl(std::make_unique<Impl>())
 {
 }
-CLPixelWiseMultiplication::CLPixelWiseMultiplication(CLPixelWiseMultiplication &&) = default;
+CLPixelWiseMultiplication::CLPixelWiseMultiplication(CLPixelWiseMultiplication &&)            = default;
 CLPixelWiseMultiplication &CLPixelWiseMultiplication::operator=(CLPixelWiseMultiplication &&) = default;
 CLPixelWiseMultiplication::~CLPixelWiseMultiplication()                                       = default;
 
-void CLPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
-                                          ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+void CLPixelWiseMultiplication::configure(ICLTensor                 *input1,
+                                          ICLTensor                 *input2,
+                                          ICLTensor                 *output,
+                                          float                      scale,
+                                          ConvertPolicy              overflow_policy,
+                                          RoundingPolicy             rounding_policy,
+                                          const ActivationLayerInfo &act_info)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy,
+              rounding_policy, act_info);
 }
 
-void CLPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
-                                          ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+void CLPixelWiseMultiplication::configure(const CLCompileContext    &compile_context,
+                                          ICLTensor                 *input1,
+                                          ICLTensor                 *input2,
+                                          ICLTensor                 *output,
+                                          float                      scale,
+                                          ConvertPolicy              overflow_policy,
+                                          RoundingPolicy             rounding_policy,
+                                          const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
     _impl->dst   = output;
     _impl->op    = std::make_unique<opencl::ClMul>();
-    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, act_info);
+    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), scale, overflow_policy,
+                         rounding_policy, act_info);
 }
 
-Status CLPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
-                                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+Status CLPixelWiseMultiplication::validate(const ITensorInfo         *input1,
+                                           const ITensorInfo         *input2,
+                                           const ITensorInfo         *output,
+                                           float                      scale,
+                                           ConvertPolicy              overflow_policy,
+                                           RoundingPolicy             rounding_policy,
+                                           const ActivationLayerInfo &act_info)
 {
     return opencl::ClMul::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
 }
@@ -82,26 +100,33 @@ void CLPixelWiseMultiplication::run()
 
 struct CLComplexPixelWiseMultiplication::Impl
 {
-    const ICLTensor                      *src_0{ nullptr };
-    const ICLTensor                      *src_1{ nullptr };
-    ICLTensor                            *dst{ nullptr };
-    std::unique_ptr<opencl::ClComplexMul> op{ nullptr };
+    const ICLTensor                      *src_0{nullptr};
+    const ICLTensor                      *src_1{nullptr};
+    ICLTensor                            *dst{nullptr};
+    std::unique_ptr<opencl::ClComplexMul> op{nullptr};
 };
 
-CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication()
-    : _impl(std::make_unique<Impl>())
+CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication() : _impl(std::make_unique<Impl>())
 {
 }
 CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication(CLComplexPixelWiseMultiplication &&) = default;
-CLComplexPixelWiseMultiplication &CLComplexPixelWiseMultiplication::operator=(CLComplexPixelWiseMultiplication &&) = default;
-CLComplexPixelWiseMultiplication::~CLComplexPixelWiseMultiplication()                                              = default;
-
-void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+CLComplexPixelWiseMultiplication &
+CLComplexPixelWiseMultiplication::operator=(CLComplexPixelWiseMultiplication &&) = default;
+CLComplexPixelWiseMultiplication::~CLComplexPixelWiseMultiplication()            = default;
+
+void CLComplexPixelWiseMultiplication::configure(ICLTensor                 *input1,
+                                                 ICLTensor                 *input2,
+                                                 ICLTensor                 *output,
+                                                 const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLComplexPixelWiseMultiplication::configure(const CLCompileContext    &compile_context,
+                                                 ICLTensor                 *input1,
+                                                 ICLTensor                 *input2,
+                                                 ICLTensor                 *output,
+                                                 const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
@@ -110,7 +135,10 @@ void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
 }
 
-Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo         *input1,
+                                                  const ITensorInfo         *input2,
+                                                  const ITensorInfo         *output,
+                                                  const ActivationLayerInfo &act_info)
 {
     return opencl::ClComplexMul::validate(input1, input2, output, act_info);
 }
diff --git a/src/runtime/CL/functions/CLPooling3dLayer.cpp b/src/runtime/CL/functions/CLPooling3dLayer.cpp
index 11ae1d0fe6..ce1092a7cc 100644
--- a/src/runtime/CL/functions/CLPooling3dLayer.cpp
+++ b/src/runtime/CL/functions/CLPooling3dLayer.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClPool3d.h"
 
@@ -32,14 +33,13 @@ namespace arm_compute
 {
 struct CLPooling3dLayer::Impl
 {
-    const ICLTensor                  *src{ nullptr };
-    ICLTensor                        *dst{ nullptr };
-    ICLTensor                        *indices{ nullptr };
-    std::unique_ptr<opencl::ClPool3d> op{ nullptr };
+    const ICLTensor                  *src{nullptr};
+    ICLTensor                        *dst{nullptr};
+    ICLTensor                        *indices{nullptr};
+    std::unique_ptr<opencl::ClPool3d> op{nullptr};
 };
 
-CLPooling3dLayer::CLPooling3dLayer()
-    : _impl(std::make_unique<Impl>())
+CLPooling3dLayer::CLPooling3dLayer() : _impl(std::make_unique<Impl>())
 {
 }
 CLPooling3dLayer::~CLPooling3dLayer() = default;
@@ -49,7 +49,10 @@ void CLPooling3dLayer::configure(const ICLTensor *input, ICLTensor *output, cons
     configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info);
 }
 
-void CLPooling3dLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Pooling3dLayerInfo &pool_info)
+void CLPooling3dLayer::configure(const CLCompileContext   &compile_context,
+                                 const ICLTensor          *input,
+                                 ICLTensor                *output,
+                                 const Pooling3dLayerInfo &pool_info)
 {
     _impl->src = input;
     _impl->dst = output;
@@ -58,7 +61,8 @@ void CLPooling3dLayer::configure(const CLCompileContext &compile_context, const
     _impl->op->configure(compile_context, input->info(), output->info(), pool_info);
 }
 
-Status CLPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info)
+Status
+CLPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info)
 {
     return opencl::ClPool3d::validate(input, output, pool_info);
 }
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index 0ebce318fa..65e53b9be3 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClPool2d.h"
 
@@ -32,34 +33,44 @@ namespace arm_compute
 {
 struct CLPoolingLayer::Impl
 {
-    const ICLTensor                  *src{ nullptr };
-    ICLTensor                        *dst{ nullptr };
-    ICLTensor                        *indices{ nullptr };
-    std::unique_ptr<opencl::ClPool2d> op{ nullptr };
+    const ICLTensor                  *src{nullptr};
+    ICLTensor                        *dst{nullptr};
+    ICLTensor                        *indices{nullptr};
+    std::unique_ptr<opencl::ClPool2d> op{nullptr};
 };
 
-CLPoolingLayer::CLPoolingLayer()
-    : _impl(std::make_unique<Impl>())
+CLPoolingLayer::CLPoolingLayer() : _impl(std::make_unique<Impl>())
 {
 }
 CLPoolingLayer::~CLPoolingLayer() = default;
 
-void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices)
+void CLPoolingLayer::configure(ICLTensor              *input,
+                               ICLTensor              *output,
+                               const PoolingLayerInfo &pool_info,
+                               ICLTensor              *indices)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info, indices);
 }
 
-void CLPoolingLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices)
+void CLPoolingLayer::configure(const CLCompileContext &compile_context,
+                               ICLTensor              *input,
+                               ICLTensor              *output,
+                               const PoolingLayerInfo &pool_info,
+                               ICLTensor              *indices)
 {
     _impl->src     = input;
     _impl->dst     = output;
     _impl->indices = indices;
 
     _impl->op = std::make_unique<opencl::ClPool2d>();
-    _impl->op->configure(compile_context, input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr);
+    _impl->op->configure(compile_context, input->info(), output->info(), pool_info,
+                         (indices) ? indices->info() : nullptr);
 }
 
-Status CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status CLPoolingLayer::validate(const ITensorInfo      *input,
+                                const ITensorInfo      *output,
+                                const PoolingLayerInfo &pool_info,
+                                const ITensorInfo      *indices)
 {
     return opencl::ClPool2d::validate(input, output, pool_info, indices);
 }
diff --git a/src/runtime/CL/functions/CLPriorBoxLayer.cpp b/src/runtime/CL/functions/CLPriorBoxLayer.cpp
index 019f0a7e61..cfd0ec4fbf 100644
--- a/src/runtime/CL/functions/CLPriorBoxLayer.cpp
+++ b/src/runtime/CL/functions/CLPriorBoxLayer.cpp
@@ -29,31 +29,40 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLPriorBoxLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLPriorBoxLayerKernel.h"
 
 using namespace arm_compute;
 
-CLPriorBoxLayer::CLPriorBoxLayer()
-    : _min(nullptr), _max(nullptr), _aspect_ratios(nullptr)
+CLPriorBoxLayer::CLPriorBoxLayer() : _min(nullptr), _max(nullptr), _aspect_ratios(nullptr)
 {
 }
 
-void CLPriorBoxLayer::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info)
+void CLPriorBoxLayer::configure(const ICLTensor         *input1,
+                                const ICLTensor         *input2,
+                                ICLTensor               *output,
+                                const PriorBoxLayerInfo &info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, info);
 }
 
-void CLPriorBoxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info)
+void CLPriorBoxLayer::configure(const CLCompileContext  &compile_context,
+                                const ICLTensor         *input1,
+                                const ICLTensor         *input2,
+                                ICLTensor               *output,
+                                const PriorBoxLayerInfo &info)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, output, info);
-    _min           = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.min_sizes().size() * sizeof(float));
-    _aspect_ratios = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.aspect_ratios().size() * sizeof(float));
-    if(!info.max_sizes().empty())
+    _min           = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+                                info.min_sizes().size() * sizeof(float));
+    _aspect_ratios = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+                                info.aspect_ratios().size() * sizeof(float));
+    if (!info.max_sizes().empty())
     {
-        _max = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.max_sizes().size() * sizeof(float));
+        _max = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+                          info.max_sizes().size() * sizeof(float));
     }
 
     auto k = std::make_unique<CLPriorBoxLayerKernel>();
@@ -61,7 +70,10 @@ void CLPriorBoxLayer::configure(const CLCompileContext &compile_context, const I
     _kernel = std::move(k);
 }
 
-Status CLPriorBoxLayer::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status CLPriorBoxLayer::validate(const ITensorInfo       *input1,
+                                 const ITensorInfo       *input2,
+                                 const ITensorInfo       *output,
+                                 const PriorBoxLayerInfo &info)
 {
     return CLPriorBoxLayerKernel::validate(input1, input2, output, info);
 }
diff --git a/src/runtime/CL/functions/CLQLSTMLayer.cpp b/src/runtime/CL/functions/CLQLSTMLayer.cpp
index 7fbb866fa9..12f6f89290 100644
--- a/src/runtime/CL/functions/CLQLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLQLSTMLayer.cpp
@@ -26,29 +26,36 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/QuantizationInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 using namespace arm_compute::utils::info_helpers;
 using namespace arm_compute::opencl::kernels;
 namespace
 {
-Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm_input, const ITensorInfo *mm_weights, const ITensorInfo *bias,
-                   float gemmlowp_scale, const TensorInfo *mm_res_info, const TensorInfo *outstage_tensor_info)
+Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info,
+                   const ITensorInfo       *mm_input,
+                   const ITensorInfo       *mm_weights,
+                   const ITensorInfo       *bias,
+                   float                    gemmlowp_scale,
+                   const TensorInfo        *mm_res_info,
+                   const TensorInfo        *outstage_tensor_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(mm_input, mm_weights, nullptr, mm_res_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+        gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));
     return Status{};
 }
 } // namespace
@@ -78,14 +85,12 @@ void CLQLSTMLayer::TensorCopyKernel::run()
     _src->map(q, true);
     _dst->map(q, true);
 
-    Iterator input_iter{ _src, _window };
-    Iterator output_iter{ _dst, _window };
+    Iterator input_iter{_src, _window};
+    Iterator output_iter{_dst, _window};
 
-    execute_window_loop(_window, [&](const Coordinates &)
-    {
-        memcpy(output_iter.ptr(), input_iter.ptr(), _row_size);
-    },
-    input_iter, output_iter);
+    execute_window_loop(
+        _window, [&](const Coordinates &) { memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); }, input_iter,
+        output_iter);
 
     _src->unmap(q);
     _dst->unmap(q);
@@ -104,7 +109,7 @@ CLQLSTMLayer::CLQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
       _layer_norms(),
       _copy_output()
 {
-    for(auto &norm : _layer_norms)
+    for (auto &norm : _layer_norms)
     {
         norm = std::make_unique<CLQLSTMLayerNormalizationKernel>();
     }
@@ -129,17 +134,22 @@ Status CLQLSTMLayer::validate_layer_norm(const ITensorInfo &in, const ITensorInf
 {
     // Output quantization scale will be different, but ignored here
     // since it will be configured at configure() stage.
-    const TensorInfo out
-    {
-        in
-    };
+    const TensorInfo out{in};
     return CLQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias);
 }
 
-void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
-                                const ICLTensor *mm_input, const ICLTensor *mm_weights, const ICLTensor *bias,
-                                CLTensor *mm_res, CLTensor *outstage_res, float gemmlowp_scale,
-                                const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info)
+void CLQLSTMLayer::configure_mm(const CLCompileContext       &compile_context,
+                                CLGEMMLowpMatrixMultiplyCore &mm,
+                                CLGEMMLowpOutputStage        &outstage,
+                                GEMMLowpOutputStageInfo      &gemmlowp_info,
+                                const ICLTensor              *mm_input,
+                                const ICLTensor              *mm_weights,
+                                const ICLTensor              *bias,
+                                CLTensor                     *mm_res,
+                                CLTensor                     *outstage_res,
+                                float                         gemmlowp_scale,
+                                const TensorInfo             &mm_res_info,
+                                const TensorInfo             &outstage_tensor_info)
 {
     _memory_group.manage(mm_res);
     _memory_group.manage(outstage_res);
@@ -151,30 +161,51 @@ void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMML
     mm.configure(compile_context, mm_input, mm_weights, nullptr, mm_res);
 
     // Configure output stage
-    quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
+    quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                 &gemmlowp_info.gemmlowp_shift);
     outstage.configure(compile_context, mm_res, bias, outstage_res, gemmlowp_info);
     mm_res->allocator()->allocate();
 }
 
-void CLQLSTMLayer::configure(const ICLTensor *input,
-                             const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                             const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                             const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                             ICLTensor *cell_state_in, ICLTensor *output_state_in,
-                             ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
+void CLQLSTMLayer::configure(const ICLTensor             *input,
+                             const ICLTensor             *input_to_forget_weights,
+                             const ICLTensor             *input_to_cell_weights,
+                             const ICLTensor             *input_to_output_weights,
+                             const ICLTensor             *recurrent_to_forget_weights,
+                             const ICLTensor             *recurrent_to_cell_weights,
+                             const ICLTensor             *recurrent_to_output_weights,
+                             const ICLTensor             *forget_gate_bias,
+                             const ICLTensor             *cell_bias,
+                             const ICLTensor             *output_gate_bias,
+                             ICLTensor                   *cell_state_in,
+                             ICLTensor                   *output_state_in,
+                             ICLTensor                   *cell_state_out,
+                             ICLTensor                   *output_state_out,
+                             ICLTensor                   *output,
                              const LSTMParams<ICLTensor> &lstm_params)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-              recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
-              cell_state_in, output_state_in, cell_state_out, output_state_out, output, lstm_params);
+    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights,
+              input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
+              recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in,
+              output_state_in, cell_state_out, output_state_out, output, lstm_params);
 }
 
-void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input,
-                             const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                             const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                             const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                             ICLTensor *cell_state_in, ICLTensor *output_state_in,
-                             ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
+void CLQLSTMLayer::configure(const CLCompileContext      &compile_context,
+                             const ICLTensor             *input,
+                             const ICLTensor             *input_to_forget_weights,
+                             const ICLTensor             *input_to_cell_weights,
+                             const ICLTensor             *input_to_output_weights,
+                             const ICLTensor             *recurrent_to_forget_weights,
+                             const ICLTensor             *recurrent_to_cell_weights,
+                             const ICLTensor             *recurrent_to_output_weights,
+                             const ICLTensor             *forget_gate_bias,
+                             const ICLTensor             *cell_bias,
+                             const ICLTensor             *output_gate_bias,
+                             ICLTensor                   *cell_state_in,
+                             ICLTensor                   *output_state_in,
+                             ICLTensor                   *cell_state_out,
+                             ICLTensor                   *output_state_out,
+                             ICLTensor                   *output,
                              const LSTMParams<ICLTensor> &lstm_params)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
@@ -191,11 +222,11 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
     build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
 
     // Validate
-    ARM_COMPUTE_ERROR_THROW_ON(CLQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
-                                                      recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                      forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
-                                                      cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(),
-                                                      lstm_params_info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLQLSTMLayer::validate(
+        input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
+        recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+        forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(),
+        output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(), lstm_params_info));
 
     const int batch_size  = input->info()->dimension(1);
     const int num_units   = input_to_output_weights->info()->dimension(1);
@@ -216,7 +247,7 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
 
     // Layer normalization
     _has_layer_norm = lstm_params.use_layer_norm();
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         set_layer_norm_weight(lstm_params.forget_layer_norm_weights(), LayerNormGate::Forget);
         set_layer_norm_weight(lstm_params.cell_layer_norm_weights(), LayerNormGate::Cell);
@@ -238,53 +269,75 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
 
     // Calculate quantized parameters for clipping.
     int16_t quantized_cell_clip = 0;
-    if(lstm_params.cell_clip() > 0.0f)
+    if (lstm_params.cell_clip() > 0.0f)
     {
         quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
     }
     _has_cell_clipping = quantized_cell_clip > 0;
 
     // Precompute effective bias for optimizing the matmul computations.
-    if(!_has_cifg)
+    if (!_has_cifg)
     {
         _input_to_input_weights     = lstm_params.input_to_input_weights();
         _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights();
 
-        _input_to_input_reduction->configure(compile_context, _input_to_input_weights->info(), _input_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-        _recurrent_to_input_reduction->configure(compile_context, _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false,
-                                                 -qoutput_state_in.offset, true));
+        _input_to_input_reduction->configure(compile_context, _input_to_input_weights->info(),
+                                             _input_to_input_eff_bias.info(),
+                                             GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+        _recurrent_to_input_reduction->configure(
+            compile_context, _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(),
+            GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
     }
-    _input_to_forget_reduction->configure(compile_context, input_to_forget_weights->info(), _input_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_forget_reduction->configure(compile_context, recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false,
-                                              -qoutput_state_in.offset, true));
-    _input_to_cell_reduction->configure(compile_context, input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_cell_reduction->configure(compile_context, recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset,
-                                            true));
-    _input_to_output_reduction->configure(compile_context, input_to_output_weights->info(), _input_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_output_reduction->configure(compile_context, recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false,
-                                              -qoutput_state_in.offset, true));
-    if(_has_projection)
+    _input_to_forget_reduction->configure(compile_context, input_to_forget_weights->info(),
+                                          _input_to_forget_eff_bias.info(),
+                                          GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_forget_reduction->configure(
+        compile_context, recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(),
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_cell_reduction->configure(compile_context, input_to_cell_weights->info(), _input_to_cell_eff_bias.info(),
+                                        GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_cell_reduction->configure(
+        compile_context, recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(),
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_output_reduction->configure(compile_context, input_to_output_weights->info(),
+                                          _input_to_output_eff_bias.info(),
+                                          GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_output_reduction->configure(
+        compile_context, recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(),
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    if (_has_projection)
     {
-        _projection_reduction->configure(compile_context, _projection_weights->info(), _projection_eff_bias.info(), GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
-        if(_projection_bias != nullptr)
+        _projection_reduction->configure(
+            compile_context, _projection_weights->info(), _projection_eff_bias.info(),
+            GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
+        if (_projection_bias != nullptr)
         {
-            _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE);
+            _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias,
+                                           &_projection_eff_bias, ConvertPolicy::SATURATE);
         }
     }
 
     // Pre-transpose weights to be used in GEMM.
-    _transpose_input_to_forget_weights.configure(compile_context, input_to_forget_weights, &_input_to_forget_weights_transposed);
-    _transpose_input_to_cell_weights.configure(compile_context, input_to_cell_weights, &_input_to_cell_weights_transposed);
-    _transpose_input_to_output_weights.configure(compile_context, input_to_output_weights, &_input_to_output_weights_transposed);
-    _transpose_recurrent_to_forget_weights.configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed);
-    _transpose_recurrent_to_cell_weights.configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed);
-    _transpose_recurrent_to_output_weights.configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_weights_transposed);
-    if(!_has_cifg)
+    _transpose_input_to_forget_weights.configure(compile_context, input_to_forget_weights,
+                                                 &_input_to_forget_weights_transposed);
+    _transpose_input_to_cell_weights.configure(compile_context, input_to_cell_weights,
+                                               &_input_to_cell_weights_transposed);
+    _transpose_input_to_output_weights.configure(compile_context, input_to_output_weights,
+                                                 &_input_to_output_weights_transposed);
+    _transpose_recurrent_to_forget_weights.configure(compile_context, recurrent_to_forget_weights,
+                                                     &_recurrent_to_forget_weights_transposed);
+    _transpose_recurrent_to_cell_weights.configure(compile_context, recurrent_to_cell_weights,
+                                                   &_recurrent_to_cell_weights_transposed);
+    _transpose_recurrent_to_output_weights.configure(compile_context, recurrent_to_output_weights,
+                                                     &_recurrent_to_output_weights_transposed);
+    if (!_has_cifg)
     {
-        _transpose_input_to_input_weights.configure(compile_context, lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed);
-        _transpose_recurrent_to_input_weights.configure(compile_context, lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed);
+        _transpose_input_to_input_weights.configure(compile_context, lstm_params.input_to_input_weights(),
+                                                    &_input_to_input_weights_transposed);
+        _transpose_recurrent_to_input_weights.configure(compile_context, lstm_params.recurrent_to_input_weights(),
+                                                        &_recurrent_to_input_weights_transposed);
     }
-    if(_has_projection)
+    if (_has_projection)
     {
         _transpose_projection_weights.configure(compile_context, _projection_weights, &_projection_weights_transposed);
     }
@@ -297,42 +350,55 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
 
     const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
     // Forget gate.
-    const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
-    const float      input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
-    configure_mm(compile_context, _mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info,
-                 input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias,
-                 &_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale,
-                 mm_out_info, forget_gate_outstage_info);
-
-    const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
+    const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16,
+                                               QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
+    const float      input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale *
+                                        qinput.scale / lstm_params.forget_intermediate_scale();
+    configure_mm(compile_context, _mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, input,
+                 &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, &_mm_input_to_forget_res,
+                 &_input_to_forget_outstage_res, input_to_forget_scale, mm_out_info, forget_gate_outstage_info);
+
+    const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
     configure_mm(compile_context, _mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info,
                  output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias,
                  &_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale,
                  mm_out_info, forget_gate_outstage_info);
 
-    _accumulate_input_recurrent_forget.configure(compile_context, &_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
+    _accumulate_input_recurrent_forget.configure(compile_context, &_input_to_forget_outstage_res,
+                                                 &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
                                                  ConvertPolicy::SATURATE);
     _input_to_forget_outstage_res.allocator()->allocate();
 
-    if(_has_peephole)
+    if (_has_peephole)
     {
         _mul_cell_to_forget_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
         _memory_group.manage(&_mul_cell_to_forget_res);
-        _pixelwise_mul_cell_to_forget.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-        _cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));
+        _pixelwise_mul_cell_to_forget.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(),
+                                                &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE,
+                                                RoundingPolicy::TO_ZERO);
+        _cell_to_forget_outstage_res.allocator()->init(
+            TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+                       QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));
         _memory_group.manage(&_cell_to_forget_outstage_res);
-        const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
-        quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
-        _cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info);
+        const float cell_to_forget_scale =
+            std::pow(2, cell_shift) *
+            lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale /
+            lstm_params.forget_intermediate_scale();
+        quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                     &gemmlowp_info.gemmlowp_shift);
+        _cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr,
+                                           &_cell_to_forget_outstage_res, gemmlowp_info);
         _mul_cell_to_forget_res.allocator()->allocate();
-        _accumulate_cell_forget.configure(compile_context, &_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
+        _accumulate_cell_forget.configure(compile_context, &_recurrent_to_forget_outstage_res,
+                                          &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
                                           ConvertPolicy::SATURATE);
         _cell_to_forget_outstage_res.allocator()->allocate();
     }
 
     CLTensor *forget_activation_input = &_recurrent_to_forget_outstage_res;
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         configure_layer_norm(LayerNormGate::Forget, &_recurrent_to_forget_outstage_res);
         _recurrent_to_forget_outstage_res.allocator()->allocate();
@@ -345,30 +411,33 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
     const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
     _memory_group.manage(&_forget_gate);
     _forget_gate.allocator()->init(forget_gate_info);
-    _forget_gate_sigmoid.configure(compile_context, forget_activation_input, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _forget_gate_sigmoid.configure(compile_context, forget_activation_input, &_forget_gate,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     forget_activation_input->allocator()->allocate();
 
     // Modulation gate.
-    const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
-    const float      input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
-    configure_mm(compile_context, _mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info,
-                 input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias,
-                 &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale,
-                 mm_out_info, cell_outstage_info);
-
-    const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
-    configure_mm(compile_context, _mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info,
-                 output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias,
-                 &_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale,
-                 mm_out_info, cell_outstage_info);
-
-    _accumulate_input_recurrent_modulation.configure(compile_context, &_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res,
+    const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16,
+                                        QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
+    const float      input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale *
+                                      qinput.scale / lstm_params.cell_intermediate_scale();
+    configure_mm(compile_context, _mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, input,
+                 &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias, &_mm_input_to_cell_res,
+                 &_input_to_cell_outstage_res, input_to_cell_scale, mm_out_info, cell_outstage_info);
+
+    const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale *
+                                          qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
+    configure_mm(compile_context, _mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, output_state_in,
+                 &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, &_mm_recurrent_to_cell_res,
+                 &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, mm_out_info, cell_outstage_info);
+
+    _accumulate_input_recurrent_modulation.configure(compile_context, &_input_to_cell_outstage_res,
+                                                     &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res,
                                                      ConvertPolicy::SATURATE);
     _input_to_cell_outstage_res.allocator()->allocate();
 
     CLTensor *cell_activation_input = &_recurrent_to_cell_outstage_res;
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         configure_layer_norm(LayerNormGate::Cell, &_recurrent_to_cell_outstage_res);
         _recurrent_to_cell_outstage_res.allocator()->allocate();
@@ -378,14 +447,15 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
     const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
     _memory_group.manage(&_cell_gate);
     _cell_gate.allocator()->init(cell_gate_info);
-    _cell_gate_tanh.configure(compile_context, cell_activation_input, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+    _cell_gate_tanh.configure(compile_context, cell_activation_input, &_cell_gate,
+                              ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
     cell_activation_input->allocator()->allocate();
 
     // Input gate.
     const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
     _input_gate.allocator()->init(input_gate_info);
     _memory_group.manage(&_input_gate);
-    if(_has_cifg)
+    if (_has_cifg)
     {
         _ones.allocator()->init(*_forget_gate.info());
         _input_gate_sub.configure(compile_context, &_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE);
@@ -393,107 +463,142 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
     }
     else
     {
-        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
-        const float      input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
-        configure_mm(compile_context, _mm_input_to_input, _input_to_input_outstage, gemmlowp_info,
-                     input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias,
-                     &_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale,
-                     mm_out_info, input_outstage_info);
-
-        const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
+        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                             QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
+        const float      input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale *
+                                           qinput.scale / lstm_params.input_intermediate_scale();
+        configure_mm(compile_context, _mm_input_to_input, _input_to_input_outstage, gemmlowp_info, input,
+                     &_input_to_input_weights_transposed, &_input_to_input_eff_bias, &_mm_input_to_input_res,
+                     &_input_to_input_outstage_res, input_to_input_scale, mm_out_info, input_outstage_info);
+
+        const float recurrent_to_input_scale =
+            _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale /
+            lstm_params.input_intermediate_scale();
         configure_mm(compile_context, _mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info,
                      output_state_in, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias,
                      &_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale,
                      mm_out_info, input_outstage_info);
-        _accumulate_input_recurrent_input.configure(compile_context, &_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res,
-                                                    ConvertPolicy::SATURATE);
+        _accumulate_input_recurrent_input.configure(compile_context, &_input_to_input_outstage_res,
+                                                    &_recurrent_to_input_outstage_res,
+                                                    &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
         _input_to_input_outstage_res.allocator()->allocate();
 
-        if(_has_peephole)
+        if (_has_peephole)
         {
-            _mul_cell_to_input_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
+            _mul_cell_to_input_res.allocator()->init(
+                TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
             _memory_group.manage(&_mul_cell_to_input_res);
-            _pixelwise_mul_cell_to_input.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-            const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
-            quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
-            _cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));
+            _pixelwise_mul_cell_to_input.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(),
+                                                   &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE,
+                                                   RoundingPolicy::TO_ZERO);
+            const float cell_to_input_scale =
+                std::pow(2, cell_shift) *
+                lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale /
+                lstm_params.input_intermediate_scale();
+            quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                         &gemmlowp_info.gemmlowp_shift);
+            _cell_to_input_outstage_res.allocator()->init(
+                TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+                           QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));
             _memory_group.manage(&_cell_to_input_outstage_res);
-            _cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info);
+            _cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr,
+                                              &_cell_to_input_outstage_res, gemmlowp_info);
             _mul_cell_to_input_res.allocator()->allocate();
-            _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
+            _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res,
+                                             &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
             _cell_to_input_outstage_res.allocator()->allocate();
         }
 
         CLTensor *input_activation_input = &_recurrent_to_input_outstage_res;
 
-        if(_has_layer_norm)
+        if (_has_layer_norm)
         {
             configure_layer_norm(LayerNormGate::Input, &_recurrent_to_input_outstage_res);
             _recurrent_to_input_outstage_res.allocator()->allocate();
             input_activation_input = &get_layer_norm_output(LayerNormGate::Input);
         }
 
-        _input_gate_sigmoid.configure(compile_context, input_activation_input, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        _input_gate_sigmoid.configure(compile_context, input_activation_input, &_input_gate,
+                                      ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
         input_activation_input->allocator()->allocate();
     }
     // Cell.
     // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication
-    _pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f,
+                                         ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     const float      cell_gate_scale      = _cell_gate.info()->quantization_info().uniform().scale;
     const float      mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift);
-    const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0));
+    const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                         QuantizationInfo(mul_input_cell_scale, 0));
     _memory_group.manage(&_mul_input_cell_res);
     _mul_input_cell_res.allocator()->init(mul_input_cell_info);
-    _pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f,
+                                        ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _cell_gate.allocator()->allocate();
-    _add_forget_cell.configure(compile_context, &_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE);
+    _add_forget_cell.configure(compile_context, &_forget_gate, &_mul_input_cell_res, cell_state_out,
+                               ConvertPolicy::SATURATE);
     _mul_input_cell_res.allocator()->allocate();
     _forget_gate.allocator()->allocate();
-    if(_has_cell_clipping)
+    if (_has_cell_clipping)
     {
-        _cell_clip.configure(compile_context, cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip));
+        _cell_clip.configure(compile_context, cell_state_out, nullptr,
+                             ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                 -quantized_cell_clip, quantized_cell_clip));
     }
     // Output gate.
-    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
-    const float      input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
-    configure_mm(compile_context, _mm_input_to_output, _input_to_output_outstage, gemmlowp_info,
-                 input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias,
-                 &_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale,
-                 mm_out_info, output_outstage_info);
-
-    const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
+    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                          QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
+    const float      input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale *
+                                        qinput.scale / lstm_params.output_intermediate_scale();
+    configure_mm(compile_context, _mm_input_to_output, _input_to_output_outstage, gemmlowp_info, input,
+                 &_input_to_output_weights_transposed, &_input_to_output_eff_bias, &_mm_input_to_output_res,
+                 &_input_to_output_outstage_res, input_to_output_scale, mm_out_info, output_outstage_info);
+
+    const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.output_intermediate_scale();
     configure_mm(compile_context, _mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info,
                  output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias,
                  &_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale,
                  mm_out_info, output_outstage_info);
 
-    _accumulate_input_recurrent_output.configure(compile_context, &_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res,
+    _accumulate_input_recurrent_output.configure(compile_context, &_recurrent_to_output_outstage_res,
+                                                 &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res,
                                                  ConvertPolicy::SATURATE);
     _input_to_output_outstage_res.allocator()->allocate();
 
-    if(_has_peephole)
+    if (_has_peephole)
     {
         // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication
         // Here we are not using the output stage because all operations are done in float
         _mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32));
         _memory_group.manage(&_mul_cell_to_output_res);
-        _pixelwise_mul_cell_to_output.configure(compile_context, cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-
-        const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
-        quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
-        _cell_to_output_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)));
+        _pixelwise_mul_cell_to_output.configure(compile_context, cell_state_out, lstm_params.cell_to_output_weights(),
+                                                &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE,
+                                                RoundingPolicy::TO_ZERO);
+
+        const float cell_to_output_scale =
+            std::pow(2, cell_shift) *
+            lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale /
+            lstm_params.output_intermediate_scale();
+        quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                     &gemmlowp_info.gemmlowp_shift);
+        _cell_to_output_outstage_res.allocator()->init(
+            TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+                       QuantizationInfo(lstm_params.output_intermediate_scale(), 0)));
         _memory_group.manage(&_cell_to_output_outstage_res);
-        _cell_to_output_outstage.configure(compile_context, &_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, gemmlowp_info);
+        _cell_to_output_outstage.configure(compile_context, &_mul_cell_to_output_res, nullptr,
+                                           &_cell_to_output_outstage_res, gemmlowp_info);
         _mul_cell_to_output_res.allocator()->allocate();
 
-        _accumulate_cell_to_output.configure(compile_context, &_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res,
+        _accumulate_cell_to_output.configure(compile_context, &_recurrent_to_output_outstage_res,
+                                             &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res,
                                              ConvertPolicy::SATURATE);
         _cell_to_output_outstage_res.allocator()->allocate();
     }
 
     CLTensor *output_activation_input = &_recurrent_to_output_outstage_res;
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         configure_layer_norm(LayerNormGate::Output, &_recurrent_to_output_outstage_res);
         _recurrent_to_output_outstage_res.allocator()->allocate();
@@ -503,20 +608,24 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
     const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
     _memory_group.manage(&_output_gate);
     _output_gate.allocator()->init(output_gate_info);
-    _output_gate_sigmoid.configure(compile_context, output_activation_input, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _output_gate_sigmoid.configure(compile_context, output_activation_input, &_output_gate,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     output_activation_input->allocator()->allocate();
 
     // Hidden.
-    _hidden_tanh.configure(compile_context, cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+    _hidden_tanh.configure(compile_context, cell_state_out, &_input_gate,
+                           ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
     // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication
     _memory_group.manage(&_hidden_mul_res);
     const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32);
     _hidden_mul_res.allocator()->init(hidden_mul_res);
-    _pixelwise_mul_hidden.configure(compile_context, &_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_hidden.configure(compile_context, &_output_gate, &_input_gate, &_hidden_mul_res, 1.f,
+                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _output_gate.allocator()->allocate();
     _input_gate.allocator()->allocate();
     const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
-    quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);
+    quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                 &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);
     gemmlowp_info.gemmlowp_offset  = lstm_params.hidden_state_zero();
     gemmlowp_info.output_data_type = output_state_in->info()->data_type();
 
@@ -525,7 +634,7 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
 
     _memory_group.manage(&_hidden_gate);
 
-    if(_projection_tensor_copy_required)
+    if (_projection_tensor_copy_required)
     {
         _hidden_gate.allocator()->init(*output_state_out->info());
         _hidden_gate.info()->set_tensor_shape(_hidden_mul_res.info()->tensor_shape());
@@ -536,27 +645,26 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
     _hidden_mul_res.allocator()->allocate();
 
     // Projection.
-    if(_has_projection)
+    if (_has_projection)
     {
         const TensorInfo              projection_outstage_info(*output_state_out->info());
-        const UniformQuantizationInfo qprojection      = _projection_weights->info()->quantization_info().uniform();
-        const float                   projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
-        gemmlowp_info.gemmlowp_offset                  = qoutput_state_in.offset;
-        gemmlowp_info.gemmlowp_min_bound               = std::numeric_limits<int8_t>::lowest();
-        gemmlowp_info.gemmlowp_max_bound               = std::numeric_limits<int8_t>::max();
-        gemmlowp_info.output_data_type                 = DataType::QASYMM8_SIGNED;
-
-        TensorInfo projection_mm_out_info{ mm_out_info };
+        const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform();
+        const float projection_scale  = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
+        gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;
+        gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
+        gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
+        gemmlowp_info.output_data_type   = DataType::QASYMM8_SIGNED;
+
+        TensorInfo projection_mm_out_info{mm_out_info};
         projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
 
-        configure_mm(compile_context, _mm_projection, _projection_outstage, gemmlowp_info,
-                     hidden_gate_result, &_projection_weights_transposed, &_projection_eff_bias,
-                     &_mm_projection_res, &_projection_outstage_res, projection_scale,
-                     projection_mm_out_info, projection_outstage_info);
+        configure_mm(compile_context, _mm_projection, _projection_outstage, gemmlowp_info, hidden_gate_result,
+                     &_projection_weights_transposed, &_projection_eff_bias, &_mm_projection_res,
+                     &_projection_outstage_res, projection_scale, projection_mm_out_info, projection_outstage_info);
 
         ICLTensor *accumulate_destination = output_state_out;
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _hidden_gate.allocator()->allocate();
             _projection_accumulate_res.allocator()->init(*output_state_in->info());
@@ -565,31 +673,34 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
             accumulate_destination = &_projection_accumulate_res;
         }
 
-        _accumulate_projection.configure(compile_context, &_projection_outstage_res, accumulate_destination, accumulate_destination, ConvertPolicy::SATURATE);
+        _accumulate_projection.configure(compile_context, &_projection_outstage_res, accumulate_destination,
+                                         accumulate_destination, ConvertPolicy::SATURATE);
         _projection_outstage_res.allocator()->allocate();
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _projection_accumulate_to_output_copy.configure(_projection_accumulate_res, *output_state_out);
             _projection_accumulate_res.allocator()->allocate();
         }
 
-        int8_t quantized_projection_clip{ 0 };
-        if(lstm_params.projection_clip() > 0.0f)
+        int8_t quantized_projection_clip{0};
+        if (lstm_params.projection_clip() > 0.0f)
         {
-            quantized_projection_clip = utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);
+            quantized_projection_clip =
+                utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);
         }
 
-        if(quantized_projection_clip > 0)
+        if (quantized_projection_clip > 0)
         {
-            _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip,
-                                                                                                       quantized_projection_clip));
+            _projection_clip.configure(compile_context, output_state_out, nullptr,
+                                       ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                           -quantized_projection_clip, quantized_projection_clip));
             _has_projection_clipping = true;
         }
     }
     else
     {
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _hidden_to_output_copy.configure(_hidden_gate, *output_state_out);
             _hidden_gate.allocator()->allocate();
@@ -600,17 +711,27 @@ void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLT
     _copy_output.configure(compile_context, output_state_out, output);
 }
 
-Status CLQLSTMLayer::validate(const ITensorInfo *input,
-                              const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                              const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                              const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                              const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                              const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output,
+Status CLQLSTMLayer::validate(const ITensorInfo             *input,
+                              const ITensorInfo             *input_to_forget_weights,
+                              const ITensorInfo             *input_to_cell_weights,
+                              const ITensorInfo             *input_to_output_weights,
+                              const ITensorInfo             *recurrent_to_forget_weights,
+                              const ITensorInfo             *recurrent_to_cell_weights,
+                              const ITensorInfo             *recurrent_to_output_weights,
+                              const ITensorInfo             *forget_gate_bias,
+                              const ITensorInfo             *cell_bias,
+                              const ITensorInfo             *output_gate_bias,
+                              const ITensorInfo             *cell_state_in,
+                              const ITensorInfo             *output_state_in,
+                              const ITensorInfo             *cell_state_out,
+                              const ITensorInfo             *output_state_out,
+                              const ITensorInfo             *output,
                               const LSTMParams<ITensorInfo> &lstm_params)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
-                                        recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
-                                        cell_state_out, output_state_out, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+                                        recurrent_to_forget_weights, recurrent_to_cell_weights,
+                                        recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+                                        cell_state_in, output_state_in, cell_state_out, output_state_out, output);
 
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != 2, "Input must have exactly 2 dimensions");
@@ -622,13 +743,16 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
 
     ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->dimension(0) != input_size);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, input_to_cell_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights,
+                                                   input_to_cell_weights);
     ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights,
+                                                   recurrent_to_cell_weights);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QSYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                                       recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights,
+                                                       input_to_output_weights, recurrent_to_forget_weights,
+                                                       recurrent_to_cell_weights, recurrent_to_output_weights);
 
     ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units);
@@ -647,20 +771,25 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_in);
 
     // Check whether peephole weights are all there or none
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1,
+                                                             DataType::QSYMM16);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->dimension(0) != num_units);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(),
+                                                           lstm_params.cell_to_output_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(),
+                                                       lstm_params.cell_to_output_weights());
 
-        if(!lstm_params.has_cifg_opt())
+        if (!lstm_params.has_cifg_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(),
+                                                               lstm_params.cell_to_input_weights());
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(),
+                                                           lstm_params.cell_to_input_weights());
         }
     }
 
@@ -674,7 +803,7 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
 
     // Calculate quantized parameters for clipping.
     int16_t quantized_cell_clip = 0;
-    if(lstm_params.cell_clip() > 0.0f)
+    if (lstm_params.cell_clip() > 0.0f)
     {
         quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
     }
@@ -682,33 +811,50 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
     // Precompute effective bias for optimizing the matmul computations.
     const TensorInfo eff_bias_info(TensorShape(num_units), 1, DataType::S32);
     const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32);
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset,
-                                                                               true)));
+        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+            lstm_params.input_to_input_weights(), &eff_bias_info,
+            GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+            lstm_params.recurrent_to_input_weights(), &eff_bias_info,
+            GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
-    if(lstm_params.has_projection())
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        recurrent_to_forget_weights, &eff_bias_info,
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        recurrent_to_cell_weights, &eff_bias_info,
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        recurrent_to_output_weights, &eff_bias_info,
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    if (lstm_params.has_projection())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false,
-                                                                               lstm_params.hidden_state_zero(),
-                                                                               true)));
-        if(lstm_params.projection_bias() != nullptr)
+        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+            lstm_params.projection_weights(), &projection_eff_bias_info,
+            GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)));
+        if (lstm_params.projection_bias() != nullptr)
         {
             ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.projection_bias(), 1, DataType::S32);
-            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info,
-                                                                       &projection_eff_bias_info, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info,
+                                               &projection_eff_bias_info, ConvertPolicy::SATURATE));
         }
     }
 
-    const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_forget_weights->data_type(), input_to_forget_weights->quantization_info());
-    const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info());
+    const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1,
+                                              input_to_forget_weights->data_type(),
+                                              input_to_forget_weights->quantization_info());
+    const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1,
+                                                  recurrent_to_forget_weights->data_type(),
+                                                  recurrent_to_forget_weights->quantization_info());
 
     // Validate weights transpose
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(input_to_forget_weights, &input_weights_transposed));
@@ -717,15 +863,20 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_forget_weights, &recurrent_weights_transposed));
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_cell_weights, &recurrent_weights_transposed));
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_output_weights, &recurrent_weights_transposed));
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLTranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLTranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed));
     }
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
-        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
-        ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed));
+        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1,
+                                                       lstm_params.projection_weights()->data_type(),
+                                                       lstm_params.projection_weights()->quantization_info());
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLTranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed));
     }
 
     GEMMLowpOutputStageInfo gemmlowp_info;
@@ -738,28 +889,42 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
 
     // Forget gate.
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_intermediate_scale() == 0);
-    const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
+    const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                          QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
     const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
-    const float      input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_forget_scale, &mm_out_info, &forget_outstage_info));
+    const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale /
+                                        lstm_params.forget_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                            input_to_forget_scale, &mm_out_info, &forget_outstage_info));
 
-    const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info));
+    const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                            &eff_bias_info, recurrent_to_forget_scale, &mm_out_info,
+                                            &forget_outstage_info));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info,
+                                                               &forget_outstage_info, ConvertPolicy::SATURATE));
 
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_ZERO));
-        const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1,
+                                                             DataType::QSYMM16);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f,
+                                                ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        const float cell_to_forget_scale = std::pow(2, cell_shift) *
+                                           lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale /
+                                           lstm_params.forget_intermediate_scale();
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+            cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info,
+                                                                   &forget_outstage_info, ConvertPolicy::SATURATE));
     }
 
-    if(has_layer_norm)
+    if (has_layer_norm)
     {
         const ITensorInfo *w_info = lstm_params.forget_layer_norm_weights();
         const ITensorInfo *b_info = forget_gate_bias;
@@ -770,20 +935,29 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
     const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0);
 
     const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_outstage_info, &forget_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&forget_outstage_info, &forget_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Modulation gate.
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_intermediate_scale() == 0);
-    const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
-    const float      input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info));
-
-    const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info));
-
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE));
-
-    if(has_layer_norm)
+    const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                        QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
+    const float      input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale /
+                                      lstm_params.cell_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                            input_to_cell_scale, &mm_out_info, &cell_outstage_info));
+
+    const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale *
+                                          qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                            &eff_bias_info, recurrent_to_cell_scale, &mm_out_info,
+                                            &cell_outstage_info));
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info,
+                                                               &cell_outstage_info, ConvertPolicy::SATURATE));
+
+    if (has_layer_norm)
     {
         const ITensorInfo *w_info = lstm_params.cell_layer_norm_weights();
         const ITensorInfo *b_info = cell_bias;
@@ -791,85 +965,123 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
     }
 
     const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_outstage_info, &cell_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&cell_outstage_info, &cell_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
 
     // Input gate.
     const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
-    if(lstm_params.has_cifg_opt())
+    if (lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used");
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr,
+                                        "Input gate bias must not be present when CIFG is used");
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info,
+                                                                      &forget_gate_info, ConvertPolicy::SATURATE));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
+                                            lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(
+            input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_forget_weights, lstm_params.input_to_input_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, lstm_params.recurrent_to_input_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights,
+                                                       lstm_params.recurrent_to_input_weights());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.input_gate_bias());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias());
 
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_intermediate_scale() == 0);
-        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
-        const float      input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info));
-
-        const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info));
-
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
-
-        if(lstm_params.has_peephole_opt())
+        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                             QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
+        const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale *
+                                           qinput.scale / lstm_params.input_intermediate_scale();
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                                input_to_input_scale, &mm_out_info, &input_outstage_info));
+
+        const float recurrent_to_input_scale =
+            lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale /
+            lstm_params.input_intermediate_scale();
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                                &eff_bias_info, recurrent_to_input_scale, &mm_out_info,
+                                                &input_outstage_info));
+
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info,
+                                                                   &input_outstage_info, ConvertPolicy::SATURATE));
+
+        if (lstm_params.has_peephole_opt())
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
-                                                                            RoundingPolicy::TO_ZERO));
-            const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
-            ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info,
+                                                    1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+            const float cell_to_input_scale = std::pow(2, cell_shift) *
+                                              lstm_params.cell_to_input_weights()->quantization_info().uniform().scale /
+                                              lstm_params.input_intermediate_scale();
+            ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+                cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info,
+                                                                       &input_outstage_info, ConvertPolicy::SATURATE));
         }
 
-        if(has_layer_norm)
+        if (has_layer_norm)
         {
             const ITensorInfo *w_info = lstm_params.input_layer_norm_weights();
             const ITensorInfo *b_info = lstm_params.input_gate_bias();
             ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(cell_outstage_info, *w_info, *b_info));
         }
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_outstage_info, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 1.f, 1.f)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+            &input_outstage_info, &input_gate_info,
+            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 1.f, 1.f)));
     }
     // Cell.
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
-    if(quantized_cell_clip > 0)
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
+    if (quantized_cell_clip > 0)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip,
-                                                                                                             quantized_cell_clip)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLActivationLayer::validate(cell_state_out, nullptr,
+                                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                            -quantized_cell_clip, quantized_cell_clip)));
     }
     // Output gate.
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_intermediate_scale() == 0);
-    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
-    const float      input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info));
-
-    const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info));
-
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
-    if(lstm_params.has_peephole_opt())
+    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                          QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
+    const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale /
+                                        lstm_params.output_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                            input_to_output_scale, &mm_out_info, &output_outstage_info));
+
+    const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.output_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                            &eff_bias_info, recurrent_to_output_scale, &mm_out_info,
+                                            &output_outstage_info));
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info,
+                                                               &output_outstage_info, ConvertPolicy::SATURATE));
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1,
+                                                             DataType::QSYMM16);
         // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel
         // Here we are not using the output stage because all operations are done in float
         // const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
         // ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+            cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
+            RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info,
+                                                                   &output_outstage_info, ConvertPolicy::SATURATE));
     }
 
-    if(has_layer_norm)
+    if (has_layer_norm)
     {
         const ITensorInfo *w_info = lstm_params.output_layer_norm_weights();
         const ITensorInfo *b_info = output_gate_bias;
@@ -877,85 +1089,103 @@ Status CLQLSTMLayer::validate(const ITensorInfo *input,
     }
 
     const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_outstage_info, &output_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&output_outstage_info, &output_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Hidden.
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(cell_state_out, &input_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
     const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32);
     const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED);
 
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.hidden_state_scale() == 0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
     const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                     &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
     gemmlowp_info.gemmlowp_offset  = lstm_params.hidden_state_zero();
     gemmlowp_info.output_data_type = hidden_out_info.data_type();
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
 
     const bool projection_tensor_copy_required = num_units != output_size;
 
     // Projection.
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights,
+                                                           lstm_params.projection_weights());
         ARM_COMPUTE_RETURN_ERROR_ON(qoutput_state_in.scale == 0);
 
-        const UniformQuantizationInfo qprojection      = lstm_params.projection_weights()->quantization_info().uniform();
-        const float                   projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+        const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform();
+        const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+            projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
         gemmlowp_info.gemmlowp_offset    = qoutput_state_in.offset;
         gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
         gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
         gemmlowp_info.output_data_type   = DataType::QASYMM8_SIGNED;
 
         const TensorInfo projection_outstage_info(*output_state_out);
-        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
+        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1,
+                                                       lstm_params.projection_weights()->data_type(),
+                                                       lstm_params.projection_weights()->quantization_info());
 
-        TensorInfo projection_mm_out_info{ mm_out_info };
+        TensorInfo projection_mm_out_info{mm_out_info};
         projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, &projection_eff_bias_info, projection_scale, &projection_mm_out_info,
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed,
+                                                &projection_eff_bias_info, projection_scale, &projection_mm_out_info,
                                                 &projection_outstage_info));
 
-        if(projection_tensor_copy_required)
+        if (projection_tensor_copy_required)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info));
         }
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(output_state_out, output_state_out, output_state_out,
+                                                                   ConvertPolicy::SATURATE));
 
-        if(projection_tensor_copy_required)
+        if (projection_tensor_copy_required)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out));
         }
 
-        int8_t quantized_projection_clip{ 0 };
-        if(lstm_params.projection_clip() > 0.0f)
+        int8_t quantized_projection_clip{0};
+        if (lstm_params.projection_clip() > 0.0f)
         {
             quantized_projection_clip = quantize_qasymm8_signed(lstm_params.projection_clip(), qprojection);
         }
 
-        if(quantized_projection_clip > 0)
+        if (quantized_projection_clip > 0)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip,
-                                                                                                                   quantized_projection_clip)));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+                output_state_out, nullptr,
+                ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                    -quantized_projection_clip, quantized_projection_clip)));
         }
     }
     else
     {
-        if(projection_tensor_copy_required)
+        if (projection_tensor_copy_required)
         {
             ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(hidden_out_info, *output_state_out));
         }
     }
 
-    if(cell_state_out->total_size() > 0)
+    if (cell_state_out->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(cell_state_in, cell_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(cell_state_in, cell_state_out);
     }
 
-    if(output_state_out->total_size() > 0)
+    if (output_state_out->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out);
@@ -980,14 +1210,14 @@ void CLQLSTMLayer::run()
     _recurrent_to_forget_outstage.run();
     _accumulate_input_recurrent_forget.run();
 
-    if(_has_peephole)
+    if (_has_peephole)
     {
         _pixelwise_mul_cell_to_forget.run();
         _cell_to_forget_outstage.run();
         _accumulate_cell_forget.run();
     }
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Forget));
     }
@@ -1002,7 +1232,7 @@ void CLQLSTMLayer::run()
     _recurrent_to_cell_outstage.run();
     _accumulate_input_recurrent_modulation.run();
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Cell));
     }
@@ -1010,7 +1240,7 @@ void CLQLSTMLayer::run()
     _cell_gate_tanh.run();
 
     // Input gate
-    if(_has_cifg)
+    if (_has_cifg)
     {
         _input_gate_sub.run();
     }
@@ -1022,14 +1252,14 @@ void CLQLSTMLayer::run()
         _recurrent_to_input_outstage.run();
         _accumulate_input_recurrent_input.run();
 
-        if(_has_peephole)
+        if (_has_peephole)
         {
             _pixelwise_mul_cell_to_input.run();
             _cell_to_input_outstage.run();
             _accumulate_cell_input.run();
         }
 
-        if(_has_layer_norm)
+        if (_has_layer_norm)
         {
             CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Input));
         }
@@ -1041,7 +1271,7 @@ void CLQLSTMLayer::run()
     _pixelwise_mul_forget_cell.run();
     _pixelwise_mul_input_cell.run();
     _add_forget_cell.run();
-    if(_has_cell_clipping)
+    if (_has_cell_clipping)
     {
         _cell_clip.run();
     }
@@ -1052,14 +1282,14 @@ void CLQLSTMLayer::run()
     _mm_recurrent_to_output.run();
     _recurrent_to_output_outstage.run();
     _accumulate_input_recurrent_output.run();
-    if(_has_peephole)
+    if (_has_peephole)
     {
         _pixelwise_mul_cell_to_output.run();
         _cell_to_output_outstage.run();
         _accumulate_cell_to_output.run();
     }
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Output));
     }
@@ -1072,31 +1302,31 @@ void CLQLSTMLayer::run()
     _hidden_outstage.run();
 
     // Projection.
-    if(_has_projection)
+    if (_has_projection)
     {
         _mm_projection.run();
         _projection_outstage.run();
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _projection_output_to_accumulate_copy.run();
         }
 
         _accumulate_projection.run();
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _projection_accumulate_to_output_copy.run();
         }
 
-        if(_has_projection_clipping)
+        if (_has_projection_clipping)
         {
             _projection_clip.run();
         }
     }
     else
     {
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _hidden_to_output_copy.run();
         }
@@ -1108,7 +1338,7 @@ void CLQLSTMLayer::run()
 
 void CLQLSTMLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         // Pre-transpose weights to be used in GEMM.
         _input_to_forget_weights_transposed.allocator()->allocate();
@@ -1125,10 +1355,11 @@ void CLQLSTMLayer::prepare()
         _transpose_recurrent_to_output_weights.run();
 
         // Precompute effective biases
-        if(_has_cifg)
+        if (_has_cifg)
         {
             _ones.map(true);
-            std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 32767);
+            std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()),
+                        _ones.info()->total_size() / _ones.info()->element_size(), 32767);
             _ones.unmap();
         }
         else
@@ -1136,10 +1367,12 @@ void CLQLSTMLayer::prepare()
             _input_to_input_eff_bias.allocator()->allocate();
             _recurrent_to_input_eff_bias.allocator()->allocate();
 
-            ITensorPack input_to_input_red_pack = { { ACL_SRC, _input_to_input_weights }, { ACL_DST, &_input_to_input_eff_bias } };
+            ITensorPack input_to_input_red_pack = {{ACL_SRC, _input_to_input_weights},
+                                                   {ACL_DST, &_input_to_input_eff_bias}};
             CLScheduler::get().enqueue_op(*_input_to_input_reduction, input_to_input_red_pack, false);
 
-            ITensorPack rec_to_input_red_pack = { { ACL_SRC, _recurrent_to_input_weights }, { ACL_DST, &_recurrent_to_input_eff_bias } };
+            ITensorPack rec_to_input_red_pack = {{ACL_SRC, _recurrent_to_input_weights},
+                                                 {ACL_DST, &_recurrent_to_input_eff_bias}};
             CLScheduler::get().enqueue_op(*_recurrent_to_input_reduction, rec_to_input_red_pack, false);
 
             _input_to_input_weights_transposed.allocator()->allocate();
@@ -1156,30 +1389,35 @@ void CLQLSTMLayer::prepare()
         _input_to_output_eff_bias.allocator()->allocate();
         _recurrent_to_output_eff_bias.allocator()->allocate();
 
-        ITensorPack input_to_forget_red_pack = { { ACL_SRC, _input_to_forget_weights }, { ACL_DST, &_input_to_forget_eff_bias } };
+        ITensorPack input_to_forget_red_pack = {{ACL_SRC, _input_to_forget_weights},
+                                                {ACL_DST, &_input_to_forget_eff_bias}};
         CLScheduler::get().enqueue_op(*_input_to_forget_reduction, input_to_forget_red_pack, false);
 
-        ITensorPack rec_to_forget_red_pack = { { ACL_SRC, _recurrent_to_forget_weights }, { ACL_DST, &_recurrent_to_forget_eff_bias } };
+        ITensorPack rec_to_forget_red_pack = {{ACL_SRC, _recurrent_to_forget_weights},
+                                              {ACL_DST, &_recurrent_to_forget_eff_bias}};
         CLScheduler::get().enqueue_op(*_recurrent_to_forget_reduction, rec_to_forget_red_pack, false);
 
-        ITensorPack input_to_cell_red_pack = { { ACL_SRC, _input_to_cell_weights }, { ACL_DST, &_input_to_cell_eff_bias } };
+        ITensorPack input_to_cell_red_pack = {{ACL_SRC, _input_to_cell_weights}, {ACL_DST, &_input_to_cell_eff_bias}};
         CLScheduler::get().enqueue_op(*_input_to_cell_reduction, input_to_cell_red_pack, false);
 
-        ITensorPack rec_to_cell_red_pack = { { ACL_SRC, _recurrent_to_cell_weights }, { ACL_DST, &_recurrent_to_cell_eff_bias } };
+        ITensorPack rec_to_cell_red_pack = {{ACL_SRC, _recurrent_to_cell_weights},
+                                            {ACL_DST, &_recurrent_to_cell_eff_bias}};
         CLScheduler::get().enqueue_op(*_recurrent_to_cell_reduction, rec_to_cell_red_pack, false);
 
-        ITensorPack input_to_output_red_pack = { { ACL_SRC, _input_to_output_weights }, { ACL_DST, &_input_to_output_eff_bias } };
+        ITensorPack input_to_output_red_pack = {{ACL_SRC, _input_to_output_weights},
+                                                {ACL_DST, &_input_to_output_eff_bias}};
         CLScheduler::get().enqueue_op(*_input_to_output_reduction, input_to_output_red_pack, false);
 
-        ITensorPack rec_to_output_red_pack = { { ACL_SRC, _recurrent_to_output_weights }, { ACL_DST, &_recurrent_to_output_eff_bias } };
+        ITensorPack rec_to_output_red_pack = {{ACL_SRC, _recurrent_to_output_weights},
+                                              {ACL_DST, &_recurrent_to_output_eff_bias}};
         CLScheduler::get().enqueue_op(*_recurrent_to_output_reduction, rec_to_output_red_pack, false);
 
-        if(_has_projection)
+        if (_has_projection)
         {
             _projection_eff_bias.allocator()->allocate();
-            ITensorPack proj_red_pack{ { ACL_SRC, _projection_weights }, { ACL_DST, &_projection_eff_bias } };
+            ITensorPack proj_red_pack{{ACL_SRC, _projection_weights}, {ACL_DST, &_projection_eff_bias}};
             CLScheduler::get().enqueue_op(*_projection_reduction, proj_red_pack, false);
-            if(_projection_bias != nullptr)
+            if (_projection_bias != nullptr)
             {
                 _projection_bias_add.run();
                 _projection_bias->mark_as_unused();
@@ -1189,7 +1427,7 @@ void CLQLSTMLayer::prepare()
             _transpose_projection_weights.run();
             _projection_weights->mark_as_unused();
 
-            if(!_projection_tensor_copy_required)
+            if (!_projection_tensor_copy_required)
             {
                 _hidden_gate.mark_as_unused();
                 _projection_accumulate_res.mark_as_unused();
diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp
index b249bdd1db..6edef29992 100644
--- a/src/runtime/CL/functions/CLQuantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClQuantize.h"
 
@@ -32,13 +33,12 @@ namespace arm_compute
 {
 struct CLQuantizationLayer::Impl
 {
-    const ICLTensor                    *src{ nullptr };
-    ICLTensor                          *dst{ nullptr };
-    std::unique_ptr<opencl::ClQuantize> op{ nullptr };
+    const ICLTensor                    *src{nullptr};
+    ICLTensor                          *dst{nullptr};
+    std::unique_ptr<opencl::ClQuantize> op{nullptr};
 };
 
-CLQuantizationLayer::CLQuantizationLayer()
-    : _impl(std::make_unique<Impl>())
+CLQuantizationLayer::CLQuantizationLayer() : _impl(std::make_unique<Impl>())
 {
 }
 CLQuantizationLayer::~CLQuantizationLayer() = default;
diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
index 6f122866b2..34b78eefa7 100644
--- a/src/runtime/CL/functions/CLRNNLayer.cpp
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp
@@ -28,24 +28,37 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 namespace arm_compute
 {
 using namespace arm_compute::misc::shape_calculator;
 
 CLRNNLayer::CLRNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation(), _fully_connected_kernel(), _copy(), _fully_connected_out(), _gemm_output(), _add_output(),
+    : _memory_group(std::move(memory_manager)),
+      _gemm_state_f(),
+      _add_kernel(),
+      _activation(),
+      _fully_connected_kernel(),
+      _copy(),
+      _fully_connected_out(),
+      _gemm_output(),
+      _add_output(),
       _is_prepared(false)
 {
 }
 
 CLRNNLayer::~CLRNNLayer() = default;
 
-Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state,
-                            const ITensorInfo *output, const ActivationLayerInfo &info)
+Status CLRNNLayer::validate(const ITensorInfo         *input,
+                            const ITensorInfo         *weights,
+                            const ITensorInfo         *recurrent_weights,
+                            const ITensorInfo         *bias,
+                            const ITensorInfo         *hidden_state,
+                            const ITensorInfo         *output,
+                            const ActivationLayerInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
@@ -63,28 +76,42 @@ Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights
     ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), hidden_state->tensor_shape());
 
-    auto shape_info = TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type());
+    auto shape_info =
+        TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type());
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info));
     ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
     ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&shape_info, &shape_info, info));
 
     return Status{};
 }
 
-void CLRNNLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state, ICLTensor *output,
+void CLRNNLayer::configure(const ICLTensor     *input,
+                           const ICLTensor     *weights,
+                           const ICLTensor     *recurrent_weights,
+                           const ICLTensor     *bias,
+                           ICLTensor           *hidden_state,
+                           ICLTensor           *output,
                            ActivationLayerInfo &info)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, recurrent_weights, bias, hidden_state, output, info);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, recurrent_weights, bias, hidden_state,
+              output, info);
 }
 
-void CLRNNLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias,
-                           ICLTensor *hidden_state,
-                           ICLTensor *output, ActivationLayerInfo &info)
+void CLRNNLayer::configure(const CLCompileContext &compile_context,
+                           const ICLTensor        *input,
+                           const ICLTensor        *weights,
+                           const ICLTensor        *recurrent_weights,
+                           const ICLTensor        *bias,
+                           ICLTensor              *hidden_state,
+                           ICLTensor              *output,
+                           ActivationLayerInfo    &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(),
+                                                    bias->info(), hidden_state->info(), output->info(), info));
     ARM_COMPUTE_LOG_PARAMS(input, weights, recurrent_weights, bias, hidden_state, output, info);
 
     const int   idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
@@ -133,7 +160,7 @@ void CLRNNLayer::run()
 
 void CLRNNLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         _fully_connected_kernel.prepare();
         _gemm_state_f.prepare();
diff --git a/src/runtime/CL/functions/CLROIAlignLayer.cpp b/src/runtime/CL/functions/CLROIAlignLayer.cpp
index 867ef7c7ac..1939d1d0ba 100644
--- a/src/runtime/CL/functions/CLROIAlignLayer.cpp
+++ b/src/runtime/CL/functions/CLROIAlignLayer.cpp
@@ -24,26 +24,36 @@
 #include "arm_compute/runtime/CL/functions/CLROIAlignLayer.h"
 
 #include "arm_compute/core/CL/ICLArray.h"
-#include "src/core/CL/kernels/CLROIAlignLayerKernel.h"
-#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLROIAlignLayerKernel.h"
+#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
 
 namespace arm_compute
 {
-Status CLROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status CLROIAlignLayer::validate(const ITensorInfo         *input,
+                                 const ITensorInfo         *rois,
+                                 ITensorInfo               *output,
+                                 const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(CLROIAlignLayerKernel::validate(input, rois, output, pool_info));
 
     return Status{};
 }
 
-void CLROIAlignLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayer::configure(const ICLTensor           *input,
+                                const ICLTensor           *rois,
+                                ICLTensor                 *output,
+                                const ROIPoolingLayerInfo &pool_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
 }
 
-void CLROIAlignLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayer::configure(const CLCompileContext    &compile_context,
+                                const ICLTensor           *input,
+                                const ICLTensor           *rois,
+                                ICLTensor                 *output,
+                                const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info);
 
diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
index 239a1c6bb2..0d2eab0c76 100644
--- a/src/runtime/CL/functions/CLROIPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
@@ -22,24 +22,35 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h"
+
 #include "arm_compute/core/CL/ICLArray.h"
-#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
 
 using namespace arm_compute;
 
-Status CLROIPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status CLROIPoolingLayer::validate(const ITensorInfo         *input,
+                                   const ITensorInfo         *rois,
+                                   ITensorInfo               *output,
+                                   const ROIPoolingLayerInfo &pool_info)
 {
     return CLROIPoolingLayerKernel::validate(input, rois, output, pool_info);
 }
 
-void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayer::configure(const ICLTensor           *input,
+                                  const ICLTensor           *rois,
+                                  ICLTensor                 *output,
+                                  const ROIPoolingLayerInfo &pool_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
 }
 
-void CLROIPoolingLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayer::configure(const CLCompileContext    &compile_context,
+                                  const ICLTensor           *input,
+                                  const ICLTensor           *rois,
+                                  const ICLTensor           *output,
+                                  const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info);
 
diff --git a/src/runtime/CL/functions/CLRange.cpp b/src/runtime/CL/functions/CLRange.cpp
index 3fbbd5f952..5c3f7f9c8c 100644
--- a/src/runtime/CL/functions/CLRange.cpp
+++ b/src/runtime/CL/functions/CLRange.cpp
@@ -27,9 +27,9 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLRangeKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLRangeKernel.h"
 
 using namespace arm_compute;
 
@@ -38,7 +38,8 @@ void CLRange::configure(ICLTensor *output, const float start, const float end, c
     configure(CLKernelLibrary::get().get_compile_context(), output, start, end, step);
 }
 
-void CLRange::configure(const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
+void CLRange::configure(
+    const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
 {
     ARM_COMPUTE_LOG_PARAMS(output, start, end, step);
     auto k = std::make_unique<CLRangeKernel>();
diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index cddbf77d7c..6c6daff5ba 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp
@@ -27,23 +27,25 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLReductionOperationKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace
 {
-Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status
+validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
 {
     ARM_COMPUTE_UNUSED(keep_dims);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
 
@@ -51,29 +53,29 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
     const int          input_dims    = input->num_dimensions();
     Coordinates        axis_local    = reduction_axis;
 
-    for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
     {
         //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)).
         ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast<int>(input->num_dimensions())));
         ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast<int>(input->num_dimensions()));
     }
 
-    if(output->tensor_shape().total_size() != 0)
+    if (output->tensor_shape().total_size() != 0)
     {
         // Only validate if not using auto_init for the output tensor
         TensorShape out_shape = input->tensor_shape();
         // Validate output_shape only if not using auto_init
         convert_negative_axis(axis_local, input_dims);
         std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
-        for(unsigned int i = 0; i < reduction_ops; ++i)
+        for (unsigned int i = 0; i < reduction_ops; ++i)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
             ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
-            if(output->total_size() > 0 && keep_dims)
+            if (output->total_size() > 0 && keep_dims)
             {
                 ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
             }
-            if(keep_dims)
+            if (keep_dims)
             {
                 out_shape.set(axis_local[i], 1);
             }
@@ -87,8 +89,9 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
         }
         const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
-        const bool requant = is_data_type_quantized(input->data_type()) && input->quantization_info() != output->quantization_info();
-        if(requant)
+        const bool requant =
+            is_data_type_quantized(input->data_type()) && input->quantization_info() != output->quantization_info();
+        if (requant)
         {
             TensorInfo input_no_quant(input->clone()->set_data_type(DataType::F32));
             CLDequantizationLayer::validate(input, &input_no_quant);
@@ -98,10 +101,19 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
     }
     return Status{};
 }
-}
+} // namespace
 
 CLReduceMean::CLReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _dequant(), _requant(), _reduction_ops(), _keep_dims(), _do_requant(), _input_no_quant(),
+    : _memory_group(std::move(memory_manager)),
+      _reduction_kernels(),
+      _reduced_outs(),
+      _reshape(),
+      _dequant(),
+      _requant(),
+      _reduction_ops(),
+      _keep_dims(),
+      _do_requant(),
+      _input_no_quant(),
       _output_no_quant()
 {
 }
@@ -111,17 +123,23 @@ void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis
     configure(CLKernelLibrary::get().get_compile_context(), input, reduction_axis, keep_dims, output);
 }
 
-void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output)
+void CLReduceMean::configure(const CLCompileContext &compile_context,
+                             ICLTensor              *input,
+                             const Coordinates      &reduction_axis,
+                             bool                    keep_dims,
+                             ICLTensor              *output)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_THROW_ON(CLReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info()));
     ARM_COMPUTE_LOG_PARAMS(input, reduction_axis, keep_dims, output);
 
     // Output auto inizialitation if not yet initialized
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims);
+    const TensorShape output_shape =
+        arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims);
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
-    _do_requant    = is_data_type_quantized(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info();
+    _do_requant = is_data_type_quantized(input->info()->data_type()) &&
+                  input->info()->quantization_info() != output->info()->quantization_info();
     _reduction_ops = reduction_axis.num_dimensions();
     _reduction_kernels.resize(_reduction_ops);
     _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
@@ -129,7 +147,7 @@ void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor
 
     ICLTensor *tmp_input  = input;
     ICLTensor *tmp_output = output;
-    if(_do_requant)
+    if (_do_requant)
     {
         _memory_group.manage(&_input_no_quant);
         _memory_group.manage(&_output_no_quant);
@@ -148,46 +166,51 @@ void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor
     convert_negative_axis(axis_local, input_dims);
 
     // Perform reduction for every axis
-    for(int i = 0; i < _reduction_ops; ++i)
+    for (int i = 0; i < _reduction_ops; ++i)
     {
-        TensorShape out_shape = i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+        TensorShape out_shape =
+            i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
         out_shape.set(axis_local[i], 1);
         auto in = (i == 0) ? tmp_input : (&_reduced_outs[i - 1]);
 
-        if(i == _reduction_ops - 1 && keep_dims)
+        if (i == _reduction_ops - 1 && keep_dims)
         {
-            _reduction_kernels[i].configure(compile_context, in, tmp_output, axis_local[i], ReductionOperation::MEAN_SUM);
+            _reduction_kernels[i].configure(compile_context, in, tmp_output, axis_local[i],
+                                            ReductionOperation::MEAN_SUM);
         }
         else
         {
-            _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_input->info()->num_channels(), tmp_input->info()->data_type(), tmp_input->info()->quantization_info()));
+            _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_input->info()->num_channels(),
+                                                          tmp_input->info()->data_type(),
+                                                          tmp_input->info()->quantization_info()));
             _memory_group.manage(&_reduced_outs[i]);
-            _reduction_kernels[i].configure(compile_context, in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM);
+            _reduction_kernels[i].configure(compile_context, in, &_reduced_outs[i], axis_local[i],
+                                            ReductionOperation::MEAN_SUM);
         }
     }
 
     // Allocate intermediate tensors
-    for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+    for (int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
     {
         _reduced_outs[i].allocator()->allocate();
     }
 
     // Configure reshape layer if we want to drop the dimensions
-    if(!_keep_dims)
+    if (!_keep_dims)
     {
         TensorShape out_shape = tmp_input->info()->tensor_shape();
 
         // We have to sort the reduction axis vectors in order for remove_dimension
         // to work properly
         std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
-        for(int i = 0; i < _reduction_ops; ++i)
+        for (int i = 0; i < _reduction_ops; ++i)
         {
             out_shape.remove_dimension(axis_local[i] - i, false);
         }
         auto_init_if_empty(*tmp_output->info(), tmp_input->info()->clone()->set_tensor_shape(out_shape));
         _reshape.configure(compile_context, &_reduced_outs[_reduction_ops - 1], tmp_output);
     }
-    if(_do_requant)
+    if (_do_requant)
     {
         _requant.configure(compile_context, &_output_no_quant, output);
         _input_no_quant.allocator()->allocate();
@@ -195,7 +218,10 @@ void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor
     }
 }
 
-Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status CLReduceMean::validate(const ITensorInfo *input,
+                              const Coordinates &reduction_axis,
+                              bool               keep_dims,
+                              const ITensorInfo *output)
 {
     return validate_config(input, reduction_axis, keep_dims, output);
 }
@@ -204,19 +230,19 @@ void CLReduceMean::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    if(_do_requant)
+    if (_do_requant)
     {
         _dequant.run();
     }
-    for(auto &kernel : _reduction_kernels)
+    for (auto &kernel : _reduction_kernels)
     {
         kernel.run();
     }
-    if(!_keep_dims)
+    if (!_keep_dims)
     {
         _reshape.run();
     }
-    if(_do_requant)
+    if (_do_requant)
     {
         _requant.run();
     }
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index cdc7fec51b..ba5489018e 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -27,35 +27,43 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLReductionOperationKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/runtime/Utils.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _unreshaped_output(), _reduction_kernel(), _reshape(), _reduction_axis(), _is_reshape_required(false)
+    : _memory_group(std::move(memory_manager)),
+      _unreshaped_output(),
+      _reduction_kernel(),
+      _reshape(),
+      _reduction_axis(),
+      _is_reshape_required(false)
 {
 }
 
 CLReductionOperation::~CLReductionOperation() = default;
 
-Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+Status CLReductionOperation::validate(
+    const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+                                    "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
 
     const bool is_reshape_required = !keep_dims;
 
-    if(is_reshape_required && output->total_size() != 0)
+    if (is_reshape_required && output->total_size() != 0)
     {
-        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
+        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
     }
 
@@ -67,22 +75,23 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf
     const auto input_qinfo        = input->quantization_info();
     const auto output_data_type   = output->data_type();
 
-    auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo)
-    {
+    auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, int num_channels,
+                                    QuantizationInfo qinfo) {
         ti.set_data_type(data_type).set_tensor_shape(shape).set_num_channels(num_channels).set_quantization_info(qinfo);
     };
 
-    if(is_reshape_required)
+    if (is_reshape_required)
     {
         auto shape_before_reshape = input_shape;
         shape_before_reshape.set(axis, 1);
-        initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles, input_qinfo);
+        initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles,
+                              input_qinfo);
         output_internal = &output_before_reshape;
     }
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output_internal, axis, op));
 
-    if(is_reshape_required)
+    if (is_reshape_required)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(output_internal, output));
     }
@@ -92,7 +101,7 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf
 
 ICLTensor *CLReductionOperation::configure_intermediate_result_vector(ICLTensor *input, ICLTensor *output)
 {
-    if(!_is_reshape_required)
+    if (!_is_reshape_required)
     {
         return output;
     }
@@ -103,12 +112,18 @@ ICLTensor *CLReductionOperation::configure_intermediate_result_vector(ICLTensor
     return &_unreshaped_output;
 }
 
-void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+void CLReductionOperation::configure(
+    ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op, keep_dims);
 }
 
-void CLReductionOperation::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+void CLReductionOperation::configure(const CLCompileContext &compile_context,
+                                     ICLTensor              *input,
+                                     ICLTensor              *output,
+                                     unsigned int            axis,
+                                     ReductionOperation      op,
+                                     bool                    keep_dims)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, output, axis, op, keep_dims);
@@ -117,11 +132,17 @@ void CLReductionOperation::configure(const CLCompileContext &compile_context, IC
 
     auto *output_internal = configure_intermediate_result_vector(input, output);
 
-    if(_is_reshape_required)
+    if (_is_reshape_required)
     {
-        const TensorShape output_shape     = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
-        const auto        output_data_type = input->info()->data_type();
-        auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
+        const TensorShape output_shape =
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
+        const auto output_data_type = input->info()->data_type();
+        auto_init_if_empty(*output->info(), input->info()
+                                                ->clone()
+                                                ->set_tensor_shape(output_shape)
+                                                .set_data_type(output_data_type)
+                                                .reset_padding()
+                                                .set_is_resizable(true));
 
         _memory_group.manage(&_unreshaped_output);
     }
@@ -129,7 +150,7 @@ void CLReductionOperation::configure(const CLCompileContext &compile_context, IC
     _reduction_kernel = std::make_unique<CLReductionOperationKernel>();
     _reduction_kernel->configure(compile_context, input, output_internal, axis, op);
 
-    if(_is_reshape_required)
+    if (_is_reshape_required)
     {
         _reshape.configure(compile_context, &_unreshaped_output, output);
         _unreshaped_output.allocator()->allocate();
@@ -142,7 +163,7 @@ void CLReductionOperation::run()
 
     CLScheduler::get().enqueue(*_reduction_kernel, false);
 
-    if(_is_reshape_required)
+    if (_is_reshape_required)
     {
         _reshape.run();
     }
diff --git a/src/runtime/CL/functions/CLReorgLayer.cpp b/src/runtime/CL/functions/CLReorgLayer.cpp
index 15de959225..156e9b90c1 100644
--- a/src/runtime/CL/functions/CLReorgLayer.cpp
+++ b/src/runtime/CL/functions/CLReorgLayer.cpp
@@ -27,9 +27,9 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CL/kernels/CLReorgLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLReorgLayerKernel.h"
 
 #include <utility>
 
@@ -40,7 +40,10 @@ void CLReorgLayer::configure(ICLTensor *input, ICLTensor *output, int32_t stride
     configure(CLKernelLibrary::get().get_compile_context(), input, output, stride);
 }
 
-void CLReorgLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int32_t stride)
+void CLReorgLayer::configure(const CLCompileContext &compile_context,
+                             ICLTensor              *input,
+                             ICLTensor              *output,
+                             int32_t                 stride)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, stride);
     auto k = std::make_unique<CLReorgLayerKernel>();
diff --git a/src/runtime/CL/functions/CLReshapeLayer.cpp b/src/runtime/CL/functions/CLReshapeLayer.cpp
index c51a3298c1..3d6349fb25 100644
--- a/src/runtime/CL/functions/CLReshapeLayer.cpp
+++ b/src/runtime/CL/functions/CLReshapeLayer.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClReshape.h"
 
@@ -35,17 +36,16 @@ namespace arm_compute
 {
 struct CLReshapeLayer::Impl
 {
-    const ICLTensor                   *src{ nullptr };
-    ICLTensor                         *dst{ nullptr };
-    std::unique_ptr<opencl::ClReshape> op{ nullptr };
+    const ICLTensor                   *src{nullptr};
+    ICLTensor                         *dst{nullptr};
+    std::unique_ptr<opencl::ClReshape> op{nullptr};
 };
 
-CLReshapeLayer::CLReshapeLayer()
-    : _impl(std::make_unique<Impl>())
+CLReshapeLayer::CLReshapeLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLReshapeLayer::CLReshapeLayer(CLReshapeLayer &&) = default;
+CLReshapeLayer::CLReshapeLayer(CLReshapeLayer &&)            = default;
 CLReshapeLayer &CLReshapeLayer::operator=(CLReshapeLayer &&) = default;
 CLReshapeLayer::~CLReshapeLayer()                            = default;
 
@@ -78,4 +78,4 @@ void CLReshapeLayer::run()
     _impl->op->run(pack);
 }
 } // namespace arm_compute
-/** [CLReshapeLayer snippet] **/
-\ No newline at end of file
+  /** [CLReshapeLayer snippet] **/
diff --git a/src/runtime/CL/functions/CLReverse.cpp b/src/runtime/CL/functions/CLReverse.cpp
index 1fc93571d9..415de52e64 100644
--- a/src/runtime/CL/functions/CLReverse.cpp
+++ b/src/runtime/CL/functions/CLReverse.cpp
@@ -24,9 +24,9 @@
 #include "arm_compute/runtime/CL/functions/CLReverse.h"
 
 #include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLReverseKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLReverseKernel.h"
 
 namespace arm_compute
 {
@@ -35,7 +35,10 @@ void CLReverse::configure(const ICLTensor *input, ICLTensor *output, const ICLTe
     configure(CLKernelLibrary::get().get_compile_context(), input, output, axis);
 }
 
-void CLReverse::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+void CLReverse::configure(const CLCompileContext &compile_context,
+                          const ICLTensor        *input,
+                          ICLTensor              *output,
+                          const ICLTensor        *axis)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, axis);
     auto k = std::make_unique<CLReverseKernel>();
diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp
index 5b78989bfa..abff0724e4 100644
--- a/src/runtime/CL/functions/CLScale.cpp
+++ b/src/runtime/CL/functions/CLScale.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClScale.h"
 
@@ -33,13 +34,12 @@ namespace arm_compute
 {
 struct CLScale::Impl
 {
-    const ICLTensor                 *src{ nullptr };
-    ICLTensor                       *dst{ nullptr };
-    std::unique_ptr<opencl::ClScale> op{ nullptr };
+    const ICLTensor                 *src{nullptr};
+    ICLTensor                       *dst{nullptr};
+    std::unique_ptr<opencl::ClScale> op{nullptr};
 };
 
-CLScale::CLScale()
-    : _impl(std::make_unique<Impl>())
+CLScale::CLScale() : _impl(std::make_unique<Impl>())
 {
 }
 CLScale::~CLScale() = default;
@@ -49,7 +49,10 @@ void CLScale::configure(ICLTensor *input, ICLTensor *output, const ScaleKernelIn
     configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
 }
 
-void CLScale::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info)
+void CLScale::configure(const CLCompileContext &compile_context,
+                        ICLTensor              *input,
+                        ICLTensor              *output,
+                        const ScaleKernelInfo  &info)
 {
     _impl->src = input;
     _impl->dst = output;
diff --git a/src/runtime/CL/functions/CLSelect.cpp b/src/runtime/CL/functions/CLSelect.cpp
index c4ab3dc67a..b4897d9e62 100644
--- a/src/runtime/CL/functions/CLSelect.cpp
+++ b/src/runtime/CL/functions/CLSelect.cpp
@@ -25,9 +25,9 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLSelectKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLSelectKernel.h"
 
 using namespace arm_compute;
 
@@ -38,7 +38,11 @@ void CLSelect::configure(const ICLTensor *c, const ICLTensor *x, const ICLTensor
     configure(CLKernelLibrary::get().get_compile_context(), c, x, y, output);
 }
 
-void CLSelect::configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output)
+void CLSelect::configure(const CLCompileContext &compile_context,
+                         const ICLTensor        *c,
+                         const ICLTensor        *x,
+                         const ICLTensor        *y,
+                         ICLTensor              *output)
 {
     ARM_COMPUTE_LOG_PARAMS(c, x, y, output);
     auto k = std::make_unique<CLSelectKernel>();
diff --git a/src/runtime/CL/functions/CLSlice.cpp b/src/runtime/CL/functions/CLSlice.cpp
index 7e3ac7d769..f79c6a1235 100644
--- a/src/runtime/CL/functions/CLSlice.cpp
+++ b/src/runtime/CL/functions/CLSlice.cpp
@@ -26,15 +26,19 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
-#include "src/core/CL/kernels/CLStridedSliceKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLStridedSliceKernel.h"
 
 namespace arm_compute
 {
 namespace experimental
 {
-void CLSlice::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+void CLSlice::configure(const CLCompileContext &compile_context,
+                        const ITensorInfo      *input,
+                        ITensorInfo            *output,
+                        const Coordinates      &starts,
+                        const Coordinates      &ends)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends);
@@ -47,15 +51,16 @@ void CLSlice::configure(const CLCompileContext &compile_context, const ITensorIn
     _kernel = std::move(k);
 }
 
-Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+Status CLSlice::validate(const ITensorInfo *input,
+                         const ITensorInfo *output,
+                         const Coordinates &starts,
+                         const Coordinates &ends)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
 
     // Check start dimensions for being non-negative
-    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i)
-    {
-        return i < 0;
-    }));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) { return i < 0; }));
 
     // Get absolute end coordinates
     const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
@@ -66,20 +71,22 @@ Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, co
 
 struct CLSlice::Impl
 {
-    const ICLTensor                       *src{ nullptr };
-    ICLTensor                             *dst{ nullptr };
-    std::unique_ptr<experimental::CLSlice> op{ nullptr };
+    const ICLTensor                       *src{nullptr};
+    ICLTensor                             *dst{nullptr};
+    std::unique_ptr<experimental::CLSlice> op{nullptr};
 };
 
-CLSlice::CLSlice()
-    : _impl(std::make_unique<Impl>())
+CLSlice::CLSlice() : _impl(std::make_unique<Impl>())
 {
 }
-CLSlice::CLSlice(CLSlice &&) = default;
+CLSlice::CLSlice(CLSlice &&)            = default;
 CLSlice &CLSlice::operator=(CLSlice &&) = default;
 CLSlice::~CLSlice()                     = default;
 
-Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+Status CLSlice::validate(const ITensorInfo *input,
+                         const ITensorInfo *output,
+                         const Coordinates &starts,
+                         const Coordinates &ends)
 {
     return experimental::CLSlice::validate(input, output, starts, ends);
 }
@@ -89,7 +96,11 @@ void CLSlice::configure(const ICLTensor *input, ICLTensor *output, const Coordin
     configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends);
 }
 
-void CLSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends)
+void CLSlice::configure(const CLCompileContext &compile_context,
+                        const ICLTensor        *input,
+                        ICLTensor              *output,
+                        const Coordinates      &starts,
+                        const Coordinates      &ends)
 {
     _impl->src = input;
     _impl->dst = output;
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index d52352fc8d..2e70e2aa08 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -22,12 +22,14 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
+
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/kernels/ClSoftmaxKernel.h"
 #include "src/gpu/cl/operators/ClPermute.h"
@@ -40,9 +42,9 @@ using OperatorType = opencl::ClSoftmax;
 template <bool IS_LOG>
 struct CLSoftmaxLayerGeneric<IS_LOG>::Impl
 {
-    const ICLTensor              *src{ nullptr };
-    ICLTensor                    *dst{ nullptr };
-    std::unique_ptr<OperatorType> op{ nullptr };
+    const ICLTensor              *src{nullptr};
+    ICLTensor                    *dst{nullptr};
+    std::unique_ptr<OperatorType> op{nullptr};
     MemoryGroup                   memory_group{};
     ITensorPack                   run_pack{};
     WorkspaceData<CLTensor>       workspace_tensors{};
@@ -65,28 +67,30 @@ void CLSoftmaxLayerGeneric<IS_LOG>::configure(const ICLTensor *input, ICLTensor
 }
 
 template <bool IS_LOG>
-void CLSoftmaxLayerGeneric<IS_LOG>::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, int32_t axis)
+void CLSoftmaxLayerGeneric<IS_LOG>::configure(
+    const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, int32_t axis)
 {
     _impl->src = input;
     _impl->dst = output;
     _impl->op  = std::make_unique<OperatorType>();
 
-    SoftmaxKernelInfo softmax_info{ beta, IS_LOG, input->info()->data_type(), axis };
+    SoftmaxKernelInfo softmax_info{beta, IS_LOG, input->info()->data_type(), axis};
     _impl->op->configure(compile_context, *input->info(), *output->info(), softmax_info);
 
-    _impl->run_pack          = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST, _impl->dst } };
+    _impl->run_pack          = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}};
     _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
 }
 
 template <bool IS_LOG>
-Status CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
+Status
+CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
 {
-    SoftmaxKernelInfo softmax_info{ beta, IS_LOG, input->data_type(), axis };
+    SoftmaxKernelInfo softmax_info{beta, IS_LOG, input->data_type(), axis};
     return OperatorType::validate(*input, *output, softmax_info);
 }
 
 template <bool IS_LOG>
-void           CLSoftmaxLayerGeneric<IS_LOG>::run()
+void CLSoftmaxLayerGeneric<IS_LOG>::run()
 {
     // Acquire all the temporaries
     MemoryGroupResourceScope scope_mg(_impl->memory_group);
diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
index 3b7083400b..37f728895f 100644
--- a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
@@ -29,71 +29,100 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
 
 namespace arm_compute
 {
 CLSpaceToBatchLayer::CLSpaceToBatchLayer()
-    : _space_to_batch_kernel(std::make_unique<CLSpaceToBatchLayerKernel>()),
-      _fill(),
-      _has_padding(false)
+    : _space_to_batch_kernel(std::make_unique<CLSpaceToBatchLayerKernel>()), _fill(), _has_padding(false)
 {
 }
 
 CLSpaceToBatchLayer::~CLSpaceToBatchLayer() = default;
 
-void CLSpaceToBatchLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+void CLSpaceToBatchLayer::configure(const ICLTensor *input,
+                                    const ICLTensor *block_shape,
+                                    const ICLTensor *paddings,
+                                    ICLTensor       *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, paddings, output);
 }
 
-void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    const ICLTensor        *block_shape,
+                                    const ICLTensor        *paddings,
+                                    ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
     ARM_COMPUTE_LOG_PARAMS(input, block_shape, paddings, output);
 
-    if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+    if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
-        _fill.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+        _fill.configure(compile_context, output,
+                        PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
     }
     _space_to_batch_kernel->configure(compile_context, input, block_shape, paddings, output);
 }
 
-void CLSpaceToBatchLayer::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output)
+void CLSpaceToBatchLayer::configure(const ICLTensor *input,
+                                    const int        block_shape_x,
+                                    const int        block_shape_y,
+                                    const Size2D    &padding_left,
+                                    const Size2D    &padding_right,
+                                    ICLTensor       *output)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left,
+              padding_right, output);
 }
 
-void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left,
-                                    const Size2D &padding_right, ICLTensor *output)
+void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    const int               block_shape_x,
+                                    const int               block_shape_y,
+                                    const Size2D           &padding_left,
+                                    const Size2D           &padding_right,
+                                    ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
 
-    if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+    if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
-        _fill.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+        _fill.configure(compile_context, output,
+                        PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
     }
-    _space_to_batch_kernel->configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+    _space_to_batch_kernel->configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right,
+                                      output);
 }
 
-Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+Status CLSpaceToBatchLayer::validate(const ITensorInfo *input,
+                                     const ITensorInfo *block_shape,
+                                     const ITensorInfo *paddings,
+                                     const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
     ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
 
     return Status{};
 }
 
-Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status CLSpaceToBatchLayer::validate(const ITensorInfo *input,
+                                     const int          block_shape_x,
+                                     const int          block_shape_y,
+                                     const Size2D      &padding_left,
+                                     const Size2D      &padding_right,
                                      const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
 
     return Status{};
 }
@@ -101,7 +130,7 @@ Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_s
 void CLSpaceToBatchLayer::run()
 {
     // Zero out output only if we have paddings
-    if(_has_padding)
+    if (_has_padding)
     {
         //CLScheduler::get().enqueue(*_fill, true);
         _fill.run();
diff --git a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
index 67dafff47f..22695c9ef3 100644
--- a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
@@ -29,14 +29,13 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
 
 namespace arm_compute
 {
-CLSpaceToDepthLayer::CLSpaceToDepthLayer()
-    : _space_to_depth_kernel(std::make_unique<CLSpaceToDepthLayerKernel>())
+CLSpaceToDepthLayer::CLSpaceToDepthLayer() : _space_to_depth_kernel(std::make_unique<CLSpaceToDepthLayerKernel>())
 {
 }
 
@@ -47,7 +46,10 @@ void CLSpaceToDepthLayer::configure(const ICLTensor *input, ICLTensor *output, i
     configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
 }
 
-void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    ICLTensor              *output,
+                                    int32_t                 block_shape)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, block_shape);
     _space_to_depth_kernel->configure(compile_context, input, output, block_shape);
diff --git a/src/runtime/CL/functions/CLSplit.cpp b/src/runtime/CL/functions/CLSplit.cpp
index 0b27371e3f..6be43cc5cd 100644
--- a/src/runtime/CL/functions/CLSplit.cpp
+++ b/src/runtime/CL/functions/CLSplit.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
@@ -38,7 +39,7 @@ void CLSplit::run()
 {
     cl::CommandQueue q = CLScheduler::get().queue();
 
-    for(unsigned i = 0; i < _num_outputs; ++i)
+    for (unsigned i = 0; i < _num_outputs; ++i)
     {
         _slice_functions[i].run();
     }
diff --git a/src/runtime/CL/functions/CLStackLayer.cpp b/src/runtime/CL/functions/CLStackLayer.cpp
index 6a335da00c..c15496fc31 100644
--- a/src/runtime/CL/functions/CLStackLayer.cpp
+++ b/src/runtime/CL/functions/CLStackLayer.cpp
@@ -21,8 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include <complex>
-
 #include "arm_compute/runtime/CL/functions/CLStackLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
@@ -32,16 +30,16 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLStackLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLStackLayerKernel.h"
+
+#include <complex>
 
 namespace arm_compute
 {
 CLStackLayer::CLStackLayer() // NOLINT
-    : _input(),
-      _stack_kernels(),
-      _num_inputs(0)
+    : _input(), _stack_kernels(), _num_inputs(0)
 {
 }
 
@@ -52,7 +50,10 @@ void CLStackLayer::configure(const std::vector<ICLTensor *> &input, int axis, IC
     configure(CLKernelLibrary::get().get_compile_context(), input, axis, output);
 }
 
-void CLStackLayer::configure(const CLCompileContext &compile_context, const std::vector<ICLTensor *> &input, int axis, ICLTensor *output)
+void CLStackLayer::configure(const CLCompileContext         &compile_context,
+                             const std::vector<ICLTensor *> &input,
+                             int                             axis,
+                             ICLTensor                      *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input, axis, output);
     _num_inputs = input.size();
@@ -61,7 +62,7 @@ void CLStackLayer::configure(const CLCompileContext &compile_context, const std:
     // Wrap around negative values
     const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
 
-    for(unsigned int i = 0; i < _num_inputs; i++)
+    for (unsigned int i = 0; i < _num_inputs; i++)
     {
         _stack_kernels.emplace_back(std::make_unique<CLStackLayerKernel>());
         _stack_kernels.back()->configure(compile_context, input[i], axis_u, i, _num_inputs, output);
@@ -79,7 +80,7 @@ Status CLStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis,
 
     const unsigned int num_inputs = input.size();
 
-    for(unsigned int i = 0; i < num_inputs; i++)
+    for (unsigned int i = 0; i < num_inputs; i++)
     {
         // All the tensors must have the same rank
         ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank);
@@ -92,7 +93,7 @@ Status CLStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis,
 
 void CLStackLayer::run()
 {
-    for(unsigned i = 0; i < _num_inputs; i++)
+    for (unsigned i = 0; i < _num_inputs; i++)
     {
         CLScheduler::get().enqueue(*_stack_kernels[i], false);
     }
diff --git a/src/runtime/CL/functions/CLStridedSlice.cpp b/src/runtime/CL/functions/CLStridedSlice.cpp
index 261bdc13d1..c1953cc415 100644
--- a/src/runtime/CL/functions/CLStridedSlice.cpp
+++ b/src/runtime/CL/functions/CLStridedSlice.cpp
@@ -25,17 +25,23 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLStridedSliceKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLStridedSliceKernel.h"
 
 namespace arm_compute
 {
 namespace experimental
 {
-void CLStridedSlice::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
-                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void CLStridedSlice::configure(const CLCompileContext &compile_context,
+                               const ITensorInfo      *input,
+                               ITensorInfo            *output,
+                               const Coordinates      &starts,
+                               const Coordinates      &ends,
+                               const BiStrides        &strides,
+                               int32_t                 begin_mask,
+                               int32_t                 end_mask,
+                               int32_t                 shrink_axis_mask)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     auto k = std::make_unique<CLStridedSliceKernel>();
@@ -43,9 +49,14 @@ void CLStridedSlice::configure(const CLCompileContext &compile_context, const IT
     _kernel = std::move(k);
 }
 
-Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status CLStridedSlice::validate(const ITensorInfo *input,
+                                const ITensorInfo *output,
+                                const Coordinates &starts,
+                                const Coordinates &ends,
+                                const BiStrides   &strides,
+                                int32_t            begin_mask,
+                                int32_t            end_mask,
+                                int32_t            shrink_axis_mask)
 {
     return CLStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
 }
@@ -53,32 +64,43 @@ Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *out
 
 struct CLStridedSlice::Impl
 {
-    const ICLTensor                              *src{ nullptr };
-    ICLTensor                                    *dst{ nullptr };
-    CLRuntimeContext                             *ctx{ nullptr };
-    std::unique_ptr<experimental::CLStridedSlice> op{ nullptr };
+    const ICLTensor                              *src{nullptr};
+    ICLTensor                                    *dst{nullptr};
+    CLRuntimeContext                             *ctx{nullptr};
+    std::unique_ptr<experimental::CLStridedSlice> op{nullptr};
 };
 
-CLStridedSlice::CLStridedSlice(CLRuntimeContext *ctx)
-    : _impl(std::make_unique<Impl>())
+CLStridedSlice::CLStridedSlice(CLRuntimeContext *ctx) : _impl(std::make_unique<Impl>())
 {
     _impl->ctx = ctx;
 }
 
-CLStridedSlice::CLStridedSlice(CLStridedSlice &&) = default;
+CLStridedSlice::CLStridedSlice(CLStridedSlice &&)            = default;
 CLStridedSlice &CLStridedSlice::operator=(CLStridedSlice &&) = default;
 CLStridedSlice::~CLStridedSlice()                            = default;
 
-void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output,
-                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void CLStridedSlice::configure(const ICLTensor   *input,
+                               ICLTensor         *output,
+                               const Coordinates &starts,
+                               const Coordinates &ends,
+                               const BiStrides   &strides,
+                               int32_t            begin_mask,
+                               int32_t            end_mask,
+                               int32_t            shrink_axis_mask)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask,
+              shrink_axis_mask);
 }
 
-void CLStridedSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
-                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void CLStridedSlice::configure(const CLCompileContext &compile_context,
+                               const ICLTensor        *input,
+                               ICLTensor              *output,
+                               const Coordinates      &starts,
+                               const Coordinates      &ends,
+                               const BiStrides        &strides,
+                               int32_t                 begin_mask,
+                               int32_t                 end_mask,
+                               int32_t                 shrink_axis_mask)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
@@ -86,14 +108,21 @@ void CLStridedSlice::configure(const CLCompileContext &compile_context, const IC
     _impl->dst = output;
 
     _impl->op = std::make_unique<experimental::CLStridedSlice>();
-    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), starts, ends, strides, begin_mask,
+                         end_mask, shrink_axis_mask);
 }
 
-Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status CLStridedSlice::validate(const ITensorInfo *input,
+                                const ITensorInfo *output,
+                                const Coordinates &starts,
+                                const Coordinates &ends,
+                                const BiStrides   &strides,
+                                int32_t            begin_mask,
+                                int32_t            end_mask,
+                                int32_t            shrink_axis_mask)
 {
-    return experimental::CLStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    return experimental::CLStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask,
+                                                  shrink_axis_mask);
 }
 
 void CLStridedSlice::run()
diff --git a/src/runtime/CL/functions/CLTile.cpp b/src/runtime/CL/functions/CLTile.cpp
index ef790995f9..4f86c4adfa 100644
--- a/src/runtime/CL/functions/CLTile.cpp
+++ b/src/runtime/CL/functions/CLTile.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLTile.h"
 
-#include "src/core/CL/kernels/CLTileKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLTileKernel.h"
 
 namespace arm_compute
 {
@@ -34,7 +33,10 @@ void CLTile::configure(const ICLTensor *input, ICLTensor *output, const Multiple
     configure(CLKernelLibrary::get().get_compile_context(), input, output, multiples);
 }
 
-void CLTile::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
+void CLTile::configure(const CLCompileContext &compile_context,
+                       const ICLTensor        *input,
+                       ICLTensor              *output,
+                       const Multiples        &multiples)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, multiples);
     auto k = std::make_unique<CLTileKernel>();
diff --git a/src/runtime/CL/functions/CLTranspose.cpp b/src/runtime/CL/functions/CLTranspose.cpp
index e63c92eeb4..5a738f47ce 100644
--- a/src/runtime/CL/functions/CLTranspose.cpp
+++ b/src/runtime/CL/functions/CLTranspose.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClTranspose.h"
 
@@ -34,12 +35,11 @@ namespace arm_compute
 {
 struct CLTranspose::Impl
 {
-    const ICLTensor                     *src{ nullptr };
-    ICLTensor                           *dst{ nullptr };
-    std::unique_ptr<opencl::ClTranspose> op{ nullptr };
+    const ICLTensor                     *src{nullptr};
+    ICLTensor                           *dst{nullptr};
+    std::unique_ptr<opencl::ClTranspose> op{nullptr};
 };
-CLTranspose::CLTranspose()
-    : _impl(std::make_unique<Impl>())
+CLTranspose::CLTranspose() : _impl(std::make_unique<Impl>())
 {
 }
 CLTranspose::~CLTranspose() = default;
@@ -70,4 +70,4 @@ void CLTranspose::run()
     pack.add_tensor(TensorType::ACL_DST, _impl->dst);
     _impl->op->run(pack);
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLUnstack.cpp b/src/runtime/CL/functions/CLUnstack.cpp
index 98d47810ab..ddd83e7824 100644
--- a/src/runtime/CL/functions/CLUnstack.cpp
+++ b/src/runtime/CL/functions/CLUnstack.cpp
@@ -40,13 +40,15 @@ inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor)
     return wrap_around(axis, static_cast<int>(tensor->num_dimensions()));
 }
 
-inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions)
+inline void setup_slice_coordinates_and_mask(Coordinates       &slice_start,
+                                             int32_t           &slice_end_mask,
+                                             const unsigned int input_num_dimensions)
 {
     // Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time.
     Coordinates slice_end;
     slice_start.set_num_dimensions(input_num_dimensions);
     slice_end.set_num_dimensions(input_num_dimensions);
-    for(size_t k = 0; k < input_num_dimensions; ++k)
+    for (size_t k = 0; k < input_num_dimensions; ++k)
     {
         slice_start.set(k, 0);
         slice_end.set(k, -1);
@@ -56,8 +58,7 @@ inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &
 } // namespace
 
 CLUnstack::CLUnstack() // NOLINT
-    : _num_slices(0),
-      _strided_slice_vector()
+    : _num_slices(0), _strided_slice_vector()
 {
 }
 
@@ -66,15 +67,19 @@ void CLUnstack::configure(const ICLTensor *input, const std::vector<ICLTensor *>
     configure(CLKernelLibrary::get().get_compile_context(), input, output_vector, axis);
 }
 
-void CLUnstack::configure(const CLCompileContext &compile_context, const ICLTensor *input, const std::vector<ICLTensor *> &output_vector, int axis)
+void CLUnstack::configure(const CLCompileContext         &compile_context,
+                          const ICLTensor                *input,
+                          const std::vector<ICLTensor *> &output_vector,
+                          int                             axis)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output_vector, axis);
     std::vector<ITensorInfo *> outputs_vector_info(output_vector.size());
-    std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ICLTensor * t)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(t);
-        return t->info();
-    });
+    std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(),
+                   [](ICLTensor *t)
+                   {
+                       ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+                       return t->info();
+                   });
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_ERROR_THROW_ON(CLUnstack::validate(input->info(), outputs_vector_info, axis));
@@ -87,11 +92,12 @@ void CLUnstack::configure(const CLCompileContext &compile_context, const ICLTens
     Coordinates slice_start;
     int32_t     slice_end_mask;
     setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions());
-    for(unsigned int slice = 0; slice < _num_slices; ++slice)
+    for (unsigned int slice = 0; slice < _num_slices; ++slice)
     {
         // Adjusts start and end coordinates to take a 2D slice at a time
         slice_start.set(axis_u, slice);
-        _strided_slice_vector[slice].configure(compile_context, input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u));
+        _strided_slice_vector[slice].configure(compile_context, input, output_vector[slice], slice_start, Coordinates(),
+                                               BiStrides(), 0, slice_end_mask, (1 << axis_u));
     }
 }
 
@@ -106,18 +112,20 @@ Status CLUnstack::validate(const ITensorInfo *input, const std::vector<ITensorIn
     ARM_COMPUTE_RETURN_ERROR_ON(num_slices > output_vector.size());
     Coordinates slice_start;
     int32_t     slice_end_mask;
-    for(size_t k = 0; k < num_slices; ++k)
+    for (size_t k = 0; k < num_slices; ++k)
     {
         slice_start.set(wrap_axis(axis, input), k);
         setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions());
-        ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input))));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(),
+                                                             BiStrides(), 0, slice_end_mask,
+                                                             (1 << wrap_axis(axis, input))));
     }
     return Status{};
 }
 
 void CLUnstack::run()
 {
-    for(unsigned i = 0; i < _num_slices; ++i)
+    for (unsigned i = 0; i < _num_slices; ++i)
     {
         _strided_slice_vector[i].run();
     }
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index b416d0fcf1..645f817030 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClWinogradConv2d.h"
@@ -35,15 +36,15 @@ namespace arm_compute
 {
 struct CLWinogradConvolutionLayer::Impl
 {
-    const ICLTensor                          *src{ nullptr };
-    const ICLTensor                          *weights{ nullptr };
-    const ICLTensor                          *biases{ nullptr };
-    ICLTensor                                *dst{ nullptr };
-    std::unique_ptr<opencl::ClWinogradConv2d> op{ nullptr };
+    const ICLTensor                          *src{nullptr};
+    const ICLTensor                          *weights{nullptr};
+    const ICLTensor                          *biases{nullptr};
+    ICLTensor                                *dst{nullptr};
+    std::unique_ptr<opencl::ClWinogradConv2d> op{nullptr};
     ITensorPack                               run_pack{};
     MemoryGroup                               memory_group{};
     WorkspaceData<CLTensor>                   workspace_tensors{};
-    bool                                      is_prepared{ false };
+    bool                                      is_prepared{false};
 };
 
 CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
@@ -54,15 +55,26 @@ CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryMa
 
 CLWinogradConvolutionLayer::~CLWinogradConvolutionLayer() = default;
 
-void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
-                                           bool enable_fast_math)
+void CLWinogradConvolutionLayer::configure(ICLTensor                 *input,
+                                           const ICLTensor           *weights,
+                                           const ICLTensor           *biases,
+                                           ICLTensor                 *output,
+                                           const PadStrideInfo       &conv_info,
+                                           const ActivationLayerInfo &act_info,
+                                           bool                       enable_fast_math)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, enable_fast_math);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info,
+              enable_fast_math);
 }
 
-void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+void CLWinogradConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                           ICLTensor                 *input,
+                                           const ICLTensor           *weights,
+                                           const ICLTensor           *biases,
+                                           ICLTensor                 *output,
                                            const PadStrideInfo       &conv_info,
-                                           const ActivationLayerInfo &act_info, bool enable_fast_math)
+                                           const ActivationLayerInfo &act_info,
+                                           bool                       enable_fast_math)
 {
     _impl->src     = input;
     _impl->weights = weights;
@@ -70,20 +82,25 @@ void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_conte
     _impl->dst     = output;
 
     _impl->op = std::make_unique<opencl::ClWinogradConv2d>();
-    _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, act_info, enable_fast_math);
+    _impl->op->configure(compile_context, input->info(), weights->info(),
+                         (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, act_info,
+                         enable_fast_math);
 
-    _impl->run_pack =
-    {
-        { TensorType::ACL_SRC_0, _impl->src },
-        { TensorType::ACL_SRC_1, _impl->weights },
-        { TensorType::ACL_SRC_2, _impl->biases },
-        { TensorType::ACL_DST, _impl->dst }
-    };
-    _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack);
+    _impl->run_pack = {{TensorType::ACL_SRC_0, _impl->src},
+                       {TensorType::ACL_SRC_1, _impl->weights},
+                       {TensorType::ACL_SRC_2, _impl->biases},
+                       {TensorType::ACL_DST, _impl->dst}};
+    _impl->workspace_tensors =
+        manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack);
 }
 
-Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                            const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status CLWinogradConvolutionLayer::validate(const ITensorInfo         *input,
+                                            const ITensorInfo         *weights,
+                                            const ITensorInfo         *biases,
+                                            const ITensorInfo         *output,
+                                            const PadStrideInfo       &conv_info,
+                                            const ActivationLayerInfo &act_info,
+                                            bool                       enable_fast_math)
 {
     return opencl::ClWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math);
 }
@@ -97,7 +114,7 @@ void CLWinogradConvolutionLayer::run()
 
 void CLWinogradConvolutionLayer::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->run_pack);
 
@@ -107,4 +124,4 @@ void CLWinogradConvolutionLayer::prepare()
         _impl->is_prepared = true;
     }
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp
index 18ade97885..4270165ab4 100644
--- a/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp
+++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <map>
@@ -34,8 +35,7 @@ namespace arm_compute
 {
 namespace cl_gemm
 {
-CLGEMMDefaultTypeBifrost::CLGEMMDefaultTypeBifrost(GPUTarget gpu)
-    : ICLGEMMKernelSelection(gpu)
+CLGEMMDefaultTypeBifrost::CLGEMMDefaultTypeBifrost(GPUTarget gpu) : ICLGEMMKernelSelection(gpu)
 {
 }
 
@@ -44,109 +44,109 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::select_kernel(const CLGEMMKernelSelec
     // _target could be used in the future to have a dedicated heuristic for each GPU IP
     ARM_COMPUTE_UNUSED(_target);
 
-    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeBifrost::*)(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
 
     // Default configurations for Bifrost architectures
-    static std::map<DataType, FunctionExecutorPtr> gemm_default_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeBifrost::default_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_default_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32},
+        {DataType::F16, &CLGEMMDefaultTypeBifrost::default_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}};
 
     // Mali-G71 configurations
-    static std::map<DataType, FunctionExecutorPtr> gemm_g71_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeBifrost::g71_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_g71_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32},
+        {DataType::F16, &CLGEMMDefaultTypeBifrost::g71_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}};
 
     // Mali-G52 configurations
-    static std::map<DataType, FunctionExecutorPtr> gemm_g52_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeBifrost::g52_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeBifrost::g52_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_g52_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeBifrost::g52_f32},
+        {DataType::F16, &CLGEMMDefaultTypeBifrost::g52_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}};
 
     // Mali-G76 configurations
-    static std::map<DataType, FunctionExecutorPtr> gemm_g76_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeBifrost::g76_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeBifrost::g76_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_g76_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeBifrost::g76_f32},
+        {DataType::F16, &CLGEMMDefaultTypeBifrost::g76_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}};
 
     const DataType data_type = params.data_type;
 
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G71:
-            if(gemm_g71_configs.find(data_type) != gemm_g71_configs.end())
+            if (gemm_g71_configs.find(data_type) != gemm_g71_configs.end())
             {
-                return (this->*gemm_g71_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+                return (this->*gemm_g71_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                            params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
         case GPUTarget::G76:
-            if(gemm_g76_configs.find(data_type) != gemm_g76_configs.end())
+            if (gemm_g76_configs.find(data_type) != gemm_g76_configs.end())
             {
-                return (this->*gemm_g76_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+                return (this->*gemm_g76_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                            params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
         case GPUTarget::G52:
-            if(gemm_g52_configs.find(data_type) != gemm_g52_configs.end())
+            if (gemm_g52_configs.find(data_type) != gemm_g52_configs.end())
             {
-                return (this->*gemm_g52_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+                return (this->*gemm_g52_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                            params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
         default:
-            if(gemm_default_configs.find(data_type) != gemm_default_configs.end())
+            if (gemm_default_configs.find(data_type) != gemm_default_configs.end())
             {
-                return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+                return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                                params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(b);
 
     CLGEMMKernelType gemm_type = CLGEMMKernelType::NATIVE;
 
-    if(is_rhs_constant)
+    if (is_rhs_constant)
     {
-        if((m > 1) && (n < 16))
+        if ((m > 1) && (n < 16))
         {
             gemm_type = CLGEMMKernelType::RESHAPED;
         }
-        else if(m == 1)
+        else if (m == 1)
         {
             gemm_type = CLGEMMKernelType::RESHAPED_ONLY_RHS;
         }
         else
         {
-            if((k > 256) && (m > 4))
+            if ((k > 256) && (m > 4))
             {
                 constexpr float alpha = 3.2f;
                 constexpr float fact0 = 1.51f;
                 constexpr float fact1 = 1.66f;
                 constexpr float ops   = 12.0f;
                 const float     scale = k > 1024 ? 1.07f : 1.0f;
-                gemm_type             = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) ? CLGEMMKernelType::RESHAPED : CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                gemm_type             = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops))
+                                            ? CLGEMMKernelType::RESHAPED
+                                            : CLGEMMKernelType::RESHAPED_ONLY_RHS;
             }
             else
             {
@@ -156,19 +156,21 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f32(unsigned int m, unsigned
 
         const auto workload = static_cast<float>((m * n) / 20.0f);
 
-        gemm_type = ((workload > 1600.0f) && (gemm_type == CLGEMMKernelType::RESHAPED)) ? CLGEMMKernelType::RESHAPED : gemm_type;
+        gemm_type = ((workload > 1600.0f) && (gemm_type == CLGEMMKernelType::RESHAPED)) ? CLGEMMKernelType::RESHAPED
+                                                                                        : gemm_type;
     }
 
     return gemm_type;
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(n, k, b);
 
-    if(is_rhs_constant)
+    if (is_rhs_constant)
     {
-        if(m == 1)
+        if (m == 1)
         {
             return CLGEMMKernelType::RESHAPED_ONLY_RHS;
         }
@@ -183,11 +185,12 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f16(unsigned int m, unsigned
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_q8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(m, n, k, b);
 
-    if(is_rhs_constant)
+    if (is_rhs_constant)
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
     }
@@ -197,21 +200,22 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_q8(unsigned int m, unsigned i
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(b);
 
-    if(!is_rhs_constant)
+    if (!is_rhs_constant)
     {
         return CLGEMMKernelType::NATIVE;
     }
-    if(m == 1)
+    if (m == 1)
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
     }
-    if(k <= 496)
+    if (k <= 496)
     {
-        if(n <= 544)
+        if (n <= 544)
         {
             return CLGEMMKernelType::RESHAPED_ONLY_RHS;
         }
@@ -222,17 +226,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int
     }
     else
     {
-        if(k <= 588)
+        if (k <= 588)
         {
-            if(k <= 552)
+            if (k <= 552)
             {
-                if(m <= 148)
+                if (m <= 148)
                 {
                     return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                 }
                 else
                 {
-                    if(m <= 278)
+                    if (m <= 278)
                     {
                         return CLGEMMKernelType::RESHAPED;
                     }
@@ -254,16 +258,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(b);
 
-    if(!is_rhs_constant)
+    if (!is_rhs_constant)
     {
         return CLGEMMKernelType::NATIVE;
     }
 
-    if(m == 1)
+    if (m == 1)
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
     }
@@ -273,13 +278,13 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int
     const float r_nk  = static_cast<float>(n) / static_cast<float>(k);
     const float r_mnk = static_cast<float>(m) / (static_cast<float>(n) * static_cast<float>(k));
 
-    if(r_mn <= 1.5469f)
+    if (r_mn <= 1.5469f)
     {
-        if(r_mk <= 0.8766f)
+        if (r_mk <= 0.8766f)
         {
-            if(r_mk <= 0.0211f)
+            if (r_mk <= 0.0211f)
             {
-                if(r_mnk <= 77.5833f)
+                if (r_mnk <= 77.5833f)
                 {
                     return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                 }
@@ -290,7 +295,7 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int
             }
             else
             {
-                if(r_nk <= 0.0832f)
+                if (r_nk <= 0.0832f)
                 {
                     return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                 }
@@ -302,11 +307,11 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int
         }
         else
         {
-            if(r_mnk <= 193.0000f)
+            if (r_mnk <= 193.0000f)
             {
-                if(r_mn <= 0.9948f)
+                if (r_mn <= 0.9948f)
                 {
-                    if(r_mk <= 2.5453f)
+                    if (r_mk <= 2.5453f)
                     {
                         return CLGEMMKernelType::RESHAPED;
                     }
@@ -328,17 +333,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int
     }
     else
     {
-        if(r_mn <= 17.7370f)
+        if (r_mn <= 17.7370f)
         {
-            if(r_mnk <= 1391.2875f)
+            if (r_mnk <= 1391.2875f)
             {
-                if(r_mk <= 2.9724f)
+                if (r_mk <= 2.9724f)
                 {
                     return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                 }
                 else
                 {
-                    if(r_mnk <= 470.0000f)
+                    if (r_mnk <= 470.0000f)
                     {
                         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                     }
@@ -350,9 +355,9 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int
             }
             else
             {
-                if(r_nk <= 0.1381f)
+                if (r_nk <= 0.1381f)
                 {
-                    if(r_mnk <= 9040.5000f)
+                    if (r_mnk <= 9040.5000f)
                     {
                         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                     }
@@ -363,7 +368,7 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int
                 }
                 else
                 {
-                    if(r_mn <= 5.6790f)
+                    if (r_mn <= 5.6790f)
                     {
                         return CLGEMMKernelType::RESHAPED;
                     }
@@ -381,16 +386,17 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(b);
 
-    if(!is_rhs_constant)
+    if (!is_rhs_constant)
     {
         return CLGEMMKernelType::NATIVE;
     }
 
-    if(m == 1)
+    if (m == 1)
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
     }
@@ -398,21 +404,21 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int
     const float r_mn = static_cast<float>(m) / static_cast<float>(n);
     const float r_nk = static_cast<float>(n) / static_cast<float>(k);
 
-    if(k <= 212)
+    if (k <= 212)
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
     }
     else
     {
-        if(r_nk <= 0.4990234375f)
+        if (r_nk <= 0.4990234375f)
         {
-            if(k <= 1392)
+            if (k <= 1392)
             {
                 return CLGEMMKernelType::RESHAPED_ONLY_RHS;
             }
             else
             {
-                if(m <= 325)
+                if (m <= 325)
                 {
                     return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                 }
@@ -424,13 +430,13 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int
         }
         else
         {
-            if(k <= 471)
+            if (k <= 471)
             {
                 return CLGEMMKernelType::RESHAPED_ONLY_RHS;
             }
             else
             {
-                if(r_mn <= 0.04475911520421505f)
+                if (r_mn <= 0.04475911520421505f)
                 {
                     return CLGEMMKernelType::RESHAPED;
                 }
@@ -443,37 +449,38 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
-    if(!is_rhs_constant)
+    if (!is_rhs_constant)
     {
         return CLGEMMKernelType::NATIVE;
     }
 
-    if(m == 1)
+    if (m == 1)
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
     }
 
-    if(n <= 127.0000f)
+    if (n <= 127.0000f)
     {
-        if(n <= 63.5000f)
+        if (n <= 63.5000f)
         {
             return CLGEMMKernelType::RESHAPED_ONLY_RHS;
         }
         else
         {
-            if(m <= 3616.0000f)
+            if (m <= 3616.0000f)
             {
-                if(b <= 18.5000f)
+                if (b <= 18.5000f)
                 {
-                    if(m <= 2970.5000f)
+                    if (m <= 2970.5000f)
                     {
                         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                     }
                     else
                     {
-                        if(k <= 104.0000f)
+                        if (k <= 104.0000f)
                         {
                             return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                         }
@@ -496,19 +503,19 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int
     }
     else
     {
-        if(m <= 12.5000f)
+        if (m <= 12.5000f)
         {
             return CLGEMMKernelType::RESHAPED_ONLY_RHS;
         }
         else
         {
-            if(k <= 104.0000f)
+            if (k <= 104.0000f)
             {
-                if(b <= 18.5000f)
+                if (b <= 18.5000f)
                 {
-                    if(m <= 490.0000f)
+                    if (m <= 490.0000f)
                     {
-                        if(n <= 272.0000f)
+                        if (n <= 272.0000f)
                         {
                             return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                         }
@@ -529,11 +536,11 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int
             }
             else
             {
-                if(m <= 226.0000f)
+                if (m <= 226.0000f)
                 {
-                    if(n <= 140.0000f)
+                    if (n <= 140.0000f)
                     {
-                        if(m <= 179.5000f)
+                        if (m <= 179.5000f)
                         {
                             return CLGEMMKernelType::RESHAPED;
                         }
@@ -556,15 +563,16 @@ CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::g71_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g71_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(b);
     ARM_COMPUTE_UNUSED(n);
     ARM_COMPUTE_UNUSED(k);
 
-    if(is_rhs_constant)
+    if (is_rhs_constant)
     {
-        if(m == 1)
+        if (m == 1)
         {
             return CLGEMMKernelType::RESHAPED_ONLY_RHS;
         }
diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp
index ef30b28f96..673038a8db 100644
--- a/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp
+++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/GPUTarget.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <map>
@@ -35,8 +36,7 @@ namespace arm_compute
 {
 namespace cl_gemm
 {
-CLGEMMDefaultTypeMidgard::CLGEMMDefaultTypeMidgard(GPUTarget gpu)
-    : ICLGEMMKernelSelection(gpu)
+CLGEMMDefaultTypeMidgard::CLGEMMDefaultTypeMidgard(GPUTarget gpu) : ICLGEMMKernelSelection(gpu)
 {
 }
 
@@ -45,22 +45,21 @@ CLGEMMKernelType CLGEMMDefaultTypeMidgard::select_kernel(const CLGEMMKernelSelec
     // _target could be used in the future to have a dedicated heuristic for each GPU IP
     ARM_COMPUTE_UNUSED(_target);
 
-    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeMidgard::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeMidgard::*)(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
 
     // Configurations for Midgard architectures
-    static std::map<DataType, FunctionExecutorPtr> gemm_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeMidgard::default_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeMidgard::default_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeMidgard::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeMidgard::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeMidgard::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeMidgard::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeMidgard::default_f32},
+        {DataType::F16, &CLGEMMDefaultTypeMidgard::default_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeMidgard::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeMidgard::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeMidgard::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeMidgard::default_q8}};
 
     const DataType data_type = params.data_type;
 
-    if(gemm_configs.find(data_type) != gemm_configs.end())
+    if (gemm_configs.find(data_type) != gemm_configs.end())
     {
         return (this->*gemm_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
     }
@@ -68,7 +67,8 @@ CLGEMMKernelType CLGEMMDefaultTypeMidgard::select_kernel(const CLGEMMKernelSelec
     ARM_COMPUTE_ERROR("Not supported data type");
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(n, k, b);
 
@@ -76,7 +76,8 @@ CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f32(unsigned int m, unsigned
     return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED : CLGEMMKernelType::NATIVE;
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(n, k, b);
 
@@ -84,7 +85,8 @@ CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f16(unsigned int m, unsigned
     return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED : CLGEMMKernelType::NATIVE;
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_q8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(m, n, k, b, is_rhs_constant);
 
diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp
index 9e779d3752..851e23bc84 100644
--- a/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp
+++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <map>
@@ -34,8 +35,7 @@ namespace arm_compute
 {
 namespace cl_gemm
 {
-CLGEMMDefaultTypeValhall::CLGEMMDefaultTypeValhall(GPUTarget gpu)
-    : ICLGEMMKernelSelection(gpu)
+CLGEMMDefaultTypeValhall::CLGEMMDefaultTypeValhall(GPUTarget gpu) : ICLGEMMKernelSelection(gpu)
 {
 }
 
@@ -44,135 +44,136 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::select_kernel(const CLGEMMKernelSelec
     // _target could be used in the future to have a dedicated heuristic for each GPU IP
     ARM_COMPUTE_UNUSED(_target);
 
-    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeValhall::*)(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
 
     // Default configurations for Valhall architectures
-    static std::map<DataType, FunctionExecutorPtr> gemm_default_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeValhall::default_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeValhall::default_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_default_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32},
+        {DataType::F16, &CLGEMMDefaultTypeValhall::default_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
 
     // Mali-G77 configurations
-    static std::map<DataType, FunctionExecutorPtr> gemm_g77_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeValhall::default_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeValhall::g77_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_g77_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32},
+        {DataType::F16, &CLGEMMDefaultTypeValhall::g77_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
 
     // Mali-G78 configurations
-    static std::map<DataType, FunctionExecutorPtr> gemm_g78_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeValhall::g78_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeValhall::g78_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_g78_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeValhall::g78_f32},
+        {DataType::F16, &CLGEMMDefaultTypeValhall::g78_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
 
     // Mali-G710 and Mali-G610 configurations
-    static std::map<DataType, FunctionExecutorPtr> gemm_g710_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeValhall::default_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeValhall::g710_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_g710_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32},
+        {DataType::F16, &CLGEMMDefaultTypeValhall::g710_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
 
     // Mali-G715 and Mali-G615 configurations
-    static std::map<DataType, FunctionExecutorPtr> gemm_g715_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeValhall::g715_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeValhall::g715_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_g715_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeValhall::g715_f32},
+        {DataType::F16, &CLGEMMDefaultTypeValhall::g715_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
 
     const DataType data_type = params.data_type;
 
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G710:
         case GPUTarget::G610:
-            if(gemm_g710_configs.find(data_type) != gemm_g710_configs.end())
+            if (gemm_g710_configs.find(data_type) != gemm_g710_configs.end())
             {
-                return (this->*gemm_g710_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+                return (this->*gemm_g710_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                             params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
         case GPUTarget::G715:
         case GPUTarget::G615:
-            if(gemm_g715_configs.find(data_type) != gemm_g715_configs.end())
+            if (gemm_g715_configs.find(data_type) != gemm_g715_configs.end())
             {
-                return (this->*gemm_g715_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+                return (this->*gemm_g715_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                             params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
         case GPUTarget::G78:
-            if(gemm_g78_configs.find(data_type) != gemm_g78_configs.end())
+            if (gemm_g78_configs.find(data_type) != gemm_g78_configs.end())
             {
-                return (this->*gemm_g78_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+                return (this->*gemm_g78_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                            params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
         case GPUTarget::G77:
-            if(gemm_g77_configs.find(data_type) != gemm_g77_configs.end())
+            if (gemm_g77_configs.find(data_type) != gemm_g77_configs.end())
             {
-                return (this->*gemm_g77_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+                return (this->*gemm_g77_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                            params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
         default:
-            if(gemm_default_configs.find(data_type) != gemm_default_configs.end())
+            if (gemm_default_configs.find(data_type) != gemm_default_configs.end())
             {
-                return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+                return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                                params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(m, n, k, b);
 
     return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE;
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(m, n, k, b);
 
     return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE;
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeValhall::g77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(m, n, k, b);
 
     return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE;
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeValhall::g710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(m, n, k, b);
 
     return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE;
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeValhall::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeValhall::default_q8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(m, n, k, b);
 
-    if(is_rhs_constant)
+    if (is_rhs_constant)
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
     }
@@ -182,47 +183,48 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::default_q8(unsigned int m, unsigned i
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(b);
 
-    if(!is_rhs_constant)
+    if (!is_rhs_constant)
     {
         return CLGEMMKernelType::NATIVE;
     }
 
-    if(m == 1)
+    if (m == 1)
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
     }
 
-    if(n <= 272.0000f)
+    if (n <= 272.0000f)
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
     }
     else
     {
-        if(k <= 471.0000f)
+        if (k <= 471.0000f)
         {
             return CLGEMMKernelType::RESHAPED_ONLY_RHS;
         }
         else
         {
-            if(m <= 72.5000f)
+            if (m <= 72.5000f)
             {
                 return CLGEMMKernelType::RESHAPED_ONLY_RHS;
             }
             else
             {
-                if(m <= 90.5000f)
+                if (m <= 90.5000f)
                 {
                     return CLGEMMKernelType::RESHAPED;
                 }
                 else
                 {
-                    if(k <= 2448.0000f)
+                    if (k <= 2448.0000f)
                     {
-                        if(n <= 756.0000f)
+                        if (n <= 756.0000f)
                         {
                             return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                         }
@@ -241,11 +243,12 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f32(unsigned int m, unsigned int
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(m, n, k, b);
 
-    if(!is_rhs_constant)
+    if (!is_rhs_constant)
     {
         return CLGEMMKernelType::NATIVE;
     }
@@ -253,9 +256,10 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f16(unsigned int m, unsigned int
     return CLGEMMKernelType::RESHAPED_ONLY_RHS;
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
-    if(!is_rhs_constant)
+    if (!is_rhs_constant)
     {
         return default_f32(m, n, k, b, is_rhs_constant);
     }
@@ -263,7 +267,7 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int
     unsigned int best_m0;
     unsigned int best_n0;
 
-    if(opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0))
+    if (opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0))
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL;
     }
@@ -273,9 +277,10 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
-    if(!is_rhs_constant)
+    if (!is_rhs_constant)
     {
         return g78_f16(m, n, k, b, is_rhs_constant);
     }
@@ -283,7 +288,7 @@ CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f16(unsigned int m, unsigned int
     unsigned int best_m0;
     unsigned int best_n0;
 
-    if(opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0))
+    if (opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0))
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL;
     }
diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelection.h b/src/runtime/CL/gemm/CLGEMMKernelSelection.h
index 6189a324cf..c528dbcac4 100644
--- a/src/runtime/CL/gemm/CLGEMMKernelSelection.h
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelection.h
@@ -25,6 +25,7 @@
 #define SRC_CLGEMMKERNELSELECTION_H
 
 #include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
+
 #include "src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.h"
 #include "src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.h"
 #include "src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h"
@@ -45,7 +46,7 @@ public:
      */
     static std::unique_ptr<ICLGEMMKernelSelection> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
                 return std::make_unique<CLGEMMDefaultTypeMidgard>(gpu);
diff --git a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp
index b06c3b0f8e..8df57197e2 100644
--- a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp
+++ b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 #include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
 #include "src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h"
@@ -51,13 +52,15 @@ GEMMTypeResult select_mlgo_gemm_kernel(const CommonQuery &query, bool reshape_b_
     bool             valid = false;
     CLGEMMKernelType gemm_type{};
     const auto       mlgo_heuristics = CLScheduler::get().gemm_heuristics();
-    if(mlgo_heuristics != nullptr)
+    if (mlgo_heuristics != nullptr)
     {
-        std::tie(valid, gemm_type) = mlgo_heuristics->get()->query_gemm_type(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b });
+        std::tie(valid, gemm_type) = mlgo_heuristics->get()->query_gemm_type(
+            mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b});
     }
-    if(valid)
+    if (valid)
     {
-        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm type: %s.", to_string(gemm_type).c_str());
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm type: %s.",
+                                                  to_string(gemm_type).c_str());
     }
     else
     {
@@ -87,10 +90,11 @@ GEMMConfigResult select_default_gemm_config_reshaped_only_rhs(const CommonQuery
 {
     GEMMLHSMatrixInfo                    lhs_info;
     GEMMRHSMatrixInfo                    rhs_info;
-    std::unique_ptr<IClGemmKernelConfig> gemm_config = ClGemmReshapedOnlyRhsKernelConfigurationFactory::create(query.gpu_target);
+    std::unique_ptr<IClGemmKernelConfig> gemm_config =
+        ClGemmReshapedOnlyRhsKernelConfigurationFactory::create(query.gpu_target);
     ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
     std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type);
-    return GEMMConfigResult{ true, lhs_info, rhs_info };
+    return GEMMConfigResult{true, lhs_info, rhs_info};
 }
 
 GEMMConfigResult select_mlgo_gemm_config_reshaped_only_rhs(const CommonQuery &query)
@@ -100,32 +104,36 @@ GEMMConfigResult select_mlgo_gemm_config_reshaped_only_rhs(const CommonQuery &qu
     GEMMRHSMatrixInfo               rhs_info;
     mlgo::GEMMConfigReshapedOnlyRHS config{};
     const auto                      mlgo_heuristics = CLScheduler::get().gemm_heuristics();
-    if(mlgo_heuristics != nullptr)
+    if (mlgo_heuristics != nullptr)
     {
-        std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped_only_rhs(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b });
+        std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped_only_rhs(
+            mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b});
     }
-    if(valid)
+    if (valid)
     {
-        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", to_string(config).c_str());
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.",
+                                                  to_string(config).c_str());
         // Setting irrelevant unsigned int parameters to 1 and bool parameters to false as they do no matter
-        std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(query.m, query.n, config.m0, config.n0, config.k0, 1, config.h0, false, config.interleave_rhs, !config.transpose_rhs, config.transpose_rhs,
-                                                              config.export_cl_image);
+        std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(
+            query.m, query.n, config.m0, config.n0, config.k0, 1, config.h0, false, config.interleave_rhs,
+            !config.transpose_rhs, config.transpose_rhs, config.export_cl_image);
     }
     else
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed");
     }
-    return GEMMConfigResult{ valid, lhs_info, rhs_info };
+    return GEMMConfigResult{valid, lhs_info, rhs_info};
 }
 
 GEMMConfigResult select_default_gemm_config_reshaped(const CommonQuery &query)
 {
     GEMMLHSMatrixInfo                    lhs_info;
     GEMMRHSMatrixInfo                    rhs_info;
-    std::unique_ptr<IClGemmKernelConfig> gemm_config = ClGemmReshapedKernelConfigurationFactory::create(query.gpu_target);
+    std::unique_ptr<IClGemmKernelConfig> gemm_config =
+        ClGemmReshapedKernelConfigurationFactory::create(query.gpu_target);
     ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
     std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type);
-    return GEMMConfigResult{ true, lhs_info, rhs_info };
+    return GEMMConfigResult{true, lhs_info, rhs_info};
 }
 
 GEMMConfigResult select_mlgo_gemm_config_reshaped(const CommonQuery &query)
@@ -135,21 +143,24 @@ GEMMConfigResult select_mlgo_gemm_config_reshaped(const CommonQuery &query)
     GEMMRHSMatrixInfo        rhs_info;
     mlgo::GEMMConfigReshaped config{};
     const auto               mlgo_heuristics = CLScheduler::get().gemm_heuristics();
-    if(mlgo_heuristics != nullptr)
+    if (mlgo_heuristics != nullptr)
     {
-        std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b });
+        std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped(
+            mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b});
     }
-    if(valid)
+    if (valid)
     {
-        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", to_string(config).c_str());
-        std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(query.m, query.n, config.m0, config.n0, config.k0, config.v0, config.h0, config.interleave_lhs, config.interleave_rhs, !config.transpose_rhs,
-                                                              config.transpose_rhs, config.export_cl_image);
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.",
+                                                  to_string(config).c_str());
+        std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(
+            query.m, query.n, config.m0, config.n0, config.k0, config.v0, config.h0, config.interleave_lhs,
+            config.interleave_rhs, !config.transpose_rhs, config.transpose_rhs, config.export_cl_image);
     }
     else
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed");
     }
-    return GEMMConfigResult{ valid, lhs_info, rhs_info };
+    return GEMMConfigResult{valid, lhs_info, rhs_info};
 }
 
 GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query)
@@ -159,7 +170,7 @@ GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query)
     std::unique_ptr<IClGemmKernelConfig> gemm_config = ClGemmNativeKernelConfigurationFactory::create(query.gpu_target);
     ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
     std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type);
-    return GEMMConfigResult{ true, lhs_info, rhs_info };
+    return GEMMConfigResult{true, lhs_info, rhs_info};
 }
 
 GEMMConfigResult select_mlgo_gemm_config_native(const CommonQuery &query)
@@ -169,23 +180,26 @@ GEMMConfigResult select_mlgo_gemm_config_native(const CommonQuery &query)
     GEMMRHSMatrixInfo      rhs_info;
     mlgo::GEMMConfigNative config{};
     const auto             mlgo_heuristics = CLScheduler::get().gemm_heuristics();
-    if(mlgo_heuristics != nullptr)
+    if (mlgo_heuristics != nullptr)
     {
-        std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_native(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b });
+        std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_native(
+            mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b});
     }
-    if(valid)
+    if (valid)
     {
-        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", to_string(config).c_str());
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.",
+                                                  to_string(config).c_str());
         // Setting irrelevant unsigned int parameters to 1 and bool parameters to false as they do no matter
-        std::tie(lhs_info, rhs_info) = opencl::kernels::gemm::configure_lhs_rhs_info(query.m, query.n, config.m0, config.n0, config.k0, 1, 1, false, false, false, false, false);
+        std::tie(lhs_info, rhs_info) = opencl::kernels::gemm::configure_lhs_rhs_info(
+            query.m, query.n, config.m0, config.n0, config.k0, 1, 1, false, false, false, false, false);
     }
     else
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed");
     }
-    return GEMMConfigResult{ valid, lhs_info, rhs_info };
+    return GEMMConfigResult{valid, lhs_info, rhs_info};
 }
 } // namespace auto_heuristics
 
 } // namespace cl_gemm
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h
index 020237b7f4..f544715e03 100644
--- a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h
+++ b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h
@@ -50,8 +50,7 @@ struct CommonQuery
 /** Result of querying about GEMM type ( @ref CLGEMMKernelType) */
 struct GEMMTypeResult
 {
-    GEMMTypeResult(bool valid, CLGEMMKernelType gemm_type)
-        : valid{ valid }, gemm_type{ gemm_type }
+    GEMMTypeResult(bool valid, CLGEMMKernelType gemm_type) : valid{valid}, gemm_type{gemm_type}
     {
     }
     /** Test if the result is valid */
@@ -67,7 +66,7 @@ struct GEMMTypeResult
 struct GEMMConfigResult
 {
     GEMMConfigResult(bool valid, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info)
-        : valid{ valid }, lhs_info{ lhs_info }, rhs_info{ rhs_info }
+        : valid{valid}, lhs_info{lhs_info}, rhs_info{rhs_info}
     {
     }
     /** Test if the result is valid */
@@ -134,4 +133,4 @@ GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query);
 } // namespace cl_gemm
 } // namespace arm_compute
 
-#endif // SRC_RUNTIME_CL_GEMM_AUTO_HEURISTICS_CL_GEMM_AUTO_HEURISTICS_H
-\ No newline at end of file
+#endif // SRC_RUNTIME_CL_GEMM_AUTO_HEURISTICS_CL_GEMM_AUTO_HEURISTICS_H
diff --git a/src/runtime/CL/mlgo/Common.h b/src/runtime/CL/mlgo/Common.h
index c451bd9062..08a7ee8c18 100644
--- a/src/runtime/CL/mlgo/Common.h
+++ b/src/runtime/CL/mlgo/Common.h
@@ -45,37 +45,37 @@ using GEMMType = CLGEMMKernelType;
 /** GEMM Configuration for Native kernel */
 struct GEMMConfigNative
 {
-    unsigned int m0{ 1 }; /**< Number of rows processed by the matrix multiplication */
-    unsigned int n0{ 1 }; /**< Number of columns processed by the matrix multiplication */
-    unsigned int k0{ 1 }; /**< Number of partial accumulations performed by the matrix multiplication */
+    unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */
+    unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */
+    unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */
 };
 
 /** GEMM Configuration for Reshaped Only RHS kernel */
 struct GEMMConfigReshapedOnlyRHS
 {
-    unsigned int m0{ 1 };                  /**< Number of rows processed by the matrix multiplication */
-    unsigned int n0{ 1 };                  /**< Number of columns processed by the matrix multiplication */
-    unsigned int k0{ 1 };                  /**< Number of partial accumulations performed by the matrix multiplication */
-    unsigned int h0{ 1 };                  /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
-    bool         interleave_rhs{ false };  /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
-    bool         transpose_rhs{ false };   /**< True if the (k0xn0) block has to be transposed before been stored */
-    bool         export_cl_image{ false }; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
+    unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */
+    unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */
+    unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */
+    unsigned int h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
+    bool         interleave_rhs{false}; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
+    bool         transpose_rhs{false};  /**< True if the (k0xn0) block has to be transposed before been stored */
+    bool export_cl_image{false}; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
 };
 
 /** GEMM Configuration for Reshaped kernel */
 struct GEMMConfigReshaped
 {
-    unsigned int m0{ 1 };                  /**< Number of rows processed by the matrix multiplication */
-    unsigned int n0{ 1 };                  /**< Number of columns processed by the matrix multiplication */
-    unsigned int k0{ 1 };                  /**< Number of partial accumulations performed by the matrix multiplication */
-    unsigned int v0{ 1 };                  /**< Number of vertical blocks of size (m0xk0) stored on the same output row */
-    unsigned int h0{ 1 };                  /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
-    bool         interleave_lhs{ false };  /**< True if the v0 (m0xk0) blocks have to be interleaved in the output row */
-    bool         interleave_rhs{ false };  /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
-    bool         transpose_rhs{ false };   /**< True if the (k0xn0) block has to be transposed before been stored */
-    bool         export_cl_image{ false }; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
+    unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */
+    unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */
+    unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */
+    unsigned int v0{1}; /**< Number of vertical blocks of size (m0xk0) stored on the same output row */
+    unsigned int h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
+    bool         interleave_lhs{false}; /**< True if the v0 (m0xk0) blocks have to be interleaved in the output row */
+    bool         interleave_rhs{false}; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
+    bool         transpose_rhs{false};  /**< True if the (k0xn0) block has to be transposed before been stored */
+    bool export_cl_image{false}; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
 };
 
 } // namespace mlgo
 } // namespace arm_compute
-#endif // SRC_RUNTIME_CL_MLGO_COMMON_H
-\ No newline at end of file
+#endif // SRC_RUNTIME_CL_MLGO_COMMON_H
diff --git a/src/runtime/CL/mlgo/HeuristicTree.cpp b/src/runtime/CL/mlgo/HeuristicTree.cpp
index 1c75cdc427..f7b706902b 100644
--- a/src/runtime/CL/mlgo/HeuristicTree.cpp
+++ b/src/runtime/CL/mlgo/HeuristicTree.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "src/runtime/CL/mlgo/HeuristicTree.h"
+
 #include "arm_compute/core/Log.h"
 
 #include "support/Cast.h"
@@ -40,27 +41,23 @@ bool evaluate(GEMMShape shape, Condition cond)
     // PRE: all features and ConditionalOps are valid
     constexpr float eps = 0.0001f;
     // Calculate all secondary features
-    std::vector<std::pair<std::string, float>> cond_values
-    {
-        { "m", static_cast<float>(shape.m) },
-        { "n", static_cast<float>(shape.n) },
-        { "k", static_cast<float>(shape.k) },
-        { "b", static_cast<float>(shape.b) },
-        { "r_mn", static_cast<float>(shape.m) / shape.n },
-        { "r_mk", static_cast<float>(shape.m) / shape.k },
-        { "r_nk", static_cast<float>(shape.n) / shape.k },
-        { "r_mnk", static_cast<float>(shape.m) / (static_cast<float>(shape.n) / shape.k) },
-        { "workload", (static_cast<float>(shape.m) * shape.n * shape.b) / 20.0 }
-    };
-    auto cond_value_pair_it = std::find_if(cond_values.begin(), cond_values.end(),
-                                           [&cond](decltype(*cond_values.begin()) it)
-    {
-        return it.first == cond.feature;
-    });
+    std::vector<std::pair<std::string, float>> cond_values{
+        {"m", static_cast<float>(shape.m)},
+        {"n", static_cast<float>(shape.n)},
+        {"k", static_cast<float>(shape.k)},
+        {"b", static_cast<float>(shape.b)},
+        {"r_mn", static_cast<float>(shape.m) / shape.n},
+        {"r_mk", static_cast<float>(shape.m) / shape.k},
+        {"r_nk", static_cast<float>(shape.n) / shape.k},
+        {"r_mnk", static_cast<float>(shape.m) / (static_cast<float>(shape.n) / shape.k)},
+        {"workload", (static_cast<float>(shape.m) * shape.n * shape.b) / 20.0}};
+    auto cond_value_pair_it =
+        std::find_if(cond_values.begin(), cond_values.end(),
+                     [&cond](decltype(*cond_values.begin()) it) { return it.first == cond.feature; });
 
     ARM_COMPUTE_ERROR_ON(cond_value_pair_it == cond_values.end());
     const float cond_value = cond_value_pair_it->second;
-    switch(cond.op)
+    switch (cond.op)
     {
         case ConditionalOp::LT:
         {
@@ -92,13 +89,12 @@ constexpr size_t                HeuristicTree::_max_num_nodes;
 constexpr size_t                HeuristicTree::_max_query_depth;
 constexpr HeuristicTree::NodeID HeuristicTree::_root;
 
-HeuristicTree::HeuristicTree()
-    : HeuristicTree(0, HeuristicType::GEMM_Type, "", DataType::F32)
+HeuristicTree::HeuristicTree() : HeuristicTree(0, HeuristicType::GEMM_Type, "", DataType::F32)
 {
 }
 
 HeuristicTree::HeuristicTree(TreeID id, HeuristicType h_type, const std::string &ip_target, DataType data_type)
-    : _id{ id }, _heuristic_type{ h_type }, _ip_target{ ip_target }, _data_type{ data_type }, _tree{}
+    : _id{id}, _heuristic_type{h_type}, _ip_target{ip_target}, _data_type{data_type}, _tree{}
 {
 }
 
@@ -108,16 +104,17 @@ std::pair<bool, T> HeuristicTree::query(GEMMShape shape) const
     // Root ID = 0;
     auto   cur_node = _tree.at(_root).get();
     size_t depth    = 0;
-    while(cur_node->type() != NodeType::Leaf)
+    while (cur_node->type() != NodeType::Leaf)
     {
-        if(depth > _max_query_depth)
+        if (depth > _max_query_depth)
         {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding max query depth: %zu. Is the tree too deep?", _max_query_depth);
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding max query depth: %zu. Is the tree too deep?",
+                                                      _max_query_depth);
             return std::make_pair(false, T{});
         }
         ARM_COMPUTE_ERROR_ON_MSG(cur_node->type() != NodeType::Branch, "Unexpected NodeType");
         auto br_node = utils::cast::polymorphic_downcast<BranchNode *>(cur_node);
-        if(evaluate(shape, br_node->condition))
+        if (evaluate(shape, br_node->condition))
         {
             cur_node = _tree.at(br_node->true_node).get();
         }
@@ -135,12 +132,12 @@ std::pair<bool, T> HeuristicTree::query(GEMMShape shape) const
 template <typename T>
 bool HeuristicTree::add_leaf(NodeID id, T val)
 {
-    if(_tree.size() >= _max_num_nodes)
+    if (_tree.size() >= _max_num_nodes)
     {
         ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the maximum number of nodes allowed %zu", _max_num_nodes);
         return false;
     }
-    if(_tree.find(id) != _tree.end())
+    if (_tree.find(id) != _tree.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add node; node id %zu already exists", id);
         return false;
@@ -151,28 +148,23 @@ bool HeuristicTree::add_leaf(NodeID id, T val)
 
 bool HeuristicTree::add_branch(NodeID id, Condition cond, NodeID t_node, NodeID f_node)
 {
-    if(_tree.size() >= _max_num_nodes)
+    if (_tree.size() >= _max_num_nodes)
     {
         ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the maximum number of nodes allowed %zu", _max_num_nodes);
         return false;
     }
 
-    const std::set<std::string> supported_features =
-    {
-        "m", "n", "k", "b", "r_mn", "r_mk", "r_nk", "r_mnk", "workload"
-    };
-    const auto orig_feature = cond.feature;
-    std::transform(cond.feature.begin(), cond.feature.end(), cond.feature.begin(), [](char c)
-    {
-        return std::tolower(c);
-    });
-    if(supported_features.find(cond.feature) == supported_features.end())
+    const std::set<std::string> supported_features = {"m", "n", "k", "b", "r_mn", "r_mk", "r_nk", "r_mnk", "workload"};
+    const auto                  orig_feature       = cond.feature;
+    std::transform(cond.feature.begin(), cond.feature.end(), cond.feature.begin(),
+                   [](char c) { return std::tolower(c); });
+    if (supported_features.find(cond.feature) == supported_features.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Unsupported feature %s", orig_feature.c_str());
         return false;
     }
 
-    if(_tree.find(id) != _tree.end())
+    if (_tree.find(id) != _tree.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add node; node id %zu already exists", id);
         return false;
@@ -184,32 +176,32 @@ bool HeuristicTree::add_branch(NodeID id, Condition cond, NodeID t_node, NodeID
 bool HeuristicTree::check_if_structurally_correct() const
 {
     std::set<NodeID>   visited;
-    std::deque<NodeID> to_visit{ _root };
+    std::deque<NodeID> to_visit{_root};
 
-    while(!to_visit.empty())
+    while (!to_visit.empty())
     {
         auto id = to_visit.front();
         to_visit.pop_front();
-        if(_tree.find(id) == _tree.end())
+        if (_tree.find(id) == _tree.end())
         {
             ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Missing node %zu", id);
             return false;
         }
         auto not_seen_before = visited.insert(id);
-        if(!not_seen_before.second)
+        if (!not_seen_before.second)
         {
             ARM_COMPUTE_LOG_INFO_MSG_CORE("Not a tree; contains cycles or loops");
             return false;
         }
         auto cur_node = _tree.at(id).get();
-        if(cur_node->type() == NodeType::Branch)
+        if (cur_node->type() == NodeType::Branch)
         {
             auto br_node = utils::cast::polymorphic_downcast<BranchNode *>(cur_node);
             to_visit.push_back(br_node->true_node);
             to_visit.push_back(br_node->false_node);
         }
     }
-    if(visited.size() != _tree.size())
+    if (visited.size() != _tree.size())
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Contains disjoint nodes");
         return false;
@@ -219,12 +211,12 @@ bool HeuristicTree::check_if_structurally_correct() const
 
 bool HeuristicTree::check()
 {
-    if(_tree.empty())
+    if (_tree.empty())
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Empty tree encountered");
         return false;
     }
-    if(_tree.find(_root) == _tree.end())
+    if (_tree.find(_root) == _tree.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Missing root. Root must have a Node ID of %zu", _root);
         return false;
@@ -237,7 +229,8 @@ template std::pair<bool, GEMMType> HeuristicTree::query<GEMMType>(GEMMShape shap
 /** Explicit template instantiation @relates HeuristicTree */
 template std::pair<bool, GEMMConfigNative> HeuristicTree::query<GEMMConfigNative>(GEMMShape shape) const;
 /** Explicit template instantiation @relates HeuristicTree */
-template std::pair<bool, GEMMConfigReshapedOnlyRHS> HeuristicTree::query<GEMMConfigReshapedOnlyRHS>(GEMMShape shape) const;
+template std::pair<bool, GEMMConfigReshapedOnlyRHS>
+HeuristicTree::query<GEMMConfigReshapedOnlyRHS>(GEMMShape shape) const;
 /** Explicit template instantiation @relates HeuristicTree */
 template std::pair<bool, GEMMConfigReshaped> HeuristicTree::query<GEMMConfigReshaped>(GEMMShape shape) const;
 
diff --git a/src/runtime/CL/mlgo/HeuristicTree.h b/src/runtime/CL/mlgo/HeuristicTree.h
index d5c7de2215..a4f8c116b9 100644
--- a/src/runtime/CL/mlgo/HeuristicTree.h
+++ b/src/runtime/CL/mlgo/HeuristicTree.h
@@ -25,6 +25,7 @@
 #define SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/runtime/CL/mlgo/Common.h"
 
 #include <map>
@@ -84,7 +85,7 @@ public:
     struct BranchNode : public Node
     {
         BranchNode(NodeID id, Condition cond, NodeID t_node, NodeID f_node)
-            : id{ id }, condition{ cond }, true_node{ t_node }, false_node{ f_node }
+            : id{id}, condition{cond}, true_node{t_node}, false_node{f_node}
         {
         }
         NodeType type() const override
@@ -100,8 +101,7 @@ public:
     template <typename T>
     struct LeafNode : public Node
     {
-        LeafNode(NodeID id, T val)
-            : id{ id }, value{ val }
+        LeafNode(NodeID id, T val) : id{id}, value{val}
         {
         }
         NodeType type() const override
@@ -177,22 +177,22 @@ public:
     bool check();
 
 private:
-    static constexpr size_t _max_query_depth{ 1000 }; // Maximum depth of query
-    static constexpr size_t _max_num_nodes{ 100000 }; // Maximum number of nodes contained by the tree
-    static constexpr NodeID _root{ 0 };               // Root tree ID
+    static constexpr size_t _max_query_depth{1000}; // Maximum depth of query
+    static constexpr size_t _max_num_nodes{100000}; // Maximum number of nodes contained by the tree
+    static constexpr NodeID _root{0};               // Root tree ID
 
 private:
     bool check_if_structurally_correct() const;
 
 private:
-    TreeID        _id;                             /**< Heuristic tree ID */
-    HeuristicType _heuristic_type;                 /**< Heuristic type */
-    std::string   _ip_target;                      /**< IP target associated with the tree */
-    DataType      _data_type;                      /**< Data type associated with the tree */
-    std::map<NodeID, std::unique_ptr<Node>> _tree; /**< Tree representation */
+    TreeID                                  _id;             /**< Heuristic tree ID */
+    HeuristicType                           _heuristic_type; /**< Heuristic type */
+    std::string                             _ip_target;      /**< IP target associated with the tree */
+    DataType                                _data_type;      /**< Data type associated with the tree */
+    std::map<NodeID, std::unique_ptr<Node>> _tree;           /**< Tree representation */
 };
 } // namespace mlgo
 
 } // namespace arm_compute
 
-#endif //SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H
-\ No newline at end of file
+#endif //SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H
diff --git a/src/runtime/CL/mlgo/MLGOHeuristics.cpp b/src/runtime/CL/mlgo/MLGOHeuristics.cpp
index 80f3bb85e9..aed46cd80f 100644
--- a/src/runtime/CL/mlgo/MLGOHeuristics.cpp
+++ b/src/runtime/CL/mlgo/MLGOHeuristics.cpp
@@ -24,6 +24,7 @@
 #include "src/runtime/CL/mlgo/MLGOHeuristics.h"
 
 #include "arm_compute/core/Log.h"
+
 #include "src/runtime/CL/mlgo/MLGOParser.h"
 #include "src/runtime/CL/mlgo/Utils.h"
 
@@ -39,19 +40,19 @@ bool operator==(const GEMMConfigNative &lhs, const GEMMConfigNative &rhs)
 }
 bool operator==(const GEMMConfigReshapedOnlyRHS &lhs, const GEMMConfigReshapedOnlyRHS &rhs)
 {
-    return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.h0, lhs.interleave_rhs, lhs.transpose_rhs, lhs.export_cl_image) == std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.h0, rhs.interleave_rhs, rhs.transpose_rhs,
-                                                                                                                            rhs.export_cl_image);
+    return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.h0, lhs.interleave_rhs, lhs.transpose_rhs, lhs.export_cl_image) ==
+           std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.h0, rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image);
 }
 bool operator==(const GEMMConfigReshaped &lhs, const GEMMConfigReshaped &rhs)
 {
-    return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.v0, lhs.h0, lhs.interleave_lhs, lhs.interleave_rhs, lhs.transpose_rhs, lhs.export_cl_image) == std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.v0, rhs.h0,
-            rhs.interleave_lhs, rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image);
+    return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.v0, lhs.h0, lhs.interleave_lhs, lhs.interleave_rhs, lhs.transpose_rhs,
+                    lhs.export_cl_image) == std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.v0, rhs.h0, rhs.interleave_lhs,
+                                                     rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image);
 }
 
 constexpr size_t MLGOHeuristics::_max_num_trees;
 
-MLGOHeuristics::MLGOHeuristics()
-    : _indices{}, _trees{}, _tree_valid{}, _valid{ false }
+MLGOHeuristics::MLGOHeuristics() : _indices{}, _trees{}, _tree_valid{}, _valid{false}
 {
 }
 
@@ -59,71 +60,74 @@ std::pair<bool, GEMMType> MLGOHeuristics::query_gemm_type(const Query &query) co
 {
     ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm type. %s.", to_string(query).c_str());
     const auto invalid = GEMMType::RESHAPED;
-    if(!_valid)
+    if (!_valid)
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead");
-        return { false, invalid };
+        return {false, invalid};
     }
     auto      index = std::make_tuple(HeuristicType::GEMM_Type, query.ip_target, query.data_type);
-    GEMMShape shape_query{ query.m, query.n, query.k, query.b };
-    if(_trees.find(index) == _trees.end())
+    GEMMShape shape_query{query.m, query.n, query.k, query.b};
+    if (_trees.find(index) == _trees.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
-        return { false, invalid };
+        return {false, invalid};
     }
     return _trees.at(index).query<GEMMType>(shape_query);
 }
 std::pair<bool, GEMMConfigNative> MLGOHeuristics::query_gemm_config_native(const Query &query) const
 {
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config native. %s.", to_string(query).c_str());
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config native. %s.",
+                                              to_string(query).c_str());
     const auto invalid = GEMMConfigNative{};
-    if(!_valid)
+    if (!_valid)
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead");
-        return { false, invalid };
+        return {false, invalid};
     }
     auto      index = std::make_tuple(HeuristicType::GEMM_Config_Native, query.ip_target, query.data_type);
-    GEMMShape shape_query{ query.m, query.n, query.k, query.b };
-    if(_trees.find(index) == _trees.end())
+    GEMMShape shape_query{query.m, query.n, query.k, query.b};
+    if (_trees.find(index) == _trees.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
-        return { false, invalid };
+        return {false, invalid};
     }
     return _trees.at(index).query<GEMMConfigNative>(shape_query);
 }
 std::pair<bool, GEMMConfigReshapedOnlyRHS> MLGOHeuristics::query_gemm_config_reshaped_only_rhs(const Query &query) const
 {
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped only rhs. %s.", to_string(query).c_str());
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped only rhs. %s.",
+                                              to_string(query).c_str());
     const auto invalid = GEMMConfigReshapedOnlyRHS{};
-    if(!_valid)
+    if (!_valid)
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead");
-        return { false, invalid };
+        return {false, invalid};
     }
     auto      index = std::make_tuple(HeuristicType::GEMM_Config_Reshaped_Only_RHS, query.ip_target, query.data_type);
-    GEMMShape shape_query{ query.m, query.n, query.k, query.b };
-    if(_trees.find(index) == _trees.end())
+    GEMMShape shape_query{query.m, query.n, query.k, query.b};
+    if (_trees.find(index) == _trees.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
-        return { false, invalid };
+        return {false, invalid};
     }
     return _trees.at(index).query<GEMMConfigReshapedOnlyRHS>(shape_query);
 }
 std::pair<bool, GEMMConfigReshaped> MLGOHeuristics::query_gemm_config_reshaped(const Query &query) const
 {
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped. %s.", to_string(query).c_str());
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped. %s.",
+                                              to_string(query).c_str());
     const auto invalid = GEMMConfigReshaped{};
-    if(!_valid)
+    if (!_valid)
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead");
-        return { false, invalid };
+        return {false, invalid};
     }
     auto      index = std::make_tuple(HeuristicType::GEMM_Config_Reshaped, query.ip_target, query.data_type);
-    GEMMShape shape_query{ query.m, query.n, query.k, query.b };
-    if(_trees.find(index) == _trees.end())
+    GEMMShape shape_query{query.m, query.n, query.k, query.b};
+    if (_trees.find(index) == _trees.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
-        return { false, invalid };
+        return {false, invalid};
     }
     return _trees.at(index).query<GEMMConfigReshaped>(shape_query);
 }
@@ -131,14 +135,14 @@ std::pair<bool, GEMMConfigReshaped> MLGOHeuristics::query_gemm_config_reshaped(c
 bool MLGOHeuristics::check_heuristic_tree(HeuristicTree::TreeID id)
 {
     bool           status;
-    HeuristicTree *tree{ nullptr };
+    HeuristicTree *tree{nullptr};
     std::tie(status, tree) = get_heuristic_tree(id);
-    if(!status)
+    if (!status)
     {
         return status;
     }
     status = tree->check();
-    if(!status)
+    if (!status)
     {
         return status;
     }
@@ -149,14 +153,12 @@ bool MLGOHeuristics::check_heuristic_tree(HeuristicTree::TreeID id)
 bool MLGOHeuristics::check_all() const
 {
     // Tree validities are already checked and cached.
-    bool all_trees_are_checked = std::find_if(_tree_valid.begin(), _tree_valid.end(), [](auto v)
-    {
-        return !v.second;
-    })
-    == _tree_valid.end();
-    if(!all_trees_are_checked)
+    bool all_trees_are_checked =
+        std::find_if(_tree_valid.begin(), _tree_valid.end(), [](auto v) { return !v.second; }) == _tree_valid.end();
+    if (!all_trees_are_checked)
     {
-        ARM_COMPUTE_LOG_INFO_MSG_CORE("Missing checks on some trees. Make sure to call check_heuristic_tree after each tree is completed. This could also indicate there are no trees in the dotmlgo");
+        ARM_COMPUTE_LOG_INFO_MSG_CORE("Missing checks on some trees. Make sure to call check_heuristic_tree after each "
+                                      "tree is completed. This could also indicate there are no trees in the dotmlgo");
         return false;
     }
 
@@ -167,14 +169,14 @@ bool MLGOHeuristics::check_all() const
 
 std::pair<bool, HeuristicTree *> MLGOHeuristics::get_heuristic_tree(HeuristicTree::TreeID id)
 {
-    if(_indices.find(id) == _indices.end())
+    if (_indices.find(id) == _indices.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot find tree with id %zu", id);
         return std::make_pair(false, nullptr);
     }
     const auto index = _indices[id];
 
-    if(_trees.find(index) == _trees.end())
+    if (_trees.find(index) == _trees.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
         return std::make_pair(false, nullptr);
@@ -186,7 +188,7 @@ std::pair<bool, HeuristicTree *> MLGOHeuristics::get_heuristic_tree(HeuristicTre
 
 bool MLGOHeuristics::add_heuristic_tree(HeuristicTree &&t)
 {
-    if(_indices.size() >= _max_num_trees)
+    if (_indices.size() >= _max_num_trees)
     {
         ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the max number of trees allowed: %zu", _max_num_trees);
         return false;
@@ -194,7 +196,7 @@ bool MLGOHeuristics::add_heuristic_tree(HeuristicTree &&t)
     // PRE: correctness of t is guaranteed by the tree construction process
     // Ensure unique id
     const auto id = t.id();
-    if(_indices.find(id) != _indices.end())
+    if (_indices.find(id) != _indices.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add redundant trees; tree id %zu already exists", id);
         return false;
@@ -202,7 +204,7 @@ bool MLGOHeuristics::add_heuristic_tree(HeuristicTree &&t)
 
     // Ensure unique index
     const auto index = t.index();
-    if(_trees.find(index) != _trees.end())
+    if (_trees.find(index) != _trees.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot add redundant trees; tree index already exists");
         return false;
@@ -219,9 +221,10 @@ bool MLGOHeuristics::reload_from_file(const std::string &filename)
     std::ifstream fs;
     fs.exceptions(std::ifstream::badbit);
     fs.open(filename, std::ios::in);
-    if(!fs.is_open())
+    if (!fs.is_open())
     {
-        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot open DotMLGO file %s. Use default heuristics instead", filename.c_str());
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot open DotMLGO file %s. Use default heuristics instead",
+                                                  filename.c_str());
         return _valid = false;
     }
     return reload_from_stream(fs);
@@ -230,7 +233,7 @@ bool MLGOHeuristics::reload_from_file(const std::string &filename)
 bool MLGOHeuristics::reload_from_stream(std::istream &in)
 {
     auto parsed = parser::parse_mlgo(in);
-    if(!parsed.first)
+    if (!parsed.first)
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("DotMLGO parsing failed. Use default heuristics instead");
         return _valid = false;
@@ -241,4 +244,4 @@ bool MLGOHeuristics::reload_from_stream(std::istream &in)
 }
 
 } // namespace mlgo
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/mlgo/MLGOHeuristics.h b/src/runtime/CL/mlgo/MLGOHeuristics.h
index aa21225959..6a491c5503 100644
--- a/src/runtime/CL/mlgo/MLGOHeuristics.h
+++ b/src/runtime/CL/mlgo/MLGOHeuristics.h
@@ -135,16 +135,16 @@ public:
     bool check_all() const;
 
 private:
-    static constexpr size_t _max_num_trees{ 100 }; /**< Max number of trees that can be added*/
+    static constexpr size_t _max_num_trees{100}; /**< Max number of trees that can be added*/
 
 private:
     // There exists a one-to-one mappipng between TreeID and Index, either can be used to identify a @ref HeuristicTree
     std::map<HeuristicTree::TreeID, HeuristicTree::Index> _indices;    /**< A mapping from TreeID to Index */
     std::map<HeuristicTree::Index, HeuristicTree>         _trees;      /**< A mapping from Index to HeuristicTree */
     std::map<HeuristicTree::TreeID, bool>                 _tree_valid; /**< Result cache of the tree validity checks */
-    bool _valid;                                                       /**< Overall validity */
+    bool                                                  _valid;      /**< Overall validity */
 };
 
 } // namespace mlgo
 } // namespace arm_compute
-#endif //SRC_RUNTIME_CL_MLGO_MLGO_HEURISTICS_H
-\ No newline at end of file
+#endif //SRC_RUNTIME_CL_MLGO_MLGO_HEURISTICS_H
diff --git a/src/runtime/CL/mlgo/MLGOParser.cpp b/src/runtime/CL/mlgo/MLGOParser.cpp
index 625739e450..893daf2ed9 100644
--- a/src/runtime/CL/mlgo/MLGOParser.cpp
+++ b/src/runtime/CL/mlgo/MLGOParser.cpp
@@ -22,19 +22,21 @@
  * SOFTWARE.
  */
 #include "src/runtime/CL/mlgo/MLGOParser.h"
+
 #include "arm_compute/core/Log.h"
+
 #include "src/runtime/CL/mlgo/Utils.h"
 
 #include <sstream>
 
 #define CHECK(parser_expr, valid_var) \
     (parser_expr);                    \
-    if(!valid_var)                    \
+    if (!valid_var)                   \
         return;
 
 #define CHECK_DEFAULT(parser_expr, valid_var, default_val) \
     (parser_expr);                                         \
-    if(!valid_var)                                         \
+    if (!valid_var)                                        \
         return default_val;
 
 #ifdef ARM_COMPUTE_LOGGING_ENABLED
@@ -53,8 +55,7 @@
     valid_var = false;                                          \
     return default_val;
 
-#define LOG_TOKEN_POS(tokens, pos_var) \
-    const auto pos_var = tokens.current_pos();
+#define LOG_TOKEN_POS(tokens, pos_var) const auto pos_var = tokens.current_pos();
 
 #else // ARM_COMPUTE_LOGGING_ENABLED
 
@@ -73,19 +74,12 @@ namespace
 {
 void ltrim(std::string &str)
 {
-    str.erase(str.begin(), std::find_if(str.begin(), str.end(), [](char ch)
-    {
-        return !std::isspace(ch);
-    }));
+    str.erase(str.begin(), std::find_if(str.begin(), str.end(), [](char ch) { return !std::isspace(ch); }));
 }
 
 void rtrim(std::string &str)
 {
-    str.erase(std::find_if(str.rbegin(), str.rend(), [](char ch)
-    {
-        return !std::isspace(ch);
-    }).base(),
-    str.end());
+    str.erase(std::find_if(str.rbegin(), str.rend(), [](char ch) { return !std::isspace(ch); }).base(), str.end());
 }
 
 void trim(std::string &str)
@@ -109,7 +103,7 @@ enum class ComparatorType
 };
 
 TokenStream::TokenStream(std::istream &s, const std::string &delims)
-    : _delims{ delims }, _istream{ s }, _tokens{}, _lookahead_pos{}
+    : _delims{delims}, _istream{s}, _tokens{}, _lookahead_pos{}
 {
     read();
 }
@@ -125,7 +119,7 @@ Token TokenStream::take()
     ARM_COMPUTE_ERROR_ON_MSG(_tokens.empty(), "TokenStream can never be empty");
     Token t = _tokens.front();
     _tokens.pop_front();
-    if(_tokens.empty())
+    if (_tokens.empty())
     {
         read();
     }
@@ -136,7 +130,7 @@ Token TokenStream::peek(size_t i)
     ARM_COMPUTE_ERROR_ON_MSG(_tokens.empty(), "TokenStream can never be empty");
     ARM_COMPUTE_ERROR_ON_MSG(i >= max_look_ahead, "TokenStream: Exceeding max look ahead");
     // NOTE: If i exceeds the stream (_istream.eof()), read() automatically appends a End token at the end
-    while(_istream && _tokens.size() <= i)
+    while (_istream && _tokens.size() <= i)
     {
         read();
     }
@@ -146,7 +140,7 @@ Token TokenStream::peek(size_t i)
 
 void advance(CharPosition &pos, char ch)
 {
-    if(ch == '\n')
+    if (ch == '\n')
     {
         pos.ln += 1;
         pos.col = 0;
@@ -167,17 +161,16 @@ void TokenStream::read()
     do
     {
         // Reached eof
-        if(!_istream.get(ch))
+        if (!_istream.get(ch))
         {
-            if(!reached_end())
+            if (!reached_end())
             {
                 _tokens.emplace_back(TokenType::End, "", _lookahead_pos);
             }
             return;
         }
         advance(_lookahead_pos, ch);
-    }
-    while(std::isspace(ch) || is_delim(ch));
+    } while (std::isspace(ch) || is_delim(ch));
     // Read chars until we hit a delim or eof
     auto orig_pos = _lookahead_pos;
     auto tok      = recognize_tok(ch);
@@ -190,41 +183,41 @@ void TokenStream::read()
 
 Token TokenStream::recognize_tok(char ch)
 {
-    if(ch == '[')
+    if (ch == '[')
     {
-        return Token{ TokenType::L_List, "", _lookahead_pos };
+        return Token{TokenType::L_List, "", _lookahead_pos};
     }
-    else if(ch == ']')
+    else if (ch == ']')
     {
-        return Token{ TokenType::R_List, "", _lookahead_pos };
+        return Token{TokenType::R_List, "", _lookahead_pos};
     }
-    else if(ch == '.')
+    else if (ch == '.')
     {
-        return float_after_dp_st(std::string{ ch });
+        return float_after_dp_st(std::string{ch});
     }
-    else if(std::isdigit(ch))
+    else if (std::isdigit(ch))
     {
-        return num_st(std::string{ ch });
+        return num_st(std::string{ch});
     }
     else
     {
-        return text_st(std::string{ ch });
+        return text_st(std::string{ch});
     }
 }
 
 Token TokenStream::num_st(std::string value)
 {
     char ch{};
-    while(_istream.get(ch))
+    while (_istream.get(ch))
     {
         advance(_lookahead_pos, ch);
-        if(ch == '.')
+        if (ch == '.')
         {
             return float_after_dp_st(value + ch);
         }
-        else if(!std::isdigit(ch))
+        else if (!std::isdigit(ch))
         {
-            if(!is_delim(ch) && !std::isspace(ch))
+            if (!is_delim(ch) && !std::isspace(ch))
             {
                 rewind(_lookahead_pos);
                 _istream.unget();
@@ -233,18 +226,18 @@ Token TokenStream::num_st(std::string value)
         }
         value += ch;
     }
-    return Token{ TokenType::Int, value, _lookahead_pos };
+    return Token{TokenType::Int, value, _lookahead_pos};
 }
 
 Token TokenStream::float_after_dp_st(std::string value)
 {
     char ch{};
-    while(_istream.get(ch))
+    while (_istream.get(ch))
     {
         advance(_lookahead_pos, ch);
-        if(!std::isdigit(ch))
+        if (!std::isdigit(ch))
         {
-            if(!is_delim(ch) && !std::isspace(ch))
+            if (!is_delim(ch) && !std::isspace(ch))
             {
                 rewind(_lookahead_pos);
                 _istream.unget();
@@ -253,20 +246,20 @@ Token TokenStream::float_after_dp_st(std::string value)
         }
         value += ch;
     }
-    return Token{ TokenType::Float, value, _lookahead_pos };
+    return Token{TokenType::Float, value, _lookahead_pos};
 }
 
 Token TokenStream::text_st(std::string value)
 {
     char ch{};
-    while(_istream.get(ch))
+    while (_istream.get(ch))
     {
         advance(_lookahead_pos, ch);
-        if(is_delim(ch))
+        if (is_delim(ch))
         {
             break;
         }
-        if(ch == '[' || ch == ']')
+        if (ch == '[' || ch == ']')
         {
             rewind(_lookahead_pos);
             _istream.unget();
@@ -274,7 +267,7 @@ Token TokenStream::text_st(std::string value)
         }
         value += ch;
     }
-    return Token{ TokenType::Text, value, _lookahead_pos };
+    return Token{TokenType::Text, value, _lookahead_pos};
 }
 
 bool TokenStream::reached_end() const
@@ -291,7 +284,7 @@ void end(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
     auto tok = in.take();
-    if(tok.type != TokenType::End)
+    if (tok.type != TokenType::End)
     {
         FAIL_WITH_MSG(valid, pos, "Unexpected token at the end of stream");
     }
@@ -301,7 +294,7 @@ bool bool_val(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
     auto tok = in.take();
-    if(tok.type != TokenType::Int)
+    if (tok.type != TokenType::Int)
     {
         FAIL_WITH_MSG_DEFAULT(valid, false, pos, "Expect bool or int token");
     }
@@ -314,7 +307,7 @@ int int_val(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
     auto tok = in.take();
-    if(tok.type != TokenType::Int)
+    if (tok.type != TokenType::Int)
     {
         FAIL_WITH_MSG_DEFAULT(valid, -1, pos, "Expect int token");
     }
@@ -327,7 +320,7 @@ unsigned int uint_val(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
     int val = CHECK_DEFAULT(int_val(in, valid), valid, 0);
-    if(val < 0)
+    if (val < 0)
     {
         FAIL_WITH_MSG_DEFAULT(valid, 0, pos, "Expect unsigned int token");
     }
@@ -338,7 +331,7 @@ float float_val(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
     auto tok = in.take();
-    if(tok.type != TokenType::Float)
+    if (tok.type != TokenType::Float)
     {
         FAIL_WITH_MSG_DEFAULT(valid, 0.f, pos, "Expect float token");
     }
@@ -351,7 +344,7 @@ std::string text_val(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
     auto tok = in.take();
-    if(tok.type != TokenType::Text || tok.value.empty())
+    if (tok.type != TokenType::Text || tok.value.empty())
     {
         FAIL_WITH_MSG_DEFAULT(valid, "", pos, "Expect a non-empty text token");
     }
@@ -361,9 +354,9 @@ std::string text_val(TokenStream &in, bool &valid)
 bool accept_text(TokenStream &in, const std::string &c_str, bool take = true)
 {
     auto tok = in.peek();
-    if(tok.type == TokenType::Text && tok.value == c_str)
+    if (tok.type == TokenType::Text && tok.value == c_str)
     {
-        if(take)
+        if (take)
         {
             in.take();
         }
@@ -375,7 +368,7 @@ bool accept_text(TokenStream &in, const std::string &c_str, bool take = true)
 void expect_text(TokenStream &in, const std::string &str, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
-    if(!accept_text(in, str))
+    if (!accept_text(in, str))
     {
         FAIL_WITH_MSG(valid, pos, std::string("Expect text token: ") + str);
     }
@@ -384,7 +377,7 @@ void expect_text(TokenStream &in, const std::string &str, bool &valid)
 bool accept_l_list(TokenStream &in)
 {
     auto tok = in.peek();
-    if(tok.type == TokenType::L_List)
+    if (tok.type == TokenType::L_List)
     {
         in.take();
         return true;
@@ -395,7 +388,7 @@ bool accept_l_list(TokenStream &in)
 void expect_l_list(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
-    if(!accept_l_list(in))
+    if (!accept_l_list(in))
     {
         FAIL_WITH_MSG(valid, pos, "Expect '['");
     }
@@ -404,7 +397,7 @@ void expect_l_list(TokenStream &in, bool &valid)
 bool accept_r_list(TokenStream &in)
 {
     auto tok = in.peek();
-    if(tok.type == TokenType::R_List)
+    if (tok.type == TokenType::R_List)
     {
         in.take();
         return true;
@@ -415,7 +408,7 @@ bool accept_r_list(TokenStream &in)
 void expect_r_list(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
-    if(!accept_r_list(in))
+    if (!accept_r_list(in))
     {
         FAIL_WITH_MSG(valid, pos, "Expect ']'");
     }
@@ -424,23 +417,23 @@ void expect_r_list(TokenStream &in, bool &valid)
 ConditionalOp conditional_op(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
-    if(accept_text(in, "<="))
+    if (accept_text(in, "<="))
     {
         return ConditionalOp::LE;
     }
-    else if(accept_text(in, ">="))
+    else if (accept_text(in, ">="))
     {
         return ConditionalOp::GE;
     }
-    else if(accept_text(in, "=="))
+    else if (accept_text(in, "=="))
     {
         return ConditionalOp::EQ;
     }
-    else if(accept_text(in, "<"))
+    else if (accept_text(in, "<"))
     {
         return ConditionalOp::LT;
     }
-    else if(accept_text(in, ">"))
+    else if (accept_text(in, ">"))
     {
         return ConditionalOp::GT;
     }
@@ -464,11 +457,11 @@ void ip_type(TokenStream &in, bool &valid)
 {
     CHECK(expect_text(in, "ip-type", valid), valid);
     LOG_TOKEN_POS(in, pos);
-    if(accept_text(in, "gpu"))
+    if (accept_text(in, "gpu"))
     {
         ;
     }
-    else if(accept_text(in, "cpu"))
+    else if (accept_text(in, "cpu"))
     {
         ;
     }
@@ -489,15 +482,15 @@ void header(TokenStream &in, bool &valid)
 DataType data_type(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
-    if(accept_text(in, "f16"))
+    if (accept_text(in, "f16"))
     {
         return DataType::F16;
     }
-    else if(accept_text(in, "f32"))
+    else if (accept_text(in, "f32"))
     {
         return DataType::F32;
     }
-    else if(accept_text(in, "qasymm8"))
+    else if (accept_text(in, "qasymm8"))
     {
         return DataType::QASYMM8;
     }
@@ -510,15 +503,15 @@ DataType data_type(TokenStream &in, bool &valid)
 ComparatorType comparator_type(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
-    if(accept_text(in, "var"))
+    if (accept_text(in, "var"))
     {
         return ComparatorType::Var;
     }
-    else if(accept_text(in, "num"))
+    else if (accept_text(in, "num"))
     {
         return ComparatorType::Num;
     }
-    else if(accept_text(in, "enum"))
+    else if (accept_text(in, "enum"))
     {
         return ComparatorType::Enum;
     }
@@ -531,19 +524,19 @@ ComparatorType comparator_type(TokenStream &in, bool &valid)
 HeuristicType heuristic_type(TokenStream &in, bool &valid, bool take = true)
 {
     LOG_TOKEN_POS(in, pos);
-    if(accept_text(in, "gemm-type", take))
+    if (accept_text(in, "gemm-type", take))
     {
         return HeuristicType::GEMM_Type;
     }
-    else if(accept_text(in, "gemm-config-native", take))
+    else if (accept_text(in, "gemm-config-native", take))
     {
         return HeuristicType::GEMM_Config_Native;
     }
-    else if(accept_text(in, "gemm-config-reshaped-only-rhs", take))
+    else if (accept_text(in, "gemm-config-reshaped-only-rhs", take))
     {
         return HeuristicType::GEMM_Config_Reshaped_Only_RHS;
     }
-    else if(accept_text(in, "gemm-config-reshaped", take))
+    else if (accept_text(in, "gemm-config-reshaped", take))
     {
         return HeuristicType::GEMM_Config_Reshaped;
     }
@@ -557,7 +550,7 @@ void expect_heuristic_type(TokenStream &in, HeuristicType expected_ht, bool &val
 {
     LOG_TOKEN_POS(in, pos);
     auto ht = CHECK(heuristic_type(in, valid, false), valid);
-    if(ht != expected_ht)
+    if (ht != expected_ht)
     {
         FAIL_WITH_MSG(valid, pos, "Unexpected heuristic type");
     }
@@ -567,15 +560,15 @@ void expect_heuristic_type(TokenStream &in, HeuristicType expected_ht, bool &val
 GEMMType gemm_type(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
-    if(accept_text(in, "native"))
+    if (accept_text(in, "native"))
     {
         return GEMMType::NATIVE;
     }
-    else if(accept_text(in, "reshaped-only-rhs"))
+    else if (accept_text(in, "reshaped-only-rhs"))
     {
         return GEMMType::RESHAPED_ONLY_RHS;
     }
-    else if(accept_text(in, "reshaped"))
+    else if (accept_text(in, "reshaped"))
     {
         return GEMMType::RESHAPED;
     }
@@ -593,7 +586,7 @@ GEMMConfigNative gemm_config_native(TokenStream &in, bool &valid)
     const auto n0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val);
     const auto k0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val);
     CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val);
-    return GEMMConfigNative{ m0, n0, k0 };
+    return GEMMConfigNative{m0, n0, k0};
 }
 
 GEMMConfigReshapedOnlyRHS gemm_config_reshaped_only_rhs(TokenStream &in, bool &valid)
@@ -608,7 +601,7 @@ GEMMConfigReshapedOnlyRHS gemm_config_reshaped_only_rhs(TokenStream &in, bool &v
     const auto tr = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val);
     const auto ex = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val);
     CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val);
-    return GEMMConfigReshapedOnlyRHS{ m0, n0, k0, h0, ir, tr, ex };
+    return GEMMConfigReshapedOnlyRHS{m0, n0, k0, h0, ir, tr, ex};
 }
 
 GEMMConfigReshaped gemm_config_reshaped(TokenStream &in, bool &valid)
@@ -625,17 +618,17 @@ GEMMConfigReshaped gemm_config_reshaped(TokenStream &in, bool &valid)
     const auto tr = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val);
     const auto ex = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val);
     CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val);
-    return GEMMConfigReshaped{ m0, n0, k0, v0, h0, il, ir, tr, ex };
+    return GEMMConfigReshaped{m0, n0, k0, v0, h0, il, ir, tr, ex};
 }
 
 void gpu_priority(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
-    if(accept_text(in, "best-performance"))
+    if (accept_text(in, "best-performance"))
     {
         ;
     }
-    else if(accept_text(in, "best-memory-usage"))
+    else if (accept_text(in, "best-memory-usage"))
     {
         ;
     }
@@ -648,11 +641,11 @@ void gpu_priority(TokenStream &in, bool &valid)
 void gpu_behavior(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
-    if(accept_text(in, "static"))
+    if (accept_text(in, "static"))
     {
         ;
     }
-    else if(accept_text(in, "dynamic"))
+    else if (accept_text(in, "dynamic"))
     {
         ;
     }
@@ -665,7 +658,7 @@ void gpu_behavior(TokenStream &in, bool &valid)
 void free_vars(TokenStream &in, bool &valid)
 {
     CHECK(expect_l_list(in, valid), valid);
-    while(!accept_r_list(in))
+    while (!accept_r_list(in))
     {
         CHECK(text_val(in, valid), valid);
     }
@@ -688,7 +681,7 @@ void heuristics_table_entry(TokenStream &in, MLGOHeuristics &h, bool &valid)
 void heuristics_table(TokenStream &in, MLGOHeuristics &h, bool &valid)
 {
     CHECK(expect_text(in, "<heuristics-table>", valid), valid);
-    while(!accept_text(in, "</heuristics-table>"))
+    while (!accept_text(in, "</heuristics-table>"))
     {
         CHECK(heuristics_table_entry(in, h, valid), valid);
     }
@@ -705,11 +698,12 @@ Condition condition(TokenStream &in, bool &valid)
     const auto c_o         = CHECK_DEFAULT(conditional_op(in, valid), valid, invalid_val);
     const auto r_t         = CHECK_DEFAULT(comparator_type(in, valid), valid, invalid_val);
     const auto r_v         = CHECK_DEFAULT(float_val(in, valid), valid, invalid_val);
-    if(l_t != ComparatorType::Var || r_t != ComparatorType::Num)
+    if (l_t != ComparatorType::Var || r_t != ComparatorType::Num)
     {
-        FAIL_WITH_MSG_DEFAULT(valid, invalid_val, pos, "Only accept LHS type to be Var (string) and RHS type to be Num (float)");
+        FAIL_WITH_MSG_DEFAULT(valid, invalid_val, pos,
+                              "Only accept LHS type to be Var (string) and RHS type to be Num (float)");
     }
-    return Condition{ l_v, c_o, r_v };
+    return Condition{l_v, c_o, r_v};
 }
 
 void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid)
@@ -717,13 +711,13 @@ void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid)
     CHECK(expect_text(in, "<heuristic", valid), valid);
     const auto tree_id = CHECK(uint_val(in, valid), valid);
     CHECK(expect_text(in, ">", valid), valid);
-    HeuristicTree *t = nullptr;
-    std::tie(valid, t) = CHECK(h.get_heuristic_tree(tree_id), valid);
+    HeuristicTree *t                     = nullptr;
+    std::tie(valid, t)                   = CHECK(h.get_heuristic_tree(tree_id), valid);
     const HeuristicType t_heuristic_type = std::get<0>(t->index());
-    while(!accept_text(in, "</heuristic>"))
+    while (!accept_text(in, "</heuristic>"))
     {
         LOG_TOKEN_POS(in, pos);
-        if(accept_text(in, "b"))
+        if (accept_text(in, "b"))
         {
             // Branch node
             const auto id   = CHECK(uint_val(in, valid), valid);
@@ -732,7 +726,7 @@ void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid)
             const auto f_id = CHECK(uint_val(in, valid), valid);
             valid           = CHECK(t->add_branch(id, cond, t_id, f_id), valid);
         }
-        else if(accept_text(in, "l"))
+        else if (accept_text(in, "l"))
         {
             // Leaf node
             const auto id = CHECK(uint_val(in, valid), valid);
@@ -740,7 +734,7 @@ void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid)
             // heuristic table). For now it remains as a step for validation.
             LOG_TOKEN_POS(in, pos);
             CHECK(expect_heuristic_type(in, t_heuristic_type, valid), valid);
-            switch(t_heuristic_type)
+            switch (t_heuristic_type)
             {
                 case HeuristicType::GEMM_Type:
                 {
@@ -786,7 +780,7 @@ MLGOHeuristics mlgo(TokenStream &in, bool &valid)
     MLGOHeuristics h;
     CHECK_DEFAULT(header(in, valid), valid, h);
     CHECK_DEFAULT(heuristics_table(in, h, valid), valid, h);
-    while(accept_text(in, "<heuristic", false))
+    while (accept_text(in, "<heuristic", false))
     {
         CHECK_DEFAULT(heuristic_tree(in, h, valid), valid, h);
     }
@@ -809,4 +803,4 @@ std::pair<bool, MLGOHeuristics> parse_mlgo(std::istream &in)
 #undef CHECK
 #undef CHECK_DEFAULT
 #undef FAIL_WITH_MSG
-#undef FAIL_WITH_MSG_DEFAULT
-\ No newline at end of file
+#undef FAIL_WITH_MSG_DEFAULT
diff --git a/src/runtime/CL/mlgo/MLGOParser.h b/src/runtime/CL/mlgo/MLGOParser.h
index 49d8b9c644..cffce8d6a1 100644
--- a/src/runtime/CL/mlgo/MLGOParser.h
+++ b/src/runtime/CL/mlgo/MLGOParser.h
@@ -98,15 +98,14 @@ struct CharPosition
         return ln == other.ln && col == other.col;
     }
 
-    size_t ln{ 0 };
-    size_t col{ 0 };
+    size_t ln{0};
+    size_t col{0};
 };
 
 /** Token */
 struct Token
 {
-    Token(TokenType t, std::string v, CharPosition pos)
-        : type{ t }, value{ v }, pos{ pos }
+    Token(TokenType t, std::string v, CharPosition pos) : type{t}, value{v}, pos{pos}
     {
     }
 
@@ -196,4 +195,4 @@ std::pair<bool, MLGOHeuristics> parse_mlgo(std::istream &in);
 } // namespace parser
 } // namespace mlgo
 } // namespace arm_compute
-#endif //SRC_RUNTIME_CL_MLGO_MLGO_PARSER_H
-\ No newline at end of file
+#endif //SRC_RUNTIME_CL_MLGO_MLGO_PARSER_H
diff --git a/src/runtime/CL/mlgo/Utils.cpp b/src/runtime/CL/mlgo/Utils.cpp
index 81d418c28e..c7e0100b3c 100644
--- a/src/runtime/CL/mlgo/Utils.cpp
+++ b/src/runtime/CL/mlgo/Utils.cpp
@@ -43,40 +43,38 @@ inline std::string to_str(const T &val)
 std::ostream &operator<<(std::ostream &os, const GEMMConfigNative &config)
 {
     return os << "Native:{"
-           << "m0: " << config.m0 << ", "
-           << "n0: " << config.n0 << ", "
-           << "k0: " << config.k0 << ", "
-           << "}";
+              << "m0: " << config.m0 << ", "
+              << "n0: " << config.n0 << ", "
+              << "k0: " << config.k0 << ", "
+              << "}";
 }
 std::ostream &operator<<(std::ostream &os, const GEMMConfigReshapedOnlyRHS &config)
 {
     return os << "ReshapedOnlyRHS:{"
-           << "m0: " << config.m0 << ", "
-           << "n0: " << config.n0 << ", "
-           << "k0: " << config.k0 << ", "
-           << "h0: " << config.h0 << ", "
-           << "interleave_rhs: " << config.interleave_rhs << ", "
-           << "transpose_rhs: " << config.transpose_rhs << ", "
-           << "export_cl_image: " << config.export_cl_image
-           << "}";
+              << "m0: " << config.m0 << ", "
+              << "n0: " << config.n0 << ", "
+              << "k0: " << config.k0 << ", "
+              << "h0: " << config.h0 << ", "
+              << "interleave_rhs: " << config.interleave_rhs << ", "
+              << "transpose_rhs: " << config.transpose_rhs << ", "
+              << "export_cl_image: " << config.export_cl_image << "}";
 }
 std::ostream &operator<<(std::ostream &os, const GEMMConfigReshaped &config)
 {
     return os << "Reshaped:{"
-           << "m0: " << config.m0 << ", "
-           << "n0: " << config.n0 << ", "
-           << "k0: " << config.k0 << ", "
-           << "v0: " << config.v0 << ", "
-           << "h0: " << config.h0 << ", "
-           << "interleave_lhs: " << config.interleave_lhs << ", "
-           << "interleave_rhs: " << config.interleave_rhs << ", "
-           << "transpose_rhs: " << config.transpose_rhs << ", "
-           << "export_cl_image: " << config.export_cl_image
-           << "}";
+              << "m0: " << config.m0 << ", "
+              << "n0: " << config.n0 << ", "
+              << "k0: " << config.k0 << ", "
+              << "v0: " << config.v0 << ", "
+              << "h0: " << config.h0 << ", "
+              << "interleave_lhs: " << config.interleave_lhs << ", "
+              << "interleave_rhs: " << config.interleave_rhs << ", "
+              << "transpose_rhs: " << config.transpose_rhs << ", "
+              << "export_cl_image: " << config.export_cl_image << "}";
 }
 std::ostream &operator<<(std::ostream &os, HeuristicType ht)
 {
-    switch(ht)
+    switch (ht)
     {
         case HeuristicType::GEMM_Type:
         {
@@ -103,7 +101,7 @@ std::ostream &operator<<(std::ostream &os, HeuristicType ht)
 }
 std::ostream &operator<<(std::ostream &os, DataType dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::F32:
         {
@@ -184,4 +182,4 @@ std::ostream &operator<<(std::ostream &os, const CharPosition &pos)
 
 } // namespace mlgo
 
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/CL/mlgo/Utils.h b/src/runtime/CL/mlgo/Utils.h
index c634a887e9..73b537f476 100644
--- a/src/runtime/CL/mlgo/Utils.h
+++ b/src/runtime/CL/mlgo/Utils.h
@@ -43,10 +43,10 @@ std::ostream &operator<<(std::ostream &os, HeuristicType ht);
 std::ostream &operator<<(std::ostream &os, DataType dt);
 std::ostream &operator<<(std::ostream &os, const HeuristicTree::Index &index);
 std::ostream &operator<<(std::ostream &os, const Query &query);
-std::string to_string(const GEMMConfigNative &config);
-std::string to_string(const GEMMConfigReshapedOnlyRHS &config);
-std::string to_string(const GEMMConfigReshaped &config);
-std::string to_string(const Query &query);
+std::string   to_string(const GEMMConfigNative &config);
+std::string   to_string(const GEMMConfigReshapedOnlyRHS &config);
+std::string   to_string(const GEMMConfigReshaped &config);
+std::string   to_string(const Query &query);
 namespace parser
 {
 std::ostream &operator<<(std::ostream &os, const CharPosition &pos);
@@ -54,4 +54,4 @@ std::ostream &operator<<(std::ostream &os, const CharPosition &pos);
 } // namespace mlgo
 } // namespace arm_compute
 
-#endif //SRC_RUNTIME_CL_MLGO_UTILS_H
-\ No newline at end of file
+#endif //SRC_RUNTIME_CL_MLGO_UTILS_H
diff --git a/src/runtime/CL/tuners/CLTuningParametersList.cpp b/src/runtime/CL/tuners/CLTuningParametersList.cpp
index 6f3e32491a..5e3907f1ea 100644
--- a/src/runtime/CL/tuners/CLTuningParametersList.cpp
+++ b/src/runtime/CL/tuners/CLTuningParametersList.cpp
@@ -27,20 +27,20 @@ namespace arm_compute
 {
 namespace cl_tuner
 {
-constexpr unsigned int max_lws_supported_x{ 64u };
-constexpr unsigned int max_lws_supported_y{ 32u };
-constexpr unsigned int max_lws_supported_z{ 32u };
+constexpr unsigned int max_lws_supported_x{64u};
+constexpr unsigned int max_lws_supported_y{32u};
+constexpr unsigned int max_lws_supported_z{32u};
 
 /** Non instantiable base class for Tuning parameters combinations that use Index2Coord mapping */
 class CLTuningParametersList : public ICLTuningParametersList
 {
 protected:
     /* Shape of 4-D search space */
-    TensorShape               search_space_shape{ 0, 0, 0, 0 };
-    std::vector<unsigned int> _lws_x{ 0 };
-    std::vector<unsigned int> _lws_y{ 0 };
-    std::vector<unsigned int> _lws_z{ 0 };
-    std::vector<int>          _wbsm{ 0 }; /* Modify the batches size of workgroups distributed to compute units.
+    TensorShape               search_space_shape{0, 0, 0, 0};
+    std::vector<unsigned int> _lws_x{0};
+    std::vector<unsigned int> _lws_y{0};
+    std::vector<unsigned int> _lws_z{0};
+    std::vector<int>          _wbsm{0}; /* Modify the batches size of workgroups distributed to compute units.
                                              The value is in the range [-31,+31].
                                              When 0, the runtime-selected wbs used is unmodified. */
 
@@ -116,7 +116,8 @@ private:
      * @param[in]      lws_max     Max LWS value allowed to be tested
      * @param[in]      mod_let_one True if the results of the modulo operation between gws and the lws can be less than one.
      */
-    void initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one);
+    void
+    initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one);
 };
 
 /** A minimal subset of LWS values that only have 1,2 and 4/8 */
@@ -170,9 +171,9 @@ CLTuningParametersListExhaustive::CLTuningParametersListExhaustive(const cl::NDR
     search_space_shape[1] = lws_y_max;
     search_space_shape[2] = lws_z_max;
     search_space_shape[3] = 1;
-    if(tuning_info.tune_wbsm)
+    if (tuning_info.tune_wbsm)
     {
-        _wbsm                 = { -3, -2, -1, 0, 1, 2, 3 };
+        _wbsm                 = {-3, -2, -1, 0, 1, 2, 3};
         search_space_shape[3] = _wbsm.size();
     }
 }
@@ -194,26 +195,31 @@ CLTuningParametersListNormal::CLTuningParametersListNormal(const cl::NDRange &gw
     _lws_x = {};
     _lws_y = {};
     _lws_z = {};
-    initialize_lws_values(_lws_x, gws[0], lws_x_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
-    initialize_lws_values(_lws_y, gws[1], lws_y_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
+    initialize_lws_values(_lws_x, gws[0], lws_x_max,
+                          gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
+    initialize_lws_values(_lws_y, gws[1], lws_y_max,
+                          gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
     initialize_lws_values(_lws_z, gws[2], lws_z_max, false);
 
     search_space_shape[0] = _lws_x.size();
     search_space_shape[1] = _lws_y.size();
     search_space_shape[2] = _lws_z.size();
     search_space_shape[3] = 1;
-    if(tuning_info.tune_wbsm)
+    if (tuning_info.tune_wbsm)
     {
-        _wbsm                 = { -2, -1, 0, 1, 2 };
+        _wbsm                 = {-2, -1, 0, 1, 2};
         search_space_shape[3] = _wbsm.size();
     }
 }
 
-void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one)
+void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned int> &lws,
+                                                         unsigned int               gws,
+                                                         unsigned int               lws_max,
+                                                         bool                       mod_let_one)
 {
     lws.push_back(1);
 
-    for(unsigned int i = 2; i <= lws_max; ++i)
+    for (unsigned int i = 2; i <= lws_max; ++i)
     {
         // Power of two condition
         const bool is_power_of_two = (i & (i - 1)) == 0;
@@ -221,7 +227,7 @@ void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned in
         // Condition for the module accordingly with the mod_let_one flag
         const bool mod_cond = mod_let_one ? (gws % i) <= 1 : (gws % i) == 0;
 
-        if(mod_cond || is_power_of_two)
+        if (mod_cond || is_power_of_two)
         {
             lws.push_back(i);
         }
@@ -246,9 +252,9 @@ CLTuningParametersListRapid::CLTuningParametersListRapid(const cl::NDRange &gws,
     search_space_shape[1] = _lws_y.size();
     search_space_shape[2] = _lws_z.size();
     search_space_shape[3] = 1;
-    if(tuning_info.tune_wbsm)
+    if (tuning_info.tune_wbsm)
     {
-        _wbsm                 = { -1, 0, 1 };
+        _wbsm                 = {-1, 0, 1};
         search_space_shape[3] = _wbsm.size();
     }
 }
@@ -257,7 +263,7 @@ void CLTuningParametersListRapid::initialize_lws_values(std::vector<unsigned int
 {
     lws.push_back(1);
 
-    for(unsigned int i = 2; i <= lws_max; i *= 4)
+    for (unsigned int i = 2; i <= lws_max; i *= 4)
     {
         lws.push_back(i);
     }
@@ -265,7 +271,7 @@ void CLTuningParametersListRapid::initialize_lws_values(std::vector<unsigned int
 
 std::unique_ptr<ICLTuningParametersList> get_tuning_parameters_list(CLTuningInfo tuning_info, const cl::NDRange &gws)
 {
-    switch(tuning_info.tuner_mode)
+    switch (tuning_info.tuner_mode)
     {
         case CLTunerMode::EXHAUSTIVE:
             return std::make_unique<CLTuningParametersListExhaustive>(gws, tuning_info);
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index 45e872428f..9fbdc3a4dd 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/Log.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/Utility.h"
+
 #include "support/Mutex.h"
 
 #include <atomic>
@@ -53,8 +54,7 @@ public:
      * @param[in] start First value that will be returned by the feeder
      * @param[in] end   End condition (The last value returned by get_next() will be end - 1)
      */
-    explicit ThreadFeeder(unsigned int start = 0, unsigned int end = 0)
-        : _atomic_counter(start), _end(end)
+    explicit ThreadFeeder(unsigned int start = 0, unsigned int end = 0) : _atomic_counter(start), _end(end)
     {
     }
     /** Return the next element in the range if there is one.
@@ -89,8 +89,7 @@ void process_workloads(std::vector<IScheduler::Workload> &workloads, ThreadFeede
     {
         ARM_COMPUTE_ERROR_ON(workload_index >= workloads.size());
         workloads[workload_index](info);
-    }
-    while(feeder.get_next(workload_index));
+    } while (feeder.get_next(workload_index));
 }
 
 /** Set thread affinity. Pin current thread to a particular core
@@ -99,7 +98,7 @@ void process_workloads(std::vector<IScheduler::Workload> &workloads, ThreadFeede
  */
 void set_thread_affinity(int core_id)
 {
-    if(core_id < 0)
+    if (core_id < 0)
     {
         return;
     }
@@ -150,10 +149,10 @@ public:
      */
     explicit Thread(int core_pin = -1);
 
-    Thread(const Thread &) = delete;
+    Thread(const Thread &)            = delete;
     Thread &operator=(const Thread &) = delete;
     Thread(Thread &&)                 = delete;
-    Thread &operator=(Thread &&) = delete;
+    Thread &operator=(Thread &&)      = delete;
 
     /** Destructor. Make the thread join. */
     ~Thread();
@@ -196,21 +195,20 @@ public:
 private:
     std::thread                        _thread{};
     ThreadInfo                         _info{};
-    std::vector<IScheduler::Workload> *_workloads{ nullptr };
-    ThreadFeeder                      *_feeder{ nullptr };
+    std::vector<IScheduler::Workload> *_workloads{nullptr};
+    ThreadFeeder                      *_feeder{nullptr};
     std::mutex                         _m{};
     std::condition_variable            _cv{};
-    bool                               _wait_for_work{ false };
-    bool                               _job_complete{ true };
-    std::exception_ptr                 _current_exception{ nullptr };
-    int                                _core_pin{ -1 };
-    std::list<Thread>                 *_thread_pool{ nullptr };
-    unsigned int                       _wake_beg{ 0 };
-    unsigned int                       _wake_end{ 0 };
+    bool                               _wait_for_work{false};
+    bool                               _job_complete{true};
+    std::exception_ptr                 _current_exception{nullptr};
+    int                                _core_pin{-1};
+    std::list<Thread>                 *_thread_pool{nullptr};
+    unsigned int                       _wake_beg{0};
+    unsigned int                       _wake_end{0};
 };
 
-Thread::Thread(int core_pin)
-    : _core_pin(core_pin)
+Thread::Thread(int core_pin) : _core_pin(core_pin)
 {
     _thread = std::thread(&Thread::worker_thread, this);
 }
@@ -218,7 +216,7 @@ Thread::Thread(int core_pin)
 Thread::~Thread()
 {
     // Make sure worker thread has ended
-    if(_thread.joinable())
+    if (_thread.joinable())
     {
         ThreadFeeder feeder;
         set_workload(nullptr, feeder, ThreadInfo());
@@ -257,7 +255,7 @@ void Thread::worker_thread()
 {
     set_thread_affinity(_core_pin);
 
-    while(true)
+    while (true)
     {
         std::unique_lock<std::mutex> lock(_m);
         _cv.wait(lock, [&] { return _wait_for_work; });
@@ -266,18 +264,18 @@ void Thread::worker_thread()
         _current_exception = nullptr;
 
         // Exit if the worker thread has not been fed with workloads
-        if(_workloads == nullptr || _feeder == nullptr)
+        if (_workloads == nullptr || _feeder == nullptr)
         {
             return;
         }
 
         // Wake up more peer threads from thread pool if this job has been delegated to the current thread
-        if(_thread_pool != nullptr)
+        if (_thread_pool != nullptr)
         {
             auto thread_it = _thread_pool->begin();
             std::advance(thread_it, std::min(static_cast<unsigned int>(_thread_pool->size()), _wake_beg));
             auto wake_end = std::min(_wake_end, static_cast<unsigned int>(_info.num_threads - 1));
-            for(unsigned int t = _wake_beg; t < wake_end; ++t, ++thread_it)
+            for (unsigned int t = _wake_beg; t < wake_end; ++t, ++thread_it)
             {
                 thread_it->start();
             }
@@ -291,7 +289,7 @@ void Thread::worker_thread()
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
         }
-        catch(...)
+        catch (...)
         {
             _current_exception = std::current_exception();
         }
@@ -322,11 +320,11 @@ struct CPPScheduler::Impl final
         : _num_threads(thread_hint), _threads(_num_threads - 1), _mode(Mode::Linear), _wake_fanout(0U)
     {
         const auto mode_env_v = utility::tolower(utility::getenv("ARM_COMPUTE_CPP_SCHEDULER_MODE"));
-        if(mode_env_v == "linear")
+        if (mode_env_v == "linear")
         {
             _forced_mode = ModeToggle::Linear;
         }
-        else if(mode_env_v == "fanout")
+        else if (mode_env_v == "fanout")
         {
             _forced_mode = ModeToggle::Fanout;
         }
@@ -350,7 +348,7 @@ struct CPPScheduler::Impl final
 
         // Set affinity on worked threads
         _threads.clear();
-        for(auto i = 1U; i < _num_threads; ++i)
+        for (auto i = 1U; i < _num_threads; ++i)
         {
             _threads.emplace_back(func(i, thread_hint));
         }
@@ -359,20 +357,23 @@ struct CPPScheduler::Impl final
     void auto_switch_mode(unsigned int num_threads_to_use)
     {
         // If the environment variable is set to any of the modes, it overwrites the mode selected over num_threads_to_use
-        if(_forced_mode == ModeToggle::Fanout || (_forced_mode == ModeToggle::None && num_threads_to_use > 8))
+        if (_forced_mode == ModeToggle::Fanout || (_forced_mode == ModeToggle::None && num_threads_to_use > 8))
         {
             set_fanout_mode(m_default_wake_fanout, num_threads_to_use);
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Set CPPScheduler to Fanout mode, with wake up fanout : %d and %d threads to use\n", this->wake_fanout(), num_threads_to_use);
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+                "Set CPPScheduler to Fanout mode, with wake up fanout : %d and %d threads to use\n",
+                this->wake_fanout(), num_threads_to_use);
         }
         else // Equivalent to (_forced_mode == ModeToggle::Linear || (_forced_mode == ModeToggle::None && num_threads_to_use <= 8))
         {
             set_linear_mode();
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Set CPPScheduler to Linear mode, with %d threads to use\n", num_threads_to_use);
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Set CPPScheduler to Linear mode, with %d threads to use\n",
+                                                      num_threads_to_use);
         }
     }
     void set_linear_mode()
     {
-        for(auto &thread : _threads)
+        for (auto &thread : _threads)
         {
             thread.set_linear_mode();
         }
@@ -384,14 +385,14 @@ struct CPPScheduler::Impl final
         ARM_COMPUTE_ERROR_ON(num_threads_to_use > _threads.size() + 1);
         const auto actual_wake_fanout = std::max(2U, std::min(wake_fanout, num_threads_to_use - 1));
         auto       thread_it          = _threads.begin();
-        for(auto i = 1U; i < num_threads_to_use; ++i, ++thread_it)
+        for (auto i = 1U; i < num_threads_to_use; ++i, ++thread_it)
         {
             const auto wake_begin = i * actual_wake_fanout - 1;
             const auto wake_end   = std::min((i + 1) * actual_wake_fanout - 1, num_threads_to_use - 1);
             thread_it->set_fanout_mode(&_threads, wake_begin, wake_end);
         }
         // Reset the remaining threads's wake up schedule
-        while(thread_it != _threads.end())
+        while (thread_it != _threads.end())
         {
             thread_it->set_fanout_mode(&_threads, 0U, 0U);
             ++thread_it;
@@ -417,9 +418,9 @@ struct CPPScheduler::Impl final
     unsigned int       _num_threads;
     std::list<Thread>  _threads;
     arm_compute::Mutex _run_workloads_mutex{};
-    Mode               _mode{ Mode::Linear };
-    ModeToggle         _forced_mode{ ModeToggle::None };
-    unsigned int       _wake_fanout{ 0 };
+    Mode               _mode{Mode::Linear};
+    ModeToggle         _forced_mode{ModeToggle::None};
+    unsigned int       _wake_fanout{0};
 };
 
 /*
@@ -431,8 +432,7 @@ CPPScheduler &CPPScheduler::get()
     return scheduler;
 }
 
-CPPScheduler::CPPScheduler()
-    : _impl(std::make_unique<Impl>(num_threads_hint()))
+CPPScheduler::CPPScheduler() : _impl(std::make_unique<Impl>(num_threads_hint()))
 {
 }
 
@@ -465,15 +465,15 @@ void CPPScheduler::run_workloads(std::vector<IScheduler::Workload> &workloads)
     // This is not great because different threads workloads won't run in parallel but at least they
     // won't interfere each other and deadlock.
     arm_compute::lock_guard<std::mutex> lock(_impl->_run_workloads_mutex);
-    const unsigned int                  num_threads_to_use = std::min(_impl->num_threads(), static_cast<unsigned int>(workloads.size()));
-    if(num_threads_to_use < 1)
+    const unsigned int num_threads_to_use = std::min(_impl->num_threads(), static_cast<unsigned int>(workloads.size()));
+    if (num_threads_to_use < 1)
     {
         return;
     }
     // Re-adjust the mode if the actual number of threads to use is different from the number of threads created
     _impl->auto_switch_mode(num_threads_to_use);
     int num_threads_to_start = 0;
-    switch(_impl->mode())
+    switch (_impl->mode())
     {
         case CPPScheduler::Impl::Mode::Fanout:
         {
@@ -494,22 +494,22 @@ void CPPScheduler::run_workloads(std::vector<IScheduler::Workload> &workloads)
     unsigned int t         = 0;
     auto         thread_it = _impl->_threads.begin();
     // Set num_threads_to_use - 1 workloads to the threads as the remaining 1 is left to the main thread
-    for(; t < num_threads_to_use - 1; ++t, ++thread_it)
+    for (; t < num_threads_to_use - 1; ++t, ++thread_it)
     {
         info.thread_id = t;
         thread_it->set_workload(&workloads, feeder, info);
     }
     thread_it = _impl->_threads.begin();
-    for(int i = 0; i < num_threads_to_start; ++i, ++thread_it)
+    for (int i = 0; i < num_threads_to_start; ++i, ++thread_it)
     {
         thread_it->start();
     }
-    info.thread_id = t;                         // Set main thread's thread_id
+    info.thread_id                    = t; // Set main thread's thread_id
     std::exception_ptr last_exception = nullptr;
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     try
     {
-#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
+#endif                                              /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
         process_workloads(workloads, feeder, info); // Main thread processes workloads
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
@@ -522,7 +522,7 @@ void CPPScheduler::run_workloads(std::vector<IScheduler::Workload> &workloads)
     {
 #endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
         thread_it = _impl->_threads.begin();
-        for(unsigned int i = 0; i < num_threads_to_use - 1; ++i, ++thread_it)
+        for (unsigned int i = 0; i < num_threads_to_use - 1; ++i, ++thread_it)
         {
             std::exception_ptr current_exception = thread_it->wait();
             if (current_exception)
@@ -536,7 +536,7 @@ void CPPScheduler::run_workloads(std::vector<IScheduler::Workload> &workloads)
         }
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
-    catch(const std::system_error &e)
+    catch (const std::system_error &e)
     {
         std::cerr << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n';
     }
diff --git a/src/runtime/CPP/SingleThreadScheduler.cpp b/src/runtime/CPP/SingleThreadScheduler.cpp
index 5890553f6f..c46a2731d8 100644
--- a/src/runtime/CPP/SingleThreadScheduler.cpp
+++ b/src/runtime/CPP/SingleThreadScheduler.cpp
@@ -39,10 +39,10 @@ void SingleThreadScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
 {
     const Window &max_window = kernel->window();
 
-    if(hints.split_dimension() != IScheduler::split_dimensions_all)
+    if (hints.split_dimension() != IScheduler::split_dimensions_all)
     {
         const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
-        if(num_iterations < 1)
+        if (num_iterations < 1)
         {
             return;
         }
@@ -53,7 +53,10 @@ void SingleThreadScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
     kernel->run(kernel->window(), info);
 }
 
-void SingleThreadScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors)
+void SingleThreadScheduler::schedule_op(ICPPKernel   *kernel,
+                                        const Hints  &hints,
+                                        const Window &window,
+                                        ITensorPack  &tensors)
 {
     ARM_COMPUTE_UNUSED(hints);
     ThreadInfo info;
@@ -65,7 +68,7 @@ void SingleThreadScheduler::run_workloads(std::vector<Workload> &workloads)
 {
     ThreadInfo info;
     info.cpu_info = &cpu_info();
-    for(auto &wl : workloads)
+    for (auto &wl : workloads)
     {
         wl(info);
     }
diff --git a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
index dccbe4045d..94a1673d59 100644
--- a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
+++ b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
@@ -42,28 +42,37 @@ void dequantize_tensor(const ITensor *input, ITensor *output)
     Iterator input_it(input, window);
     Iterator output_it(output, window);
 
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::QASYMM8:
-            execute_window_loop(window, [&](const Coordinates &)
-            {
-                *reinterpret_cast<float *>(output_it.ptr()) = dequantize(*reinterpret_cast<const uint8_t *>(input_it.ptr()), qinfo.scale, qinfo.offset);
-            },
-            input_it, output_it);
+            execute_window_loop(
+                window,
+                [&](const Coordinates &)
+                {
+                    *reinterpret_cast<float *>(output_it.ptr()) =
+                        dequantize(*reinterpret_cast<const uint8_t *>(input_it.ptr()), qinfo.scale, qinfo.offset);
+                },
+                input_it, output_it);
             break;
         case DataType::QASYMM8_SIGNED:
-            execute_window_loop(window, [&](const Coordinates &)
-            {
-                *reinterpret_cast<float *>(output_it.ptr()) = dequantize_qasymm8_signed(*reinterpret_cast<const int8_t *>(input_it.ptr()), qinfo);
-            },
-            input_it, output_it);
+            execute_window_loop(
+                window,
+                [&](const Coordinates &)
+                {
+                    *reinterpret_cast<float *>(output_it.ptr()) =
+                        dequantize_qasymm8_signed(*reinterpret_cast<const int8_t *>(input_it.ptr()), qinfo);
+                },
+                input_it, output_it);
             break;
         case DataType::QASYMM16:
-            execute_window_loop(window, [&](const Coordinates &)
-            {
-                *reinterpret_cast<float *>(output_it.ptr()) = dequantize(*reinterpret_cast<const uint16_t *>(input_it.ptr()), qinfo.scale, qinfo.offset);
-            },
-            input_it, output_it);
+            execute_window_loop(
+                window,
+                [&](const Coordinates &)
+                {
+                    *reinterpret_cast<float *>(output_it.ptr()) =
+                        dequantize(*reinterpret_cast<const uint16_t *>(input_it.ptr()), qinfo.scale, qinfo.offset);
+                },
+                input_it, output_it);
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported data type");
@@ -80,28 +89,37 @@ void quantize_tensor(const ITensor *input, ITensor *output)
     Iterator input_it(input, window);
     Iterator output_it(output, window);
 
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::QASYMM8:
-            execute_window_loop(window, [&](const Coordinates &)
-            {
-                *reinterpret_cast<uint8_t *>(output_it.ptr()) = quantize_qasymm8(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
-            },
-            input_it, output_it);
+            execute_window_loop(
+                window,
+                [&](const Coordinates &)
+                {
+                    *reinterpret_cast<uint8_t *>(output_it.ptr()) =
+                        quantize_qasymm8(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
+                },
+                input_it, output_it);
             break;
         case DataType::QASYMM8_SIGNED:
-            execute_window_loop(window, [&](const Coordinates &)
-            {
-                *reinterpret_cast<int8_t *>(output_it.ptr()) = quantize_qasymm8_signed(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
-            },
-            input_it, output_it);
+            execute_window_loop(
+                window,
+                [&](const Coordinates &)
+                {
+                    *reinterpret_cast<int8_t *>(output_it.ptr()) =
+                        quantize_qasymm8_signed(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
+                },
+                input_it, output_it);
             break;
         case DataType::QASYMM16:
-            execute_window_loop(window, [&](const Coordinates &)
-            {
-                *reinterpret_cast<uint16_t *>(output_it.ptr()) = quantize_qasymm16(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
-            },
-            input_it, output_it);
+            execute_window_loop(
+                window,
+                [&](const Coordinates &)
+                {
+                    *reinterpret_cast<uint16_t *>(output_it.ptr()) =
+                        quantize_qasymm16(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
+                },
+                input_it, output_it);
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported data type");
@@ -132,14 +150,23 @@ CPPBoxWithNonMaximaSuppressionLimit::CPPBoxWithNonMaximaSuppressionLimit(std::sh
 {
 }
 
-void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in,
-                                                    ITensor *scores_out, ITensor *boxes_out, ITensor *classes, ITensor *batch_splits_out,
-                                                    ITensor *keeps, ITensor *keeps_size, const BoxNMSLimitInfo info)
+void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor        *scores_in,
+                                                    const ITensor        *boxes_in,
+                                                    const ITensor        *batch_splits_in,
+                                                    ITensor              *scores_out,
+                                                    ITensor              *boxes_out,
+                                                    ITensor              *classes,
+                                                    ITensor              *batch_splits_out,
+                                                    ITensor              *keeps,
+                                                    ITensor              *keeps_size,
+                                                    const BoxNMSLimitInfo info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes);
-    ARM_COMPUTE_LOG_PARAMS(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out, keeps, keeps_size, info);
+    ARM_COMPUTE_LOG_PARAMS(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out,
+                           keeps, keeps_size, info);
 
-    _is_qasymm8 = scores_in->info()->data_type() == DataType::QASYMM8 || scores_in->info()->data_type() == DataType::QASYMM8_SIGNED;
+    _is_qasymm8 = scores_in->info()->data_type() == DataType::QASYMM8 ||
+                  scores_in->info()->data_type() == DataType::QASYMM8_SIGNED;
 
     _scores_in        = scores_in;
     _boxes_in         = boxes_in;
@@ -150,7 +177,7 @@ void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, co
     _batch_splits_out = batch_splits_out;
     _keeps            = keeps;
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         // Manage intermediate buffers
         _memory_group.manage(&_scores_in_f32);
@@ -160,7 +187,7 @@ void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, co
         _memory_group.manage(&_classes_f32);
         _scores_in_f32.allocator()->init(scores_in->info()->clone()->set_data_type(DataType::F32));
         _boxes_in_f32.allocator()->init(boxes_in->info()->clone()->set_data_type(DataType::F32));
-        if(batch_splits_in != nullptr)
+        if (batch_splits_in != nullptr)
         {
             _memory_group.manage(&_batch_splits_in_f32);
             _batch_splits_in_f32.allocator()->init(batch_splits_in->info()->clone()->set_data_type(DataType::F32));
@@ -168,58 +195,70 @@ void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, co
         _scores_out_f32.allocator()->init(scores_out->info()->clone()->set_data_type(DataType::F32));
         _boxes_out_f32.allocator()->init(boxes_out->info()->clone()->set_data_type(DataType::F32));
         _classes_f32.allocator()->init(classes->info()->clone()->set_data_type(DataType::F32));
-        if(batch_splits_out != nullptr)
+        if (batch_splits_out != nullptr)
         {
             _memory_group.manage(&_batch_splits_out_f32);
             _batch_splits_out_f32.allocator()->init(batch_splits_out->info()->clone()->set_data_type(DataType::F32));
         }
-        if(keeps != nullptr)
+        if (keeps != nullptr)
         {
             _memory_group.manage(&_keeps_f32);
             _keeps_f32.allocator()->init(keeps->info()->clone()->set_data_type(DataType::F32));
         }
 
-        _box_with_nms_limit_kernel.configure(&_scores_in_f32, &_boxes_in_f32, (batch_splits_in != nullptr) ? &_batch_splits_in_f32 : nullptr,
+        _box_with_nms_limit_kernel.configure(&_scores_in_f32, &_boxes_in_f32,
+                                             (batch_splits_in != nullptr) ? &_batch_splits_in_f32 : nullptr,
                                              &_scores_out_f32, &_boxes_out_f32, &_classes_f32,
-                                             (batch_splits_out != nullptr) ? &_batch_splits_out_f32 : nullptr, (keeps != nullptr) ? &_keeps_f32 : nullptr,
-                                             keeps_size, info);
+                                             (batch_splits_out != nullptr) ? &_batch_splits_out_f32 : nullptr,
+                                             (keeps != nullptr) ? &_keeps_f32 : nullptr, keeps_size, info);
     }
     else
     {
-        _box_with_nms_limit_kernel.configure(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out, keeps, keeps_size, info);
+        _box_with_nms_limit_kernel.configure(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes,
+                                             batch_splits_out, keeps, keeps_size, info);
     }
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _scores_in_f32.allocator()->allocate();
         _boxes_in_f32.allocator()->allocate();
-        if(_batch_splits_in != nullptr)
+        if (_batch_splits_in != nullptr)
         {
             _batch_splits_in_f32.allocator()->allocate();
         }
         _scores_out_f32.allocator()->allocate();
         _boxes_out_f32.allocator()->allocate();
         _classes_f32.allocator()->allocate();
-        if(batch_splits_out != nullptr)
+        if (batch_splits_out != nullptr)
         {
             _batch_splits_out_f32.allocator()->allocate();
         }
-        if(keeps != nullptr)
+        if (keeps != nullptr)
         {
             _keeps_f32.allocator()->allocate();
         }
     }
 }
 
-Status validate(const ITensorInfo *scores_in, const ITensorInfo *boxes_in, const ITensorInfo *batch_splits_in, const ITensorInfo *scores_out, const ITensorInfo *boxes_out, const ITensorInfo *classes,
-                const ITensorInfo *batch_splits_out, const ITensorInfo *keeps, const ITensorInfo *keeps_size, const BoxNMSLimitInfo info)
+Status validate(const ITensorInfo    *scores_in,
+                const ITensorInfo    *boxes_in,
+                const ITensorInfo    *batch_splits_in,
+                const ITensorInfo    *scores_out,
+                const ITensorInfo    *boxes_out,
+                const ITensorInfo    *classes,
+                const ITensorInfo    *batch_splits_out,
+                const ITensorInfo    *keeps,
+                const ITensorInfo    *keeps_size,
+                const BoxNMSLimitInfo info)
 {
     ARM_COMPUTE_UNUSED(batch_splits_in, batch_splits_out, keeps, keeps_size, info);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
 
-    const bool is_qasymm8 = scores_in->data_type() == DataType::QASYMM8 || scores_in->data_type() == DataType::QASYMM8_SIGNED;
-    if(is_qasymm8)
+    const bool is_qasymm8 =
+        scores_in->data_type() == DataType::QASYMM8 || scores_in->data_type() == DataType::QASYMM8_SIGNED;
+    if (is_qasymm8)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(boxes_in, 1, DataType::QASYMM16);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes_in, boxes_out);
@@ -237,11 +276,11 @@ void CPPBoxWithNonMaximaSuppressionLimit::run()
     // Acquire all the temporaries
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         dequantize_tensor(_scores_in, &_scores_in_f32);
         dequantize_tensor(_boxes_in, &_boxes_in_f32);
-        if(_batch_splits_in != nullptr)
+        if (_batch_splits_in != nullptr)
         {
             dequantize_tensor(_batch_splits_in, &_batch_splits_in_f32);
         }
@@ -249,16 +288,16 @@ void CPPBoxWithNonMaximaSuppressionLimit::run()
 
     Scheduler::get().schedule(&_box_with_nms_limit_kernel, Window::DimY);
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         quantize_tensor(&_scores_out_f32, _scores_out);
         quantize_tensor(&_boxes_out_f32, _boxes_out);
         quantize_tensor(&_classes_f32, _classes);
-        if(_batch_splits_out != nullptr)
+        if (_batch_splits_out != nullptr)
         {
             quantize_tensor(&_batch_splits_out_f32, _batch_splits_out);
         }
-        if(_keeps != nullptr)
+        if (_keeps != nullptr)
         {
             quantize_tensor(&_keeps_f32, _keeps);
         }
diff --git a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
index 41d875eb97..e6291f973e 100644
--- a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
@@ -26,9 +26,9 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include <list>
 
@@ -36,25 +36,35 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info)
+Status validate_arguments(const ITensorInfo       *input_loc,
+                          const ITensorInfo       *input_conf,
+                          const ITensorInfo       *input_priorbox,
+                          const ITensorInfo       *output,
+                          DetectionOutputLayerInfo info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_loc, input_conf, input_priorbox, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_loc, 1, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_loc, input_conf, input_priorbox);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_loc->num_dimensions() > 2, "The location input tensor should be [C1, N].");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_conf->num_dimensions() > 2, "The location input tensor should be [C2, N].");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_priorbox->num_dimensions() > 3, "The priorbox input tensor should be [C3, 2, N].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_priorbox->num_dimensions() > 3,
+                                    "The priorbox input tensor should be [C3, 2, N].");
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.eta() <= 0.f && info.eta() > 1.f, "Eta should be between 0 and 1");
 
     const int num_priors = input_priorbox->tensor_shape()[0] / 4;
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_loc_classes() * 4)) != input_loc->tensor_shape()[0], "Number of priors must match number of location predictions.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_classes())) != input_conf->tensor_shape()[0], "Number of priors must match number of confidence predictions.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_loc_classes() * 4)) !=
+                                        input_loc->tensor_shape()[0],
+                                    "Number of priors must match number of location predictions.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_classes())) !=
+                                        input_conf->tensor_shape()[0],
+                                    "Number of priors must match number of confidence predictions.");
 
     // Validate configured output
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const unsigned int max_size = info.keep_top_k() * (input_loc->num_dimensions() > 1 ? input_loc->dimension(1) : 1);
+        const unsigned int max_size =
+            info.keep_top_k() * (input_loc->num_dimensions() > 1 ? input_loc->dimension(1) : 1);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), TensorShape(7U, max_size));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_loc, output);
     }
@@ -65,8 +75,7 @@ Status validate_arguments(const ITensorInfo *input_loc, const ITensorInfo *input
 /** Function used to sort pair<float, T> in descend order based on the score (first) value.
  */
 template <typename T>
-bool SortScorePairDescend(const std::pair<float, T> &pair1,
-                          const std::pair<float, T> &pair2)
+bool SortScorePairDescend(const std::pair<float, T> &pair1, const std::pair<float, T> &pair2)
 {
     return pair1.first > pair2.first;
 }
@@ -82,16 +91,19 @@ bool SortScorePairDescend(const std::pair<float, T> &pair1,
  * @param[out] all_location_predictions All the location predictions.
  *
  */
-void retrieve_all_loc_predictions(const ITensor *input_loc, const int num,
-                                  const int num_priors, const int num_loc_classes,
-                                  const bool share_location, std::vector<LabelBBox> &all_location_predictions)
+void retrieve_all_loc_predictions(const ITensor          *input_loc,
+                                  const int               num,
+                                  const int               num_priors,
+                                  const int               num_loc_classes,
+                                  const bool              share_location,
+                                  std::vector<LabelBBox> &all_location_predictions)
 {
-    for(int i = 0; i < num; ++i)
+    for (int i = 0; i < num; ++i)
     {
-        for(int c = 0; c < num_loc_classes; ++c)
+        for (int c = 0; c < num_loc_classes; ++c)
         {
             int label = share_location ? -1 : c;
-            if(all_location_predictions[i].find(label) == all_location_predictions[i].end())
+            if (all_location_predictions[i].find(label) == all_location_predictions[i].end())
             {
                 all_location_predictions[i][label].resize(num_priors);
             }
@@ -102,19 +114,23 @@ void retrieve_all_loc_predictions(const ITensor *input_loc, const int num,
             }
         }
     }
-    for(int i = 0; i < num; ++i)
+    for (int i = 0; i < num; ++i)
     {
-        for(int p = 0; p < num_priors; ++p)
+        for (int p = 0; p < num_priors; ++p)
         {
-            for(int c = 0; c < num_loc_classes; ++c)
+            for (int c = 0; c < num_loc_classes; ++c)
             {
                 const int label    = share_location ? -1 : c;
                 const int base_ptr = i * num_priors * num_loc_classes * 4 + p * num_loc_classes * 4 + c * 4;
                 //xmin, ymin, xmax, ymax
-                all_location_predictions[i][label][p][0] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr)));
-                all_location_predictions[i][label][p][1] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 1)));
-                all_location_predictions[i][label][p][2] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 2)));
-                all_location_predictions[i][label][p][3] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 3)));
+                all_location_predictions[i][label][p][0] =
+                    *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr)));
+                all_location_predictions[i][label][p][1] =
+                    *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 1)));
+                all_location_predictions[i][label][p][2] =
+                    *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 2)));
+                all_location_predictions[i][label][p][3] =
+                    *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 3)));
             }
         }
     }
@@ -130,26 +146,28 @@ void retrieve_all_loc_predictions(const ITensor *input_loc, const int num,
  * @param[out] all_location_predictions All the location predictions.
  *
  */
-void retrieve_all_conf_scores(const ITensor *input_conf, const int num,
-                              const int num_priors, const int                 num_classes,
+void retrieve_all_conf_scores(const ITensor                                  *input_conf,
+                              const int                                       num,
+                              const int                                       num_priors,
+                              const int                                       num_classes,
                               std::vector<std::map<int, std::vector<float>>> &all_confidence_scores)
 {
     std::vector<float> tmp_buffer;
     tmp_buffer.resize(num * num_priors * num_classes);
-    for(int i = 0; i < num; ++i)
+    for (int i = 0; i < num; ++i)
     {
-        for(int c = 0; c < num_classes; ++c)
+        for (int c = 0; c < num_classes; ++c)
         {
-            for(int p = 0; p < num_priors; ++p)
+            for (int p = 0; p < num_priors; ++p)
             {
-                tmp_buffer[i * num_classes * num_priors + c * num_priors + p] =
-                    *reinterpret_cast<float *>(input_conf->ptr_to_element(Coordinates(i * num_classes * num_priors + p * num_classes + c)));
+                tmp_buffer[i * num_classes * num_priors + c * num_priors + p] = *reinterpret_cast<float *>(
+                    input_conf->ptr_to_element(Coordinates(i * num_classes * num_priors + p * num_classes + c)));
             }
         }
     }
-    for(int i = 0; i < num; ++i)
+    for (int i = 0; i < num; ++i)
     {
-        for(int c = 0; c < num_classes; ++c)
+        for (int c = 0; c < num_classes; ++c)
         {
             all_confidence_scores[i][c].resize(num_priors);
             all_confidence_scores[i][c].assign(&tmp_buffer[i * num_classes * num_priors + c * num_priors],
@@ -168,28 +186,23 @@ void retrieve_all_conf_scores(const ITensor *input_conf, const int num,
  * @param[out] all_location_predictions All the location predictions.
  *
  */
-void retrieve_all_priorbox(const ITensor     *input_priorbox,
-                           const int          num_priors,
-                           std::vector<BBox> &all_prior_bboxes,
+void retrieve_all_priorbox(const ITensor                     *input_priorbox,
+                           const int                          num_priors,
+                           std::vector<BBox>                 &all_prior_bboxes,
                            std::vector<std::array<float, 4>> &all_prior_variances)
 {
-    for(int i = 0; i < num_priors; ++i)
+    for (int i = 0; i < num_priors; ++i)
     {
-        all_prior_bboxes[i] =
-        {
-            {
-                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4))),
-                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 1))),
-                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 2))),
-                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 3)))
-            }
-        };
+        all_prior_bboxes[i] = {{*reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4))),
+                                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 1))),
+                                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 2))),
+                                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 3)))}};
     }
 
-    std::array<float, 4> var({ { 0, 0, 0, 0 } });
-    for(int i = 0; i < num_priors; ++i)
+    std::array<float, 4> var({{0, 0, 0, 0}});
+    for (int i = 0; i < num_priors; ++i)
     {
-        for(int j = 0; j < 4; ++j)
+        for (int j = 0; j < 4; ++j)
         {
             var[j] = *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates((num_priors + i) * 4 + j)));
         }
@@ -208,13 +221,17 @@ void retrieve_all_priorbox(const ITensor     *input_priorbox,
  * @param[out] decode_bbox                The decoded bboxes.
  *
  */
-void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_variance,
-                const DetectionOutputLayerCodeType code_type, const bool variance_encoded_in_target,
-                const bool clip_bbox, const BBox &bbox, BBox &decode_bbox)
+void DecodeBBox(const BBox                        &prior_bbox,
+                const std::array<float, 4>        &prior_variance,
+                const DetectionOutputLayerCodeType code_type,
+                const bool                         variance_encoded_in_target,
+                const bool                         clip_bbox,
+                const BBox                        &bbox,
+                BBox                              &decode_bbox)
 {
     // if the variance is encoded in target, we simply need to add the offset predictions
     // otherwise we need to scale the offset accordingly.
-    switch(code_type)
+    switch (code_type)
     {
         case DetectionOutputLayerCodeType::CORNER:
         {
@@ -237,10 +254,14 @@ void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_varian
             const float prior_center_x = (prior_bbox[0] + prior_bbox[2]) / 2.;
             const float prior_center_y = (prior_bbox[1] + prior_bbox[3]) / 2.;
 
-            const float decode_bbox_center_x = (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width + prior_center_x;
-            const float decode_bbox_center_y = (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height + prior_center_y;
-            const float decode_bbox_width    = (variance_encoded_in_target ? std::exp(bbox[2]) : std::exp(prior_variance[2] * bbox[2])) * prior_width;
-            const float decode_bbox_height   = (variance_encoded_in_target ? std::exp(bbox[3]) : std::exp(prior_variance[3] * bbox[3])) * prior_height;
+            const float decode_bbox_center_x =
+                (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width + prior_center_x;
+            const float decode_bbox_center_y =
+                (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height + prior_center_y;
+            const float decode_bbox_width =
+                (variance_encoded_in_target ? std::exp(bbox[2]) : std::exp(prior_variance[2] * bbox[2])) * prior_width;
+            const float decode_bbox_height =
+                (variance_encoded_in_target ? std::exp(bbox[3]) : std::exp(prior_variance[3] * bbox[3])) * prior_height;
 
             decode_bbox[0] = (decode_bbox_center_x - decode_bbox_width / 2.f);
             decode_bbox[1] = (decode_bbox_center_y - decode_bbox_height / 2.f);
@@ -258,10 +279,14 @@ void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_varian
             ARM_COMPUTE_ERROR_ON(prior_width <= 0.f);
             ARM_COMPUTE_ERROR_ON(prior_height <= 0.f);
 
-            decode_bbox[0] = prior_bbox[0] + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width;
-            decode_bbox[1] = prior_bbox[1] + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height;
-            decode_bbox[2] = prior_bbox[2] + (variance_encoded_in_target ? bbox[2] : prior_variance[2] * bbox[2]) * prior_width;
-            decode_bbox[3] = prior_bbox[3] + (variance_encoded_in_target ? bbox[3] : prior_variance[3] * bbox[3]) * prior_height;
+            decode_bbox[0] =
+                prior_bbox[0] + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width;
+            decode_bbox[1] =
+                prior_bbox[1] + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height;
+            decode_bbox[2] =
+                prior_bbox[2] + (variance_encoded_in_target ? bbox[2] : prior_variance[2] * bbox[2]) * prior_width;
+            decode_bbox[3] =
+                prior_bbox[3] + (variance_encoded_in_target ? bbox[3] : prior_variance[3] * bbox[3]) * prior_height;
 
             break;
         }
@@ -269,9 +294,9 @@ void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_varian
             ARM_COMPUTE_ERROR("Unsupported Detection Output Code Type.");
     }
 
-    if(clip_bbox)
+    if (clip_bbox)
     {
-        for(auto &d_bbox : decode_bbox)
+        for (auto &d_bbox : decode_bbox)
         {
             d_bbox = utility::clamp(d_bbox, 0.f, 1.f);
         }
@@ -289,10 +314,13 @@ void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_varian
  * @param[out] indices         The kept indices of bboxes after nms.
  *
  */
-void ApplyNMSFast(const std::vector<BBox> &bboxes,
-                  const std::vector<float> &scores, const float score_threshold,
-                  const float nms_threshold, const float eta, const int top_k,
-                  std::vector<int> &indices)
+void ApplyNMSFast(const std::vector<BBox>  &bboxes,
+                  const std::vector<float> &scores,
+                  const float               score_threshold,
+                  const float               nms_threshold,
+                  const float               eta,
+                  const int                 top_k,
+                  std::vector<int>         &indices)
 {
     ARM_COMPUTE_ERROR_ON_MSG(bboxes.size() != scores.size(), "bboxes and scores have different size.");
 
@@ -300,9 +328,9 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes,
     std::list<std::pair<float, int>> score_index_vec;
 
     // Generate index score pairs.
-    for(size_t i = 0; i < scores.size(); ++i)
+    for (size_t i = 0; i < scores.size(); ++i)
     {
-        if(scores[i] > score_threshold)
+        if (scores[i] > score_threshold)
         {
             score_index_vec.emplace_back(std::make_pair(scores[i], i));
         }
@@ -313,7 +341,7 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes,
 
     // Keep top_k scores if needed.
     const int score_index_vec_size = score_index_vec.size();
-    if(top_k > -1 && top_k < score_index_vec_size)
+    if (top_k > -1 && top_k < score_index_vec_size)
     {
         score_index_vec.resize(top_k);
     }
@@ -322,46 +350,45 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes,
     float adaptive_threshold = nms_threshold;
     indices.clear();
 
-    while(!score_index_vec.empty())
+    while (!score_index_vec.empty())
     {
         const int idx  = score_index_vec.front().second;
         bool      keep = true;
-        for(int kept_idx : indices)
+        for (int kept_idx : indices)
         {
-            if(keep)
+            if (keep)
             {
                 // Compute the jaccard (intersection over union IoU) overlap between two bboxes.
-                BBox intersect_bbox = std::array<float, 4>({ 0, 0, 0, 0 });
-                if(bboxes[kept_idx][0] > bboxes[idx][2] || bboxes[kept_idx][2] < bboxes[idx][0] || bboxes[kept_idx][1] > bboxes[idx][3] || bboxes[kept_idx][3] < bboxes[idx][1])
+                BBox intersect_bbox = std::array<float, 4>({0, 0, 0, 0});
+                if (bboxes[kept_idx][0] > bboxes[idx][2] || bboxes[kept_idx][2] < bboxes[idx][0] ||
+                    bboxes[kept_idx][1] > bboxes[idx][3] || bboxes[kept_idx][3] < bboxes[idx][1])
                 {
-                    intersect_bbox = std::array<float, 4>({ { 0, 0, 0, 0 } });
+                    intersect_bbox = std::array<float, 4>({{0, 0, 0, 0}});
                 }
                 else
                 {
-                    intersect_bbox = std::array<float, 4>({ {
-                            std::max(bboxes[idx][0], bboxes[kept_idx][0]),
-                            std::max(bboxes[idx][1], bboxes[kept_idx][1]),
-                            std::min(bboxes[idx][2], bboxes[kept_idx][2]),
-                            std::min(bboxes[idx][3], bboxes[kept_idx][3])
-                        }
-                    });
+                    intersect_bbox = std::array<float, 4>(
+                        {{std::max(bboxes[idx][0], bboxes[kept_idx][0]), std::max(bboxes[idx][1], bboxes[kept_idx][1]),
+                          std::min(bboxes[idx][2], bboxes[kept_idx][2]),
+                          std::min(bboxes[idx][3], bboxes[kept_idx][3])}});
                 }
 
                 float intersect_width  = intersect_bbox[2] - intersect_bbox[0];
                 float intersect_height = intersect_bbox[3] - intersect_bbox[1];
 
                 float overlap = 0.f;
-                if(intersect_width > 0 && intersect_height > 0)
+                if (intersect_width > 0 && intersect_height > 0)
                 {
                     float intersect_size = intersect_width * intersect_height;
-                    float bbox1_size     = (bboxes[idx][2] < bboxes[idx][0]
-                                            || bboxes[idx][3] < bboxes[idx][1]) ?
-                                           0.f :
-                                           (bboxes[idx][2] - bboxes[idx][0]) * (bboxes[idx][3] - bboxes[idx][1]); //BBoxSize(bboxes[idx]);
-                    float bbox2_size = (bboxes[kept_idx][2] < bboxes[kept_idx][0]
-                                        || bboxes[kept_idx][3] < bboxes[kept_idx][1]) ?
-                                       0.f :
-                                       (bboxes[kept_idx][2] - bboxes[kept_idx][0]) * (bboxes[kept_idx][3] - bboxes[kept_idx][1]); // BBoxSize(bboxes[kept_idx]);
+                    float bbox1_size     = (bboxes[idx][2] < bboxes[idx][0] || bboxes[idx][3] < bboxes[idx][1])
+                                               ? 0.f
+                                               : (bboxes[idx][2] - bboxes[idx][0]) *
+                                                 (bboxes[idx][3] - bboxes[idx][1]); //BBoxSize(bboxes[idx]);
+                    float bbox2_size =
+                        (bboxes[kept_idx][2] < bboxes[kept_idx][0] || bboxes[kept_idx][3] < bboxes[kept_idx][1])
+                            ? 0.f
+                            : (bboxes[kept_idx][2] - bboxes[kept_idx][0]) *
+                                  (bboxes[kept_idx][3] - bboxes[kept_idx][1]); // BBoxSize(bboxes[kept_idx]);
                     overlap = intersect_size / (bbox1_size + bbox2_size - intersect_size);
                 }
                 keep = (overlap <= adaptive_threshold);
@@ -371,12 +398,12 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes,
                 break;
             }
         }
-        if(keep)
+        if (keep)
         {
             indices.push_back(idx);
         }
         score_index_vec.erase(score_index_vec.begin());
-        if(keep && eta < 1.f && adaptive_threshold > 0.5f)
+        if (keep && eta < 1.f && adaptive_threshold > 0.5f)
         {
             adaptive_threshold *= eta;
         }
@@ -385,13 +412,27 @@ void ApplyNMSFast(const std::vector<BBox> &bboxes,
 } // namespace
 
 CPPDetectionOutputLayer::CPPDetectionOutputLayer()
-    : _input_loc(nullptr), _input_conf(nullptr), _input_priorbox(nullptr), _output(nullptr), _info(), _num_priors(), _num(), _all_location_predictions(), _all_confidence_scores(), _all_prior_bboxes(),
-      _all_prior_variances(), _all_decode_bboxes(), _all_indices()
+    : _input_loc(nullptr),
+      _input_conf(nullptr),
+      _input_priorbox(nullptr),
+      _output(nullptr),
+      _info(),
+      _num_priors(),
+      _num(),
+      _all_location_predictions(),
+      _all_confidence_scores(),
+      _all_prior_bboxes(),
+      _all_prior_variances(),
+      _all_decode_bboxes(),
+      _all_indices()
 {
 }
 
-void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor *input_conf, const ITensor *input_priorbox,
-                                        ITensor *output, DetectionOutputLayerInfo info)
+void CPPDetectionOutputLayer::configure(const ITensor           *input_loc,
+                                        const ITensor           *input_conf,
+                                        const ITensor           *input_priorbox,
+                                        ITensor                 *output,
+                                        DetectionOutputLayerInfo info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input_loc, input_conf, input_priorbox, output);
     ARM_COMPUTE_LOG_PARAMS(input_loc, input_conf, input_priorbox, output, info);
@@ -400,11 +441,13 @@ void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor
     // Since the number of bboxes to kept is unknown before nms, the shape is set to the maximum
     // The maximum is keep_top_k * input_loc_size[1]
     // Each row is a 7 dimension std::vector, which stores [image_id, label, confidence, xmin, ymin, xmax, ymax]
-    const unsigned int max_size = info.keep_top_k() * (input_loc->info()->num_dimensions() > 1 ? input_loc->info()->dimension(1) : 1);
+    const unsigned int max_size =
+        info.keep_top_k() * (input_loc->info()->num_dimensions() > 1 ? input_loc->info()->dimension(1) : 1);
     auto_init_if_empty(*output->info(), input_loc->info()->clone()->set_tensor_shape(TensorShape(7U, max_size)));
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_loc->info(), input_conf->info(), input_priorbox->info(), output->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input_loc->info(), input_conf->info(), input_priorbox->info(), output->info(), info));
 
     _input_loc      = input_loc;
     _input_conf     = input_conf;
@@ -420,12 +463,12 @@ void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor
     _all_prior_variances.resize(_num_priors);
     _all_decode_bboxes.resize(_num);
 
-    for(int i = 0; i < _num; ++i)
+    for (int i = 0; i < _num; ++i)
     {
-        for(int c = 0; c < _info.num_loc_classes(); ++c)
+        for (int c = 0; c < _info.num_loc_classes(); ++c)
         {
             const int label = _info.share_location() ? -1 : c;
-            if(label == _info.background_label_id())
+            if (label == _info.background_label_id())
             {
                 // Ignore background class.
                 continue;
@@ -440,7 +483,11 @@ void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor
     output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
 }
 
-Status CPPDetectionOutputLayer::validate(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info)
+Status CPPDetectionOutputLayer::validate(const ITensorInfo       *input_loc,
+                                         const ITensorInfo       *input_conf,
+                                         const ITensorInfo       *input_priorbox,
+                                         const ITensorInfo       *output,
+                                         DetectionOutputLayerInfo info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_loc, input_conf, input_priorbox, output, info));
     return Status{};
@@ -449,7 +496,8 @@ Status CPPDetectionOutputLayer::validate(const ITensorInfo *input_loc, const ITe
 void CPPDetectionOutputLayer::run()
 {
     // Retrieve all location predictions.
-    retrieve_all_loc_predictions(_input_loc, _num, _num_priors, _info.num_loc_classes(), _info.share_location(), _all_location_predictions);
+    retrieve_all_loc_predictions(_input_loc, _num, _num_priors, _info.num_loc_classes(), _info.share_location(),
+                                 _all_location_predictions);
 
     // Retrieve all confidences.
     retrieve_all_conf_scores(_input_conf, _num, _num_priors, _info.num_classes(), _all_confidence_scores);
@@ -459,75 +507,79 @@ void CPPDetectionOutputLayer::run()
 
     // Decode all loc predictions to bboxes
     const bool clip_bbox = false;
-    for(int i = 0; i < _num; ++i)
+    for (int i = 0; i < _num; ++i)
     {
-        for(int c = 0; c < _info.num_loc_classes(); ++c)
+        for (int c = 0; c < _info.num_loc_classes(); ++c)
         {
             const int label = _info.share_location() ? -1 : c;
-            if(label == _info.background_label_id())
+            if (label == _info.background_label_id())
             {
                 // Ignore background class.
                 continue;
             }
-            ARM_COMPUTE_ERROR_ON_MSG_VAR(_all_location_predictions[i].find(label) == _all_location_predictions[i].end(), "Could not find location predictions for label %d.", label);
+            ARM_COMPUTE_ERROR_ON_MSG_VAR(_all_location_predictions[i].find(label) == _all_location_predictions[i].end(),
+                                         "Could not find location predictions for label %d.", label);
 
             const std::vector<BBox> &label_loc_preds = _all_location_predictions[i].find(label)->second;
 
             const int num_bboxes = _all_prior_bboxes.size();
             ARM_COMPUTE_ERROR_ON(_all_prior_variances[i].size() != 4);
 
-            for(int j = 0; j < num_bboxes; ++j)
+            for (int j = 0; j < num_bboxes; ++j)
             {
-                DecodeBBox(_all_prior_bboxes[j], _all_prior_variances[j], _info.code_type(), _info.variance_encoded_in_target(), clip_bbox, label_loc_preds[j], _all_decode_bboxes[i][label][j]);
+                DecodeBBox(_all_prior_bboxes[j], _all_prior_variances[j], _info.code_type(),
+                           _info.variance_encoded_in_target(), clip_bbox, label_loc_preds[j],
+                           _all_decode_bboxes[i][label][j]);
             }
         }
     }
 
     int num_kept = 0;
 
-    for(int i = 0; i < _num; ++i)
+    for (int i = 0; i < _num; ++i)
     {
-        const LabelBBox &decode_bboxes = _all_decode_bboxes[i];
-        const std::map<int, std::vector<float>> &conf_scores = _all_confidence_scores[i];
+        const LabelBBox                         &decode_bboxes = _all_decode_bboxes[i];
+        const std::map<int, std::vector<float>> &conf_scores   = _all_confidence_scores[i];
 
         std::map<int, std::vector<int>> indices;
-        int num_det = 0;
-        for(int c = 0; c < _info.num_classes(); ++c)
+        int                             num_det = 0;
+        for (int c = 0; c < _info.num_classes(); ++c)
         {
-            if(c == _info.background_label_id())
+            if (c == _info.background_label_id())
             {
                 // Ignore background class
                 continue;
             }
             const int label = _info.share_location() ? -1 : c;
-            if(conf_scores.find(c) == conf_scores.end() || decode_bboxes.find(label) == decode_bboxes.end())
+            if (conf_scores.find(c) == conf_scores.end() || decode_bboxes.find(label) == decode_bboxes.end())
             {
                 ARM_COMPUTE_ERROR_VAR("Could not find predictions for label %d.", label);
             }
             const std::vector<float> &scores = conf_scores.find(c)->second;
-            const std::vector<BBox> &bboxes = decode_bboxes.find(label)->second;
+            const std::vector<BBox>  &bboxes = decode_bboxes.find(label)->second;
 
-            ApplyNMSFast(bboxes, scores, _info.confidence_threshold(), _info.nms_threshold(), _info.eta(), _info.top_k(), indices[c]);
+            ApplyNMSFast(bboxes, scores, _info.confidence_threshold(), _info.nms_threshold(), _info.eta(),
+                         _info.top_k(), indices[c]);
 
             num_det += indices[c].size();
         }
 
         int num_to_add = 0;
-        if(_info.keep_top_k() > -1 && num_det > _info.keep_top_k())
+        if (_info.keep_top_k() > -1 && num_det > _info.keep_top_k())
         {
             std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
-            for(auto const &it : indices)
+            for (auto const &it : indices)
             {
                 const int               label         = it.first;
                 const std::vector<int> &label_indices = it.second;
 
-                if(conf_scores.find(label) == conf_scores.end())
+                if (conf_scores.find(label) == conf_scores.end())
                 {
                     ARM_COMPUTE_ERROR_VAR("Could not find predictions for label %d.", label);
                 }
 
                 const std::vector<float> &scores = conf_scores.find(label)->second;
-                for(auto idx : label_indices)
+                for (auto idx : label_indices)
                 {
                     ARM_COMPUTE_ERROR_ON(idx > static_cast<int>(scores.size()));
                     score_index_pairs.emplace_back(std::make_pair(scores[idx], std::make_pair(label, idx)));
@@ -541,7 +593,7 @@ void CPPDetectionOutputLayer::run()
             // Store the new indices.
 
             std::map<int, std::vector<int>> new_indices;
-            for(auto score_index_pair : score_index_pairs)
+            for (auto score_index_pair : score_index_pairs)
             {
                 int label = score_index_pair.second.first;
                 int idx   = score_index_pair.second.second;
@@ -562,25 +614,25 @@ void CPPDetectionOutputLayer::run()
     _output->info()->set_valid_region(ValidRegion(Coordinates(0, 0), TensorShape(7, num_kept)));
 
     int count = 0;
-    for(int i = 0; i < _num; ++i)
+    for (int i = 0; i < _num; ++i)
     {
-        const std::map<int, std::vector<float>> &conf_scores = _all_confidence_scores[i];
-        const LabelBBox &decode_bboxes = _all_decode_bboxes[i];
-        for(auto &it : _all_indices[i])
+        const std::map<int, std::vector<float>> &conf_scores   = _all_confidence_scores[i];
+        const LabelBBox                         &decode_bboxes = _all_decode_bboxes[i];
+        for (auto &it : _all_indices[i])
         {
             const int                 label     = it.first;
             const std::vector<float> &scores    = conf_scores.find(label)->second;
             const int                 loc_label = _info.share_location() ? -1 : label;
-            if(conf_scores.find(label) == conf_scores.end() || decode_bboxes.find(loc_label) == decode_bboxes.end())
+            if (conf_scores.find(label) == conf_scores.end() || decode_bboxes.find(loc_label) == decode_bboxes.end())
             {
                 // Either if there are no confidence predictions
                 // or there are no location predictions for current label.
                 ARM_COMPUTE_ERROR_VAR("Could not find predictions for the label %d.", label);
             }
             const std::vector<BBox> &bboxes  = decode_bboxes.find(loc_label)->second;
-            const std::vector<int> &indices = it.second;
+            const std::vector<int>  &indices = it.second;
 
-            for(auto idx : indices)
+            for (auto idx : indices)
             {
                 *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7))))     = i;
                 *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 1)))) = label;
diff --git a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
index ecbc49b3c1..2861d6cacb 100644
--- a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
@@ -26,9 +26,9 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include <cstddef>
 #include <ios>
@@ -38,53 +38,76 @@ namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors,
-                          ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection,
-                          DetectionPostProcessLayerInfo info, const unsigned int kBatchSize, const unsigned int kNumCoordBox)
+Status validate_arguments(const ITensorInfo            *input_box_encoding,
+                          const ITensorInfo            *input_class_score,
+                          const ITensorInfo            *input_anchors,
+                          ITensorInfo                  *output_boxes,
+                          ITensorInfo                  *output_classes,
+                          ITensorInfo                  *output_scores,
+                          ITensorInfo                  *num_detection,
+                          DetectionPostProcessLayerInfo info,
+                          const unsigned int            kBatchSize,
+                          const unsigned int            kNumCoordBox)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_box_encoding, input_class_score, input_anchors);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_box_encoding, 1, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_box_encoding, 1, DataType::F32, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_box_encoding, input_anchors);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_box_encoding->num_dimensions() > 3, "The location input tensor shape should be [4, N, kBatchSize].");
-    if(input_box_encoding->num_dimensions() > 2)
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_box_encoding->num_dimensions() > 3,
+                                    "The location input tensor shape should be [4, N, kBatchSize].");
+    if (input_box_encoding->num_dimensions() > 2)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(2) != kBatchSize, "The third dimension of the input box_encoding tensor should be equal to %d.", kBatchSize);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(
+            input_box_encoding->dimension(2) != kBatchSize,
+            "The third dimension of the input box_encoding tensor should be equal to %d.", kBatchSize);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(0) != kNumCoordBox, "The first dimension of the input box_encoding tensor should be equal to %d.", kNumCoordBox);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_class_score->dimension(0) != (info.num_classes() + 1),
-                                    "The first dimension of the input class_prediction should be equal to the number of classes plus one.");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_anchors->num_dimensions() > 3, "The anchors input tensor shape should be [4, N, kBatchSize].");
-    if(input_anchors->num_dimensions() > 2)
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(0) != kNumCoordBox,
+                                        "The first dimension of the input box_encoding tensor should be equal to %d.",
+                                        kNumCoordBox);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        input_class_score->dimension(0) != (info.num_classes() + 1),
+        "The first dimension of the input class_prediction should be equal to the number of classes plus one.");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_anchors->num_dimensions() > 3,
+                                    "The anchors input tensor shape should be [4, N, kBatchSize].");
+    if (input_anchors->num_dimensions() > 2)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_anchors->dimension(0) != kNumCoordBox, "The first dimension of the input anchors tensor should be equal to %d.", kNumCoordBox);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_anchors->dimension(0) != kNumCoordBox,
+                                            "The first dimension of the input anchors tensor should be equal to %d.",
+                                            kNumCoordBox);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input_box_encoding->dimension(1) != input_class_score->dimension(1))
-                                    || (input_box_encoding->dimension(1) != input_anchors->dimension(1)),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input_box_encoding->dimension(1) != input_class_score->dimension(1)) ||
+                                        (input_box_encoding->dimension(1) != input_anchors->dimension(1)),
                                     "The second dimension of the inputs should be the same.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_detection->num_dimensions() > 1, "The num_detection output tensor shape should be [M].");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.iou_threshold() <= 0.0f) || (info.iou_threshold() > 1.0f), "The intersection over union should be positive and less than 1.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_classes_per_detection() <= 0, "The number of max classes per detection should be positive.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_detection->num_dimensions() > 1,
+                                    "The num_detection output tensor shape should be [M].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.iou_threshold() <= 0.0f) || (info.iou_threshold() > 1.0f),
+                                    "The intersection over union should be positive and less than 1.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_classes_per_detection() <= 0,
+                                    "The number of max classes per detection should be positive.");
 
     const unsigned int num_detected_boxes = info.max_detections() * info.max_classes_per_detection();
 
     // Validate configured outputs
-    if(output_boxes->total_size() != 0)
+    if (output_boxes->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_boxes->tensor_shape(), TensorShape(4U, num_detected_boxes, 1U));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_boxes->tensor_shape(),
+                                                           TensorShape(4U, num_detected_boxes, 1U));
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_boxes, 1, DataType::F32);
     }
-    if(output_classes->total_size() != 0)
+    if (output_classes->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_classes->tensor_shape(), TensorShape(num_detected_boxes, 1U));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_classes->tensor_shape(),
+                                                           TensorShape(num_detected_boxes, 1U));
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_classes, 1, DataType::F32);
     }
-    if(output_scores->total_size() != 0)
+    if (output_scores->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_scores->tensor_shape(), TensorShape(num_detected_boxes, 1U));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_scores->tensor_shape(),
+                                                           TensorShape(num_detected_boxes, 1U));
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_scores, 1, DataType::F32);
     }
-    if(num_detection->total_size() != 0)
+    if (num_detection->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(num_detection->tensor_shape(), TensorShape(1U));
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_detection, 1, DataType::F32);
@@ -93,15 +116,18 @@ Status validate_arguments(const ITensorInfo *input_box_encoding, const ITensorIn
     return Status{};
 }
 
-inline void DecodeBoxCorner(BBox &box_centersize, BBox &anchor, Iterator &decoded_it, DetectionPostProcessLayerInfo info)
+inline void
+DecodeBoxCorner(BBox &box_centersize, BBox &anchor, Iterator &decoded_it, DetectionPostProcessLayerInfo info)
 {
     const float half_factor = 0.5f;
 
     // BBox is equavalent to CenterSizeEncoding [y,x,h,w]
     const float y_center = box_centersize[0] / info.scale_value_y() * anchor[2] + anchor[0];
     const float x_center = box_centersize[1] / info.scale_value_x() * anchor[3] + anchor[1];
-    const float half_h   = half_factor * static_cast<float>(std::exp(box_centersize[2] / info.scale_value_h())) * anchor[2];
-    const float half_w   = half_factor * static_cast<float>(std::exp(box_centersize[3] / info.scale_value_w())) * anchor[3];
+    const float half_h =
+        half_factor * static_cast<float>(std::exp(box_centersize[2] / info.scale_value_h())) * anchor[2];
+    const float half_w =
+        half_factor * static_cast<float>(std::exp(box_centersize[3] / info.scale_value_w())) * anchor[3];
 
     // Box Corner encoding boxes are saved as [xmin, ymin, xmax, ymax]
     auto decoded_ptr   = reinterpret_cast<float *>(decoded_it.ptr());
@@ -118,12 +144,15 @@ inline void DecodeBoxCorner(BBox &box_centersize, BBox &anchor, Iterator &decode
  * @param[in]  info               The detection informations
  * @param[out] decoded_boxes      The decoded bboxes.
  */
-void DecodeCenterSizeBoxes(const ITensor *input_box_encoding, const ITensor *input_anchors, DetectionPostProcessLayerInfo info, Tensor *decoded_boxes)
+void DecodeCenterSizeBoxes(const ITensor                *input_box_encoding,
+                           const ITensor                *input_anchors,
+                           DetectionPostProcessLayerInfo info,
+                           Tensor                       *decoded_boxes)
 {
     const QuantizationInfo &qi_box     = input_box_encoding->info()->quantization_info();
     const QuantizationInfo &qi_anchors = input_anchors->info()->quantization_info();
-    BBox                    box_centersize{ {} };
-    BBox                    anchor{ {} };
+    BBox                    box_centersize{{}};
+    BBox                    anchor{{}};
 
     Window win;
     win.use_tensor_dimensions(input_box_encoding->info()->tensor_shape());
@@ -133,107 +162,155 @@ void DecodeCenterSizeBoxes(const ITensor *input_box_encoding, const ITensor *inp
     Iterator anchor_it(input_anchors, win);
     Iterator decoded_it(decoded_boxes, win);
 
-    if(input_box_encoding->info()->data_type() == DataType::QASYMM8)
+    if (input_box_encoding->info()->data_type() == DataType::QASYMM8)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto box_ptr    = reinterpret_cast<const qasymm8_t *>(box_it.ptr());
-            const auto anchor_ptr = reinterpret_cast<const qasymm8_t *>(anchor_it.ptr());
-            box_centersize        = BBox({ dequantize_qasymm8(*box_ptr, qi_box), dequantize_qasymm8(*(box_ptr + 1), qi_box),
-                                           dequantize_qasymm8(*(2 + box_ptr), qi_box), dequantize_qasymm8(*(3 + box_ptr), qi_box)
-                                         });
-            anchor = BBox({ dequantize_qasymm8(*anchor_ptr, qi_anchors), dequantize_qasymm8(*(anchor_ptr + 1), qi_anchors),
-                            dequantize_qasymm8(*(2 + anchor_ptr), qi_anchors), dequantize_qasymm8(*(3 + anchor_ptr), qi_anchors)
-                          });
-            DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
-        },
-        box_it, anchor_it, decoded_it);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto box_ptr    = reinterpret_cast<const qasymm8_t *>(box_it.ptr());
+                const auto anchor_ptr = reinterpret_cast<const qasymm8_t *>(anchor_it.ptr());
+                box_centersize =
+                    BBox({dequantize_qasymm8(*box_ptr, qi_box), dequantize_qasymm8(*(box_ptr + 1), qi_box),
+                          dequantize_qasymm8(*(2 + box_ptr), qi_box), dequantize_qasymm8(*(3 + box_ptr), qi_box)});
+                anchor = BBox({dequantize_qasymm8(*anchor_ptr, qi_anchors),
+                               dequantize_qasymm8(*(anchor_ptr + 1), qi_anchors),
+                               dequantize_qasymm8(*(2 + anchor_ptr), qi_anchors),
+                               dequantize_qasymm8(*(3 + anchor_ptr), qi_anchors)});
+                DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
+            },
+            box_it, anchor_it, decoded_it);
     }
-    else if(input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED)
+    else if (input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto box_ptr    = reinterpret_cast<const qasymm8_signed_t *>(box_it.ptr());
-            const auto anchor_ptr = reinterpret_cast<const qasymm8_signed_t *>(anchor_it.ptr());
-            box_centersize        = BBox({ dequantize_qasymm8_signed(*box_ptr, qi_box), dequantize_qasymm8_signed(*(box_ptr + 1), qi_box),
-                                           dequantize_qasymm8_signed(*(2 + box_ptr), qi_box), dequantize_qasymm8_signed(*(3 + box_ptr), qi_box)
-                                         });
-            anchor = BBox({ dequantize_qasymm8_signed(*anchor_ptr, qi_anchors), dequantize_qasymm8_signed(*(anchor_ptr + 1), qi_anchors),
-                            dequantize_qasymm8_signed(*(2 + anchor_ptr), qi_anchors), dequantize_qasymm8_signed(*(3 + anchor_ptr), qi_anchors)
-                          });
-            DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
-        },
-        box_it, anchor_it, decoded_it);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto box_ptr    = reinterpret_cast<const qasymm8_signed_t *>(box_it.ptr());
+                const auto anchor_ptr = reinterpret_cast<const qasymm8_signed_t *>(anchor_it.ptr());
+                box_centersize        = BBox({dequantize_qasymm8_signed(*box_ptr, qi_box),
+                                              dequantize_qasymm8_signed(*(box_ptr + 1), qi_box),
+                                              dequantize_qasymm8_signed(*(2 + box_ptr), qi_box),
+                                              dequantize_qasymm8_signed(*(3 + box_ptr), qi_box)});
+                anchor                = BBox({dequantize_qasymm8_signed(*anchor_ptr, qi_anchors),
+                                              dequantize_qasymm8_signed(*(anchor_ptr + 1), qi_anchors),
+                                              dequantize_qasymm8_signed(*(2 + anchor_ptr), qi_anchors),
+                                              dequantize_qasymm8_signed(*(3 + anchor_ptr), qi_anchors)});
+                DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
+            },
+            box_it, anchor_it, decoded_it);
     }
     else
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto box_ptr    = reinterpret_cast<const float *>(box_it.ptr());
-            const auto anchor_ptr = reinterpret_cast<const float *>(anchor_it.ptr());
-            box_centersize        = BBox({ *box_ptr, *(box_ptr + 1), *(2 + box_ptr), *(3 + box_ptr) });
-            anchor                = BBox({ *anchor_ptr, *(anchor_ptr + 1), *(2 + anchor_ptr), *(3 + anchor_ptr) });
-            DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
-        },
-        box_it, anchor_it, decoded_it);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto box_ptr    = reinterpret_cast<const float *>(box_it.ptr());
+                const auto anchor_ptr = reinterpret_cast<const float *>(anchor_it.ptr());
+                box_centersize        = BBox({*box_ptr, *(box_ptr + 1), *(2 + box_ptr), *(3 + box_ptr)});
+                anchor                = BBox({*anchor_ptr, *(anchor_ptr + 1), *(2 + anchor_ptr), *(3 + anchor_ptr)});
+                DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
+            },
+            box_it, anchor_it, decoded_it);
     }
 }
 
-void SaveOutputs(const Tensor *decoded_boxes, const std::vector<int> &result_idx_boxes_after_nms, const std::vector<float> &result_scores_after_nms, const std::vector<int> &result_classes_after_nms,
-                 std::vector<unsigned int> &sorted_indices, const unsigned int num_output, const unsigned int max_detections, ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores,
-                 ITensor *num_detection)
+void SaveOutputs(const Tensor              *decoded_boxes,
+                 const std::vector<int>    &result_idx_boxes_after_nms,
+                 const std::vector<float>  &result_scores_after_nms,
+                 const std::vector<int>    &result_classes_after_nms,
+                 std::vector<unsigned int> &sorted_indices,
+                 const unsigned int         num_output,
+                 const unsigned int         max_detections,
+                 ITensor                   *output_boxes,
+                 ITensor                   *output_classes,
+                 ITensor                   *output_scores,
+                 ITensor                   *num_detection)
 {
     // xmin,ymin,xmax,ymax -> ymin,xmin,ymax,xmax
     unsigned int i = 0;
-    for(; i < num_output; ++i)
+    for (; i < num_output; ++i)
     {
         const unsigned int box_in_idx = result_idx_boxes_after_nms[sorted_indices[i]];
-        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(0, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(1, box_in_idx))));
-        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(1, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(0, box_in_idx))));
-        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(2, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(3, box_in_idx))));
-        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(3, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(2, box_in_idx))));
-        *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i)))) = static_cast<float>(result_classes_after_nms[sorted_indices[i]]);
-        *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i))))  = result_scores_after_nms[sorted_indices[i]];
+        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(0, i)))) =
+            *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(1, box_in_idx))));
+        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(1, i)))) =
+            *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(0, box_in_idx))));
+        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(2, i)))) =
+            *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(3, box_in_idx))));
+        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(3, i)))) =
+            *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(2, box_in_idx))));
+        *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i)))) =
+            static_cast<float>(result_classes_after_nms[sorted_indices[i]]);
+        *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i)))) =
+            result_scores_after_nms[sorted_indices[i]];
     }
-    for(; i < max_detections; ++i)
+    for (; i < max_detections; ++i)
     {
         *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(1, i)))) = 0.0f;
         *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(0, i)))) = 0.0f;
         *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(3, i)))) = 0.0f;
         *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(2, i)))) = 0.0f;
-        *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i)))) = 0.0f;
-        *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i))))  = 0.0f;
+        *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i))))  = 0.0f;
+        *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i))))   = 0.0f;
     }
     *(reinterpret_cast<float *>(num_detection->ptr_to_element(Coordinates(0)))) = num_output;
 }
 } // namespace
 
 CPPDetectionPostProcessLayer::CPPDetectionPostProcessLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _nms(), _input_box_encoding(nullptr), _input_scores(nullptr), _input_anchors(nullptr), _output_boxes(nullptr), _output_classes(nullptr),
-      _output_scores(nullptr), _num_detection(nullptr), _info(), _num_boxes(), _num_classes_with_background(), _num_max_detected_boxes(), _dequantize_scores(false), _decoded_boxes(), _decoded_scores(),
-      _selected_indices(), _class_scores(), _input_scores_to_use(nullptr)
+    : _memory_group(std::move(memory_manager)),
+      _nms(),
+      _input_box_encoding(nullptr),
+      _input_scores(nullptr),
+      _input_anchors(nullptr),
+      _output_boxes(nullptr),
+      _output_classes(nullptr),
+      _output_scores(nullptr),
+      _num_detection(nullptr),
+      _info(),
+      _num_boxes(),
+      _num_classes_with_background(),
+      _num_max_detected_boxes(),
+      _dequantize_scores(false),
+      _decoded_boxes(),
+      _decoded_scores(),
+      _selected_indices(),
+      _class_scores(),
+      _input_scores_to_use(nullptr)
 {
 }
 
-void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, const ITensor *input_scores,
-                                             const ITensor *input_anchors, ITensor *output_boxes, ITensor *output_classes,
-                                             ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info)
+void CPPDetectionPostProcessLayer::configure(const ITensor                *input_box_encoding,
+                                             const ITensor                *input_scores,
+                                             const ITensor                *input_anchors,
+                                             ITensor                      *output_boxes,
+                                             ITensor                      *output_classes,
+                                             ITensor                      *output_scores,
+                                             ITensor                      *num_detection,
+                                             DetectionPostProcessLayerInfo info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes,
+                                 output_scores);
     ARM_COMPUTE_LOG_PARAMS(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores,
                            num_detection, info);
 
     _num_max_detected_boxes = info.max_detections() * info.max_classes_per_detection();
 
-    auto_init_if_empty(*output_boxes->info(), TensorInfo(TensorShape(_kNumCoordBox, _num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
-    auto_init_if_empty(*output_classes->info(), TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
-    auto_init_if_empty(*output_scores->info(), TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
+    auto_init_if_empty(*output_boxes->info(),
+                       TensorInfo(TensorShape(_kNumCoordBox, _num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
+    auto_init_if_empty(*output_classes->info(),
+                       TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
+    auto_init_if_empty(*output_scores->info(),
+                       TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
     auto_init_if_empty(*num_detection->info(), TensorInfo(TensorShape(1U), 1, DataType::F32));
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), output_classes->info(), output_scores->info(),
-                                                  num_detection->info(),
-                                                  info, _kBatchSize, _kNumCoordBox));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
+        input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(),
+        output_classes->info(), output_scores->info(), num_detection->info(), info, _kBatchSize, _kNumCoordBox));
 
     _input_box_encoding          = input_box_encoding;
     _input_scores                = input_scores;
@@ -245,13 +322,24 @@ void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding,
     _info                        = info;
     _num_boxes                   = input_box_encoding->info()->dimension(1);
     _num_classes_with_background = _input_scores->info()->dimension(0);
-    _dequantize_scores           = (info.dequantize_scores() && is_data_type_quantized(input_box_encoding->info()->data_type()));
-
-    auto_init_if_empty(*_decoded_boxes.info(), TensorInfo(TensorShape(_kNumCoordBox, _input_box_encoding->info()->dimension(1), _kBatchSize), 1, DataType::F32));
-    auto_init_if_empty(*_decoded_scores.info(), TensorInfo(TensorShape(_input_scores->info()->dimension(0), _input_scores->info()->dimension(1), _kBatchSize), 1, DataType::F32));
-    auto_init_if_empty(*_selected_indices.info(), TensorInfo(TensorShape(info.use_regular_nms() ? info.detection_per_class() : info.max_detections()), 1, DataType::S32));
+    _dequantize_scores = (info.dequantize_scores() && is_data_type_quantized(input_box_encoding->info()->data_type()));
+
+    auto_init_if_empty(*_decoded_boxes.info(),
+                       TensorInfo(TensorShape(_kNumCoordBox, _input_box_encoding->info()->dimension(1), _kBatchSize), 1,
+                                  DataType::F32));
+    auto_init_if_empty(
+        *_decoded_scores.info(),
+        TensorInfo(TensorShape(_input_scores->info()->dimension(0), _input_scores->info()->dimension(1), _kBatchSize),
+                   1, DataType::F32));
+    auto_init_if_empty(
+        *_selected_indices.info(),
+        TensorInfo(TensorShape(info.use_regular_nms() ? info.detection_per_class() : info.max_detections()), 1,
+                   DataType::S32));
     const unsigned int num_classes_per_box = std::min(info.max_classes_per_detection(), info.num_classes());
-    auto_init_if_empty(*_class_scores.info(), TensorInfo(info.use_regular_nms() ? TensorShape(_num_boxes) : TensorShape(_num_boxes * num_classes_per_box), 1, DataType::F32));
+    auto_init_if_empty(
+        *_class_scores.info(),
+        TensorInfo(info.use_regular_nms() ? TensorShape(_num_boxes) : TensorShape(_num_boxes * num_classes_per_box), 1,
+                   DataType::F32));
 
     _input_scores_to_use = _dequantize_scores ? &_decoded_scores : _input_scores;
 
@@ -260,7 +348,9 @@ void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding,
     _memory_group.manage(&_decoded_scores);
     _memory_group.manage(&_selected_indices);
     _memory_group.manage(&_class_scores);
-    _nms.configure(&_decoded_boxes, &_class_scores, &_selected_indices, info.use_regular_nms() ? info.detection_per_class() : info.max_detections(), info.nms_score_threshold(), info.iou_threshold());
+    _nms.configure(&_decoded_boxes, &_class_scores, &_selected_indices,
+                   info.use_regular_nms() ? info.detection_per_class() : info.max_detections(),
+                   info.nms_score_threshold(), info.iou_threshold());
 
     // Allocate and reserve intermediate tensors and vectors
     _decoded_boxes.allocator()->allocate();
@@ -269,18 +359,28 @@ void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding,
     _class_scores.allocator()->allocate();
 }
 
-Status CPPDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors,
-                                              ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection, DetectionPostProcessLayerInfo info)
+Status CPPDetectionPostProcessLayer::validate(const ITensorInfo            *input_box_encoding,
+                                              const ITensorInfo            *input_class_score,
+                                              const ITensorInfo            *input_anchors,
+                                              ITensorInfo                  *output_boxes,
+                                              ITensorInfo                  *output_classes,
+                                              ITensorInfo                  *output_scores,
+                                              ITensorInfo                  *num_detection,
+                                              DetectionPostProcessLayerInfo info)
 {
-    constexpr unsigned int kBatchSize             = 1;
-    constexpr unsigned int kNumCoordBox           = 4;
-    const TensorInfo       _decoded_boxes_info    = TensorInfo(TensorShape(kNumCoordBox, input_box_encoding->dimension(1)), 1, DataType::F32);
-    const TensorInfo       _decoded_scores_info   = TensorInfo(TensorShape(input_box_encoding->dimension(1)), 1, DataType::F32);
-    const TensorInfo       _selected_indices_info = TensorInfo(TensorShape(info.max_detections()), 1, DataType::S32);
-
-    ARM_COMPUTE_RETURN_ON_ERROR(CPPNonMaximumSuppression::validate(&_decoded_boxes_info, &_decoded_scores_info, &_selected_indices_info, info.max_detections(), info.nms_score_threshold(),
-                                                                   info.iou_threshold()));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_box_encoding, input_class_score, input_anchors, output_boxes, output_classes, output_scores, num_detection, info, kBatchSize, kNumCoordBox));
+    constexpr unsigned int kBatchSize   = 1;
+    constexpr unsigned int kNumCoordBox = 4;
+    const TensorInfo       _decoded_boxes_info =
+        TensorInfo(TensorShape(kNumCoordBox, input_box_encoding->dimension(1)), 1, DataType::F32);
+    const TensorInfo _decoded_scores_info = TensorInfo(TensorShape(input_box_encoding->dimension(1)), 1, DataType::F32);
+    const TensorInfo _selected_indices_info = TensorInfo(TensorShape(info.max_detections()), 1, DataType::S32);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CPPNonMaximumSuppression::validate(&_decoded_boxes_info, &_decoded_scores_info,
+                                                                   &_selected_indices_info, info.max_detections(),
+                                                                   info.nms_score_threshold(), info.iou_threshold()));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_box_encoding, input_class_score, input_anchors, output_boxes,
+                                                   output_classes, output_scores, num_detection, info, kBatchSize,
+                                                   kNumCoordBox));
 
     return Status{};
 }
@@ -293,62 +393,69 @@ void CPPDetectionPostProcessLayer::run()
     DecodeCenterSizeBoxes(_input_box_encoding, _input_anchors, _info, &_decoded_boxes);
 
     // Decode scores if necessary
-    if(_dequantize_scores)
+    if (_dequantize_scores)
     {
-        if(_input_box_encoding->info()->data_type() == DataType::QASYMM8)
+        if (_input_box_encoding->info()->data_type() == DataType::QASYMM8)
         {
-            for(unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c)
+            for (unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c)
             {
-                for(unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b)
+                for (unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b)
                 {
                     *(reinterpret_cast<float *>(_decoded_scores.ptr_to_element(Coordinates(idx_c, idx_b)))) =
-                        dequantize_qasymm8(*(reinterpret_cast<qasymm8_t *>(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), _input_scores->info()->quantization_info());
+                        dequantize_qasymm8(
+                            *(reinterpret_cast<qasymm8_t *>(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))),
+                            _input_scores->info()->quantization_info());
                 }
             }
         }
-        else if(_input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED)
+        else if (_input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED)
         {
-            for(unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c)
+            for (unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c)
             {
-                for(unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b)
+                for (unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b)
                 {
                     *(reinterpret_cast<float *>(_decoded_scores.ptr_to_element(Coordinates(idx_c, idx_b)))) =
-                        dequantize_qasymm8_signed(*(reinterpret_cast<qasymm8_signed_t *>(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), _input_scores->info()->quantization_info());
+                        dequantize_qasymm8_signed(*(reinterpret_cast<qasymm8_signed_t *>(
+                                                      _input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))),
+                                                  _input_scores->info()->quantization_info());
                 }
             }
         }
     }
 
     // Regular NMS
-    if(_info.use_regular_nms())
+    if (_info.use_regular_nms())
     {
         std::vector<int>          result_idx_boxes_after_nms;
         std::vector<int>          result_classes_after_nms;
         std::vector<float>        result_scores_after_nms;
         std::vector<unsigned int> sorted_indices;
 
-        for(unsigned int c = 0; c < num_classes; ++c)
+        for (unsigned int c = 0; c < num_classes; ++c)
         {
             // For each boxes get scores of the boxes for the class c
-            for(unsigned int i = 0; i < _num_boxes; ++i)
+            for (unsigned int i = 0; i < _num_boxes; ++i)
             {
                 *(reinterpret_cast<float *>(_class_scores.ptr_to_element(Coordinates(i)))) =
-                    *(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, i)))); // i * _num_classes_with_background + c + 1
+                    *(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(
+                        Coordinates(c + 1, i)))); // i * _num_classes_with_background + c + 1
             }
 
             // Run Non-maxima Suppression
             _nms.run();
 
-            for(unsigned int i = 0; i < _info.detection_per_class(); ++i)
+            for (unsigned int i = 0; i < _info.detection_per_class(); ++i)
             {
-                const auto selected_index = *(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i))));
-                if(selected_index == -1)
+                const auto selected_index =
+                    *(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i))));
+                if (selected_index == -1)
                 {
                     // Nms will return -1 for all the last M-elements not valid
                     break;
                 }
                 result_idx_boxes_after_nms.emplace_back(selected_index);
-                result_scores_after_nms.emplace_back((reinterpret_cast<float *>(_class_scores.buffer()))[selected_index]);
+                result_scores_after_nms.emplace_back(
+                    (reinterpret_cast<float *>(_class_scores.buffer()))[selected_index]);
                 result_classes_after_nms.emplace_back(c);
             }
         }
@@ -360,49 +467,46 @@ void CPPDetectionPostProcessLayer::run()
         // Sort selected indices based on result scores
         sorted_indices.resize(num_selected);
         std::iota(sorted_indices.begin(), sorted_indices.end(), 0);
-        std::partial_sort(sorted_indices.data(),
-                          sorted_indices.data() + num_output,
+        std::partial_sort(sorted_indices.data(), sorted_indices.data() + num_output,
                           sorted_indices.data() + num_selected,
                           [&](unsigned int first, unsigned int second)
-        {
-
-            return result_scores_after_nms[first] > result_scores_after_nms[second];
-        });
+                          { return result_scores_after_nms[first] > result_scores_after_nms[second]; });
 
-        SaveOutputs(&_decoded_boxes, result_idx_boxes_after_nms, result_scores_after_nms, result_classes_after_nms, sorted_indices,
-                    num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection);
+        SaveOutputs(&_decoded_boxes, result_idx_boxes_after_nms, result_scores_after_nms, result_classes_after_nms,
+                    sorted_indices, num_output, max_detections, _output_boxes, _output_classes, _output_scores,
+                    _num_detection);
     }
     // Fast NMS
     else
     {
-        const unsigned int num_classes_per_box = std::min<unsigned int>(_info.max_classes_per_detection(), _info.num_classes());
+        const unsigned int num_classes_per_box =
+            std::min<unsigned int>(_info.max_classes_per_detection(), _info.num_classes());
         std::vector<float> max_scores;
         std::vector<int>   box_indices;
         std::vector<int>   max_score_classes;
 
-        for(unsigned int b = 0; b < _num_boxes; ++b)
+        for (unsigned int b = 0; b < _num_boxes; ++b)
         {
             std::vector<float> box_scores;
-            for(unsigned int c = 0; c < num_classes; ++c)
+            for (unsigned int c = 0; c < num_classes; ++c)
             {
-                box_scores.emplace_back(*(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, b)))));
+                box_scores.emplace_back(
+                    *(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, b)))));
             }
 
             std::vector<unsigned int> max_score_indices;
             max_score_indices.resize(_info.num_classes());
             std::iota(max_score_indices.data(), max_score_indices.data() + _info.num_classes(), 0);
-            std::partial_sort(max_score_indices.data(),
-                              max_score_indices.data() + num_classes_per_box,
+            std::partial_sort(max_score_indices.data(), max_score_indices.data() + num_classes_per_box,
                               max_score_indices.data() + num_classes,
                               [&](unsigned int first, unsigned int second)
-            {
-                return box_scores[first] > box_scores[second];
-            });
+                              { return box_scores[first] > box_scores[second]; });
 
-            for(unsigned int i = 0; i < num_classes_per_box; ++i)
+            for (unsigned int i = 0; i < num_classes_per_box; ++i)
             {
-                const float score_to_add                                                                             = box_scores[max_score_indices[i]];
-                *(reinterpret_cast<float *>(_class_scores.ptr_to_element(Coordinates(b * num_classes_per_box + i)))) = score_to_add;
+                const float score_to_add = box_scores[max_score_indices[i]];
+                *(reinterpret_cast<float *>(_class_scores.ptr_to_element(Coordinates(b * num_classes_per_box + i)))) =
+                    score_to_add;
                 max_scores.emplace_back(score_to_add);
                 box_indices.emplace_back(b);
                 max_score_classes.emplace_back(max_score_indices[i]);
@@ -412,10 +516,10 @@ void CPPDetectionPostProcessLayer::run()
         // Run Non-maxima Suppression
         _nms.run();
         std::vector<unsigned int> selected_indices;
-        for(unsigned int i = 0; i < max_detections; ++i)
+        for (unsigned int i = 0; i < max_detections; ++i)
         {
             // NMS returns M valid indices, the not valid tail is filled with -1
-            if(*(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i)))) == -1)
+            if (*(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i)))) == -1)
             {
                 // Nms will return -1 for all the last M-elements not valid
                 break;
@@ -425,8 +529,8 @@ void CPPDetectionPostProcessLayer::run()
         // We select the max detection numbers of the highest score of all classes
         const auto num_output = std::min<unsigned int>(_info.max_detections(), selected_indices.size());
 
-        SaveOutputs(&_decoded_boxes, box_indices, max_scores, max_score_classes, selected_indices,
-                    num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection);
+        SaveOutputs(&_decoded_boxes, box_indices, max_scores, max_score_classes, selected_indices, num_output,
+                    max_detections, _output_boxes, _output_classes, _output_scores, _num_detection);
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp b/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp
index 6d01b127c0..3217742c6b 100644
--- a/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp
+++ b/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp
@@ -29,9 +29,12 @@
 
 namespace arm_compute
 {
-void CPPNonMaximumSuppression::configure(
-    const ITensor *bboxes, const ITensor *scores, ITensor *indices, unsigned int max_output_size,
-    const float score_threshold, const float nms_threshold)
+void CPPNonMaximumSuppression::configure(const ITensor *bboxes,
+                                         const ITensor *scores,
+                                         ITensor       *indices,
+                                         unsigned int   max_output_size,
+                                         const float    score_threshold,
+                                         const float    nms_threshold)
 {
     ARM_COMPUTE_LOG_PARAMS(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold);
 
@@ -40,10 +43,14 @@ void CPPNonMaximumSuppression::configure(
     _kernel = std::move(k);
 }
 
-Status CPPNonMaximumSuppression::validate(
-    const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *indices, unsigned int max_output_size,
-    const float score_threshold, const float nms_threshold)
+Status CPPNonMaximumSuppression::validate(const ITensorInfo *bboxes,
+                                          const ITensorInfo *scores,
+                                          const ITensorInfo *indices,
+                                          unsigned int       max_output_size,
+                                          const float        score_threshold,
+                                          const float        nms_threshold)
 {
-    return CPPNonMaximumSuppressionKernel::validate(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold);
+    return CPPNonMaximumSuppressionKernel::validate(bboxes, scores, indices, max_output_size, score_threshold,
+                                                    nms_threshold);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CPP/functions/CPPTopKV.cpp b/src/runtime/CPP/functions/CPPTopKV.cpp
index 62a74735a2..3d64def804 100644
--- a/src/runtime/CPP/functions/CPPTopKV.cpp
+++ b/src/runtime/CPP/functions/CPPTopKV.cpp
@@ -38,7 +38,10 @@ void CPPTopKV::configure(const ITensor *predictions, const ITensor *targets, ITe
     _kernel = std::move(kernel);
 }
 
-Status CPPTopKV::validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k)
+Status CPPTopKV::validate(const ITensorInfo *predictions,
+                          const ITensorInfo *targets,
+                          ITensorInfo       *output,
+                          const unsigned int k)
 {
     return CPPTopKVKernel::validate(predictions, targets, output, k);
 }
diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp
index 436fd9ca16..ecf84abd2c 100644
--- a/src/runtime/IScheduler.cpp
+++ b/src/runtime/IScheduler.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Log.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/common/cpuinfo/CpuInfo.h"
 #include "src/runtime/SchedulerUtils.h"
 
@@ -59,7 +60,7 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W
     ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
 #ifndef BARE_METAL
     const Window &max_window = window;
-    if(hints.split_dimension() == IScheduler::split_dimensions_all)
+    if (hints.split_dimension() == IScheduler::split_dimensions_all)
     {
         /*
          * if the split dim is size_t max then this signals we should parallelise over
@@ -73,27 +74,27 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W
         std::tie(m_threads, n_threads) = scheduler_utils::split_2d(this->num_threads(), m, n);
 
         std::vector<IScheduler::Workload> workloads;
-        for(unsigned int ni = 0; ni != n_threads; ++ni)
+        for (unsigned int ni = 0; ni != n_threads; ++ni)
         {
-            for(unsigned int mi = 0; mi != m_threads; ++mi)
+            for (unsigned int mi = 0; mi != m_threads; ++mi)
             {
                 workloads.push_back(
-                    [ni, mi, m_threads, n_threads, &max_window, &kernel](const ThreadInfo & info)
-                {
-                    //narrow the window to our mi-ni workload
-                    Window win = max_window.split_window(Window::DimX, mi, m_threads)
-                                 .split_window(Window::DimY, ni, n_threads);
+                    [ni, mi, m_threads, n_threads, &max_window, &kernel](const ThreadInfo &info)
+                    {
+                        //narrow the window to our mi-ni workload
+                        Window win = max_window.split_window(Window::DimX, mi, m_threads)
+                                         .split_window(Window::DimY, ni, n_threads);
 
-                    win.validate();
+                        win.validate();
 
-                    Window thread_locator;
-                    thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads));
-                    thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads));
+                        Window thread_locator;
+                        thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads));
+                        thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads));
 
-                    thread_locator.validate();
+                        thread_locator.validate();
 
-                    kernel->run_nd(win, info, thread_locator);
-                });
+                        kernel->run_nd(win, info, thread_locator);
+                    });
             }
         }
         run_workloads(workloads);
@@ -103,16 +104,16 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W
         const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
         const unsigned int num_threads    = std::min(num_iterations, this->num_threads());
 
-        if(num_iterations == 0)
+        if (num_iterations == 0)
         {
             return;
         }
 
-        if(!kernel->is_parallelisable() || num_threads == 1)
+        if (!kernel->is_parallelisable() || num_threads == 1)
         {
             ThreadInfo info;
             info.cpu_info = &cpu_info();
-            if(tensors.empty())
+            if (tensors.empty())
             {
                 kernel->run(max_window, info);
             }
@@ -124,14 +125,15 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W
         else
         {
             unsigned int num_windows = 0;
-            switch(hints.strategy())
+            switch (hints.strategy())
             {
                 case StrategyHint::STATIC:
                     num_windows = num_threads;
                     break;
                 case StrategyHint::DYNAMIC:
                 {
-                    const unsigned int granule_threshold = (hints.threshold() <= 0) ? num_threads : static_cast<unsigned int>(hints.threshold());
+                    const unsigned int granule_threshold =
+                        (hints.threshold() <= 0) ? num_threads : static_cast<unsigned int>(hints.threshold());
                     // Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder
                     num_windows = num_iterations > granule_threshold ? granule_threshold : num_iterations;
                     break;
@@ -143,15 +145,15 @@ void IScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const W
             num_windows = adjust_num_of_windows(max_window, hints.split_dimension(), num_windows, *kernel, cpu_info());
 
             std::vector<IScheduler::Workload> workloads(num_windows);
-            for(unsigned int t = 0; t < num_windows; ++t)
+            for (unsigned int t = 0; t < num_windows; ++t)
             {
                 //Capture 't' by copy, all the other variables by reference:
-                workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo & info)
+                workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo &info)
                 {
                     Window win = max_window.split_window(hints.split_dimension(), t, num_windows);
                     win.validate();
 
-                    if(tensors.empty())
+                    if (tensors.empty())
                     {
                         kernel->run(win, info);
                     }
@@ -175,36 +177,43 @@ void IScheduler::run_tagged_workloads(std::vector<Workload> &workloads, const ch
     run_workloads(workloads);
 }
 
-std::size_t IScheduler::adjust_num_of_windows(const Window &window, std::size_t split_dimension, std::size_t init_num_windows, const ICPPKernel &kernel, const CPUInfo &cpu_info)
+std::size_t IScheduler::adjust_num_of_windows(const Window     &window,
+                                              std::size_t       split_dimension,
+                                              std::size_t       init_num_windows,
+                                              const ICPPKernel &kernel,
+                                              const CPUInfo    &cpu_info)
 {
     // Mitigation of the narrow split issue, which occurs when the split dimension is too small to split (hence "narrow").
-    if(window.num_iterations(split_dimension) < init_num_windows)
+    if (window.num_iterations(split_dimension) < init_num_windows)
     {
         auto recommended_split_dim = Window::DimX;
-        for(std::size_t dims = Window::DimY; dims <= Window::DimW; ++dims)
+        for (std::size_t dims = Window::DimY; dims <= Window::DimW; ++dims)
         {
-            if(window.num_iterations(recommended_split_dim) < window.num_iterations(dims))
+            if (window.num_iterations(recommended_split_dim) < window.num_iterations(dims))
             {
                 recommended_split_dim = dims;
             }
         }
-        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("%zu dimension is not a suitable dimension to split the workload. Recommended: %zu recommended_split_dim", split_dimension,
-                                                  recommended_split_dim);
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+            "%zu dimension is not a suitable dimension to split the workload. Recommended: %zu recommended_split_dim",
+            split_dimension, recommended_split_dim);
     }
 
-    for(auto t = init_num_windows; t > 0; --t) // Trying the highest number of windows ,init_num_windows, first
+    for (auto t = init_num_windows; t > 0; --t) // Trying the highest number of windows ,init_num_windows, first
     {
         // Try splitting the workload into t, subject to each subworkload size <= mws.
-        if((window.num_iterations(split_dimension) / kernel.get_mws(cpu_info, t)) >= t)
+        if ((window.num_iterations(split_dimension) / kernel.get_mws(cpu_info, t)) >= t)
         {
-            if(t != init_num_windows)
+            if (t != init_num_windows)
             {
-                ARM_COMPUTE_LOG_INFO_MSG_CORE("The scheduler is using a different thread count than the one assigned by the user.");
+                ARM_COMPUTE_LOG_INFO_MSG_CORE(
+                    "The scheduler is using a different thread count than the one assigned by the user.");
             }
             return t;
         }
     }
-    ARM_COMPUTE_LOG_INFO_MSG_CORE("The scheduler is using single thread instead of the thread count assigned by the user.");
+    ARM_COMPUTE_LOG_INFO_MSG_CORE(
+        "The scheduler is using single thread instead of the thread count assigned by the user.");
     return 1; //  If the workload is so small that it can't be split, we should run a single thread
 }
 
diff --git a/src/runtime/ISimpleLifetimeManager.cpp b/src/runtime/ISimpleLifetimeManager.cpp
index a6bc950644..8e5b62ae7d 100644
--- a/src/runtime/ISimpleLifetimeManager.cpp
+++ b/src/runtime/ISimpleLifetimeManager.cpp
@@ -43,7 +43,7 @@ ISimpleLifetimeManager::ISimpleLifetimeManager()
 
 void ISimpleLifetimeManager::register_group(IMemoryGroup *group)
 {
-    if(_active_group == nullptr)
+    if (_active_group == nullptr)
     {
         ARM_COMPUTE_ERROR_ON(group == nullptr);
         _active_group = group;
@@ -52,12 +52,12 @@ void ISimpleLifetimeManager::register_group(IMemoryGroup *group)
 
 bool ISimpleLifetimeManager::release_group(IMemoryGroup *group)
 {
-    if(group == nullptr)
+    if (group == nullptr)
     {
         return false;
     }
     const bool status = bool(_finalized_groups.erase(group));
-    if(status)
+    if (status)
     {
         group->mappings().clear();
     }
@@ -67,12 +67,13 @@ bool ISimpleLifetimeManager::release_group(IMemoryGroup *group)
 void ISimpleLifetimeManager::start_lifetime(void *obj)
 {
     ARM_COMPUTE_ERROR_ON(obj == nullptr);
-    ARM_COMPUTE_ERROR_ON_MSG(_active_elements.find(obj) != std::end(_active_elements), "Memory object is already registered!");
+    ARM_COMPUTE_ERROR_ON_MSG(_active_elements.find(obj) != std::end(_active_elements),
+                             "Memory object is already registered!");
 
     // Check if there is a free blob
-    if(_free_blobs.empty())
+    if (_free_blobs.empty())
     {
-        _occupied_blobs.emplace_front(Blob{ obj, 0, 0, { obj } });
+        _occupied_blobs.emplace_front(Blob{obj, 0, 0, {obj}});
     }
     else
     {
@@ -100,10 +101,8 @@ void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t
     el.status    = true;
 
     // Find object in the occupied lists
-    auto occupied_blob_it = std::find_if(std::begin(_occupied_blobs), std::end(_occupied_blobs), [&obj](const Blob & b)
-    {
-        return obj == b.id;
-    });
+    auto occupied_blob_it = std::find_if(std::begin(_occupied_blobs), std::end(_occupied_blobs),
+                                         [&obj](const Blob &b) { return obj == b.id; });
     ARM_COMPUTE_ERROR_ON(occupied_blob_it == std::end(_occupied_blobs));
 
     // Update occupied blob and return as free
@@ -114,7 +113,7 @@ void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t
     _free_blobs.splice(std::begin(_free_blobs), _occupied_blobs, occupied_blob_it);
 
     // Check if all objects are finalized and reset active group
-    if(are_all_finalized())
+    if (are_all_finalized())
     {
         ARM_COMPUTE_ERROR_ON(!_occupied_blobs.empty());
 
@@ -133,9 +132,7 @@ void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t
 
 bool ISimpleLifetimeManager::are_all_finalized() const
 {
-    return !std::any_of(std::begin(_active_elements), std::end(_active_elements), [](const std::pair<void *, Element> &e)
-    {
-        return !e.second.status;
-    });
+    return !std::any_of(std::begin(_active_elements), std::end(_active_elements),
+                        [](const std::pair<void *, Element> &e) { return !e.second.status; });
 }
 } // namespace arm_compute
diff --git a/src/runtime/IWeightsManager.cpp b/src/runtime/IWeightsManager.cpp
index 373c50c73d..96287dcc49 100644
--- a/src/runtime/IWeightsManager.cpp
+++ b/src/runtime/IWeightsManager.cpp
@@ -25,14 +25,13 @@
 
 namespace arm_compute
 {
-IWeightsManager::IWeightsManager()
-    : _managed_weights(), _managed_counter(), _managed_weights_parents()
+IWeightsManager::IWeightsManager() : _managed_weights(), _managed_counter(), _managed_weights_parents()
 {
 }
 
 void IWeightsManager::manage(const ITensor *weights, ITransformWeights *parent)
 {
-    if(!are_weights_managed(weights))
+    if (!are_weights_managed(weights))
     {
         _managed_weights[weights];
         _managed_counter[weights];
@@ -44,9 +43,9 @@ void IWeightsManager::manage(const ITensor *weights, ITransformWeights *parent)
 
     // In case the weights are an output of a previous reshape function
     // store the parent's link
-    if(parent != nullptr)
+    if (parent != nullptr)
     {
-        if(_managed_weights_parents.find(weights) == _managed_weights_parents.end())
+        if (_managed_weights_parents.find(weights) == _managed_weights_parents.end())
         {
             _managed_weights_parents[weights] = parent;
         }
@@ -59,13 +58,13 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights
 
     // Find if I have the same weights with weights transform. If I do, don't run the reshape
     auto     item = _managed_weights.find(weights);
-    bool     perform_run{ true };
-    ITensor *weights_tensor{ nullptr };
+    bool     perform_run{true};
+    ITensor *weights_tensor{nullptr};
 
     // Check if I already have the requested transform and I have run the reshape function
-    for(auto it : item->second)
+    for (auto it : item->second)
     {
-        if(it->is_reshape_run() && (it->uid() == weights_transform->uid()))
+        if (it->is_reshape_run() && (it->uid() == weights_transform->uid()))
         {
             weights_tensor = it->get_weights();
             perform_run    = false;
@@ -73,7 +72,7 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights
         }
     }
 
-    if(perform_run)
+    if (perform_run)
     {
         weights_transform->run();
         weights_tensor = weights_transform->get_weights();
@@ -81,10 +80,10 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights
 
     // Check if we can release memory from parent
     auto parent_item = _managed_weights_parents.find(weights);
-    if(parent_item != _managed_weights_parents.end())
+    if (parent_item != _managed_weights_parents.end())
     {
         int32_t refcount = parent_item->second->decrease_refcount();
-        if(refcount == 0)
+        if (refcount == 0)
         {
             parent_item->second->release();
         }
@@ -92,20 +91,20 @@ ITensor *IWeightsManager::run(const ITensor *weights, ITransformWeights *weights
 
     // Check top level weights. If all the transformations are done
     // mark the weights as unused
-    if(_managed_weights_parents.find(weights) == _managed_weights_parents.end())
+    if (_managed_weights_parents.find(weights) == _managed_weights_parents.end())
     {
         auto item           = _managed_weights.find(weights);
         bool mark_as_unused = true;
-        for(auto it : item->second)
+        for (auto it : item->second)
         {
-            if(!it->is_reshape_run())
+            if (!it->is_reshape_run())
             {
                 mark_as_unused = false;
                 break;
             }
         }
 
-        if(mark_as_unused)
+        if (mark_as_unused)
         {
             weights->mark_as_unused();
         }
@@ -123,15 +122,15 @@ ITensor *IWeightsManager::acquire(const ITensor *weights, ITransformWeights *wei
 {
     ARM_COMPUTE_ERROR_ON_MSG(!are_weights_managed(weights), "Cannot acquire weights. Weights are not managed");
 
-    ITensor *transformed_weights{ nullptr };
+    ITensor *transformed_weights{nullptr};
     auto     item = _managed_weights.find(weights);
 
     // Check if I already have the requested transform. If I do,
     // increase the refcount of the transformed weights object and
     // reuse the tensor
-    for(auto it : item->second)
+    for (auto it : item->second)
     {
-        if(it->uid() == weights_transform->uid())
+        if (it->uid() == weights_transform->uid())
         {
             transformed_weights = it->get_weights();
             it->increase_refcount();
@@ -139,7 +138,7 @@ ITensor *IWeightsManager::acquire(const ITensor *weights, ITransformWeights *wei
         }
     }
 
-    if(transformed_weights == nullptr)
+    if (transformed_weights == nullptr)
     {
         transformed_weights = weights_transform->get_weights();
         weights_transform->increase_refcount();
@@ -154,13 +153,13 @@ ITensor *IWeightsManager::acquire(const ITensor *weights, ITransformWeights *wei
 
 void IWeightsManager::release(const ITensor *weights)
 {
-    if(weights == nullptr || !are_weights_managed(weights))
+    if (weights == nullptr || !are_weights_managed(weights))
     {
         return;
     }
 
     _managed_counter[weights].counter--;
-    if(_managed_counter[weights].counter == 0 && _managed_counter[weights].is_unused)
+    if (_managed_counter[weights].counter == 0 && _managed_counter[weights].is_unused)
     {
         weights->mark_as_unused();
     }
@@ -168,7 +167,7 @@ void IWeightsManager::release(const ITensor *weights)
 
 void IWeightsManager::pre_mark_as_unused(const ITensor *weights)
 {
-    if(weights == nullptr || !are_weights_managed(weights))
+    if (weights == nullptr || !are_weights_managed(weights))
     {
         return;
     }
diff --git a/src/runtime/Memory.cpp b/src/runtime/Memory.cpp
index ac0a32539e..90fd025eb7 100644
--- a/src/runtime/Memory.cpp
+++ b/src/runtime/Memory.cpp
@@ -27,20 +27,17 @@
 
 namespace arm_compute
 {
-Memory::Memory()
-    : _region(nullptr), _region_owned(nullptr)
+Memory::Memory() : _region(nullptr), _region_owned(nullptr)
 {
 }
 
-Memory::Memory(const std::shared_ptr<IMemoryRegion> &memory)
-    : _region(nullptr), _region_owned(memory)
+Memory::Memory(const std::shared_ptr<IMemoryRegion> &memory) : _region(nullptr), _region_owned(memory)
 {
     _region_owned = memory;
     _region       = _region_owned.get();
 }
 
-Memory::Memory(IMemoryRegion *memory)
-    : _region(memory), _region_owned(nullptr)
+Memory::Memory(IMemoryRegion *memory) : _region(memory), _region_owned(nullptr)
 {
     _region = memory;
 }
diff --git a/src/runtime/MemoryManagerOnDemand.cpp b/src/runtime/MemoryManagerOnDemand.cpp
index 2e418ae9e3..5fa9ea47e9 100644
--- a/src/runtime/MemoryManagerOnDemand.cpp
+++ b/src/runtime/MemoryManagerOnDemand.cpp
@@ -31,7 +31,8 @@
 
 namespace arm_compute
 {
-MemoryManagerOnDemand::MemoryManagerOnDemand(std::shared_ptr<ILifetimeManager> lifetime_manager, std::shared_ptr<IPoolManager> pool_manager)
+MemoryManagerOnDemand::MemoryManagerOnDemand(std::shared_ptr<ILifetimeManager> lifetime_manager,
+                                             std::shared_ptr<IPoolManager>     pool_manager)
     : _lifetime_mgr(std::move(lifetime_manager)), _pool_mgr(std::move(pool_manager))
 {
     ARM_COMPUTE_ERROR_ON_MSG(!_lifetime_mgr, "Lifetime manager not specified correctly!");
@@ -57,7 +58,7 @@ void MemoryManagerOnDemand::populate(arm_compute::IAllocator &allocator, size_t
 
     // Create pools
     auto pool_template = _lifetime_mgr->create_pool(&allocator);
-    for(int i = num_pools; i > 1; --i)
+    for (int i = num_pools; i > 1; --i)
     {
         auto pool = pool_template->duplicate();
         _pool_mgr->register_pool(std::move(pool));
diff --git a/src/runtime/NEON/INEOperator.cpp b/src/runtime/NEON/INEOperator.cpp
index a5fc0a2726..fcfd3251ff 100644
--- a/src/runtime/NEON/INEOperator.cpp
+++ b/src/runtime/NEON/INEOperator.cpp
@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/NEON/INEOperator.h"
+
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -32,14 +34,13 @@ namespace experimental
 {
 INEOperator::~INEOperator() = default;
 
-INEOperator::INEOperator(IRuntimeContext *ctx)
-    : _kernel(), _ctx(ctx), _workspace()
+INEOperator::INEOperator(IRuntimeContext *ctx) : _kernel(), _ctx(ctx), _workspace()
 {
 }
 
 void INEOperator::run(ITensorPack &tensors)
 {
-    if(tensors.empty())
+    if (tensors.empty())
     {
         ARM_COMPUTE_ERROR("No inputs provided");
     }
diff --git a/src/runtime/NEON/INESimpleFunction.cpp b/src/runtime/NEON/INESimpleFunction.cpp
index 5438bce62a..b6977221b9 100644
--- a/src/runtime/NEON/INESimpleFunction.cpp
+++ b/src/runtime/NEON/INESimpleFunction.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/CPP/ICPPKernel.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/core/NEON/kernels/NEFillBorderKernel.h"
 
 namespace arm_compute
@@ -33,8 +34,7 @@ namespace arm_compute
 INESimpleFunction::~INESimpleFunction() = default;
 
 INESimpleFunction::INESimpleFunction() // NOLINT
-    : _kernel(),
-      _border_handler()
+    : _kernel(), _border_handler()
 {
 }
 
diff --git a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
index 21dd58e378..04bff9fa4b 100644
--- a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
+++ b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/core/NEON/INEKernel.h"
 #include "src/runtime/Utils.h"
 
@@ -32,9 +33,7 @@ namespace arm_compute
 {
 INESimpleFunctionNoBorder::~INESimpleFunctionNoBorder() = default;
 
-INESimpleFunctionNoBorder::INESimpleFunctionNoBorder(IRuntimeContext *ctx)
-    : _kernel(),
-      _ctx(ctx)
+INESimpleFunctionNoBorder::INESimpleFunctionNoBorder(IRuntimeContext *ctx) : _kernel(), _ctx(ctx)
 {
 }
 
diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp
index e48aede590..59199452ce 100644
--- a/src/runtime/NEON/functions/NEActivationLayer.cpp
+++ b/src/runtime/NEON/functions/NEActivationLayer.cpp
@@ -24,24 +24,24 @@
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuActivation.h"
 
 namespace arm_compute
 {
 struct NEActivationLayer::Impl
 {
-    const ITensor                      *src{ nullptr };
-    ITensor                            *dst{ nullptr };
-    IRuntimeContext                    *ctx{ nullptr };
-    std::unique_ptr<cpu::CpuActivation> op{ nullptr };
+    const ITensor                      *src{nullptr};
+    ITensor                            *dst{nullptr};
+    IRuntimeContext                    *ctx{nullptr};
+    std::unique_ptr<cpu::CpuActivation> op{nullptr};
 };
 
-NEActivationLayer::NEActivationLayer(IRuntimeContext *ctx)
-    : _impl(std::make_unique<Impl>())
+NEActivationLayer::NEActivationLayer(IRuntimeContext *ctx) : _impl(std::make_unique<Impl>())
 {
     _impl->ctx = ctx;
 }
-NEActivationLayer::NEActivationLayer(NEActivationLayer &&) = default;
+NEActivationLayer::NEActivationLayer(NEActivationLayer &&)            = default;
 NEActivationLayer &NEActivationLayer::operator=(NEActivationLayer &&) = default;
 NEActivationLayer::~NEActivationLayer()                               = default;
 
@@ -56,7 +56,8 @@ void NEActivationLayer::configure(ITensor *input, ITensor *output, ActivationLay
     _impl->op->configure(_impl->src->info(), _impl->dst->info(), activation_info);
 }
 
-Status NEActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status
+NEActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     return cpu::CpuActivation::validate(input, output, act_info);
 }
diff --git a/src/runtime/NEON/functions/NEAddMulAdd.cpp b/src/runtime/NEON/functions/NEAddMulAdd.cpp
index cfeaefc4fd..a72364791c 100644
--- a/src/runtime/NEON/functions/NEAddMulAdd.cpp
+++ b/src/runtime/NEON/functions/NEAddMulAdd.cpp
@@ -25,6 +25,7 @@
 #include "arm_compute/runtime/NEON/functions/NEAddMulAdd.h"
 
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuAddMulAdd.h"
@@ -33,45 +34,50 @@ namespace arm_compute
 {
 struct NEAddMulAdd::Impl
 {
-    std::unique_ptr<cpu::CpuAddMulAdd> op{ nullptr };
+    std::unique_ptr<cpu::CpuAddMulAdd> op{nullptr};
     WorkspaceData<Tensor>              workspace_tensors{};
     ITensorPack                        run_pack{};
     MemoryGroup                        memory_group{};
 };
 
-NEAddMulAdd::NEAddMulAdd(std::shared_ptr<IMemoryManager> memory_manager)
-    : _impl(std::make_unique<Impl>())
+NEAddMulAdd::NEAddMulAdd(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
 {
     _impl->memory_group = MemoryGroup(std::move(memory_manager));
 }
 
 NEAddMulAdd::~NEAddMulAdd() = default;
 
-void NEAddMulAdd::configure(ITensor *input1, ITensor *input2, ITensor *bn_mul, ITensor *bn_add, ITensor *add_output,
-                            ITensor *final_output, const ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void NEAddMulAdd::configure(ITensor                   *input1,
+                            ITensor                   *input2,
+                            ITensor                   *bn_mul,
+                            ITensor                   *bn_add,
+                            ITensor                   *add_output,
+                            ITensor                   *final_output,
+                            const ConvertPolicy        policy,
+                            const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info);
 
-    _impl->op           = std::make_unique<cpu::CpuAddMulAdd>();
-    _impl->op->configure(input1->info(), input2->info(), bn_mul->info(),
-                         bn_add->info(), add_output != nullptr ? add_output->info() : nullptr, final_output->info(), policy, act_info);
+    _impl->op = std::make_unique<cpu::CpuAddMulAdd>();
+    _impl->op->configure(input1->info(), input2->info(), bn_mul->info(), bn_add->info(),
+                         add_output != nullptr ? add_output->info() : nullptr, final_output->info(), policy, act_info);
 
-    _impl->run_pack =
-    {
-        { TensorType::ACL_SRC_0, input1 },
-        { TensorType::ACL_SRC_1, input2 },
-        { TensorType::ACL_SRC_2, bn_mul },
-        { TensorType::ACL_SRC_3, bn_add },
-        { TensorType::ACL_DST_0, add_output },
-        { TensorType::ACL_DST_1, final_output },
+    _impl->run_pack = {
+        {TensorType::ACL_SRC_0, input1}, {TensorType::ACL_SRC_1, input2},     {TensorType::ACL_SRC_2, bn_mul},
+        {TensorType::ACL_SRC_3, bn_add}, {TensorType::ACL_DST_0, add_output}, {TensorType::ACL_DST_1, final_output},
     };
 
     _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
 }
 
-Status NEAddMulAdd::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *bn_mul,
-                             const ITensorInfo *bn_add, const ITensorInfo *add_output, const ITensorInfo *final_output,
-                             ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status NEAddMulAdd::validate(const ITensorInfo         *input1,
+                             const ITensorInfo         *input2,
+                             const ITensorInfo         *bn_mul,
+                             const ITensorInfo         *bn_add,
+                             const ITensorInfo         *add_output,
+                             const ITensorInfo         *final_output,
+                             ConvertPolicy              policy,
+                             const ActivationLayerInfo &act_info)
 {
     return cpu::CpuAddMulAdd::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info);
 }
diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
index 3ac127b02e..fbaf1a96e7 100644
--- a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
+++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
@@ -32,6 +32,7 @@
 #include "arm_compute/runtime/NEON/functions/NECast.h"
 #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEReductionOperationKernel.h"
 
@@ -48,8 +49,7 @@ struct NEArgMinMaxLayer::Impl
 
 NEArgMinMaxLayer::~NEArgMinMaxLayer() = default;
 
-NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _impl(std::make_unique<Impl>())
+NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
 {
     _impl->memory_manager = std::move(memory_manager);
 }
@@ -58,7 +58,8 @@ void NEArgMinMaxLayer::configure(ITensor *input, int axis, ITensor *output, cons
 {
     ARM_COMPUTE_LOG_PARAMS(input, axis, output, op);
     _impl->reduction_function = std::make_unique<NEReductionOperation>();
-    if(output->info() && (output->info()->data_type() == DataType::S64 || output->info()->data_type() == DataType::U64))
+    if (output->info() &&
+        (output->info()->data_type() == DataType::S64 || output->info()->data_type() == DataType::U64))
     {
         _impl->memory_group         = MemoryGroup(std::move(_impl->memory_manager));
         _impl->cast_function        = std::make_unique<NECast>();
@@ -74,9 +75,11 @@ void NEArgMinMaxLayer::configure(ITensor *input, int axis, ITensor *output, cons
     }
 }
 
-Status NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
+Status
+NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN,
+                                    "Invalid operation");
     return NEReductionOperation::validate(input, output, axis, op, false);
 }
 
@@ -84,7 +87,7 @@ void NEArgMinMaxLayer::run()
 {
     MemoryGroupResourceScope scope_mg(_impl->memory_group);
     _impl->reduction_function->run();
-    if(_impl->tmp_reduction_result != nullptr)
+    if (_impl->tmp_reduction_result != nullptr)
     {
         _impl->cast_function->run();
     }
diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
index a7581ca9f4..aff16ae9d1 100644
--- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuAdd.h"
 
 #include <utility>
@@ -32,26 +33,33 @@ namespace arm_compute
 {
 struct NEArithmeticAddition::Impl
 {
-    const ITensor               *src_0{ nullptr };
-    const ITensor               *src_1{ nullptr };
-    ITensor                     *dst{ nullptr };
-    std::unique_ptr<cpu::CpuAdd> op{ nullptr };
+    const ITensor               *src_0{nullptr};
+    const ITensor               *src_1{nullptr};
+    ITensor                     *dst{nullptr};
+    std::unique_ptr<cpu::CpuAdd> op{nullptr};
 };
 
-NEArithmeticAddition::NEArithmeticAddition()
-    : _impl(std::make_unique<Impl>())
+NEArithmeticAddition::NEArithmeticAddition() : _impl(std::make_unique<Impl>())
 {
 }
-NEArithmeticAddition::NEArithmeticAddition(NEArithmeticAddition &&) = default;
+NEArithmeticAddition::NEArithmeticAddition(NEArithmeticAddition &&)            = default;
 NEArithmeticAddition &NEArithmeticAddition::operator=(NEArithmeticAddition &&) = default;
 NEArithmeticAddition::~NEArithmeticAddition()                                  = default;
 
-Status NEArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status NEArithmeticAddition::validate(const ITensorInfo         *input1,
+                                      const ITensorInfo         *input2,
+                                      const ITensorInfo         *output,
+                                      ConvertPolicy              policy,
+                                      const ActivationLayerInfo &act_info)
 {
     return cpu::CpuAdd::validate(input1, input2, output, policy, act_info);
 }
 
-void NEArithmeticAddition::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void NEArithmeticAddition::configure(const ITensor             *input1,
+                                     const ITensor             *input2,
+                                     ITensor                   *output,
+                                     ConvertPolicy              policy,
+                                     const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
diff --git a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
index 6fdd4267bf..097525c1a8 100644
--- a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
 
 #include "arm_compute/core/ITensor.h"
+
 #include "src/cpu/operators/CpuSub.h"
 
 #include <utility>
@@ -32,26 +33,33 @@ namespace arm_compute
 {
 struct NEArithmeticSubtraction::Impl
 {
-    const ITensor               *src_0{ nullptr };
-    const ITensor               *src_1{ nullptr };
-    ITensor                     *dst{ nullptr };
-    std::unique_ptr<cpu::CpuSub> op{ nullptr };
+    const ITensor               *src_0{nullptr};
+    const ITensor               *src_1{nullptr};
+    ITensor                     *dst{nullptr};
+    std::unique_ptr<cpu::CpuSub> op{nullptr};
 };
 
-NEArithmeticSubtraction::NEArithmeticSubtraction()
-    : _impl(std::make_unique<Impl>())
+NEArithmeticSubtraction::NEArithmeticSubtraction() : _impl(std::make_unique<Impl>())
 {
 }
-NEArithmeticSubtraction::NEArithmeticSubtraction(NEArithmeticSubtraction &&) = default;
+NEArithmeticSubtraction::NEArithmeticSubtraction(NEArithmeticSubtraction &&)            = default;
 NEArithmeticSubtraction &NEArithmeticSubtraction::operator=(NEArithmeticSubtraction &&) = default;
 NEArithmeticSubtraction::~NEArithmeticSubtraction()                                     = default;
 
-Status NEArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status NEArithmeticSubtraction::validate(const ITensorInfo         *input1,
+                                         const ITensorInfo         *input2,
+                                         const ITensorInfo         *output,
+                                         ConvertPolicy              policy,
+                                         const ActivationLayerInfo &act_info)
 {
     return cpu::CpuSub::validate(input1, input2, output, policy, act_info);
 }
 
-void NEArithmeticSubtraction::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void NEArithmeticSubtraction::configure(const ITensor             *input1,
+                                        const ITensor             *input2,
+                                        ITensor                   *output,
+                                        ConvertPolicy              policy,
+                                        const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
diff --git a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
index db49f4c1a0..d491f0aafc 100644
--- a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
 
@@ -36,12 +37,17 @@ namespace arm_compute
 {
 NEBatchNormalizationLayer::~NEBatchNormalizationLayer() = default;
 
-NEBatchNormalizationLayer::NEBatchNormalizationLayer()
-    : _norm_kernel()
+NEBatchNormalizationLayer::NEBatchNormalizationLayer() : _norm_kernel()
 {
 }
 
-void NEBatchNormalizationLayer::configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon,
+void NEBatchNormalizationLayer::configure(ITensor            *input,
+                                          ITensor            *output,
+                                          const ITensor      *mean,
+                                          const ITensor      *var,
+                                          const ITensor      *beta,
+                                          const ITensor      *gamma,
+                                          float               epsilon,
                                           ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, mean, var, beta, gamma, epsilon, act_info);
@@ -50,10 +56,17 @@ void NEBatchNormalizationLayer::configure(ITensor *input, ITensor *output, const
     _norm_kernel->configure(input, output, mean, var, beta, gamma, epsilon, act_info);
 }
 
-Status NEBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, const ITensorInfo *beta, const ITensorInfo *gamma,
-                                           float epsilon, ActivationLayerInfo act_info)
+Status NEBatchNormalizationLayer::validate(const ITensorInfo  *input,
+                                           const ITensorInfo  *output,
+                                           const ITensorInfo  *mean,
+                                           const ITensorInfo  *var,
+                                           const ITensorInfo  *beta,
+                                           const ITensorInfo  *gamma,
+                                           float               epsilon,
+                                           ActivationLayerInfo act_info)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(NEBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info));
     return Status{};
 }
 
diff --git a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
index e258028d05..5d711c5ddf 100644
--- a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
+++ b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h"
 
@@ -41,19 +42,25 @@ void NEBatchToSpaceLayer::configure(const ITensor *input, const ITensor *block_s
     _kernel = std::move(k);
 }
 
-void NEBatchToSpaceLayer::configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info)
+void NEBatchToSpaceLayer::configure(
+    const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info)
 {
     auto k = std::make_unique<NEBatchToSpaceLayerKernel>();
     k->configure(input, block_shape_x, block_shape_y, output, crop_info);
     _kernel = std::move(k);
 }
 
-Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+Status
+NEBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
 {
     return NEBatchToSpaceLayerKernel::validate(input, block_shape, output);
 }
 
-Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info)
+Status NEBatchToSpaceLayer::validate(const ITensorInfo *input,
+                                     int32_t            block_shape_x,
+                                     int32_t            block_shape_y,
+                                     const ITensorInfo *output,
+                                     const CropInfo    &crop_info)
 {
     return NEBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output, crop_info);
 }
diff --git a/src/runtime/NEON/functions/NEBitwiseAnd.cpp b/src/runtime/NEON/functions/NEBitwiseAnd.cpp
index 90eb72706e..89ce2087be 100644
--- a/src/runtime/NEON/functions/NEBitwiseAnd.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseAnd.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseAnd.h"
 
-#include "src/core/NEON/kernels/NEBitwiseAndKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEBitwiseAndKernel.h"
 
 #include <utility>
 
diff --git a/src/runtime/NEON/functions/NEBitwiseNot.cpp b/src/runtime/NEON/functions/NEBitwiseNot.cpp
index 69e5288b88..eda59cd3e9 100644
--- a/src/runtime/NEON/functions/NEBitwiseNot.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseNot.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseNot.h"
 
-#include "src/core/NEON/kernels/NEBitwiseNotKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEBitwiseNotKernel.h"
 
 #include <utility>
 
diff --git a/src/runtime/NEON/functions/NEBitwiseOr.cpp b/src/runtime/NEON/functions/NEBitwiseOr.cpp
index 0b19e919ee..3d6f30b0fe 100644
--- a/src/runtime/NEON/functions/NEBitwiseOr.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseOr.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseOr.h"
 
-#include "src/core/NEON/kernels/NEBitwiseOrKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEBitwiseOrKernel.h"
 
 #include <utility>
 
diff --git a/src/runtime/NEON/functions/NEBitwiseXor.cpp b/src/runtime/NEON/functions/NEBitwiseXor.cpp
index cc9df9f1c4..f0cf3d3e5c 100644
--- a/src/runtime/NEON/functions/NEBitwiseXor.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseXor.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseXor.h"
 
-#include "src/core/NEON/kernels/NEBitwiseXorKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEBitwiseXorKernel.h"
 
 #include <utility>
 
diff --git a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp
index af00171be6..adf891e417 100644
--- a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp
+++ b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp
@@ -22,12 +22,16 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEBoundingBoxTransformKernel.h"
 
 namespace arm_compute
 {
-void NEBoundingBoxTransform::configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info)
+void NEBoundingBoxTransform::configure(const ITensor                  *boxes,
+                                       ITensor                        *pred_boxes,
+                                       const ITensor                  *deltas,
+                                       const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_LOG_PARAMS(boxes, pred_boxes, deltas, info);
     // Configure Bounding Box kernel
@@ -36,7 +40,10 @@ void NEBoundingBoxTransform::configure(const ITensor *boxes, ITensor *pred_boxes
     _kernel = std::move(k);
 }
 
-Status NEBoundingBoxTransform::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status NEBoundingBoxTransform::validate(const ITensorInfo              *boxes,
+                                        const ITensorInfo              *pred_boxes,
+                                        const ITensorInfo              *deltas,
+                                        const BoundingBoxTransformInfo &info)
 {
     return NEBoundingBoxTransformKernel::validate(boxes, pred_boxes, deltas, info);
 }
diff --git a/src/runtime/NEON/functions/NECast.cpp b/src/runtime/NEON/functions/NECast.cpp
index f93a6ea745..1fd172a730 100644
--- a/src/runtime/NEON/functions/NECast.cpp
+++ b/src/runtime/NEON/functions/NECast.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NECast.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/operators/CpuCast.h"
 
@@ -31,16 +32,15 @@ namespace arm_compute
 {
 struct NECast::Impl
 {
-    const ITensor                *src{ nullptr };
-    ITensor                      *dst{ nullptr };
-    std::unique_ptr<cpu::CpuCast> op{ nullptr };
+    const ITensor                *src{nullptr};
+    ITensor                      *dst{nullptr};
+    std::unique_ptr<cpu::CpuCast> op{nullptr};
 };
 
-NECast::NECast()
-    : _impl(std::make_unique<Impl>())
+NECast::NECast() : _impl(std::make_unique<Impl>())
 {
 }
-NECast::NECast(NECast &&) = default;
+NECast::NECast(NECast &&)            = default;
 NECast &NECast::operator=(NECast &&) = default;
 NECast::~NECast()                    = default;
 
@@ -62,7 +62,7 @@ Status NECast::validate(const ITensorInfo *input, const ITensorInfo *output, Con
 
 void NECast::run()
 {
-    ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } };
+    ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}};
     _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp
index 8b96fadb74..86bee4dd43 100644
--- a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp
+++ b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h"
 
 #include "arm_compute/core/Types.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEChannelShuffleLayerKernel.h"
 
diff --git a/src/runtime/NEON/functions/NEConcatenateLayer.cpp b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
index ceb697aad6..59a0892f1f 100644
--- a/src/runtime/NEON/functions/NEConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
@@ -23,33 +23,31 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
 
-#include "src/cpu/operators/CpuConcatenate.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
+#include "src/cpu/operators/CpuConcatenate.h"
 
 namespace arm_compute
 {
 struct NEConcatenateLayer::Impl
 {
     std::vector<const ITensor *>         srcs{};
-    ITensor                             *dst{ nullptr };
-    unsigned int                         num_inputs{ 0 };
-    unsigned int                         axis{ 0 };
-    std::unique_ptr<cpu::CpuConcatenate> op{ nullptr };
+    ITensor                             *dst{nullptr};
+    unsigned int                         num_inputs{0};
+    unsigned int                         axis{0};
+    std::unique_ptr<cpu::CpuConcatenate> op{nullptr};
 };
 
-NEConcatenateLayer::NEConcatenateLayer()
-    : _impl(std::make_unique<Impl>())
+NEConcatenateLayer::NEConcatenateLayer() : _impl(std::make_unique<Impl>())
 {
 }
-NEConcatenateLayer::NEConcatenateLayer(NEConcatenateLayer &&) = default;
+NEConcatenateLayer::NEConcatenateLayer(NEConcatenateLayer &&)            = default;
 NEConcatenateLayer &NEConcatenateLayer::operator=(NEConcatenateLayer &&) = default;
 NEConcatenateLayer::~NEConcatenateLayer()                                = default;
 
@@ -64,7 +62,7 @@ void NEConcatenateLayer::configure(std::vector<const ITensor *> inputs_vector, I
     _impl->op         = std::make_unique<cpu::CpuConcatenate>();
 
     std::vector<const ITensorInfo *> inputs_vector_info;
-    for(unsigned int i = 0; i < inputs_vector.size(); ++i)
+    for (unsigned int i = 0; i < inputs_vector.size(); ++i)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i));
         inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
@@ -72,7 +70,9 @@ void NEConcatenateLayer::configure(std::vector<const ITensor *> inputs_vector, I
     _impl->op->configure(inputs_vector_info, _impl->dst->info(), axis);
 }
 
-Status NEConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
+Status NEConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector,
+                                    const ITensorInfo                      *output,
+                                    size_t                                  axis)
 {
     return cpu::CpuConcatenate::validate(inputs_vector, output, axis);
 }
@@ -80,7 +80,7 @@ Status NEConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inpu
 void NEConcatenateLayer::run()
 {
     ITensorPack pack;
-    for(unsigned i = 0; i < _impl->num_inputs; ++i)
+    for (unsigned i = 0; i < _impl->num_inputs; ++i)
     {
         pack.add_tensor(TensorType::ACL_SRC_VEC + i, _impl->srcs.at(i));
     }
diff --git a/src/runtime/NEON/functions/NEConv3D.cpp b/src/runtime/NEON/functions/NEConv3D.cpp
index 3bb66c44b0..8f41151d6c 100644
--- a/src/runtime/NEON/functions/NEConv3D.cpp
+++ b/src/runtime/NEON/functions/NEConv3D.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/operators/CpuDirectConv3d.h"
 
@@ -35,35 +36,41 @@ using namespace arm_compute::experimental;
 
 struct NEConv3D::Impl
 {
-    std::unique_ptr<cpu::ICpuOperator> op{ nullptr };
+    std::unique_ptr<cpu::ICpuOperator> op{nullptr};
     ITensorPack                        run_pack{};
 };
 
-NEConv3D::NEConv3D()
-    : _impl(std::make_unique<Impl>())
+NEConv3D::NEConv3D() : _impl(std::make_unique<Impl>())
 {
 }
 
 NEConv3D::~NEConv3D() = default;
 
-void NEConv3D::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv3dInfo &conv_info)
+void NEConv3D::configure(
+    ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv3dInfo &conv_info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuDirectConv3d::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info));
+    ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuDirectConv3d::validate(
+        input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info));
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info);
 
     auto f = std::make_unique<cpu::CpuDirectConv3d>();
-    f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info);
+    f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(),
+                 conv_info);
     _impl->op = std::move(f);
 
-    if(_impl->op != nullptr)
+    if (_impl->op != nullptr)
     {
-        _impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
+        _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
     }
 }
 
-Status NEConv3D::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv3dInfo &conv_info)
+Status NEConv3D::validate(const ITensorInfo *input,
+                          const ITensorInfo *weights,
+                          const ITensorInfo *biases,
+                          const ITensorInfo *output,
+                          const Conv3dInfo  &conv_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuDirectConv3d::validate(input, weights, biases, output, conv_info));
 
@@ -72,7 +79,7 @@ Status NEConv3D::validate(const ITensorInfo *input, const ITensorInfo *weights,
 
 void NEConv3D::run()
 {
-    if(_impl->op != nullptr)
+    if (_impl->op != nullptr)
     {
         _impl->op->run(_impl->run_pack);
     }
diff --git a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
index 535ac99001..84e8565aaf 100644
--- a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
+++ b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
@@ -24,24 +24,26 @@
 #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuConvertFullyConnectedWeights.h"
 
 namespace arm_compute
 {
 struct NEConvertFullyConnectedWeights::Impl
 {
-    const ITensor                                        *src{ nullptr };
-    ITensor                                              *dst{ nullptr };
-    std::unique_ptr<cpu::CpuConvertFullyConnectedWeights> op{ nullptr };
+    const ITensor                                        *src{nullptr};
+    ITensor                                              *dst{nullptr};
+    std::unique_ptr<cpu::CpuConvertFullyConnectedWeights> op{nullptr};
 };
-NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights()
-    : _impl(std::make_unique<Impl>())
+NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights() : _impl(std::make_unique<Impl>())
 {
 }
 NEConvertFullyConnectedWeights::~NEConvertFullyConnectedWeights() = default;
 
-void NEConvertFullyConnectedWeights::configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape,
-                                               DataLayout data_layout)
+void NEConvertFullyConnectedWeights::configure(const ITensor     *input,
+                                               ITensor           *output,
+                                               const TensorShape &original_input_shape,
+                                               DataLayout         data_layout)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -51,8 +53,10 @@ void NEConvertFullyConnectedWeights::configure(const ITensor *input, ITensor *ou
     _impl->op->configure(_impl->src->info(), _impl->dst->info(), original_input_shape, data_layout);
 }
 
-Status NEConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
-                                                DataLayout data_layout)
+Status NEConvertFullyConnectedWeights::validate(const ITensorInfo *input,
+                                                const ITensorInfo *output,
+                                                const TensorShape &original_input_shape,
+                                                DataLayout         data_layout)
 {
     return cpu::CpuConvertFullyConnectedWeights::validate(input, output, original_input_shape, data_layout);
 }
@@ -64,4 +68,4 @@ void NEConvertFullyConnectedWeights::run()
     pack.add_tensor(TensorType::ACL_DST, _impl->dst);
     _impl->op->run(pack);
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 89e0e498c9..37958fc2e9 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuConv2d.h"
@@ -43,34 +44,44 @@ struct NEConvolutionLayer::Impl
 {
     MemoryGroup                        memory_group{};
     std::shared_ptr<IMemoryManager>    memory_manager{};
-    std::unique_ptr<cpu::ICpuOperator> op{ nullptr };
+    std::unique_ptr<cpu::ICpuOperator> op{nullptr};
     ITensorPack                        run_pack{};
     ITensorPack                        prep_pack{};
     WorkspaceData<Tensor>              workspace{};
     experimental::MemoryRequirements   aux_mem_req{};
-    std::unique_ptr<IFunction>         func{ nullptr };
+    std::unique_ptr<IFunction>         func{nullptr};
 };
 
-NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _impl(std::make_unique<Impl>())
+NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
 {
     _impl->memory_manager = std::move(memory_manager);
 }
 
 NEConvolutionLayer::~NEConvolutionLayer() = default;
 
-void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+void NEConvolutionLayer::configure(ITensor                   *input,
+                                   const ITensor             *weights,
+                                   const ITensor             *biases,
+                                   ITensor                   *output,
+                                   const PadStrideInfo       &conv_info,
+                                   const WeightsInfo         &weights_info,
+                                   const Size2D              &dilation,
+                                   const ActivationLayerInfo &act_info,
+                                   bool                       enable_fast_math,
+                                   unsigned int               num_groups)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_UNUSED(num_groups);
-    ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
-                                                            enable_fast_math, num_groups));
-    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+    ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(
+        input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info,
+        weights_info, dilation, act_info, enable_fast_math, num_groups));
+    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
+                           enable_fast_math, num_groups);
 
     const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
-    switch(cpu::CpuConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math))
+    switch (cpu::CpuConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv_info,
+                                                   weights_info, dilation, act_info, enable_fast_math))
     {
         case ConvolutionMethod::WINOGRAD:
         case ConvolutionMethod::GEMM:
@@ -78,7 +89,8 @@ void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const
         case ConvolutionMethod::DIRECT:
         {
             auto f = std::make_unique<cpu::CpuConv2d>();
-            f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+            f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr),
+                         output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
             _impl->op = std::move(f);
             break;
         }
@@ -94,33 +106,46 @@ void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const
             break;
     }
 
-    if(_impl->op)
+    if (_impl->op)
     {
         _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager));
         _impl->aux_mem_req  = _impl->op->workspace();
-        _impl->run_pack     = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
-        _impl->prep_pack    = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } };
-        _impl->workspace    = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+        _impl->run_pack     = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+        _impl->prep_pack    = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}};
+        _impl->workspace =
+            manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
     }
 }
 
-Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+Status NEConvolutionLayer::validate(const ITensorInfo         *input,
+                                    const ITensorInfo         *weights,
+                                    const ITensorInfo         *biases,
+                                    const ITensorInfo         *output,
+                                    const PadStrideInfo       &conv_info,
+                                    const WeightsInfo         &weights_info,
+                                    const Size2D              &dilation,
+                                    const ActivationLayerInfo &act_info,
+                                    bool                       enable_fast_math,
+                                    unsigned int               num_groups)
 {
     const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported");
 
-    switch(cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math))
+    switch (cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
+                                                   enable_fast_math))
     {
         case ConvolutionMethod::WINOGRAD:
         case ConvolutionMethod::GEMM:
         case ConvolutionMethod::GEMM_CONV2D:
         case ConvolutionMethod::DIRECT:
-            ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups));
+            ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuConv2d::validate(input, weights, biases, output, conv_info,
+                                                                 weights_info, dilation, act_info, enable_fast_math,
+                                                                 num_groups));
             break;
         case ConvolutionMethod::FFT:
-            ARM_COMPUTE_RETURN_ON_ERROR(NEFFTConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEFFTConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
             break;
         default:
             ARM_COMPUTE_ERROR("Not supported.");
@@ -129,12 +154,17 @@ Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo
     return Status{};
 }
 
-ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights,
-                                                             const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                             const WeightsInfo &weights_info, const Size2D &dilation,
-                                                             const ActivationLayerInfo &act_info, bool enable_fast_math)
+ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo         *input,
+                                                             const ITensorInfo         *weights,
+                                                             const ITensorInfo         *output,
+                                                             const PadStrideInfo       &conv_info,
+                                                             const WeightsInfo         &weights_info,
+                                                             const Size2D              &dilation,
+                                                             const ActivationLayerInfo &act_info,
+                                                             bool                       enable_fast_math)
 {
-    return cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math);
+    return cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
+                                                  enable_fast_math);
 }
 
 void NEConvolutionLayer::run()
@@ -143,7 +173,7 @@ void NEConvolutionLayer::run()
 
     MemoryGroupResourceScope scope_mg(_impl->memory_group);
 
-    if(_impl->func)
+    if (_impl->func)
     {
         _impl->func->run();
     }
@@ -155,7 +185,7 @@ void NEConvolutionLayer::run()
 
 void NEConvolutionLayer::prepare()
 {
-    if(_impl->func)
+    if (_impl->func)
     {
         _impl->func->prepare();
     }
diff --git a/src/runtime/NEON/functions/NECopy.cpp b/src/runtime/NEON/functions/NECopy.cpp
index c2059e8e98..c975d3a5b5 100644
--- a/src/runtime/NEON/functions/NECopy.cpp
+++ b/src/runtime/NEON/functions/NECopy.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NECopy.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuCopy.h"
 
 #include <utility>
@@ -32,16 +33,15 @@ namespace arm_compute
 {
 struct NECopy::Impl
 {
-    const ITensor                *src{ nullptr };
-    ITensor                      *dst{ nullptr };
-    std::unique_ptr<cpu::CpuCopy> op{ nullptr };
+    const ITensor                *src{nullptr};
+    ITensor                      *dst{nullptr};
+    std::unique_ptr<cpu::CpuCopy> op{nullptr};
 };
 
-NECopy::NECopy()
-    : _impl(std::make_unique<Impl>())
+NECopy::NECopy() : _impl(std::make_unique<Impl>())
 {
 }
-NECopy::NECopy(NECopy &&) = default;
+NECopy::NECopy(NECopy &&)            = default;
 NECopy &NECopy::operator=(NECopy &&) = default;
 NECopy::~NECopy()                    = default;
 
diff --git a/src/runtime/NEON/functions/NECropResize.cpp b/src/runtime/NEON/functions/NECropResize.cpp
index cca8b400ee..a94b0882da 100644
--- a/src/runtime/NEON/functions/NECropResize.cpp
+++ b/src/runtime/NEON/functions/NECropResize.cpp
@@ -21,10 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
 #include "arm_compute/runtime/NEON/functions/NECropResize.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NECropKernel.h"
 
@@ -35,18 +36,32 @@ namespace arm_compute
 NECropResize::~NECropResize() = default;
 
 NECropResize::NECropResize()
-    : _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _crop(), _scale(), _crop_results(), _scaled_results()
+    : _output(nullptr),
+      _num_boxes(0),
+      _method(),
+      _extrapolation_value(0),
+      _crop(),
+      _scale(),
+      _crop_results(),
+      _scaled_results()
 {
 }
 
-Status NECropResize::validate(const ITensorInfo *input, const ITensorInfo *boxes, const ITensorInfo *box_ind, const ITensorInfo *output,
-                              Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value)
+Status NECropResize::validate(const ITensorInfo  *input,
+                              const ITensorInfo  *boxes,
+                              const ITensorInfo  *box_ind,
+                              const ITensorInfo  *output,
+                              Coordinates2D       crop_size,
+                              InterpolationPolicy method,
+                              float               extrapolation_value)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0);
     ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA);
     TensorInfo temp_info;
-    ARM_COMPUTE_RETURN_ON_ERROR(NECropKernel::validate(input->clone().get(), boxes->clone().get(), box_ind->clone().get(), &temp_info, boxes->tensor_shape()[1] - 1, extrapolation_value));
-    if(output->total_size() > 0)
+    ARM_COMPUTE_RETURN_ON_ERROR(NECropKernel::validate(input->clone().get(), boxes->clone().get(),
+                                                       box_ind->clone().get(), &temp_info, boxes->tensor_shape()[1] - 1,
+                                                       extrapolation_value));
+    if (output->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -56,11 +71,17 @@ Status NECropResize::validate(const ITensorInfo *input, const ITensorInfo *boxes
     return Status{};
 }
 
-void NECropResize::configure(const ITensor *input, const ITensor *boxes, const ITensor *box_ind, ITensor *output, Coordinates2D crop_size,
-                             InterpolationPolicy method, float extrapolation_value)
+void NECropResize::configure(const ITensor      *input,
+                             const ITensor      *boxes,
+                             const ITensor      *box_ind,
+                             ITensor            *output,
+                             Coordinates2D       crop_size,
+                             InterpolationPolicy method,
+                             float               extrapolation_value)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NECropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value));
+    ARM_COMPUTE_ERROR_THROW_ON(NECropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(),
+                                                      crop_size, method, extrapolation_value));
     ARM_COMPUTE_LOG_PARAMS(input, boxes, box_ind, output, crop_size, method, extrapolation_value);
 
     _num_boxes = boxes->info()->tensor_shape()[1];
@@ -81,7 +102,7 @@ void NECropResize::configure(const ITensor *input, const ITensor *boxes, const I
     _scaled_results.reserve(_num_boxes);
     _scale.reserve(_num_boxes);
 
-    for(unsigned int i = 0; i < _num_boxes; ++i)
+    for (unsigned int i = 0; i < _num_boxes; ++i)
     {
         auto       crop_tensor = std::make_unique<Tensor>();
         TensorInfo crop_result_info(1, DataType::F32);
@@ -108,7 +129,7 @@ void NECropResize::run()
 {
     ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function");
 
-    for(unsigned int i = 0; i < _num_boxes; ++i)
+    for (unsigned int i = 0; i < _num_boxes; ++i)
     {
         // Size of the crop box in _boxes and thus the shape of _crop_results[i]
         // may not be known until run-time and so the kernels cannot be configured until then.
@@ -117,12 +138,15 @@ void NECropResize::run()
         NEScheduler::get().schedule(_crop[i].get(), Window::DimZ);
 
         // Scale the cropped image.
-        _scale[i]->configure(_crop_results[i].get(), _scaled_results[i].get(), ScaleKernelInfo{ _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT, false });
+        _scale[i]->configure(_crop_results[i].get(), _scaled_results[i].get(),
+                             ScaleKernelInfo{_method, BorderMode::CONSTANT, PixelValue(_extrapolation_value),
+                                             SamplingPolicy::TOP_LEFT, false});
         _scaled_results[i]->allocator()->allocate();
         _scale[i]->run();
 
         // Copy scaled image into output.
-        std::copy_n(_scaled_results[i]->buffer(), _scaled_results[i]->info()->total_size(), _output->ptr_to_element(Coordinates(0, 0, 0, i)));
+        std::copy_n(_scaled_results[i]->buffer(), _scaled_results[i]->info()->total_size(),
+                    _output->ptr_to_element(Coordinates(0, 0, 0, i)));
     }
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index 439aff0840..3987370d9e 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -25,9 +25,10 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 
@@ -61,7 +62,8 @@ PadStrideInfo compute_upsample_info(const PadStrideInfo &info, uint32_t deconv_p
     deconv_pad_top += deconv_pad_y / 2;
     deconv_pad_bottom += deconv_pad_y / 2;
 
-    return PadStrideInfo(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR);
+    return PadStrideInfo(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom,
+                         DimensionRoundingType::FLOOR);
 }
 
 } // namespace
@@ -82,17 +84,24 @@ NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memor
 {
 }
 
-Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info,
-                                      bool enable_fast_math, const WeightsInfo &weights_info)
+Status NEDeconvolutionLayer::validate(const ITensorInfo   *input,
+                                      const ITensorInfo   *weights,
+                                      const ITensorInfo   *bias,
+                                      const ITensorInfo   *output,
+                                      const PadStrideInfo &info,
+                                      bool                 enable_fast_math,
+                                      const WeightsInfo   &weights_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    const unsigned int width_idx  = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
-    const unsigned int height_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
+    const unsigned int width_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
+    const unsigned int height_idx =
+        get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(height_idx) < 1);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input);
-    if(is_data_type_quantized_per_channel(weights->data_type()) && is_data_type_quantized(input->data_type()))
+    if (is_data_type_quantized_per_channel(weights->data_type()) && is_data_type_quantized(input->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
     }
@@ -101,11 +110,13 @@ Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     }
 
-    auto out_dims = deconvolution_output_dimensions(input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx), weights->dimension(height_idx), info);
+    auto out_dims =
+        deconvolution_output_dimensions(input->dimension(width_idx), input->dimension(height_idx),
+                                        weights->dimension(width_idx), weights->dimension(height_idx), info);
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
-        if(is_data_type_quantized_asymmetric(input->data_type()))
+        if (is_data_type_quantized_asymmetric(input->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         }
@@ -115,15 +126,18 @@ Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf
         }
     }
 
-    if(output->tensor_shape().total_size() > 0)
+    if (output->tensor_shape().total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
         const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
 
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid.");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid.");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(),
+                                        "Output's width is invalid.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(),
+                                        "Output's height is invalid.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(),
+                                        "Output's depth is invalid.");
     }
 
     uint32_t           deconv_pad_x = 0;
@@ -141,44 +155,61 @@ Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf
     ARM_COMPUTE_RETURN_ERROR_ON((out_x - weights->dimension(idx_w) + 1) > out_dims.first);
     ARM_COMPUTE_RETURN_ERROR_ON((out_y - weights->dimension(idx_h) + 1) > out_dims.second);
 
-    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
-    TensorInfo          scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
+    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y,
+                                                                              out_dims, deconv_pad_x, deconv_pad_y);
+    TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
     const PadStrideInfo upsample_info = compute_upsample_info(info, deconv_pad_x, deconv_pad_y);
 
     // Do not perform upsampling when the operation uses unit stride in all dimensions
     const bool do_upsampling = stride_x != 1 || stride_y != 1;
 
-    const unsigned int batches_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
-    const unsigned int channel_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
+    const unsigned int batches_idx =
+        get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
+    const unsigned int channel_idx =
+        get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(batches_idx) != scale_out_info.dimension(batches_idx));
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != scale_out_info.dimension(channel_idx));
 
-    if(do_upsampling)
+    if (do_upsampling)
     {
         const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-        ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info, Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info,
+                                                                 weights_info, Size2D(1U, 1U), ActivationLayerInfo(),
+                                                                 enable_fast_math));
     }
     else
     {
-        const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(), upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL);
-        ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(input, weights, bias, output, conv_info, weights_info, Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math));
+        const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(),
+                                      upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL);
+        ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(input, weights, bias, output, conv_info, weights_info,
+                                                                 Size2D(1U, 1U), ActivationLayerInfo(),
+                                                                 enable_fast_math));
     }
 
     return Status{};
 }
 
-void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info, bool enable_fast_math, const WeightsInfo &weights_info)
+void NEDeconvolutionLayer::configure(ITensor             *input,
+                                     const ITensor       *weights,
+                                     const ITensor       *bias,
+                                     ITensor             *output,
+                                     const PadStrideInfo &info,
+                                     bool                 enable_fast_math,
+                                     const WeightsInfo   &weights_info)
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), info, enable_fast_math, weights_info));
+    ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(),
+                                                              (bias == nullptr) ? nullptr : bias->info(),
+                                                              output->info(), info, enable_fast_math, weights_info));
     ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, info, enable_fast_math, weights_info);
 
     const DataLayout   data_layout = input->info()->data_layout();
     const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    auto               out_dims    = deconvolution_output_dimensions(input->info()->dimension(width_idx), input->info()->dimension(height_idx),
-                                                                     weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info);
+    auto               out_dims    = deconvolution_output_dimensions(
+                         input->info()->dimension(width_idx), input->info()->dimension(height_idx),
+                         weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info);
 
     const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
 
@@ -191,7 +222,8 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con
     const unsigned int stride_y = info.stride().second;
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+                       input->info()->quantization_info());
 
     _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
 
@@ -199,12 +231,11 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con
     _flip_weights.configure(weights, &_weights_flipped, &_flip_axis);
 
     // setup the function to convolve the upscaled output
-    uint32_t            deconv_pad_x    = 0;
-    uint32_t            deconv_pad_y    = 0;
-    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(),
-                                                                                stride_x, stride_y,
-                                                                                out_dims, deconv_pad_x, deconv_pad_y);
-    const PadStrideInfo upsample_info   = compute_upsample_info(info, deconv_pad_x, deconv_pad_y);
+    uint32_t          deconv_pad_x    = 0;
+    uint32_t          deconv_pad_y    = 0;
+    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(
+        *input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
+    const PadStrideInfo upsample_info = compute_upsample_info(info, deconv_pad_x, deconv_pad_y);
 
     // Do not perform upsampling when the operation uses unit stride in all dimensions
     _do_upsampling = stride_x != 1 || stride_y != 1;
@@ -216,12 +247,12 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con
     axis_data[1]   = static_cast<uint32_t>(height_idx);
 
     // Setup convolution and upsampling, if needed
-    if(_do_upsampling)
+    if (_do_upsampling)
     {
         _memory_group.manage(&_scaled_output);
 
         const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-        TensorInfo          scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+        TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
         scale_out_info.set_data_layout(data_layout);
         _scaled_output.allocator()->init(scale_out_info);
 
@@ -229,14 +260,17 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con
         // The padding amount can be given as input to the convolution layer.
         _upsample_f.configure(input, &_scaled_output, upsample_info);
 
-        _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math);
+        _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U),
+                          ActivationLayerInfo(), enable_fast_math);
 
         _scaled_output.allocator()->allocate();
     }
     else
     {
-        const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(), upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL);
-        _conv_f.configure(input, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math);
+        const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(),
+                                      upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL);
+        _conv_f.configure(input, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U),
+                          ActivationLayerInfo(), enable_fast_math);
     }
 }
 
@@ -246,7 +280,7 @@ void NEDeconvolutionLayer::run()
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    if(_do_upsampling)
+    if (_do_upsampling)
     {
         _upsample_f.run();
     }
@@ -255,7 +289,7 @@ void NEDeconvolutionLayer::run()
 
 void NEDeconvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
 
diff --git a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
index 1ec32074a5..766635dfa1 100644
--- a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuCast.h"
 
 #include <utility>
@@ -32,16 +33,15 @@ namespace arm_compute
 {
 struct NEDepthConvertLayer::Impl
 {
-    const ITensor                *src{ nullptr };
-    ITensor                      *dst{ nullptr };
-    std::unique_ptr<cpu::CpuCast> op{ nullptr };
+    const ITensor                *src{nullptr};
+    ITensor                      *dst{nullptr};
+    std::unique_ptr<cpu::CpuCast> op{nullptr};
 };
 
-NEDepthConvertLayer::NEDepthConvertLayer()
-    : _impl(std::make_unique<Impl>())
+NEDepthConvertLayer::NEDepthConvertLayer() : _impl(std::make_unique<Impl>())
 {
 }
-NEDepthConvertLayer::NEDepthConvertLayer(NEDepthConvertLayer &&) = default;
+NEDepthConvertLayer::NEDepthConvertLayer(NEDepthConvertLayer &&)            = default;
 NEDepthConvertLayer &NEDepthConvertLayer::operator=(NEDepthConvertLayer &&) = default;
 NEDepthConvertLayer::~NEDepthConvertLayer()                                 = default;
 
@@ -59,7 +59,8 @@ void NEDepthConvertLayer::configure(const ITensor *input, ITensor *output, Conve
     _impl->op->configure(_impl->src->info(), _impl->dst->info(), policy);
 }
 
-Status NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
+Status
+NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(shift != 0);
     return cpu::CpuCast::validate(input, output, policy);
@@ -67,7 +68,7 @@ Status NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo
 
 void NEDepthConvertLayer::run()
 {
-    ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } };
+    ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}};
     _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
index f4a8a17e05..47564059ec 100644
--- a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
 
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 4dabef3bd7..6c085645db 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/operators/CpuDepthwiseConv2d.h"
 
@@ -39,38 +40,35 @@ NEDepthwiseConvolutionLayer::~NEDepthwiseConvolutionLayer() = default;
 
 struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::Impl
 {
-    ITensor       *src{ nullptr }; // SRC_0
-    ITensor       *dst{ nullptr }; // DST_0
-    const ITensor *weights
-    {
-        nullptr
-    }; // SRC_1
-    const ITensor *biases
-    {
-        nullptr
-    };                                                           // SRC_2
+    ITensor                                 *src{nullptr};       // SRC_0
+    ITensor                                 *dst{nullptr};       // DST_0
+    const ITensor                           *weights{nullptr};   // SRC_1
+    const ITensor                           *biases{nullptr};    // SRC_2
     Tensor                                   permuted_input{};   // INT_0
     Tensor                                   permuted_weights{}; // INT_1
     Tensor                                   permuted_output{};  // INT_2
     Tensor                                   workspace{};        // INT_3
     Tensor                                   packed_weights{};   // INT_4
-    std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
-    bool                                     is_prepared{ false };
-    bool                                     permute{ false };
+    std::shared_ptr<cpu::CpuDepthwiseConv2d> op{nullptr};
+    bool                                     is_prepared{false};
+    bool                                     permute{false};
 };
 
-NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr<IMemoryManager> memory_manager)
+NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(
+    std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager), _impl(std::make_unique<Impl>())
 {
 }
 
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(ITensor       *input,
-                                                                                          const ITensor *weights,
-                                                                                          const ITensor *biases,
-                                                                                          ITensor *output, const PadStrideInfo &conv_info,
-                                                                                          unsigned int               depth_multiplier,
-                                                                                          const ActivationLayerInfo &act_info,
-                                                                                          const Size2D              &dilation)
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(
+    ITensor                   *input,
+    const ITensor             *weights,
+    const ITensor             *biases,
+    ITensor                   *output,
+    const PadStrideInfo       &conv_info,
+    unsigned int               depth_multiplier,
+    const ActivationLayerInfo &act_info,
+    const Size2D              &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
@@ -82,9 +80,9 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
     _impl->permute = is_nhwc;
 
     _impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
-    ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
-    _impl->op->configure(_impl->src->info(), _impl->weights->info(), _impl->biases == nullptr ? nullptr : _impl->biases->info(),
-                         _impl->dst->info(), info);
+    ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
+    _impl->op->configure(_impl->src->info(), _impl->weights->info(),
+                         _impl->biases == nullptr ? nullptr : _impl->biases->info(), _impl->dst->info(), info);
 
     // Configure pipeline
     ActivationLayerInfo act_info_to_use            = ActivationLayerInfo();
@@ -92,15 +90,15 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
     const bool          is_relu6                   = arm_compute::utils::info_helpers::is_relu6(act_info);
     bool                is_activationlayer_enabled = act_info.enabled() && !(is_relu || is_relu6);
 
-    if(!is_activationlayer_enabled)
+    if (!is_activationlayer_enabled)
     {
         act_info_to_use = act_info;
     }
-    info = ConvolutionInfo{ conv_info, depth_multiplier, act_info_to_use, dilation };
+    info = ConvolutionInfo{conv_info, depth_multiplier, act_info_to_use, dilation};
 
     auto dwc_optimized_func = std::make_unique<cpu::CpuDepthwiseConv2dAssemblyDispatch>();
 
-    if(is_nhwc)
+    if (is_nhwc)
     {
         auto permute_input   = std::make_unique<cpu::CpuPermute>();
         auto permute_weights = std::make_unique<cpu::CpuPermute>();
@@ -122,7 +120,9 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
         _impl->permuted_output.info()->set_quantization_info(output->info()->quantization_info());
 
         // Configure optimized depthwise
-        dwc_optimized_func->configure(_impl->permuted_input.info(), _impl->permuted_weights.info(), biases == nullptr ? nullptr : biases->info(), _impl->permuted_output.info(), info);
+        dwc_optimized_func->configure(_impl->permuted_input.info(), _impl->permuted_weights.info(),
+                                      biases == nullptr ? nullptr : biases->info(), _impl->permuted_output.info(),
+                                      info);
 
         // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
         _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
@@ -133,29 +133,33 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
     }
     else
     {
-        dwc_optimized_func->configure(_impl->src->info(), _impl->weights->info(), biases == nullptr ? nullptr : biases->info(), _impl->dst->info(), info);
+        dwc_optimized_func->configure(_impl->src->info(), _impl->weights->info(),
+                                      biases == nullptr ? nullptr : biases->info(), _impl->dst->info(), info);
     }
 
     // Allocate memory based on the internal memory requirements
     experimental::MemoryRequirements mem_req = dwc_optimized_func->workspace();
-    _impl->workspace.allocator()->init(TensorInfo(TensorShape{ mem_req[0].size + mem_req[0].alignment }, 1, DataType::S8), mem_req[0].alignment);
-    _impl->packed_weights.allocator()->init(TensorInfo(TensorShape{ mem_req[1].size + mem_req[1].alignment }, 1, DataType::S8), mem_req[1].alignment);
+    _impl->workspace.allocator()->init(TensorInfo(TensorShape{mem_req[0].size + mem_req[0].alignment}, 1, DataType::S8),
+                                       mem_req[0].alignment);
+    _impl->packed_weights.allocator()->init(
+        TensorInfo(TensorShape{mem_req[1].size + mem_req[1].alignment}, 1, DataType::S8), mem_req[1].alignment);
     _memory_group.manage(&_impl->workspace);
     _memory_group.manage(&_impl->packed_weights);
     _impl->workspace.allocator()->allocate();
     _impl->packed_weights.allocator()->allocate();
 }
 
-Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo         *input,
-                                                                                           const ITensorInfo         *weights,
-                                                                                           const ITensorInfo         *biases,
-                                                                                           const ITensorInfo         *output,
-                                                                                           const PadStrideInfo       &conv_info,
-                                                                                           unsigned int               depth_multiplier,
-                                                                                           const ActivationLayerInfo &act_info,
-                                                                                           const Size2D              &dilation)
+Status
+NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo   *input,
+                                                                                    const ITensorInfo   *weights,
+                                                                                    const ITensorInfo   *biases,
+                                                                                    const ITensorInfo   *output,
+                                                                                    const PadStrideInfo &conv_info,
+                                                                                    unsigned int depth_multiplier,
+                                                                                    const ActivationLayerInfo &act_info,
+                                                                                    const Size2D              &dilation)
 {
-    ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+    ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
     return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
 }
 
@@ -180,15 +184,15 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
 
 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         // Permute weights
-        if(_impl->permute)
+        if (_impl->permute)
         {
             _impl->permuted_weights.allocator()->allocate();
         }
 
-        if(!_impl->permuted_weights.is_used())
+        if (!_impl->permuted_weights.is_used())
         {
             _impl->permuted_weights.allocator()->free();
         }
@@ -202,14 +206,14 @@ struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::Impl
     Tensor                                   permuted_input{};
     Tensor                                   permuted_weights{};
     Tensor                                   permuted_output{};
-    bool                                     is_prepared{ false };
-    bool                                     is_nchw{ false };
-    bool                                     is_activationlayer_enabled{ false };
-    const ITensor                           *weights{ nullptr };
-    const ITensor                           *biases{ nullptr };
-    const ITensor                           *src{ nullptr };
-    ITensor                                 *dst{ nullptr };
-    std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
+    bool                                     is_prepared{false};
+    bool                                     is_nchw{false};
+    bool                                     is_activationlayer_enabled{false};
+    const ITensor                           *weights{nullptr};
+    const ITensor                           *biases{nullptr};
+    const ITensor                           *src{nullptr};
+    ITensor                                 *dst{nullptr};
+    std::shared_ptr<cpu::CpuDepthwiseConv2d> op{nullptr};
 };
 
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConvolutionLayerGeneric()
@@ -217,14 +221,21 @@ NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConv
 {
 }
 
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                                                                unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor             *input,
+                                                                                const ITensor       *weights,
+                                                                                const ITensor       *biases,
+                                                                                ITensor             *output,
+                                                                                const PadStrideInfo &conv_info,
+                                                                                unsigned int         depth_multiplier,
+                                                                                const ActivationLayerInfo &act_info,
+                                                                                const Size2D              &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
-    const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+    const ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
     _impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
-    _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), info);
+    _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(),
+                         info);
 
     _impl->src         = input;
     _impl->dst         = output;
@@ -236,7 +247,7 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(
     ITensor       *input_to_use   = input;
     const ITensor *weights_to_use = weights;
     ITensor       *output_to_use  = output;
-    if(_impl->is_nchw)
+    if (_impl->is_nchw)
     {
         auto permute_input   = std::make_unique<cpu::CpuPermute>();
         auto permute_weights = std::make_unique<cpu::CpuPermute>();
@@ -249,14 +260,16 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(
         _impl->permuted_weights.info()->set_data_layout(DataLayout::NHWC);
         weights_to_use = &_impl->permuted_weights;
 
-        _impl->permuted_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
+        _impl->permuted_output.allocator()->init(
+            output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
         output_to_use = &_impl->permuted_output;
     }
 
     auto depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>();
-    depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(), biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info);
+    depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(),
+                                     biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info);
 
-    if(_impl->is_nchw)
+    if (_impl->is_nchw)
     {
         auto permute_output = std::make_unique<cpu::CpuPermute>();
         permute_output->configure(_impl->permuted_output.info(), output->info(), PermutationVector(1U, 2U, 0U));
@@ -268,11 +281,16 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(
     }
 }
 
-Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo   *input,
+                                                                                 const ITensorInfo   *weights,
+                                                                                 const ITensorInfo   *biases,
+                                                                                 const ITensorInfo   *output,
                                                                                  const PadStrideInfo &conv_info,
-                                                                                 unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+                                                                                 unsigned int         depth_multiplier,
+                                                                                 const ActivationLayerInfo &act_info,
+                                                                                 const Size2D              &dilation)
 {
-    ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+    ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
     return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
 }
 
@@ -298,49 +316,64 @@ NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer(std::shared_ptr<IMemory
 #ifndef DOXYGEN_SKIP_THIS
 struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer::Impl
 {
-    DepthwiseConvolutionFunction                 depth_conv_func{ DepthwiseConvolutionFunction::OPTIMIZED };
-    NEDepthwiseConvolutionLayerOptimizedInternal func_optimized{ nullptr };
+    DepthwiseConvolutionFunction                 depth_conv_func{DepthwiseConvolutionFunction::OPTIMIZED};
+    NEDepthwiseConvolutionLayerOptimizedInternal func_optimized{nullptr};
     NEDepthwiseConvolutionLayerGeneric           func_generic{};
-    std::shared_ptr<cpu::CpuDepthwiseConv2d>     op{ nullptr };
+    std::shared_ptr<cpu::CpuDepthwiseConv2d>     op{nullptr};
 };
 #endif // DOXYGEN_SKIP_THIS
 
-void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
-                                            const ActivationLayerInfo &act_info, const Size2D &dilation)
+void NEDepthwiseConvolutionLayer::configure(ITensor                   *input,
+                                            const ITensor             *weights,
+                                            const ITensor             *biases,
+                                            ITensor                   *output,
+                                            const PadStrideInfo       &conv_info,
+                                            unsigned int               depth_multiplier,
+                                            const ActivationLayerInfo &act_info,
+                                            const Size2D              &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
     ARM_COMPUTE_LOG_PARAMS(input, weights, output, conv_info, depth_multiplier, biases, act_info, dilation);
-    ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
-                                                                     output->info(), conv_info, depth_multiplier, act_info, dilation));
+    ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate(
+        input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), output->info(), conv_info,
+        depth_multiplier, act_info, dilation));
 
-    const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+    const ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
     _impl->op              = std::make_shared<cpu::CpuDepthwiseConv2d>();
-    _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
-                                                                          info);
-    switch(_impl->depth_conv_func)
+    _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function(
+        input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), info);
+    switch (_impl->depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
-            _impl->func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+            _impl->func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info,
+                                            dilation);
             break;
         case DepthwiseConvolutionFunction::GENERIC:
-            _impl->func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+            _impl->func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info,
+                                          dilation);
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
     }
 }
 
-Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                             unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo         *input,
+                                             const ITensorInfo         *weights,
+                                             const ITensorInfo         *biases,
+                                             const ITensorInfo         *output,
+                                             const PadStrideInfo       &conv_info,
+                                             unsigned int               depth_multiplier,
+                                             const ActivationLayerInfo &act_info,
+                                             const Size2D              &dilation)
 {
-    ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+    ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
     return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
 }
 
 void NEDepthwiseConvolutionLayer::run()
 {
-    switch(_impl->depth_conv_func)
+    switch (_impl->depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
             _impl->func_optimized.run();
@@ -355,7 +388,7 @@ void NEDepthwiseConvolutionLayer::run()
 
 void NEDepthwiseConvolutionLayer::prepare()
 {
-    switch(_impl->depth_conv_func)
+    switch (_impl->depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
             _impl->func_optimized.prepare();
diff --git a/src/runtime/NEON/functions/NEDequantizationLayer.cpp b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
index 83e0131c83..28d19d2950 100644
--- a/src/runtime/NEON/functions/NEDequantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
@@ -26,19 +26,19 @@
 
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/cpu/operators/CpuDequantize.h"
 
 namespace arm_compute
 {
 struct NEDequantizationLayer::Impl
 {
-    const ITensor                      *src{ nullptr };
-    ITensor                            *dst{ nullptr };
-    std::unique_ptr<cpu::CpuDequantize> op{ nullptr };
+    const ITensor                      *src{nullptr};
+    ITensor                            *dst{nullptr};
+    std::unique_ptr<cpu::CpuDequantize> op{nullptr};
 };
 
-NEDequantizationLayer::NEDequantizationLayer()
-    : _impl(std::make_unique<Impl>())
+NEDequantizationLayer::NEDequantizationLayer() : _impl(std::make_unique<Impl>())
 {
 }
 NEDequantizationLayer::~NEDequantizationLayer() = default;
diff --git a/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp b/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp
index 1da8b012b3..b347390162 100644
--- a/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp
+++ b/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Log.h"
 
 #include <cstddef>
@@ -35,24 +36,36 @@
 namespace arm_compute
 {
 NEDetectionPostProcessLayer::NEDetectionPostProcessLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _dequantize(), _detection_post_process(), _decoded_scores(), _run_dequantize(false)
+    : _memory_group(std::move(memory_manager)),
+      _dequantize(),
+      _detection_post_process(),
+      _decoded_scores(),
+      _run_dequantize(false)
 {
 }
 
-void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, const ITensor *input_scores, const ITensor *input_anchors,
-                                            ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info)
+void NEDetectionPostProcessLayer::configure(const ITensor                *input_box_encoding,
+                                            const ITensor                *input_scores,
+                                            const ITensor                *input_anchors,
+                                            ITensor                      *output_boxes,
+                                            ITensor                      *output_classes,
+                                            ITensor                      *output_scores,
+                                            ITensor                      *num_detection,
+                                            DetectionPostProcessLayerInfo info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores);
-    ARM_COMPUTE_ERROR_THROW_ON(NEDetectionPostProcessLayer::validate(input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), output_classes->info(),
-                                                                     output_scores->info(),
-                                                                     num_detection->info(), info));
-    ARM_COMPUTE_LOG_PARAMS(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores, num_detection, info);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes,
+                                 output_scores);
+    ARM_COMPUTE_ERROR_THROW_ON(NEDetectionPostProcessLayer::validate(
+        input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(),
+        output_classes->info(), output_scores->info(), num_detection->info(), info));
+    ARM_COMPUTE_LOG_PARAMS(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores,
+                           num_detection, info);
 
     const ITensor                *input_scores_to_use = input_scores;
     DetectionPostProcessLayerInfo info_to_use         = info;
     _run_dequantize                                   = is_data_type_quantized(input_box_encoding->info()->data_type());
 
-    if(_run_dequantize)
+    if (_run_dequantize)
     {
         _memory_group.manage(&_decoded_scores);
 
@@ -61,26 +74,37 @@ void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, c
         input_scores_to_use = &_decoded_scores;
 
         // Create a new info struct to avoid dequantizing in the CPP layer
-        std::array<float, 4> scales_values{ info.scale_value_y(), info.scale_value_x(), info.scale_value_h(), info.scale_value_w() };
-        DetectionPostProcessLayerInfo info_quantized(info.max_detections(), info.max_classes_per_detection(), info.nms_score_threshold(), info.iou_threshold(), info.num_classes(),
-                                                     scales_values, info.use_regular_nms(), info.detection_per_class(), false);
+        std::array<float, 4>          scales_values{info.scale_value_y(), info.scale_value_x(), info.scale_value_h(),
+                                           info.scale_value_w()};
+        DetectionPostProcessLayerInfo info_quantized(
+            info.max_detections(), info.max_classes_per_detection(), info.nms_score_threshold(), info.iou_threshold(),
+            info.num_classes(), scales_values, info.use_regular_nms(), info.detection_per_class(), false);
         info_to_use = info_quantized;
     }
 
-    _detection_post_process.configure(input_box_encoding, input_scores_to_use, input_anchors, output_boxes, output_classes, output_scores, num_detection, info_to_use);
+    _detection_post_process.configure(input_box_encoding, input_scores_to_use, input_anchors, output_boxes,
+                                      output_classes, output_scores, num_detection, info_to_use);
     _decoded_scores.allocator()->allocate();
 }
 
-Status NEDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_scores, const ITensorInfo *input_anchors,
-                                             ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection, DetectionPostProcessLayerInfo info)
+Status NEDetectionPostProcessLayer::validate(const ITensorInfo            *input_box_encoding,
+                                             const ITensorInfo            *input_scores,
+                                             const ITensorInfo            *input_anchors,
+                                             ITensorInfo                  *output_boxes,
+                                             ITensorInfo                  *output_classes,
+                                             ITensorInfo                  *output_scores,
+                                             ITensorInfo                  *num_detection,
+                                             DetectionPostProcessLayerInfo info)
 {
     bool run_dequantize = is_data_type_quantized(input_box_encoding->data_type());
-    if(run_dequantize)
+    if (run_dequantize)
     {
         TensorInfo decoded_classes_info = input_scores->clone()->set_is_resizable(true).set_data_type(DataType::F32);
         ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(input_scores, &decoded_classes_info));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(CPPDetectionPostProcessLayer::validate(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores, num_detection, info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CPPDetectionPostProcessLayer::validate(input_box_encoding, input_scores, input_anchors,
+                                                                       output_boxes, output_classes, output_scores,
+                                                                       num_detection, info));
 
     return Status{};
 }
@@ -90,7 +114,7 @@ void NEDetectionPostProcessLayer::run()
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Decode scores if necessary
-    if(_run_dequantize)
+    if (_run_dequantize)
     {
         _dequantize.run();
     }
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index ef3d3d6055..f1c2cf969f 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -27,17 +27,18 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/cpu/operators/CpuDirectConv2d.h"
 
 namespace arm_compute
 {
 struct NEDirectConvolutionLayer::Impl
 {
-    ITensor                              *src{ nullptr };
-    const ITensor                        *weights{ nullptr };
-    const ITensor                        *bias{ nullptr };
-    ITensor                              *dst{ nullptr };
-    std::unique_ptr<cpu::CpuDirectConv2d> op{ nullptr };
+    ITensor                              *src{nullptr};
+    const ITensor                        *weights{nullptr};
+    const ITensor                        *bias{nullptr};
+    ITensor                              *dst{nullptr};
+    std::unique_ptr<cpu::CpuDirectConv2d> op{nullptr};
 };
 
 NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
@@ -46,17 +47,27 @@ NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManage
 }
 NEDirectConvolutionLayer::~NEDirectConvolutionLayer() = default;
 
-void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void NEDirectConvolutionLayer::configure(ITensor                   *input,
+                                         const ITensor             *weights,
+                                         const ITensor             *bias,
+                                         ITensor                   *output,
+                                         const PadStrideInfo       &conv_info,
+                                         const ActivationLayerInfo &act_info)
 {
     _impl->src     = input;
     _impl->weights = weights;
     _impl->bias    = bias;
     _impl->dst     = output;
     _impl->op      = std::make_unique<cpu::CpuDirectConv2d>(_memory_manager);
-    _impl->op->configure(input->info(), weights->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), conv_info, act_info);
+    _impl->op->configure(input->info(), weights->info(), (bias != nullptr ? bias->info() : nullptr), output->info(),
+                         conv_info, act_info);
 }
 
-Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info,
+Status NEDirectConvolutionLayer::validate(const ITensorInfo         *input,
+                                          const ITensorInfo         *weights,
+                                          const ITensorInfo         *bias,
+                                          const ITensorInfo         *output,
+                                          const PadStrideInfo       &conv_info,
                                           const ActivationLayerInfo &act_info)
 {
     return cpu::CpuDirectConv2d::validate(input, weights, bias, output, conv_info, act_info);
diff --git a/src/runtime/NEON/functions/NEElementwiseOperations.cpp b/src/runtime/NEON/functions/NEElementwiseOperations.cpp
index c958adf97c..685ef2d4d7 100644
--- a/src/runtime/NEON/functions/NEElementwiseOperations.cpp
+++ b/src/runtime/NEON/functions/NEElementwiseOperations.cpp
@@ -22,10 +22,11 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/NEON/functions/NEElementwiseOperations.h"
-#include "arm_compute/core/Validate.h"
-#include "src/cpu/operators/CpuElementwise.h"
 
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/cpu/operators/CpuElementwise.h"
 
 #include <utility>
 
@@ -33,17 +34,16 @@ namespace arm_compute
 {
 struct NEElementwiseMax::Impl
 {
-    const ITensor                          *src_0{ nullptr };
-    const ITensor                          *src_1{ nullptr };
-    ITensor                                *dst{ nullptr };
-    std::unique_ptr<cpu::CpuElementwiseMax> op{ nullptr };
+    const ITensor                          *src_0{nullptr};
+    const ITensor                          *src_1{nullptr};
+    ITensor                                *dst{nullptr};
+    std::unique_ptr<cpu::CpuElementwiseMax> op{nullptr};
 };
 
-NEElementwiseMax::NEElementwiseMax()
-    : _impl(std::make_unique<Impl>())
+NEElementwiseMax::NEElementwiseMax() : _impl(std::make_unique<Impl>())
 {
 }
-NEElementwiseMax::NEElementwiseMax(NEElementwiseMax &&) = default;
+NEElementwiseMax::NEElementwiseMax(NEElementwiseMax &&)            = default;
 NEElementwiseMax &NEElementwiseMax::operator=(NEElementwiseMax &&) = default;
 NEElementwiseMax::~NEElementwiseMax()                              = default;
 
@@ -57,7 +57,10 @@ void NEElementwiseMax::configure(ITensor *input1, ITensor *input2, ITensor *outp
     _impl->op->configure(input1->info(), input2->info(), output->info());
 }
 
-Status NEElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwiseMax::validate(const ITensorInfo         *input1,
+                                  const ITensorInfo         *input2,
+                                  const ITensorInfo         *output,
+                                  const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return cpu::CpuElementwiseMax::validate(input1, input2, output);
@@ -74,17 +77,16 @@ void NEElementwiseMax::run()
 
 struct NEElementwiseMin::Impl
 {
-    const ITensor                          *src_0{ nullptr };
-    const ITensor                          *src_1{ nullptr };
-    ITensor                                *dst{ nullptr };
-    std::unique_ptr<cpu::CpuElementwiseMin> op{ nullptr };
+    const ITensor                          *src_0{nullptr};
+    const ITensor                          *src_1{nullptr};
+    ITensor                                *dst{nullptr};
+    std::unique_ptr<cpu::CpuElementwiseMin> op{nullptr};
 };
 
-NEElementwiseMin::NEElementwiseMin()
-    : _impl(std::make_unique<Impl>())
+NEElementwiseMin::NEElementwiseMin() : _impl(std::make_unique<Impl>())
 {
 }
-NEElementwiseMin::NEElementwiseMin(NEElementwiseMin &&) = default;
+NEElementwiseMin::NEElementwiseMin(NEElementwiseMin &&)            = default;
 NEElementwiseMin &NEElementwiseMin::operator=(NEElementwiseMin &&) = default;
 NEElementwiseMin::~NEElementwiseMin()                              = default;
 
@@ -98,7 +100,10 @@ void NEElementwiseMin::configure(ITensor *input1, ITensor *input2, ITensor *outp
     _impl->op->configure(input1->info(), input2->info(), output->info());
 }
 
-Status NEElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwiseMin::validate(const ITensorInfo         *input1,
+                                  const ITensorInfo         *input2,
+                                  const ITensorInfo         *output,
+                                  const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return cpu::CpuElementwiseMin::validate(input1, input2, output);
@@ -115,21 +120,23 @@ void NEElementwiseMin::run()
 
 struct NEElementwiseSquaredDiff::Impl
 {
-    const ITensor                                  *src_0{ nullptr };
-    const ITensor                                  *src_1{ nullptr };
-    ITensor                                        *dst{ nullptr };
-    std::unique_ptr<cpu::CpuElementwiseSquaredDiff> op{ nullptr };
+    const ITensor                                  *src_0{nullptr};
+    const ITensor                                  *src_1{nullptr};
+    ITensor                                        *dst{nullptr};
+    std::unique_ptr<cpu::CpuElementwiseSquaredDiff> op{nullptr};
 };
 
-NEElementwiseSquaredDiff::NEElementwiseSquaredDiff()
-    : _impl(std::make_unique<Impl>())
+NEElementwiseSquaredDiff::NEElementwiseSquaredDiff() : _impl(std::make_unique<Impl>())
 {
 }
-NEElementwiseSquaredDiff::NEElementwiseSquaredDiff(NEElementwiseSquaredDiff &&) = default;
+NEElementwiseSquaredDiff::NEElementwiseSquaredDiff(NEElementwiseSquaredDiff &&)            = default;
 NEElementwiseSquaredDiff &NEElementwiseSquaredDiff::operator=(NEElementwiseSquaredDiff &&) = default;
 NEElementwiseSquaredDiff::~NEElementwiseSquaredDiff()                                      = default;
 
-void NEElementwiseSquaredDiff::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+void NEElementwiseSquaredDiff::configure(ITensor                   *input1,
+                                         ITensor                   *input2,
+                                         ITensor                   *output,
+                                         const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     _impl->src_0 = input1;
@@ -139,7 +146,10 @@ void NEElementwiseSquaredDiff::configure(ITensor *input1, ITensor *input2, ITens
     _impl->op->configure(input1->info(), input2->info(), output->info());
 }
 
-Status NEElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwiseSquaredDiff::validate(const ITensorInfo         *input1,
+                                          const ITensorInfo         *input2,
+                                          const ITensorInfo         *output,
+                                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return cpu::CpuElementwiseSquaredDiff::validate(input1, input2, output);
@@ -156,21 +166,23 @@ void NEElementwiseSquaredDiff::run()
 
 struct NEElementwiseDivision::Impl
 {
-    const ITensor                               *src_0{ nullptr };
-    const ITensor                               *src_1{ nullptr };
-    ITensor                                     *dst{ nullptr };
-    std::unique_ptr<cpu::CpuElementwiseDivision> op{ nullptr };
+    const ITensor                               *src_0{nullptr};
+    const ITensor                               *src_1{nullptr};
+    ITensor                                     *dst{nullptr};
+    std::unique_ptr<cpu::CpuElementwiseDivision> op{nullptr};
 };
 
-NEElementwiseDivision::NEElementwiseDivision()
-    : _impl(std::make_unique<Impl>())
+NEElementwiseDivision::NEElementwiseDivision() : _impl(std::make_unique<Impl>())
 {
 }
-NEElementwiseDivision::NEElementwiseDivision(NEElementwiseDivision &&) = default;
+NEElementwiseDivision::NEElementwiseDivision(NEElementwiseDivision &&)            = default;
 NEElementwiseDivision &NEElementwiseDivision::operator=(NEElementwiseDivision &&) = default;
 NEElementwiseDivision::~NEElementwiseDivision()                                   = default;
 
-void NEElementwiseDivision::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+void NEElementwiseDivision::configure(ITensor                   *input1,
+                                      ITensor                   *input2,
+                                      ITensor                   *output,
+                                      const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     _impl->src_0 = input1;
@@ -180,7 +192,10 @@ void NEElementwiseDivision::configure(ITensor *input1, ITensor *input2, ITensor
     _impl->op->configure(input1->info(), input2->info(), output->info());
 }
 
-Status NEElementwiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwiseDivision::validate(const ITensorInfo         *input1,
+                                       const ITensorInfo         *input2,
+                                       const ITensorInfo         *output,
+                                       const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return cpu::CpuElementwiseDivision::validate(input1, input2, output);
@@ -197,21 +212,23 @@ void NEElementwiseDivision::run()
 
 struct NEElementwisePower::Impl
 {
-    const ITensor                            *src_0{ nullptr };
-    const ITensor                            *src_1{ nullptr };
-    ITensor                                  *dst{ nullptr };
-    std::unique_ptr<cpu::CpuElementwisePower> op{ nullptr };
+    const ITensor                            *src_0{nullptr};
+    const ITensor                            *src_1{nullptr};
+    ITensor                                  *dst{nullptr};
+    std::unique_ptr<cpu::CpuElementwisePower> op{nullptr};
 };
 
-NEElementwisePower::NEElementwisePower()
-    : _impl(std::make_unique<Impl>())
+NEElementwisePower::NEElementwisePower() : _impl(std::make_unique<Impl>())
 {
 }
-NEElementwisePower::NEElementwisePower(NEElementwisePower &&) = default;
+NEElementwisePower::NEElementwisePower(NEElementwisePower &&)            = default;
 NEElementwisePower &NEElementwisePower::operator=(NEElementwisePower &&) = default;
 NEElementwisePower::~NEElementwisePower()                                = default;
 
-void NEElementwisePower::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+void NEElementwisePower::configure(ITensor                   *input1,
+                                   ITensor                   *input2,
+                                   ITensor                   *output,
+                                   const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     _impl->src_0 = input1;
@@ -221,7 +238,10 @@ void NEElementwisePower::configure(ITensor *input1, ITensor *input2, ITensor *ou
     _impl->op->configure(input1->info(), input2->info(), output->info());
 }
 
-Status NEElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwisePower::validate(const ITensorInfo         *input1,
+                                    const ITensorInfo         *input2,
+                                    const ITensorInfo         *output,
+                                    const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return cpu::CpuElementwisePower::validate(input1, input2, output);
@@ -239,22 +259,22 @@ void NEElementwisePower::run()
 template <ComparisonOperation COP>
 struct NEElementwiseComparisonStatic<COP>::Impl
 {
-    const ITensor                                            *src_0{ nullptr };
-    const ITensor                                            *src_1{ nullptr };
-    ITensor                                                  *dst{ nullptr };
-    std::unique_ptr<cpu::CpuElementwiseComparisonStatic<COP>> op{ nullptr };
+    const ITensor                                            *src_0{nullptr};
+    const ITensor                                            *src_1{nullptr};
+    ITensor                                                  *dst{nullptr};
+    std::unique_ptr<cpu::CpuElementwiseComparisonStatic<COP>> op{nullptr};
 };
 
 template <ComparisonOperation COP>
-NEElementwiseComparisonStatic<COP>::NEElementwiseComparisonStatic()
-    : _impl(std::make_unique<Impl>())
+NEElementwiseComparisonStatic<COP>::NEElementwiseComparisonStatic() : _impl(std::make_unique<Impl>())
 {
 }
 template <ComparisonOperation COP>
 NEElementwiseComparisonStatic<COP>::NEElementwiseComparisonStatic(NEElementwiseComparisonStatic &&) = default;
-template <ComparisonOperation       COP>
-NEElementwiseComparisonStatic<COP> &NEElementwiseComparisonStatic<COP>::operator=(NEElementwiseComparisonStatic &&) = default;
-template <ComparisonOperation       COP>
+template <ComparisonOperation COP>
+NEElementwiseComparisonStatic<COP> &
+NEElementwiseComparisonStatic<COP>::operator=(NEElementwiseComparisonStatic &&) = default;
+template <ComparisonOperation COP>
 NEElementwiseComparisonStatic<COP>::~NEElementwiseComparisonStatic() = default;
 
 template <ComparisonOperation COP>
@@ -268,13 +288,15 @@ void NEElementwiseComparisonStatic<COP>::configure(ITensor *input1, ITensor *inp
 }
 
 template <ComparisonOperation COP>
-Status NEElementwiseComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status NEElementwiseComparisonStatic<COP>::validate(const ITensorInfo *input1,
+                                                    const ITensorInfo *input2,
+                                                    const ITensorInfo *output)
 {
     return cpu::CpuElementwiseComparisonStatic<COP>::validate(input1, input2, output);
 }
 
 template <ComparisonOperation COP>
-void                          NEElementwiseComparisonStatic<COP>::run()
+void NEElementwiseComparisonStatic<COP>::run()
 {
     ITensorPack pack;
     pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
@@ -285,17 +307,16 @@ void                          NEElementwiseComparisonStatic<COP>::run()
 
 struct NEElementwiseComparison::Impl
 {
-    const ITensor                                 *src_0{ nullptr };
-    const ITensor                                 *src_1{ nullptr };
-    ITensor                                       *dst{ nullptr };
-    std::unique_ptr<cpu::CpuElementwiseComparison> op{ nullptr };
+    const ITensor                                 *src_0{nullptr};
+    const ITensor                                 *src_1{nullptr};
+    ITensor                                       *dst{nullptr};
+    std::unique_ptr<cpu::CpuElementwiseComparison> op{nullptr};
 };
 
-NEElementwiseComparison::NEElementwiseComparison()
-    : _impl(std::make_unique<Impl>())
+NEElementwiseComparison::NEElementwiseComparison() : _impl(std::make_unique<Impl>())
 {
 }
-NEElementwiseComparison::NEElementwiseComparison(NEElementwiseComparison &&) = default;
+NEElementwiseComparison::NEElementwiseComparison(NEElementwiseComparison &&)            = default;
 NEElementwiseComparison &NEElementwiseComparison::operator=(NEElementwiseComparison &&) = default;
 NEElementwiseComparison::~NEElementwiseComparison()                                     = default;
 
@@ -308,7 +329,10 @@ void NEElementwiseComparison::configure(ITensor *input1, ITensor *input2, ITenso
     _impl->op->configure(input1->info(), input2->info(), output->info(), op);
 }
 
-Status NEElementwiseComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op)
+Status NEElementwiseComparison::validate(const ITensorInfo  *input1,
+                                         const ITensorInfo  *input2,
+                                         const ITensorInfo  *output,
+                                         ComparisonOperation op)
 {
     return cpu::CpuElementwiseComparison::validate(input1, input2, output, op);
 }
diff --git a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
index a0674ec320..23a092c407 100644
--- a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
+++ b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
@@ -22,7 +22,9 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h"
+
 #include "src/cpu/operators/CpuElementwiseUnary.h"
+
 #include <utility>
 
 namespace arm_compute
@@ -32,21 +34,20 @@ using OperatorType = cpu::CpuElementwiseUnary;
 template <ElementWiseUnary op>
 struct NEElementwiseUnaryLayer<op>::Impl
 {
-    const ITensor                *src{ nullptr };
-    ITensor                      *dst{ nullptr };
-    std::unique_ptr<OperatorType> cpu_op{ nullptr };
+    const ITensor                *src{nullptr};
+    ITensor                      *dst{nullptr};
+    std::unique_ptr<OperatorType> cpu_op{nullptr};
 };
 
 template <ElementWiseUnary op>
-NEElementwiseUnaryLayer<op>::NEElementwiseUnaryLayer()
-    : _impl(std::make_unique<Impl>())
+NEElementwiseUnaryLayer<op>::NEElementwiseUnaryLayer() : _impl(std::make_unique<Impl>())
 {
 }
 template <ElementWiseUnary op>
 NEElementwiseUnaryLayer<op>::~NEElementwiseUnaryLayer() = default;
 template <ElementWiseUnary op>
 NEElementwiseUnaryLayer<op>::NEElementwiseUnaryLayer(NEElementwiseUnaryLayer &&) = default;
-template <ElementWiseUnary   op>
+template <ElementWiseUnary op>
 NEElementwiseUnaryLayer<op> &NEElementwiseUnaryLayer<op>::operator=(NEElementwiseUnaryLayer &&) = default;
 
 template <ElementWiseUnary op>
@@ -65,7 +66,7 @@ Status NEElementwiseUnaryLayer<op>::validate(const ITensorInfo *input, const ITe
 }
 
 template <ElementWiseUnary op>
-void                       NEElementwiseUnaryLayer<op>::run()
+void NEElementwiseUnaryLayer<op>::run()
 {
     ITensorPack pack;
     pack.add_tensor(TensorType::ACL_SRC, _impl->src);
diff --git a/src/runtime/NEON/functions/NEFFT1D.cpp b/src/runtime/NEON/functions/NEFFT1D.cpp
index 343b817eba..fb75f9da29 100644
--- a/src/runtime/NEON/functions/NEFFT1D.cpp
+++ b/src/runtime/NEON/functions/NEFFT1D.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
 #include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
@@ -37,7 +38,15 @@ namespace arm_compute
 NEFFT1D::~NEFFT1D() = default;
 
 NEFFT1D::NEFFT1D(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _digit_reverse_kernel(), _fft_kernels(), _scale_kernel(), _digit_reversed_input(), _digit_reverse_indices(), _num_ffts(0), _axis(0), _run_scale(false)
+    : _memory_group(std::move(memory_manager)),
+      _digit_reverse_kernel(),
+      _fft_kernels(),
+      _scale_kernel(),
+      _digit_reversed_input(),
+      _digit_reverse_indices(),
+      _num_ffts(0),
+      _axis(0),
+      _run_scale(false)
 {
 }
 
@@ -74,7 +83,7 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo &
     _fft_kernels.resize(_num_ffts);
     _axis = config.axis;
 
-    for(unsigned int i = 0; i < _num_ffts; ++i)
+    for (unsigned int i = 0; i < _num_ffts; ++i)
     {
         const unsigned int radix_for_stage = decomposed_vector.at(i);
 
@@ -84,19 +93,21 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo &
         fft_kernel_info.Nx             = Nx;
         fft_kernel_info.is_first_stage = (i == 0);
         _fft_kernels[i]                = std::make_unique<NEFFTRadixStageKernel>();
-        _fft_kernels[i]->configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
+        _fft_kernels[i]->configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr,
+                                   fft_kernel_info);
 
         Nx *= radix_for_stage;
     }
 
     // Configure scale kernel
-    if(_run_scale)
+    if (_run_scale)
     {
         FFTScaleKernelInfo scale_config;
         scale_config.scale     = static_cast<float>(N);
         scale_config.conjugate = config.direction == FFTDirection::Inverse;
         _scale_kernel          = std::make_unique<NEFFTScaleKernel>();
-        is_c2r ? _scale_kernel->configure(&_digit_reversed_input, output, scale_config) : _scale_kernel->configure(output, nullptr, scale_config);
+        is_c2r ? _scale_kernel->configure(&_digit_reversed_input, output, scale_config)
+                        : _scale_kernel->configure(output, nullptr, scale_config);
     }
 
     // Allocate tensors
@@ -113,7 +124,7 @@ Status NEFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
 
     // Check if FFT is decomposable
     const auto         supported_radix   = NEFFTRadixStageKernel::supported_radix();
@@ -122,7 +133,7 @@ Status NEFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co
     ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty());
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         // All combinations are supported except real input with real output (i.e., both input channels set to 1)
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1);
@@ -140,13 +151,13 @@ void NEFFT1D::run()
 
     NEScheduler::get().schedule(_digit_reverse_kernel.get(), (_axis == 0 ? Window::DimY : Window::DimZ));
 
-    for(unsigned int i = 0; i < _num_ffts; ++i)
+    for (unsigned int i = 0; i < _num_ffts; ++i)
     {
         NEScheduler::get().schedule(_fft_kernels[i].get(), (_axis == 0 ? Window::DimY : Window::DimX));
     }
 
     // Run output scaling
-    if(_run_scale)
+    if (_run_scale)
     {
         NEScheduler::get().schedule(_scale_kernel.get(), Window::DimY);
     }
diff --git a/src/runtime/NEON/functions/NEFFT2D.cpp b/src/runtime/NEON/functions/NEFFT2D.cpp
index ab422bd2ae..066909221d 100644
--- a/src/runtime/NEON/functions/NEFFT2D.cpp
+++ b/src/runtime/NEON/functions/NEFFT2D.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Scheduler.h"
+
 #include "src/common/utils/Log.h"
 
 namespace arm_compute
@@ -33,7 +34,10 @@ namespace arm_compute
 NEFFT2D::~NEFFT2D() = default;
 
 NEFFT2D::NEFFT2D(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor()
+    : _memory_group(memory_manager),
+      _first_pass_func(memory_manager),
+      _second_pass_func(memory_manager),
+      _first_pass_tensor()
 {
 }
 
@@ -78,7 +82,7 @@ Status NEFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, co
     ARM_COMPUTE_RETURN_ON_ERROR(NEFFT1D::validate(&first_pass_tensor, output, second_pass_config));
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
diff --git a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
index 0551d756fb..94f85e5ffa 100644
--- a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
@@ -25,15 +25,16 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
 #include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
 #include "src/core/NEON/kernels/NEFFTScaleKernel.h"
 #include "src/core/NEON/kernels/NEPadLayerKernel.h"
 #include "src/core/NEON/kernels/NEReductionOperationKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/utils/helpers/fft.h"
 
 namespace arm_compute
@@ -46,11 +47,11 @@ int pad_decomposable(int N)
 
     int  pad           = 0;
     bool is_decomposed = false;
-    while(!is_decomposed)
+    while (!is_decomposed)
     {
         const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix);
         is_decomposed                = !decomposed_vector.empty();
-        if(!is_decomposed)
+        if (!is_decomposed)
         {
             ++pad;
         }
@@ -102,8 +103,13 @@ NEFFTConvolutionLayer::NEFFTConvolutionLayer(std::shared_ptr<IMemoryManager> mem
 }
 NEFFTConvolutionLayer::~NEFFTConvolutionLayer() = default;
 
-void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                      const ActivationLayerInfo &act_info, bool enable_fast_math)
+void NEFFTConvolutionLayer::configure(ITensor                   *input,
+                                      const ITensor             *weights,
+                                      const ITensor             *biases,
+                                      ITensor                   *output,
+                                      const PadStrideInfo       &conv_info,
+                                      const ActivationLayerInfo &act_info,
+                                      bool                       enable_fast_math)
 {
     ARM_COMPUTE_UNUSED(enable_fast_math);
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info, enable_fast_math);
@@ -115,21 +121,24 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
     _has_bias = biases != nullptr;
 
     // Get indices for the width and height
-    const size_t idx_width  = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+    const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height =
+        get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
 
     // Input shape, kernel size and output tile
-    const Size2D input_dims  = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
-    const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
-    const Size2D pad_valid   = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
-                                      pad_decomposable(input_dims.y() + kernel_size.y() - 1));
+    const Size2D input_dims =
+        Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
+    const Size2D kernel_size =
+        Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
+    const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
+                                    pad_decomposable(input_dims.y() + kernel_size.y() - 1));
     // Tensors to use
     ITensor       *input_to_use   = input;
     const ITensor *weights_to_use = weights;
     ITensor       *output_to_use  = _has_bias ? &_bias_output : output;
 
     // Permute bias
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         _permute_bias_func.configure(biases, &_permuted_bias, PermutationVector(1U, 2U, 0U));
         _permuted_bias.info()->set_data_layout(DataLayout::NCHW);
@@ -137,7 +146,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
 
     // Permute input if needed
     _needs_permute = input->info()->data_layout() == DataLayout::NHWC;
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _memory_group.manage(&_permuted_input);
         // Configure the function to transform the input tensor from NHWC -> NCHW
@@ -158,7 +167,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
     _flip_weights_func.configure(weights_to_use, &_flipped_weights, &_flip_axis);
 
     // Pad weights
-    const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } };
+    const PaddingList padding_w = {{0, input_dims.x() + pad_valid.x() - 1}, {0, input_dims.y() + pad_valid.y() - 1}};
     _pad_weights_func.configure(&_flipped_weights, &_padded_weights, padding_w);
 
     // Transform weights
@@ -166,10 +175,10 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
     _transform_weights_func->configure(&_padded_weights, &_transformed_weights, FFT2DInfo());
 
     // Pad input
-    const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } };
+    const PaddingList padding_in = {{0, kernel_size.x() + pad_valid.x() - 1}, {0, kernel_size.y() + pad_valid.y() - 1}};
     _memory_group.manage(&_padded_input);
     _pad_input_func.configure(input_to_use, &_padded_input, padding_in);
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permuted_input.allocator()->allocate();
     }
@@ -193,7 +202,8 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
     _memory_group.manage(&_itransformed_output);
     FFT2DInfo itranform_info;
     itranform_info.direction = FFTDirection::Inverse;
-    _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
+    _itransformed_output.allocator()->init(
+        _output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
     _itransform_output_func.configure(&_output_reduced, &_itransformed_output, itranform_info);
     _output_reduced.allocator()->allocate();
 
@@ -205,26 +215,29 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
     // Extract correct region
     const int start_left = kernel_size.x() - conv_info.pad_left() - 1;
     const int start_top  = kernel_size.y() - conv_info.pad_top() - 1;
-    const int end_right  = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
-    const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
-    if(_has_bias)
+    const int end_right =
+        _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
+    const int end_botton =
+        _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
+    if (_has_bias)
     {
         _memory_group.manage(&_bias_output);
     }
-    else if(_needs_permute)
+    else if (_needs_permute)
     {
         output_to_use = &_permuted_output;
         _memory_group.manage(&_permuted_output);
     }
-    _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
+    _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top),
+                                   Coordinates(end_right, end_botton));
     _reshaped_output.allocator()->allocate();
     _itransformed_output.allocator()->allocate();
 
     // Add bias
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         output_to_use = output;
-        if(_needs_permute)
+        if (_needs_permute)
         {
             output_to_use = &_permuted_output;
             _memory_group.manage(&_permuted_output);
@@ -235,7 +248,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
     }
 
     // Permute output
-    if(_needs_permute)
+    if (_needs_permute)
     {
         // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
         _permuted_output.info()->set_data_layout(DataLayout::NCHW);
@@ -247,7 +260,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
 
     // Configure Activation Layer
     _is_activationlayer_enabled = act_info.enabled();
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activation_layer_func.configure(output, nullptr, act_info);
     }
@@ -260,8 +273,13 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
     axis_data[1]   = 1;
 }
 
-Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                       const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status NEFFTConvolutionLayer::validate(const ITensorInfo         *input,
+                                       const ITensorInfo         *weights,
+                                       const ITensorInfo         *biases,
+                                       const ITensorInfo         *output,
+                                       const PadStrideInfo       &conv_info,
+                                       const ActivationLayerInfo &act_info,
+                                       bool                       enable_fast_math)
 {
     ARM_COMPUTE_UNUSED(enable_fast_math);
 
@@ -279,11 +297,13 @@ Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn
     const auto strides = conv_info.stride();
     ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y());
-    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2));
-    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2));
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) ||
+                                conv_info.pad_right() != (kernel_size.x() / 2));
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) ||
+                                conv_info.pad_bottom() != (kernel_size.y() / 2));
 
     // Validate biases
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         const size_t idx_channels = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
@@ -291,13 +311,14 @@ Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn
     }
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
+        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) ||
+                                    (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
 
         // Validate Activation Layer
-        if(act_info.enabled())
+        if (act_info.enabled())
         {
             ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
         }
@@ -313,7 +334,7 @@ void NEFFTConvolutionLayer::run()
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Transform input
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permute_input_func.run();
     }
@@ -331,17 +352,17 @@ void NEFFTConvolutionLayer::run()
     _extract_output_func.run();
 
     // Add bias
-    if(_has_bias)
+    if (_has_bias)
     {
         _bias_add_func.run();
     }
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permute_output_func.run();
     }
 
     // Run activation layer
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activation_layer_func.run();
     }
@@ -349,10 +370,10 @@ void NEFFTConvolutionLayer::run()
 
 void NEFFTConvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         // Permute bias to NCHW
-        if(_original_bias != nullptr)
+        if (_original_bias != nullptr)
         {
             _permuted_bias.allocator()->allocate();
             _permute_bias_func.run();
@@ -362,7 +383,7 @@ void NEFFTConvolutionLayer::prepare()
         const ITensor *cur_weights = _original_weights;
 
         // Permute weights
-        if(_needs_permute)
+        if (_needs_permute)
         {
             ARM_COMPUTE_ERROR_ON(!cur_weights->is_used());
 
diff --git a/src/runtime/NEON/functions/NEFill.cpp b/src/runtime/NEON/functions/NEFill.cpp
index 43667783bf..bc1d5b7f5c 100644
--- a/src/runtime/NEON/functions/NEFill.cpp
+++ b/src/runtime/NEON/functions/NEFill.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEFill.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuFill.h"
 
 #include <utility>
@@ -32,15 +33,14 @@ namespace arm_compute
 {
 struct NEFill::Impl
 {
-    ITensor                      *tensor{ nullptr };
-    std::unique_ptr<cpu::CpuFill> op{ nullptr };
+    ITensor                      *tensor{nullptr};
+    std::unique_ptr<cpu::CpuFill> op{nullptr};
 };
 
-NEFill::NEFill()
-    : _impl(std::make_unique<Impl>())
+NEFill::NEFill() : _impl(std::make_unique<Impl>())
 {
 }
-NEFill::NEFill(NEFill &&) = default;
+NEFill::NEFill(NEFill &&)            = default;
 NEFill &NEFill::operator=(NEFill &&) = default;
 NEFill::~NEFill()                    = default;
 
diff --git a/src/runtime/NEON/functions/NEFillBorder.cpp b/src/runtime/NEON/functions/NEFillBorder.cpp
index d633e340f8..a3ab9c3db4 100644
--- a/src/runtime/NEON/functions/NEFillBorder.cpp
+++ b/src/runtime/NEON/functions/NEFillBorder.cpp
@@ -25,17 +25,20 @@
 
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEFillBorderKernel.h"
 
 namespace arm_compute
 {
-NEFillBorder::NEFillBorder()
-    : _border_handler(nullptr)
+NEFillBorder::NEFillBorder() : _border_handler(nullptr)
 {
 }
 
-void NEFillBorder::configure(ITensor *input, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
+void NEFillBorder::configure(ITensor          *input,
+                             unsigned int      border_width,
+                             BorderMode        border_mode,
+                             const PixelValue &constant_border_value)
 {
     ARM_COMPUTE_LOG_PARAMS(input, border_width, border_mode, constant_border_value);
     _border_handler = std::make_unique<NEFillBorderKernel>();
diff --git a/src/runtime/NEON/functions/NEFlattenLayer.cpp b/src/runtime/NEON/functions/NEFlattenLayer.cpp
index f435842634..56db2be3fa 100644
--- a/src/runtime/NEON/functions/NEFlattenLayer.cpp
+++ b/src/runtime/NEON/functions/NEFlattenLayer.cpp
@@ -24,8 +24,9 @@
 #include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h"
 
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/cpu/operators/CpuFlatten.h"
 
@@ -33,16 +34,15 @@ namespace arm_compute
 {
 struct NEFlattenLayer::Impl
 {
-    const ITensor                   *src{ nullptr };
-    ITensor                         *dst{ nullptr };
-    std::unique_ptr<cpu::CpuFlatten> op{ nullptr };
+    const ITensor                   *src{nullptr};
+    ITensor                         *dst{nullptr};
+    std::unique_ptr<cpu::CpuFlatten> op{nullptr};
 };
 
-NEFlattenLayer::NEFlattenLayer()
-    : _impl(std::make_unique<Impl>())
+NEFlattenLayer::NEFlattenLayer() : _impl(std::make_unique<Impl>())
 {
 }
-NEFlattenLayer::NEFlattenLayer(NEFlattenLayer &&) = default;
+NEFlattenLayer::NEFlattenLayer(NEFlattenLayer &&)            = default;
 NEFlattenLayer &NEFlattenLayer::operator=(NEFlattenLayer &&) = default;
 NEFlattenLayer::~NEFlattenLayer()                            = default;
 
@@ -51,7 +51,8 @@ void NEFlattenLayer::configure(const ITensor *input, ITensor *output)
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     _impl->src = input;
     _impl->dst = output;
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input->info())));
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(
+                                            misc::shape_calculator::compute_flatten_shape(input->info())));
 
     _impl->op = std::make_unique<cpu::CpuFlatten>();
     _impl->op->configure(_impl->src->info(), _impl->dst->info());
@@ -60,9 +61,10 @@ void NEFlattenLayer::configure(const ITensor *input, ITensor *output)
 Status NEFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input));
+        const TensorInfo tensor_info_output =
+            input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
     }
     return cpu::CpuFlatten::validate(input, output);
diff --git a/src/runtime/NEON/functions/NEFloor.cpp b/src/runtime/NEON/functions/NEFloor.cpp
index d2dc48a159..112c93c478 100644
--- a/src/runtime/NEON/functions/NEFloor.cpp
+++ b/src/runtime/NEON/functions/NEFloor.cpp
@@ -24,22 +24,22 @@
 #include "arm_compute/runtime/NEON/functions/NEFloor.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuFloor.h"
 
 namespace arm_compute
 {
 struct NEFloor::Impl
 {
-    const ITensor                 *src{ nullptr };
-    ITensor                       *dst{ nullptr };
-    std::unique_ptr<cpu::CpuFloor> op{ nullptr };
+    const ITensor                 *src{nullptr};
+    ITensor                       *dst{nullptr};
+    std::unique_ptr<cpu::CpuFloor> op{nullptr};
 };
 
-NEFloor::NEFloor()
-    : _impl(std::make_unique<Impl>())
+NEFloor::NEFloor() : _impl(std::make_unique<Impl>())
 {
 }
-NEFloor::NEFloor(NEFloor &&) = default;
+NEFloor::NEFloor(NEFloor &&)            = default;
 NEFloor &NEFloor::operator=(NEFloor &&) = default;
 NEFloor::~NEFloor()                     = default;
 
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index 891487efd3..2656d0fa0f 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuFullyConnected.h"
@@ -38,80 +39,90 @@ using namespace arm_compute::experimental;
 struct NEFullyConnectedLayer::Impl
 {
     MemoryGroup      memory_group{};
-    IWeightsManager *weights_manager{ nullptr };
+    IWeightsManager *weights_manager{nullptr};
 
-    std::unique_ptr<cpu::CpuFullyConnected> op{ nullptr };
+    std::unique_ptr<cpu::CpuFullyConnected> op{nullptr};
 
-    const ITensor *original_weights{ nullptr };
+    const ITensor *original_weights{nullptr};
 
     ITensorPack                      run_pack{};
     WorkspaceData<Tensor>            workspace{};
     experimental::MemoryRequirements aux_mem_req{};
 
-    bool is_prepared{ false };
-    bool dynamic_weights{ false };
+    bool is_prepared{false};
+    bool dynamic_weights{false};
 };
 
 NEFullyConnectedLayer::~NEFullyConnectedLayer() = default;
 
-NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
+NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager,
+                                             IWeightsManager                *weights_manager)
     : _impl(std::make_unique<Impl>())
 {
     _impl->memory_group    = MemoryGroup(std::move(memory_manager));
     _impl->weights_manager = weights_manager;
 }
 
-void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output,
-                                      FullyConnectedLayerInfo fc_info, const WeightsInfo &weights_info)
+void NEFullyConnectedLayer::configure(const ITensor          *input,
+                                      const ITensor          *weights,
+                                      const ITensor          *biases,
+                                      ITensor                *output,
+                                      FullyConnectedLayerInfo fc_info,
+                                      const WeightsInfo      &weights_info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayer::validate(input->info(),
-                                                               weights->info(),
+    ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayer::validate(input->info(), weights->info(),
                                                                biases != nullptr ? biases->info() : nullptr,
-                                                               output->info(),
-                                                               fc_info,
-                                                               weights_info));
+                                                               output->info(), fc_info, weights_info));
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, fc_info);
 
     _impl->op               = std::make_unique<cpu::CpuFullyConnected>();
     _impl->original_weights = weights;
     _impl->is_prepared      = false;
 
-    _impl->op->configure(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), fc_info, weights_info);
+    _impl->op->configure(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
+                         fc_info, weights_info);
 
-    if(_impl->weights_manager != nullptr)
+    if (_impl->weights_manager != nullptr)
     {
         _impl->weights_manager->manage(_impl->original_weights);
     }
 
     _impl->aux_mem_req = _impl->op->workspace();
-    _impl->run_pack    = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
-    _impl->workspace   = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
-
-    _impl->dynamic_weights =
-        !weights->info()->are_values_constant() &&
-        fc_info.transpose_weights &&
-        !fc_info.are_weights_reshaped &&
-        !fc_info.retain_internal_weights;
+    _impl->run_pack    = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+    _impl->workspace =
+        manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
+
+    _impl->dynamic_weights = !weights->info()->are_values_constant() && fc_info.transpose_weights &&
+                             !fc_info.are_weights_reshaped && !fc_info.retain_internal_weights;
 }
 
-Status NEFullyConnectedLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *input, const ITensorInfo *weights,
-                                           const ITensorInfo *biases, const ITensorInfo *output, const FullyConnectedLayerInfo &fc_info,
-                                           const WeightsInfo &weights_info)
+Status NEFullyConnectedLayer::has_opt_impl(arm_compute::WeightFormat     &expected_weight_format,
+                                           const ITensorInfo             *input,
+                                           const ITensorInfo             *weights,
+                                           const ITensorInfo             *biases,
+                                           const ITensorInfo             *output,
+                                           const FullyConnectedLayerInfo &fc_info,
+                                           const WeightsInfo             &weights_info)
 {
-    return cpu::CpuFullyConnected::has_opt_impl(expected_weight_format, input, weights, biases, output, fc_info, weights_info);
+    return cpu::CpuFullyConnected::has_opt_impl(expected_weight_format, input, weights, biases, output, fc_info,
+                                                weights_info);
 }
 
-Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                       FullyConnectedLayerInfo fc_info, const WeightsInfo &weights_info)
+Status NEFullyConnectedLayer::validate(const ITensorInfo      *input,
+                                       const ITensorInfo      *weights,
+                                       const ITensorInfo      *biases,
+                                       const ITensorInfo      *output,
+                                       FullyConnectedLayerInfo fc_info,
+                                       const WeightsInfo      &weights_info)
 {
     return cpu::CpuFullyConnected::validate(input, weights, biases, output, fc_info, weights_info);
 }
 
 void NEFullyConnectedLayer::run()
 {
-    if(!_impl->dynamic_weights)
+    if (!_impl->dynamic_weights)
     {
         prepare();
     }
@@ -122,7 +133,7 @@ void NEFullyConnectedLayer::run()
 
 void NEFullyConnectedLayer::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->run_pack);
 
@@ -131,13 +142,13 @@ void NEFullyConnectedLayer::prepare()
         _impl->is_prepared = true;
 
         // Handle weights managed infrastructure
-        if(_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights))
+        if (_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights))
         {
             // Ensure that b gets marked as unused (memory released) only after the last function which uses b also finishes its prepare
             // This is for cases where multiple functions share the same b (weights)
             // Therefore when a function marks original b as unused, we pre-mark it in weights manager, and mark it back to used so that it doesn't get released before its last reference
             const ITensor *original_b = _impl->original_weights;
-            if(!original_b->is_used())
+            if (!original_b->is_used())
             {
                 _impl->weights_manager->pre_mark_as_unused(original_b);
             }
diff --git a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
index 6612845d86..f5b8b57dac 100644
--- a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
+++ b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
 
@@ -35,29 +36,42 @@ namespace arm_compute
 {
 NEFuseBatchNormalization::~NEFuseBatchNormalization() = default;
 
-NEFuseBatchNormalization::NEFuseBatchNormalization()
-    : _fuse_bn_kernel()
+NEFuseBatchNormalization::NEFuseBatchNormalization() : _fuse_bn_kernel()
 {
 }
 
-void NEFuseBatchNormalization::configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var,
-                                         ITensor *fused_weights, ITensor *fused_bias,
-                                         const ITensor *input_bias, const ITensor *bn_beta, const ITensor *bn_gamma,
-                                         float epsilon, FuseBatchNormalizationType fbn_type)
+void NEFuseBatchNormalization::configure(const ITensor             *input_weights,
+                                         const ITensor             *bn_mean,
+                                         const ITensor             *bn_var,
+                                         ITensor                   *fused_weights,
+                                         ITensor                   *fused_bias,
+                                         const ITensor             *input_bias,
+                                         const ITensor             *bn_beta,
+                                         const ITensor             *bn_gamma,
+                                         float                      epsilon,
+                                         FuseBatchNormalizationType fbn_type)
 {
-    ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias,
-                           bn_beta, bn_gamma, epsilon, fbn_type);
+    ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma,
+                           epsilon, fbn_type);
 
     _fuse_bn_kernel = std::make_unique<NEFuseBatchNormalizationKernel>();
-    _fuse_bn_kernel->configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    _fuse_bn_kernel->configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma,
+                               epsilon, fbn_type);
 }
 
-Status NEFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                                          const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                                          const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
-                                          float epsilon, FuseBatchNormalizationType fbn_type)
+Status NEFuseBatchNormalization::validate(const ITensorInfo         *input_weights,
+                                          const ITensorInfo         *bn_mean,
+                                          const ITensorInfo         *bn_var,
+                                          const ITensorInfo         *fused_weights,
+                                          const ITensorInfo         *fused_bias,
+                                          const ITensorInfo         *input_bias,
+                                          const ITensorInfo         *bn_beta,
+                                          const ITensorInfo         *bn_gamma,
+                                          float                      epsilon,
+                                          FuseBatchNormalizationType fbn_type)
 {
-    return NEFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    return NEFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+                                                    input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
 void NEFuseBatchNormalization::run()
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index e51f2f9eb6..934a8250cc 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuGemm.h"
@@ -39,12 +40,12 @@ namespace arm_compute
 struct NEGEMM::Impl
 {
     MemoryGroup      memory_group{};
-    IWeightsManager *weights_manager{ nullptr };
+    IWeightsManager *weights_manager{nullptr};
 
-    std::unique_ptr<cpu::CpuGemm> op{ nullptr };
+    std::unique_ptr<cpu::CpuGemm> op{nullptr};
 
-    const ITensor *original_b{ nullptr };
-    bool           is_prepared{ false };
+    const ITensor *original_b{nullptr};
+    bool           is_prepared{false};
 
     ITensorPack                      run_pack{};
     ITensorPack                      prep_pack{};
@@ -61,10 +62,17 @@ NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *
 
 NEGEMM::~NEGEMM() = default;
 
-void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info)
+void NEGEMM::configure(const ITensor  *a,
+                       const ITensor  *b,
+                       const ITensor  *c,
+                       ITensor        *d,
+                       float           alpha,
+                       float           beta,
+                       const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
-    ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuGemm::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, gemm_info));
+    ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuGemm::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr,
+                                                      d->info(), alpha, beta, gemm_info));
 
     // Check if we need to reshape the matrix B only on the first run
     _impl->is_prepared = false;
@@ -73,24 +81,32 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
 
     // Make the B matrix dynamic values.
     auto b_info_to_use = b->info()->clone();
-    if(!gemm_info.reshape_b_only_on_first_run())
+    if (!gemm_info.reshape_b_only_on_first_run())
     {
         b_info_to_use->set_are_values_constant(false);
     }
 
-    _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, gemm_info);
+    _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta,
+                         gemm_info);
 
     _impl->aux_mem_req = _impl->op->workspace();
-    _impl->run_pack    = { { ACL_SRC_0, a }, { ACL_SRC_1, b }, { ACL_SRC_2, c }, { ACL_DST, d } };
-    _impl->prep_pack   = { { ACL_SRC_1, b }, { ACL_SRC_2, c } };
-    _impl->workspace   = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+    _impl->run_pack    = {{ACL_SRC_0, a}, {ACL_SRC_1, b}, {ACL_SRC_2, c}, {ACL_DST, d}};
+    _impl->prep_pack   = {{ACL_SRC_1, b}, {ACL_SRC_2, c}};
+    _impl->workspace =
+        manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
 }
 
-Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status NEGEMM::validate(const ITensorInfo *a,
+                        const ITensorInfo *b,
+                        const ITensorInfo *c,
+                        const ITensorInfo *output,
+                        float              alpha,
+                        float              beta,
+                        const GEMMInfo    &gemm_info)
 {
     // Make the B matrix dynamic values.
     auto b_to_use = b->clone();
-    if(!gemm_info.reshape_b_only_on_first_run())
+    if (!gemm_info.reshape_b_only_on_first_run())
     {
         b_to_use->set_are_values_constant(false);
     }
@@ -98,8 +114,14 @@ Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso
     return cpu::CpuGemm::validate(a, b_to_use.get(), c, output, alpha, beta, gemm_info);
 }
 
-Status NEGEMM::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output,
-                            float alpha, float beta, const GEMMInfo &gemm_info)
+Status NEGEMM::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                            const ITensorInfo         *a,
+                            const ITensorInfo         *b,
+                            const ITensorInfo         *c,
+                            const ITensorInfo         *output,
+                            float                      alpha,
+                            float                      beta,
+                            const GEMMInfo            &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha, beta);
     return cpu::CpuGemm::has_opt_impl(expected_weight_format, a, b, c, output, gemm_info);
@@ -115,15 +137,15 @@ void NEGEMM::run()
 
 void NEGEMM::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->prep_pack);
 
-        auto has_reshape = std::find_if(_impl->aux_mem_req.begin(),
-                                        _impl->aux_mem_req.end(),
-                                        [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+        auto has_reshape =
+            std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+                         [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
 
-        if(has_reshape != std::end(_impl->aux_mem_req))
+        if (has_reshape != std::end(_impl->aux_mem_req))
         {
             _impl->original_b->mark_as_unused();
         }
diff --git a/src/runtime/NEON/functions/NEGEMMConv2d.cpp b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
index 42b8b70405..6cca02eea9 100644
--- a/src/runtime/NEON/functions/NEGEMMConv2d.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuGemmDirectConv2d.h"
 
@@ -35,25 +36,25 @@ using namespace arm_compute::experimental;
 
 struct NEGEMMConv2d::Impl
 {
-    const ITensor                   *weights{ nullptr };
-    std::unique_ptr<OperatorType>    op{ nullptr };
+    const ITensor                   *weights{nullptr};
+    std::unique_ptr<OperatorType>    op{nullptr};
     ITensorPack                      run_pack{};
     ITensorPack                      prep_pack{};
     WorkspaceData<Tensor>            workspace{};
     MemoryGroup                      memory_group{};
-    bool                             is_prepared{ false };
+    bool                             is_prepared{false};
     experimental::MemoryRequirements aux_mem_req{};
 };
 
-NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr<IMemoryManager> &memory_manager)
-    : _impl(std::make_unique<Impl>())
+NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr<IMemoryManager> &memory_manager) : _impl(std::make_unique<Impl>())
 {
     _impl->memory_group = MemoryGroup(memory_manager);
 }
 
 NEGEMMConv2d::~NEGEMMConv2d() = default;
 
-void NEGEMMConv2d::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info)
+void NEGEMMConv2d::configure(
+    ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
@@ -61,15 +62,21 @@ void NEGEMMConv2d::configure(ITensor *input, const ITensor *weights, const ITens
     _impl->is_prepared = false;
     _impl->op          = std::make_unique<OperatorType>();
 
-    _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), info);
+    _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+                         info);
 
     _impl->aux_mem_req = _impl->op->workspace();
-    _impl->run_pack    = { { TensorType::ACL_SRC_0, input }, { TensorType::ACL_SRC_2, biases }, { TensorType::ACL_DST, output } };
-    _impl->prep_pack   = { { TensorType::ACL_SRC_1, weights }, { TensorType::ACL_SRC_2, biases } };
-    _impl->workspace   = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+    _impl->run_pack  = {{TensorType::ACL_SRC_0, input}, {TensorType::ACL_SRC_2, biases}, {TensorType::ACL_DST, output}};
+    _impl->prep_pack = {{TensorType::ACL_SRC_1, weights}, {TensorType::ACL_SRC_2, biases}};
+    _impl->workspace =
+        manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack);
 }
 
-Status NEGEMMConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &info)
+Status NEGEMMConv2d::validate(const ITensorInfo *input,
+                              const ITensorInfo *weights,
+                              const ITensorInfo *biases,
+                              const ITensorInfo *output,
+                              const Conv2dInfo  &info)
 {
     return OperatorType::validate(input, weights, biases, output, info);
 }
@@ -84,15 +91,15 @@ void NEGEMMConv2d::run()
 
 void NEGEMMConv2d::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->prep_pack);
 
-        auto has_reshape = std::find_if(_impl->aux_mem_req.begin(),
-                                        _impl->aux_mem_req.end(),
-                                        [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+        auto has_reshape =
+            std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+                         [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
 
-        if(has_reshape != std::end(_impl->aux_mem_req))
+        if (has_reshape != std::end(_impl->aux_mem_req))
         {
             _impl->weights->mark_as_unused();
         }
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index fe3ea6a767..c8f65d2fd9 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuGemmConv2d.h"
 
@@ -36,17 +37,18 @@ namespace arm_compute
 {
 struct NEGEMMConvolutionLayer::Impl
 {
-    const ITensor                      *weights{ nullptr };
-    std::unique_ptr<cpu::CpuGemmConv2d> op{ nullptr };
+    const ITensor                      *weights{nullptr};
+    std::unique_ptr<cpu::CpuGemmConv2d> op{nullptr};
     ITensorPack                         run_pack{};
     MemoryGroup                         memory_group{};
-    IWeightsManager                    *weights_manager{ nullptr };
+    IWeightsManager                    *weights_manager{nullptr};
     MemoryRequirements                  aux_mem_req{};
     WorkspaceData<Tensor>               workspace_tensors{};
-    bool                                is_prepared{ false };
+    bool                                is_prepared{false};
 };
 
-NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager, IWeightsManager *weights_manager)
+NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager,
+                                               IWeightsManager                       *weights_manager)
     : _impl(std::make_unique<Impl>())
 {
     _impl->weights_manager = weights_manager;
@@ -54,37 +56,61 @@ NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryMana
 }
 NEGEMMConvolutionLayer::~NEGEMMConvolutionLayer() = default;
 
-void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                       const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+void NEGEMMConvolutionLayer::configure(const ITensor             *input,
+                                       const ITensor             *weights,
+                                       const ITensor             *biases,
+                                       ITensor                   *output,
+                                       const PadStrideInfo       &conv_info,
+                                       const WeightsInfo         &weights_info,
+                                       const Size2D              &dilation,
+                                       const ActivationLayerInfo &act_info,
+                                       bool                       enable_fast_math,
+                                       unsigned int               num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
     _impl->weights = weights;
     _impl->op      = std::make_unique<cpu::CpuGemmConv2d>();
-    _impl->op->configure(input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+    _impl->op->configure(input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(),
+                         conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
 
-    _impl->run_pack =
-    {
-        { TensorType::ACL_SRC_0, input },
-        { TensorType::ACL_SRC_1, weights },
-        { TensorType::ACL_SRC_2, biases },
-        { TensorType::ACL_DST, output }
-    };
-    _impl->aux_mem_req       = _impl->op->workspace();
-    _impl->workspace_tensors = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
+    _impl->run_pack    = {{TensorType::ACL_SRC_0, input},
+                          {TensorType::ACL_SRC_1, weights},
+                          {TensorType::ACL_SRC_2, biases},
+                          {TensorType::ACL_DST, output}};
+    _impl->aux_mem_req = _impl->op->workspace();
+    _impl->workspace_tensors =
+        manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
 }
 
-Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                        const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+Status NEGEMMConvolutionLayer::validate(const ITensorInfo         *input,
+                                        const ITensorInfo         *weights,
+                                        const ITensorInfo         *biases,
+                                        const ITensorInfo         *output,
+                                        const PadStrideInfo       &conv_info,
+                                        const WeightsInfo         &weights_info,
+                                        const Size2D              &dilation,
+                                        const ActivationLayerInfo &act_info,
+                                        bool                       enable_fast_math,
+                                        unsigned int               num_groups)
 {
-    return cpu::CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+    return cpu::CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
+                                        enable_fast_math, num_groups);
 }
 
-Status NEGEMMConvolutionLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                            const PadStrideInfo &conv_info,
-                                            const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, const bool enable_fast_math)
+Status NEGEMMConvolutionLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                                            const ITensorInfo         *src,
+                                            const ITensorInfo         *weights,
+                                            const ITensorInfo         *biases,
+                                            const ITensorInfo         *dst,
+                                            const PadStrideInfo       &conv_info,
+                                            const WeightsInfo         &weights_info,
+                                            const Size2D              &dilation,
+                                            const ActivationLayerInfo &act_info,
+                                            const bool                 enable_fast_math)
 {
-    return cpu::CpuGemmConv2d::has_opt_impl(expected_weight_format, src, weights, biases, dst, conv_info, weights_info, dilation, act_info, enable_fast_math);
+    return cpu::CpuGemmConv2d::has_opt_impl(expected_weight_format, src, weights, biases, dst, conv_info, weights_info,
+                                            dilation, act_info, enable_fast_math);
 }
 
 void NEGEMMConvolutionLayer::run()
@@ -96,7 +122,7 @@ void NEGEMMConvolutionLayer::run()
 
 void NEGEMMConvolutionLayer::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->run_pack);
 
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 453d3cedef..44bfc6a51e 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -29,8 +29,8 @@
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/Tensor.h"
-#include "src/core/helpers/MemoryHelpers.h"
 
+#include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
 
 using namespace arm_compute::experimental;
@@ -39,18 +39,19 @@ namespace arm_compute
 {
 struct NEGEMMLowpMatrixMultiplyCore::Impl
 {
-    const ITensor                                      *b{ nullptr };
-    std::unique_ptr<cpu::CpuGemmLowpMatrixMultiplyCore> op{ nullptr };
+    const ITensor                                      *b{nullptr};
+    std::unique_ptr<cpu::CpuGemmLowpMatrixMultiplyCore> op{nullptr};
     ITensorPack                                         run_pack{};
     ITensorPack                                         prep_pack{};
     MemoryGroup                                         memory_group{};
-    IWeightsManager                                    *weights_manager{ nullptr };
+    IWeightsManager                                    *weights_manager{nullptr};
     MemoryRequirements                                  aux_mem_req{};
     WorkspaceData<Tensor>                               workspace_tensors{};
-    bool                                                is_prepared{ false };
+    bool                                                is_prepared{false};
 };
 
-NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
+NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager,
+                                                           IWeightsManager                *weights_manager)
     : _impl(std::make_unique<Impl>())
 {
     _impl->weights_manager = weights_manager;
@@ -58,41 +59,41 @@ NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemo
 }
 NEGEMMLowpMatrixMultiplyCore::~NEGEMMLowpMatrixMultiplyCore() = default;
 
-void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info)
+void NEGEMMLowpMatrixMultiplyCore::configure(
+    const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
 
     // Make the B matrix dynamic values.
     auto b_info_to_use = b->info()->clone();
-    if(!gemm_info.reshape_b_only_on_first_run())
+    if (!gemm_info.reshape_b_only_on_first_run())
     {
         b_info_to_use->set_are_values_constant(false);
     }
 
     _impl->b  = b;
     _impl->op = std::make_unique<cpu::CpuGemmLowpMatrixMultiplyCore>();
-    _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr ? c->info() : nullptr), output->info(), gemm_info);
-    _impl->run_pack =
-    {
-        { TensorType::ACL_SRC_0, a },
-        { TensorType::ACL_SRC_1, b },
-        { TensorType::ACL_SRC_2, c },
-        { TensorType::ACL_DST, output }
-    };
-    _impl->prep_pack =
-    {
-        { TensorType::ACL_SRC_1, b },
-        { TensorType::ACL_SRC_2, c }
-    };
-    _impl->aux_mem_req       = _impl->op->workspace();
-    _impl->workspace_tensors = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+    _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr ? c->info() : nullptr), output->info(),
+                         gemm_info);
+    _impl->run_pack    = {{TensorType::ACL_SRC_0, a},
+                          {TensorType::ACL_SRC_1, b},
+                          {TensorType::ACL_SRC_2, c},
+                          {TensorType::ACL_DST, output}};
+    _impl->prep_pack   = {{TensorType::ACL_SRC_1, b}, {TensorType::ACL_SRC_2, c}};
+    _impl->aux_mem_req = _impl->op->workspace();
+    _impl->workspace_tensors =
+        manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
 }
 
-Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
+Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a,
+                                              const ITensorInfo *b,
+                                              const ITensorInfo *c,
+                                              const ITensorInfo *output,
+                                              const GEMMInfo    &gemm_info)
 {
     // Make the B matrix dynamic values.
     auto b_info_to_use = b->clone();
-    if(!gemm_info.reshape_b_only_on_first_run())
+    if (!gemm_info.reshape_b_only_on_first_run())
     {
         b_info_to_use->set_are_values_constant(false);
     }
@@ -109,15 +110,15 @@ void NEGEMMLowpMatrixMultiplyCore::run()
 
 void NEGEMMLowpMatrixMultiplyCore::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->prep_pack);
 
-        auto has_reshape = std::find_if(_impl->aux_mem_req.begin(),
-                                        _impl->aux_mem_req.end(),
-                                        [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+        auto has_reshape =
+            std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+                         [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
 
-        if(has_reshape != std::end(_impl->aux_mem_req))
+        if (has_reshape != std::end(_impl->aux_mem_req))
         {
             _impl->b->mark_as_unused();
         }
diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
index 7e1de3c257..8178003b5e 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
@@ -25,45 +25,48 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuGemmLowpOutputStage.h"
 
 namespace arm_compute
 {
 struct NEGEMMLowpOutputStage::Impl
 {
-    const ITensor                               *src{ nullptr };
-    const ITensor                               *bias{ nullptr };
-    ITensor                                     *dst{ nullptr };
+    const ITensor                               *src{nullptr};
+    const ITensor                               *bias{nullptr};
+    ITensor                                     *dst{nullptr};
     ITensorPack                                  run_pack{};
-    std::unique_ptr<cpu::CpuGemmLowpOutputStage> op{ nullptr };
+    std::unique_ptr<cpu::CpuGemmLowpOutputStage> op{nullptr};
 };
 
-NEGEMMLowpOutputStage::NEGEMMLowpOutputStage()
-    : _impl(std::make_unique<Impl>())
+NEGEMMLowpOutputStage::NEGEMMLowpOutputStage() : _impl(std::make_unique<Impl>())
 {
 }
 NEGEMMLowpOutputStage::~NEGEMMLowpOutputStage() = default;
 
-void NEGEMMLowpOutputStage::configure(const ITensor *input, const ITensor *bias, ITensor *output, const GEMMLowpOutputStageInfo &info)
+void NEGEMMLowpOutputStage::configure(const ITensor                 *input,
+                                      const ITensor                 *bias,
+                                      ITensor                       *output,
+                                      const GEMMLowpOutputStageInfo &info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpOutputStage::validate(input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        NEGEMMLowpOutputStage::validate(input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info));
     _impl->src  = input;
     _impl->bias = bias;
     _impl->dst  = output;
     _impl->op   = std::make_unique<cpu::CpuGemmLowpOutputStage>();
     _impl->op->configure(input->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), info);
 
-    _impl->run_pack =
-    {
-        { TensorType::ACL_SRC, _impl->src },
-        { TensorType::ACL_BIAS, _impl->bias },
-        { TensorType::ACL_DST, _impl->dst }
-    };
+    _impl->run_pack = {
+        {TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_BIAS, _impl->bias}, {TensorType::ACL_DST, _impl->dst}};
 }
 
-Status NEGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info)
+Status NEGEMMLowpOutputStage::validate(const ITensorInfo             *input,
+                                       const ITensorInfo             *bias,
+                                       const ITensorInfo             *output,
+                                       const GEMMLowpOutputStageInfo &info)
 {
     return cpu::CpuGemmLowpOutputStage::validate(input, bias, output, info);
 }
diff --git a/src/runtime/NEON/functions/NEGather.cpp b/src/runtime/NEON/functions/NEGather.cpp
index f5d19c769e..62b8cfa48b 100644
--- a/src/runtime/NEON/functions/NEGather.cpp
+++ b/src/runtime/NEON/functions/NEGather.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEGather.h"
 
-#include "src/core/NEON/kernels/NEGatherKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEGatherKernel.h"
 
 #include <utility>
 
diff --git a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
index 1c0e736766..1022b4153e 100644
--- a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
+++ b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
@@ -25,11 +25,12 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/NEON/kernels/NEFillBorderKernel.h"
 #include "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
 #include "src/core/NEON/kernels/NEPadLayerKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
 {
@@ -68,42 +69,55 @@ NEGenerateProposalsLayer::NEGenerateProposalsLayer(std::shared_ptr<IMemoryManage
 
 NEGenerateProposalsLayer::~NEGenerateProposalsLayer() = default;
 
-void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *deltas, const ITensor *anchors, ITensor *proposals, ITensor *scores_out, ITensor *num_valid_proposals,
+void NEGenerateProposalsLayer::configure(const ITensor               *scores,
+                                         const ITensor               *deltas,
+                                         const ITensor               *anchors,
+                                         ITensor                     *proposals,
+                                         ITensor                     *scores_out,
+                                         ITensor                     *num_valid_proposals,
                                          const GenerateProposalsInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
-    ARM_COMPUTE_ERROR_THROW_ON(NEGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(NEGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(),
+                                                                  proposals->info(), scores_out->info(),
+                                                                  num_valid_proposals->info(), info));
     ARM_COMPUTE_LOG_PARAMS(scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info);
 
     _is_nhwc                        = scores->info()->data_layout() == DataLayout::NHWC;
     const DataType scores_data_type = scores->info()->data_type();
     _is_qasymm8                     = scores_data_type == DataType::QASYMM8;
-    const int    num_anchors        = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
-    const int    feat_width         = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
-    const int    feat_height        = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
-    const int    total_num_anchors  = num_anchors * feat_width * feat_height;
-    const int    pre_nms_topN       = info.pre_nms_topN();
-    const int    post_nms_topN      = info.post_nms_topN();
-    const size_t values_per_roi     = info.values_per_roi();
+    const int num_anchors           = scores->info()->dimension(
+                  get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
+    const int feat_width = scores->info()->dimension(
+        get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
+    const int feat_height = scores->info()->dimension(
+        get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
+    const int    total_num_anchors = num_anchors * feat_width * feat_height;
+    const int    pre_nms_topN      = info.pre_nms_topN();
+    const int    post_nms_topN     = info.post_nms_topN();
+    const size_t values_per_roi    = info.values_per_roi();
 
     const QuantizationInfo scores_qinfo   = scores->info()->quantization_info();
     const DataType         rois_data_type = (_is_qasymm8) ? DataType::QASYMM16 : scores_data_type;
-    const QuantizationInfo rois_qinfo     = (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
+    const QuantizationInfo rois_qinfo =
+        (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
 
     // Compute all the anchors
     _memory_group.manage(&_all_anchors);
     _compute_anchors = std::make_unique<NEComputeAllAnchorsKernel>();
-    _compute_anchors->configure(anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
+    _compute_anchors->configure(anchors, &_all_anchors,
+                                ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
 
     const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors);
-    _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
+    _deltas_flattened.allocator()->init(
+        TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
 
     // Permute and reshape deltas
     _memory_group.manage(&_deltas_flattened);
-    if(!_is_nhwc)
+    if (!_is_nhwc)
     {
         _memory_group.manage(&_deltas_permuted);
-        _permute_deltas.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
+        _permute_deltas.configure(deltas, &_deltas_permuted, PermutationVector{2, 0, 1});
         _flatten_deltas.configure(&_deltas_permuted, &_deltas_flattened);
         _deltas_permuted.allocator()->allocate();
     }
@@ -117,10 +131,10 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
 
     // Permute and reshape scores
     _memory_group.manage(&_scores_flattened);
-    if(!_is_nhwc)
+    if (!_is_nhwc)
     {
         _memory_group.manage(&_scores_permuted);
-        _permute_scores.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
+        _permute_scores.configure(scores, &_scores_permuted, PermutationVector{2, 0, 1});
         _flatten_scores.configure(&_scores_permuted, &_scores_flattened);
         _scores_permuted.allocator()->allocate();
     }
@@ -131,7 +145,7 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
 
     Tensor *anchors_to_use = &_all_anchors;
     Tensor *deltas_to_use  = &_deltas_flattened;
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _all_anchors_f32.allocator()->init(TensorInfo(_all_anchors.info()->tensor_shape(), 1, DataType::F32));
         _deltas_flattened_f32.allocator()->init(TensorInfo(_deltas_flattened.info()->tensor_shape(), 1, DataType::F32));
@@ -154,11 +168,12 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
     anchors_to_use->allocator()->allocate();
 
     _all_proposals_to_use = &_all_proposals;
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _memory_group.manage(&_all_proposals_quantized);
         // Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset
-        _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
+        _all_proposals_quantized.allocator()->init(
+            TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
         _quantize_all_proposals.configure(&_all_proposals, &_all_proposals_quantized);
         _all_proposals.allocator()->allocate();
         _all_proposals_to_use = &_all_proposals_quantized;
@@ -174,7 +189,8 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
 
     // Note that NMS needs outputs preinitialized.
     auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo);
-    auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, rois_qinfo);
+    auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type,
+                       rois_qinfo);
     auto_init_if_empty(*num_valid_proposals->info(), TensorShape(1), 1, DataType::U32);
 
     // Initialize temporaries (unused) outputs
@@ -187,17 +203,12 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
 
     _memory_group.manage(&_proposals_4_roi_values);
 
-    const BoxNMSLimitInfo box_nms_info(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height());
-    _cpp_nms.configure(&_scores_flattened /*scores_in*/,
-                       _all_proposals_to_use /*boxes_in,*/,
-                       nullptr /* batch_splits_in*/,
-                       scores_out /* scores_out*/,
-                       &_proposals_4_roi_values /*boxes_out*/,
-                       &_classes_nms_unused /*classes*/,
-                       nullptr /*batch_splits_out*/,
-                       &_keeps_nms_unused /*keeps*/,
-                       num_valid_proposals /* keeps_size*/,
-                       box_nms_info);
+    const BoxNMSLimitInfo box_nms_info(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f,
+                                       true, min_size_scaled, info.im_width(), info.im_height());
+    _cpp_nms.configure(&_scores_flattened /*scores_in*/, _all_proposals_to_use /*boxes_in,*/,
+                       nullptr /* batch_splits_in*/, scores_out /* scores_out*/, &_proposals_4_roi_values /*boxes_out*/,
+                       &_classes_nms_unused /*classes*/, nullptr /*batch_splits_out*/, &_keeps_nms_unused /*keeps*/,
+                       num_valid_proposals /* keeps_size*/, box_nms_info);
 
     _keeps_nms_unused.allocator()->allocate();
     _classes_nms_unused.allocator()->allocate();
@@ -205,12 +216,17 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
     _scores_flattened.allocator()->allocate();
 
     // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images
-    _pad.configure(&_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
+    _pad.configure(&_proposals_4_roi_values, proposals, PaddingList{{1, 0}});
     _proposals_4_roi_values.allocator()->allocate();
 }
 
-Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out,
-                                          const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info)
+Status NEGenerateProposalsLayer::validate(const ITensorInfo           *scores,
+                                          const ITensorInfo           *deltas,
+                                          const ITensorInfo           *anchors,
+                                          const ITensorInfo           *proposals,
+                                          const ITensorInfo           *scores_out,
+                                          const ITensorInfo           *num_valid_proposals,
+                                          const GenerateProposalsInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
@@ -218,9 +234,12 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores, deltas);
 
-    const int num_anchors       = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
-    const int feat_width        = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
-    const int feat_height       = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
+    const int num_anchors =
+        scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
+    const int feat_width =
+        scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
+    const int feat_height =
+        scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
     const int num_images        = scores->dimension(3);
     const int total_num_anchors = num_anchors * feat_width * feat_height;
     const int values_per_roi    = info.values_per_roi();
@@ -229,76 +248,100 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
 
     ARM_COMPUTE_RETURN_ERROR_ON(num_images > 1);
 
-    if(is_qasymm8)
+    if (is_qasymm8)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(anchors, 1, DataType::QSYMM16);
         const UniformQuantizationInfo anchors_qinfo = anchors->quantization_info().uniform();
         ARM_COMPUTE_RETURN_ERROR_ON(anchors_qinfo.scale != 0.125f);
     }
 
-    TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
-
-    TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true);
-    TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
-    if(scores->data_layout() == DataLayout::NHWC)
+    TensorInfo all_anchors_info(
+        anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEComputeAllAnchorsKernel::validate(
+        anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
+
+    TensorInfo deltas_permuted_info =
+        deltas->clone()
+            ->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height))
+            .set_is_resizable(true);
+    TensorInfo scores_permuted_info =
+        scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
+    if (scores->data_layout() == DataLayout::NHWC)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info);
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(deltas, &deltas_permuted_info, PermutationVector{2, 0, 1}));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(scores, &scores_permuted_info, PermutationVector{2, 0, 1}));
     }
 
-    TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    TensorInfo deltas_flattened_info(
+        deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
     ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(&deltas_permuted_info, &deltas_flattened_info));
 
-    TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
-    TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    TensorInfo scores_flattened_info(
+        scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
+    TensorInfo proposals_4_roi_values(
+        deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
 
     ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(&scores_permuted_info, &scores_flattened_info));
 
     TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values;
-    TensorInfo  proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
-    proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16).set_quantization_info(QuantizationInfo(0.125f, 0));
-    if(is_qasymm8)
+    TensorInfo  proposals_4_roi_values_quantized(
+         deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16)
+        .set_quantization_info(QuantizationInfo(0.125f, 0));
+    if (is_qasymm8)
     {
-        TensorInfo all_anchors_f32_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
+        TensorInfo all_anchors_f32_info(anchors->clone()
+                                            ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+                                            .set_is_resizable(true)
+                                            .set_data_type(DataType::F32));
         ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(&all_anchors_info, &all_anchors_f32_info));
 
-        TensorInfo deltas_flattened_f32_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
-
-        TensorInfo proposals_4_roi_values_f32(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
-                                                                     BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
-
-        ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
+        TensorInfo deltas_flattened_f32_info(deltas->clone()
+                                                 ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+                                                 .set_is_resizable(true)
+                                                 .set_data_type(DataType::F32));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
+
+        TensorInfo proposals_4_roi_values_f32(deltas->clone()
+                                                  ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+                                                  .set_is_resizable(true)
+                                                  .set_data_type(DataType::F32));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate(
+            &all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
+            BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
         proposals_4_roi_values_to_use = &proposals_4_roi_values_quantized;
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
-                                                                     BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEBoundingBoxTransform::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
+                                             BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPadLayer::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } }));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPadLayer::validate(proposals_4_roi_values_to_use, proposals, PaddingList{{1, 0}}));
 
-    if(num_valid_proposals->total_size() > 0)
+    if (num_valid_proposals->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->dimension(0) > 1);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_valid_proposals, 1, DataType::U32);
     }
 
-    if(proposals->total_size() > 0)
+    if (proposals->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(0) != size_t(values_per_roi) + 1);
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(1) != size_t(total_num_anchors));
-        if(is_qasymm8)
+        if (is_qasymm8)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(proposals, 1, DataType::QASYMM16);
             const UniformQuantizationInfo proposals_qinfo = proposals->quantization_info().uniform();
@@ -311,7 +354,7 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
         }
     }
 
-    if(scores_out->total_size() > 0)
+    if (scores_out->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(scores_out->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(scores_out->dimension(0) != size_t(total_num_anchors));
@@ -330,7 +373,7 @@ void NEGenerateProposalsLayer::run()
     NEScheduler::get().schedule(_compute_anchors.get(), Window::DimY);
 
     // Transpose and reshape the inputs
-    if(!_is_nhwc)
+    if (!_is_nhwc)
     {
         _permute_deltas.run();
         _permute_scores.run();
@@ -339,7 +382,7 @@ void NEGenerateProposalsLayer::run()
     _flatten_deltas.run();
     _flatten_scores.run();
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _dequantize_anchors.run();
         _dequantize_deltas.run();
@@ -348,7 +391,7 @@ void NEGenerateProposalsLayer::run()
     // Build the boxes
     _bounding_box.run();
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _quantize_all_proposals.run();
     }
diff --git a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
index 822dcf491c..78218cbdee 100644
--- a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h"
 
@@ -34,7 +35,13 @@ namespace arm_compute
 NEInstanceNormalizationLayer::~NEInstanceNormalizationLayer() = default;
 
 NEInstanceNormalizationLayer::NEInstanceNormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false), _permute_input(), _permute_output(), _permuted_input(), _permuted_output()
+    : _memory_group(std::move(memory_manager)),
+      _normalization_kernel(),
+      _is_nchw(false),
+      _permute_input(),
+      _permute_output(),
+      _permuted_input(),
+      _permuted_output()
 {
 }
 
@@ -43,14 +50,14 @@ void NEInstanceNormalizationLayer::configure(ITensor *input, ITensor *output, fl
     ARM_COMPUTE_LOG_PARAMS(input, output, gamma, beta, epsilon);
 
     const DataLayout data_layout       = input->info()->data_layout();
-    const auto       kernel_descriptor = InstanceNormalizationLayerKernelInfo{ gamma, beta, epsilon, true };
+    const auto       kernel_descriptor = InstanceNormalizationLayerKernelInfo{gamma, beta, epsilon, true};
 
     // Configure Kernels
     _is_nchw = data_layout == DataLayout::NCHW;
 
     _normalization_kernel = std::make_unique<NEInstanceNormalizationLayerKernel>();
 
-    if(!_is_nchw)
+    if (!_is_nchw)
     {
         _memory_group.manage(&_permuted_input);
         _memory_group.manage(&_permuted_output);
@@ -72,11 +79,12 @@ void NEInstanceNormalizationLayer::configure(ITensor *input, ITensor *output, fl
     }
 }
 
-Status NEInstanceNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon)
+Status NEInstanceNormalizationLayer::validate(
+    const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon)
 {
-    return NEInstanceNormalizationLayerKernel::validate(&input->clone()->set_data_layout(DataLayout::NCHW),
-                                                        &output->clone()->set_data_layout(DataLayout::NCHW),
-                                                        InstanceNormalizationLayerKernelInfo{ gamma, beta, epsilon, true });
+    return NEInstanceNormalizationLayerKernel::validate(
+        &input->clone()->set_data_layout(DataLayout::NCHW), &output->clone()->set_data_layout(DataLayout::NCHW),
+        InstanceNormalizationLayerKernelInfo{gamma, beta, epsilon, true});
 }
 
 void NEInstanceNormalizationLayer::run()
@@ -84,7 +92,7 @@ void NEInstanceNormalizationLayer::run()
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Permute input
-    if(!_is_nchw)
+    if (!_is_nchw)
     {
         _permute_input.run();
     }
@@ -92,7 +100,7 @@ void NEInstanceNormalizationLayer::run()
     NEScheduler::get().schedule(_normalization_kernel.get(), Window::DimZ);
 
     // Permute output
-    if(!_is_nchw)
+    if (!_is_nchw)
     {
         _permute_output.run();
     }
diff --git a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
index c3ecfb430f..b7f6203efd 100644
--- a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
+++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEL2NormalizeLayerKernel.h"
 #include "src/core/NEON/kernels/NEReductionOperationKernel.h"
@@ -69,7 +70,8 @@ Status NEL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo
     sum_sq.set_tensor_shape(shape);
 
     const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE));
 
     // Reduce shape on axis
     shape.set(actual_axis, 1);
diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp
index 428cdf8c04..1a08cdeb06 100644
--- a/src/runtime/NEON/functions/NELSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayer.cpp
@@ -24,11 +24,12 @@
 #include "arm_compute/runtime/NEON/functions/NELSTMLayer.h"
 
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/common/LSTMParams.h"
+
 #include "src/common/utils/Log.h"
 
 namespace arm_compute
@@ -39,42 +40,122 @@ using namespace arm_compute::utils::info_helpers;
 NELSTMLayer::~NELSTMLayer() = default;
 
 NELSTMLayer::NELSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(),
-      _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _transpose_cell_state(),
-      _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(),
-      _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _projection_clip(),
-      _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(),
-      _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(), _pixelwise_mul_forget_gate_coeff(), _accum_forget_gate_bias(),
-      _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(), _pixelwise_mul_output_gate_coeff(), _accum_output_gate_bias(), _input_gate_out1(),
-      _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _forget_gate_out6(),
-      _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _cell_state_activation(), _output_state1(), _ones(),
-      _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(), _cell_layer_norm_out1(), _cell_layer_norm_out2(), _output_layer_norm_out1(),
-      _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false),
+    : _memory_group(std::move(memory_manager)),
+      _fully_connected_input_gate(),
+      _accum_input_gate1(),
+      _subtract_input_gate(),
+      _pixelwise_mul_input_gate(),
+      _activation_input_gate(),
+      _fully_connected_forget_gate(),
+      _accum_forget_gate1(),
+      _pixelwise_mul_forget_gate(),
+      _activation_forget_gate(),
+      _fully_connected_cell_state(),
+      _gemm_cell_state1(),
+      _transpose_cell_state(),
+      _accum_cell_state1(),
+      _accum_cell_state2(),
+      _pixelwise_mul_cell_state1(),
+      _activation_cell_state(),
+      _cell_clip(),
+      _pixelwise_mul_cell_state2(),
+      _fully_connected_output(),
+      _pixelwise_mul_output_state1(),
+      _accum_output1(),
+      _activation_output(),
+      _activation_output_state(),
+      _pixelwise_mul_output_state2(),
+      _fully_connected_output_state(),
+      _projection_clip(),
+      _copy_cell_state(),
+      _copy_output(),
+      _concat_scratch_buffer(),
+      _concat_inputs_forget_gate(),
+      _concat_weights_forget_gate(),
+      _concat_weights_input_gate(),
+      _concat_weights_output(),
+      _mean_std_norm_input_gate(),
+      _pixelwise_mul_input_gate_coeff(),
+      _accum_input_gate_bias(),
+      _mean_std_norm_forget_gate(),
+      _pixelwise_mul_forget_gate_coeff(),
+      _accum_forget_gate_bias(),
+      _mean_std_norm_cell_gate(),
+      _pixelwise_mul_cell_gate_coeff(),
+      _accum_cell_gate_bias(),
+      _mean_std_norm_output_gate(),
+      _pixelwise_mul_output_gate_coeff(),
+      _accum_output_gate_bias(),
+      _input_gate_out1(),
+      _input_gate_out2(),
+      _input_gate_out3(),
+      _input_gate_out4(),
+      _forget_gate_out1(),
+      _forget_gate_out2(),
+      _forget_gate_out3(),
+      _forget_gate_out4(),
+      _forget_gate_out5(),
+      _forget_gate_out6(),
+      _cell_state_out1(),
+      _cell_state_out2(),
+      _cell_state_out3(),
+      _cell_state_out4(),
+      _cell_state_out5(),
+      _output1(),
+      _output2(),
+      _output3(),
+      _output4(),
+      _cell_state_activation(),
+      _output_state1(),
+      _ones(),
+      _input_layer_norm_out1(),
+      _input_layer_norm_out2(),
+      _forget_layer_norm_out1(),
+      _forget_layer_norm_out2(),
+      _cell_layer_norm_out1(),
+      _cell_layer_norm_out2(),
+      _output_layer_norm_out1(),
+      _output_layer_norm_out2(),
+      _run_peephole_opt(false),
+      _run_cifg_opt(false),
+      _perform_cell_clipping(false),
+      _has_projection_weights(false),
+      _perform_projection_clipping(false),
+      _is_prepared(false),
       _is_layer_norm_lstm(false)
 {
 }
 
-void NELSTMLayer::configure(const ITensor *input,
-                            const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
-                            const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
-                            const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
-                            const ITensor *output_state_in, const ITensor *cell_state_in,
-                            ITensor *scratch_buffer, ITensor *output_state_out, ITensor *cell_state_out, ITensor *output,
-                            const LSTMParams<ITensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+void NELSTMLayer::configure(const ITensor             *input,
+                            const ITensor             *input_to_forget_weights,
+                            const ITensor             *input_to_cell_weights,
+                            const ITensor             *input_to_output_weights,
+                            const ITensor             *recurrent_to_forget_weights,
+                            const ITensor             *recurrent_to_cell_weights,
+                            const ITensor             *recurrent_to_output_weights,
+                            const ITensor             *forget_gate_bias,
+                            const ITensor             *cell_bias,
+                            const ITensor             *output_gate_bias,
+                            const ITensor             *output_state_in,
+                            const ITensor             *cell_state_in,
+                            ITensor                   *scratch_buffer,
+                            ITensor                   *output_state_out,
+                            ITensor                   *cell_state_out,
+                            ITensor                   *output,
+                            const LSTMParams<ITensor> &lstm_params,
+                            const ActivationLayerInfo &activation_info,
+                            float                      cell_threshold,
+                            float                      projection_threshold)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input,
-                                 input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
                                  recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                 forget_gate_bias, cell_bias, output_gate_bias,
-                                 output_state_in, cell_state_in,
+                                 forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in,
                                  scratch_buffer, output_state_out, cell_state_out, output);
-    ARM_COMPUTE_LOG_PARAMS(input,
-                           input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+    ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
                            recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                           forget_gate_bias, cell_bias, output_gate_bias,
-                           output_state_in, cell_state_in,
-                           scratch_buffer, output_state_out, cell_state_out, output,
-                           lstm_params, activation_info, cell_threshold, projection_threshold);
+                           forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in,
+                           scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info,
+                           cell_threshold, projection_threshold);
 
     _is_layer_norm_lstm = lstm_params.use_layer_norm();
 
@@ -83,13 +164,12 @@ void NELSTMLayer::configure(const ITensor *input,
     build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
 
     // Validate
-    ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayer::validate(input->info(), input_to_forget_weights->info(),
-                                                     input_to_cell_weights->info(), input_to_output_weights->info(),
-                                                     recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                     forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
-                                                     output_state_in->info(), cell_state_in->info(),
-                                                     scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
-                                                     lstm_params_info, activation_info, cell_threshold, projection_threshold));
+    ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayer::validate(
+        input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
+        recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+        forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), output_state_in->info(),
+        cell_state_in->info(), scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
+        lstm_params_info, activation_info, cell_threshold, projection_threshold));
 
     const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape();
 
@@ -116,20 +196,23 @@ void NELSTMLayer::configure(const ITensor *input,
     _concat_weights_forget_gate.configure(weights_vector, &_forget_gate_out6, Window::DimX);
 
     _memory_group.manage(&_forget_gate_out5);
-    _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
+    _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6,
+                                           (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
     _memory_group.manage(&_forget_gate_out1);
     _memory_group.manage(&_forget_gate_out3);
     _forget_gate_out6.allocator()->allocate();
 
     Tensor *forget_gate_out = &_forget_gate_out5;
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         _forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
         _run_peephole_opt = true;
         _memory_group.manage(&_forget_gate_out4);
-        _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-        _accum_forget_gate1.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE);
+        _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1,
+                                             ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+        _accum_forget_gate1.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3,
+                                      ConvertPolicy::SATURATE);
         _forget_gate_out4.allocator()->allocate();
         _forget_gate_out5.allocator()->allocate();
         forget_gate_out = &_forget_gate_out3;
@@ -138,21 +221,25 @@ void NELSTMLayer::configure(const ITensor *input,
     {
         _forget_gate_out3.allocator()->allocate();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _forget_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _forget_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_forget_layer_norm_out1);
         _memory_group.manage(&_forget_layer_norm_out2);
         _mean_std_norm_forget_gate.configure(forget_gate_out);
-        _pixelwise_mul_forget_gate_coeff.configure(forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+        _pixelwise_mul_forget_gate_coeff.configure(forget_gate_out, lstm_params.forget_layer_norm_weights(),
+                                                   &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+                                                   RoundingPolicy::TO_ZERO);
         // forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
         forget_gate_out->allocator()->allocate();
-        _accum_forget_gate_bias.configure(&_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_forget_gate_bias.configure(&_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2,
+                                          ConvertPolicy::SATURATE);
         _forget_layer_norm_out1.allocator()->allocate();
         forget_gate_out = &_forget_layer_norm_out2;
     }
-    _activation_forget_gate.configure(forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _activation_forget_gate.configure(forget_gate_out, nullptr,
+                                      ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
 
     // Configure block that calculates the input gate
     // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
@@ -161,7 +248,7 @@ void NELSTMLayer::configure(const ITensor *input,
     // input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
     _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
     Tensor *input_gate_out = &_input_gate_out1;
-    if(lstm_params.has_cifg_opt())
+    if (lstm_params.has_cifg_opt())
     {
         _memory_group.manage(&_input_gate_out1);
         _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
@@ -183,15 +270,19 @@ void NELSTMLayer::configure(const ITensor *input,
         _memory_group.manage(&_input_gate_out1);
         _memory_group.manage(&_input_gate_out4);
 
-        _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3);
+        _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2,
+                                              (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(),
+                                              &_input_gate_out3);
         _input_gate_out2.allocator()->allocate();
         input_gate_out = &_input_gate_out3;
 
-        if(_run_peephole_opt)
+        if (_run_peephole_opt)
         {
             _memory_group.manage(&_input_gate_out4);
-            _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-            _accum_input_gate1.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
+            _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4,
+                                                1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+            _accum_input_gate1.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1,
+                                         ConvertPolicy::SATURATE);
             _input_gate_out3.allocator()->allocate();
             _input_gate_out4.allocator()->allocate();
             input_gate_out = &_input_gate_out1;
@@ -201,21 +292,25 @@ void NELSTMLayer::configure(const ITensor *input,
             _input_gate_out1.allocator()->allocate();
         }
 
-        if(_is_layer_norm_lstm)
+        if (_is_layer_norm_lstm)
         {
             _input_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
             _input_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
             _memory_group.manage(&_input_layer_norm_out1);
             _memory_group.manage(&_input_layer_norm_out2);
             _mean_std_norm_input_gate.configure(input_gate_out);
-            _pixelwise_mul_input_gate_coeff.configure(input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+            _pixelwise_mul_input_gate_coeff.configure(input_gate_out, lstm_params.input_layer_norm_weights(),
+                                                      &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+                                                      RoundingPolicy::TO_ZERO);
             // input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
             input_gate_out->allocator()->allocate();
-            _accum_input_gate_bias.configure(&_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE);
+            _accum_input_gate_bias.configure(&_input_layer_norm_out1, lstm_params.input_gate_bias(),
+                                             &_input_layer_norm_out2, ConvertPolicy::SATURATE);
             _input_layer_norm_out1.allocator()->allocate();
             input_gate_out = &_input_layer_norm_out2;
         }
-        _activation_input_gate.configure(input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        _activation_input_gate.configure(input_gate_out, nullptr,
+                                         ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     }
 
     // Configure block that calculates the cell state
@@ -228,7 +323,8 @@ void NELSTMLayer::configure(const ITensor *input,
     _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
     _memory_group.manage(&_cell_state_out1);
-    _fully_connected_cell_state.configure(input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1);
+    _fully_connected_cell_state.configure(input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias,
+                                          &_cell_state_out1);
     _memory_group.manage(&_cell_state_out2);
     _transpose_cell_state.configure(recurrent_to_cell_weights, &_cell_state_out2);
     _memory_group.manage(&_cell_state_out3);
@@ -237,33 +333,40 @@ void NELSTMLayer::configure(const ITensor *input,
     _memory_group.manage(&_cell_state_out4);
     _accum_cell_state1.configure(&_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
     Tensor *cell_state_out_ptr = &_cell_state_out4;
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _cell_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _cell_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_cell_layer_norm_out1);
         _memory_group.manage(&_cell_layer_norm_out2);
         _mean_std_norm_cell_gate.configure(cell_state_out_ptr);
-        _pixelwise_mul_cell_gate_coeff.configure(cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+        _pixelwise_mul_cell_gate_coeff.configure(cell_state_out_ptr, lstm_params.cell_layer_norm_weights(),
+                                                 &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+                                                 RoundingPolicy::TO_ZERO);
         // cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before
         cell_state_out_ptr->allocator()->allocate();
-        _accum_cell_gate_bias.configure(&_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_cell_gate_bias.configure(&_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2,
+                                        ConvertPolicy::SATURATE);
         _cell_layer_norm_out1.allocator()->allocate();
         cell_state_out_ptr = &_cell_layer_norm_out2;
     }
     _activation_cell_state.configure(cell_state_out_ptr, nullptr, activation_info);
     _memory_group.manage(&_cell_state_out5);
-    _pixelwise_mul_cell_state1.configure(cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_cell_state1.configure(cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1,
+                                         ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     cell_state_out_ptr->allocator()->allocate();
-    _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE,
+                                         RoundingPolicy::TO_ZERO);
     _accum_cell_state2.configure(&_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
     _cell_state_out3.allocator()->allocate();
     _cell_state_out5.allocator()->allocate();
     // Perform clipping
-    if(cell_threshold != 0.f)
+    if (cell_threshold != 0.f)
     {
         _perform_cell_clipping = true;
-        _cell_clip.configure(&_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold, -cell_threshold));
+        _cell_clip.configure(&_cell_state_out1, nullptr,
+                             ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                 cell_threshold, -cell_threshold));
     }
 
     // Configure block that calculates the output
@@ -281,18 +384,20 @@ void NELSTMLayer::configure(const ITensor *input,
     _memory_group.manage(&_output1);
     _memory_group.manage(&_output4);
 
-    _fully_connected_output.configure(&_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4);
+    _fully_connected_output.configure(&_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias,
+                                      &_output4);
 
     _output2.allocator()->allocate();
     _forget_gate_out2.allocator()->allocate();
 
     Tensor *output_gate_out = &_output4;
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
 
         _memory_group.manage(&_output3);
-        _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+        _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1,
+                                               ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
         _accum_output1.configure(&_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
         _output4.allocator()->allocate();
         output_gate_out = &_output1;
@@ -304,21 +409,25 @@ void NELSTMLayer::configure(const ITensor *input,
     {
         _output1.allocator()->allocate();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _output_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _output_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_output_layer_norm_out1);
         _memory_group.manage(&_output_layer_norm_out2);
         _mean_std_norm_output_gate.configure(output_gate_out);
-        _pixelwise_mul_output_gate_coeff.configure(output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+        _pixelwise_mul_output_gate_coeff.configure(output_gate_out, lstm_params.output_layer_norm_weights(),
+                                                   &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+                                                   RoundingPolicy::TO_ZERO);
         // output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
         output_gate_out->allocator()->allocate();
-        _accum_output_gate_bias.configure(&_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_output_gate_bias.configure(&_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2,
+                                          ConvertPolicy::SATURATE);
         _output_layer_norm_out1.allocator()->allocate();
         output_gate_out = &_output_layer_norm_out2;
     }
-    _activation_output.configure(output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _activation_output.configure(output_gate_out, nullptr,
+                                 ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
 
     // Configure block that calculates the output state
     /** lstm_res = PixelwiseMul(output, Activation(cell_state))
@@ -335,20 +444,24 @@ void NELSTMLayer::configure(const ITensor *input,
 
     _memory_group.manage(&_cell_state_activation);
     _activation_output_state.configure(&_cell_state_out1, &_cell_state_activation, activation_info);
-    _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1,
+                                           ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _cell_state_activation.allocator()->allocate();
     output_gate_out->allocator()->allocate();
 
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
         _has_projection_weights = true;
-        _fully_connected_output_state.configure(output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out);
+        _fully_connected_output_state.configure(output_state_out_tmp, lstm_params.projection_weights(),
+                                                lstm_params.projection_bias(), output_state_out);
         _output_state1.allocator()->allocate();
         // Perform clipping
-        if(projection_threshold != 0.f)
+        if (projection_threshold != 0.f)
         {
             _perform_projection_clipping = true;
-            _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold));
+            _projection_clip.configure(output_state_out, nullptr,
+                                       ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                           -projection_threshold, projection_threshold));
         }
     }
 
@@ -358,7 +471,7 @@ void NELSTMLayer::configure(const ITensor *input,
 
     // Vector for holding the tensors to store in scratch buffer
     std::vector<const ITensor *> scratch_inputs;
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
         scratch_inputs.emplace_back(input_gate_out);
     }
@@ -372,29 +485,38 @@ void NELSTMLayer::configure(const ITensor *input,
     output_gate_out->allocator()->allocate();
 }
 
-Status NELSTMLayer::validate(const ITensorInfo *input,
-                             const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                             const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                             const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                             const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in,
-                             const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output,
-                             const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+Status NELSTMLayer::validate(const ITensorInfo             *input,
+                             const ITensorInfo             *input_to_forget_weights,
+                             const ITensorInfo             *input_to_cell_weights,
+                             const ITensorInfo             *input_to_output_weights,
+                             const ITensorInfo             *recurrent_to_forget_weights,
+                             const ITensorInfo             *recurrent_to_cell_weights,
+                             const ITensorInfo             *recurrent_to_output_weights,
+                             const ITensorInfo             *forget_gate_bias,
+                             const ITensorInfo             *cell_bias,
+                             const ITensorInfo             *output_gate_bias,
+                             const ITensorInfo             *output_state_in,
+                             const ITensorInfo             *cell_state_in,
+                             const ITensorInfo             *scratch_buffer,
+                             const ITensorInfo             *output_state_out,
+                             const ITensorInfo             *cell_state_out,
+                             const ITensorInfo             *output,
+                             const LSTMParams<ITensorInfo> &lstm_params,
+                             const ActivationLayerInfo     &activation_info,
+                             float                          cell_threshold,
+                             float                          projection_threshold)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input,
-                                        input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                        recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                        forget_gate_bias, cell_bias, output_gate_bias,
-                                        output_state_in, cell_state_in,
-                                        scratch_buffer, output_state_out, cell_state_out, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(
+        input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights,
+        recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+        output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output);
 
     // Check data types
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input,
-                                                       input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                                       recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                                       forget_gate_bias, cell_bias, output_gate_bias,
-                                                       output_state_in, cell_state_in,
-                                                       scratch_buffer, output_state_out, cell_state_out, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(
+        input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights,
+        recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+        output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output);
 
     // Check dimensions
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
@@ -413,16 +535,16 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ERROR_ON(output_state_out->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(cell_state_out->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0)
-                                && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) &&
+                                cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
 
     const unsigned int num_batches = input->dimension(1);
     const unsigned int num_cells   = input_to_output_weights->dimension(1);
 
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
         // If CIFG is used, input layer normalization weights tensor is omitted
-        if(lstm_params.has_cifg_opt())
+        if (lstm_params.has_cifg_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights() != nullptr);
         }
@@ -434,8 +556,12 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.input_layer_norm_weights());
         }
 
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(),
+                                            lstm_params.cell_layer_norm_weights(),
+                                            lstm_params.output_layer_norm_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(),
+                                                           lstm_params.cell_layer_norm_weights(),
+                                                           lstm_params.output_layer_norm_weights());
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->num_dimensions() > 1);
@@ -445,7 +571,7 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
     }
 
     // Check peephole optimization
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights());
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() > 1);
@@ -465,33 +591,39 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
     std::vector<const ITensorInfo *> inputs_vector;
     inputs_vector.emplace_back(input);
     inputs_vector.emplace_back(output_state_in);
-    const TensorShape concat_shape       = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
+    const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
     TensorInfo        forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type());
     ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(inputs_vector, &forget_gate_concat, Window::DimX));
 
     // Validate forget gate
-    ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(
+        input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate));
 
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1,
+                                                ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&forget_gate));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1,
+                                                ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+        &forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Validate input gate
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
-                                            lstm_params.recurrent_to_input_weights(),
-                                            lstm_params.input_gate_bias());
+                                            lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1);
@@ -499,88 +631,120 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
         std::vector<const ITensorInfo *> lstm_weights;
         lstm_weights.emplace_back(lstm_params.input_to_input_weights());
         lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
-        TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
-        TensorInfo  lstm_gate_concat          = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
+        TensorShape lstm_weights_concat_shape =
+            arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+        TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
         ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(lstm_weights, &lstm_gate_concat, Window::DimX));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(
+            input, lstm_params.input_to_input_weights(),
+            (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
 
-        if(lstm_params.has_peephole_opt())
+        if (lstm_params.has_peephole_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
             ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1);
-            ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1,
+                                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
         }
 
-        if(lstm_params.use_layer_norm())
+        if (lstm_params.use_layer_norm())
         {
             ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&input_gate));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1,
+                                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(),
+                                                                       &input_gate, ConvertPolicy::SATURATE));
         }
-        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+            &input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
 
     // Validate cell state
-    ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
-    if(lstm_params.use_layer_norm())
+    ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(
+        input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+    if (lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&cell_state_tmp));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp,
+                                                1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
     }
     ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, nullptr, activation_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
-    if(cell_threshold != 0.f)
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1,
+                                                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1,
+                                                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+    if (cell_threshold != 0.f)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold,
-                                                                                                              -cell_threshold)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEActivationLayer::validate(&cell_state_tmp, nullptr,
+                                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                            cell_threshold, -cell_threshold)));
     }
 
     // Validate output gate tmp
     std::vector<const ITensorInfo *> in_out_weights;
     in_out_weights.emplace_back(input_to_output_weights);
     in_out_weights.emplace_back(recurrent_to_output_weights);
-    TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
-    TensorInfo  in_out_gate_concat          = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
+    TensorShape in_out_weights_concat_shape =
+        arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+    TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
     ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(in_out_weights, &in_out_gate_concat, Window::DimX));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(
+        input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
 
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp,
+                                                1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp,
+                                                                   ConvertPolicy::SATURATE));
     }
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&output_gate_tmp));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(),
+                                                &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp,
+                                                                   ConvertPolicy::SATURATE));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+        &output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Validate output state
     ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, &cell_state_tmp, activation_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    if(lstm_params.has_projection())
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+        &cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    if (lstm_params.has_projection())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out));
-        if(projection_threshold != 0.f)
+        ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(),
+                                                                    lstm_params.projection_bias(), output_state_out));
+        if (projection_threshold != 0.f)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output_state_out, output_state_out,
-                                                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+                output_state_out, output_state_out,
+                ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold,
+                                    projection_threshold)));
         }
     }
 
@@ -590,7 +754,7 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
 
     // Validate scratch concatenation
     std::vector<const ITensorInfo *> inputs_vector_info_raw;
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
         inputs_vector_info_raw.push_back(&input_gate);
     }
@@ -611,12 +775,12 @@ void NELSTMLayer::run()
     _concat_inputs_forget_gate.run();
     _fully_connected_forget_gate.run();
 
-    if(_run_peephole_opt)
+    if (_run_peephole_opt)
     {
         _pixelwise_mul_forget_gate.run();
         _accum_forget_gate1.run();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _mean_std_norm_forget_gate.run();
         _pixelwise_mul_forget_gate_coeff.run();
@@ -624,15 +788,17 @@ void NELSTMLayer::run()
     }
     _activation_forget_gate.run();
 
-    if(_run_cifg_opt)
+    if (_run_cifg_opt)
     {
-        if(_ones.info()->data_type() == DataType::F16)
+        if (_ones.info()->data_type() == DataType::F16)
         {
-            std::fill_n(reinterpret_cast<half *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
+            std::fill_n(reinterpret_cast<half *>(_ones.buffer()),
+                        _ones.info()->total_size() / _ones.info()->element_size(), 1);
         }
         else
         {
-            std::fill_n(reinterpret_cast<float *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
+            std::fill_n(reinterpret_cast<float *>(_ones.buffer()),
+                        _ones.info()->total_size() / _ones.info()->element_size(), 1);
         }
         _subtract_input_gate.run();
     }
@@ -640,13 +806,13 @@ void NELSTMLayer::run()
     {
         _fully_connected_input_gate.run();
 
-        if(_run_peephole_opt)
+        if (_run_peephole_opt)
         {
             _pixelwise_mul_input_gate.run();
             _accum_input_gate1.run();
         }
 
-        if(_is_layer_norm_lstm)
+        if (_is_layer_norm_lstm)
         {
             _mean_std_norm_input_gate.run();
             _pixelwise_mul_input_gate_coeff.run();
@@ -659,7 +825,7 @@ void NELSTMLayer::run()
     _transpose_cell_state.run();
     _gemm_cell_state1.run();
     _accum_cell_state1.run();
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _mean_std_norm_cell_gate.run();
         _pixelwise_mul_cell_gate_coeff.run();
@@ -671,18 +837,18 @@ void NELSTMLayer::run()
     _pixelwise_mul_cell_state2.run();
     _accum_cell_state2.run();
 
-    if(_perform_cell_clipping)
+    if (_perform_cell_clipping)
     {
         _cell_clip.run();
     }
 
     _fully_connected_output.run();
-    if(_run_peephole_opt)
+    if (_run_peephole_opt)
     {
         _pixelwise_mul_output_state1.run();
         _accum_output1.run();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _mean_std_norm_output_gate.run();
         _pixelwise_mul_output_gate_coeff.run();
@@ -693,10 +859,10 @@ void NELSTMLayer::run()
     _activation_output_state.run();
     _pixelwise_mul_output_state2.run();
 
-    if(_has_projection_weights)
+    if (_has_projection_weights)
     {
         _fully_connected_output_state.run();
-        if(_perform_projection_clipping)
+        if (_perform_projection_clipping)
         {
             _projection_clip.run();
         }
@@ -710,10 +876,10 @@ void NELSTMLayer::run()
 
 void NELSTMLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         _concat_weights_forget_gate.run();
-        if(!_run_cifg_opt)
+        if (!_run_cifg_opt)
         {
             _concat_weights_input_gate.run();
         }
diff --git a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
index cfdeb000e0..41f9c3d700 100644
--- a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
@@ -24,8 +24,9 @@
 #include "arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h"
 
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 
@@ -46,36 +47,104 @@ const QuantizationInfo qsymm_0(1.f / 32768.f, 0);  // qsymm16 with 0 integer bit
 NELSTMLayerQuantized::~NELSTMLayerQuantized() = default;
 
 NELSTMLayerQuantized::NELSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(),
-      _concat_bias(), _sigmoid_forget_gate(), _sigmoid_input_gate(), _sigmoid_output_gate(), _tanh_modulation_gate(), _tanh_output_state(), _add1(), _add2(), _mul1(), _mul2(), _mul3(),
-      _slice_input_tensor(), _slice_forget_tensor(), _slice_cell_tensor(), _slice_output_tensor(), _dequantize(), _quantize(), _input_to_input_weights(nullptr), _input_to_forget_weights(nullptr),
-      _input_to_cell_weights(nullptr), _input_to_output_weights(nullptr), _recurrent_to_input_weights(nullptr), _recurrent_to_forget_weights(nullptr), _recurrent_to_cell_weights(nullptr),
-      _recurrent_to_output_weights(nullptr), _input_gate_bias(nullptr), _forget_gate_bias(nullptr), _cell_bias(nullptr), _output_gate_bias(nullptr), _recurrent_weights(), _input_weights(), _weights(),
-      _input(), _weights_transposed(), _output_highp(), _output_lowp(), _bias(), _forget_gate_input(), _input_gate_input(), _output_gate_input(), _input_modulation_gate_input(), _forget_gate_output(),
-      _input_gate_output(), _output_gate_output(), _input_modulation_gate_output(), _cell_state1(), _cell_state2(), _output_state_tmp(), _output_state_out_symm(), _output_state_out_f32(),
+    : _memory_group(std::move(memory_manager)),
+      _gemmlowp(),
+      _output_stage(),
+      _transpose_weights(),
+      _concat_input_weights(),
+      _concat_recurrent_weights(),
+      _concat_weights(),
+      _concat_inputs(),
+      _concat_bias(),
+      _sigmoid_forget_gate(),
+      _sigmoid_input_gate(),
+      _sigmoid_output_gate(),
+      _tanh_modulation_gate(),
+      _tanh_output_state(),
+      _add1(),
+      _add2(),
+      _mul1(),
+      _mul2(),
+      _mul3(),
+      _slice_input_tensor(),
+      _slice_forget_tensor(),
+      _slice_cell_tensor(),
+      _slice_output_tensor(),
+      _dequantize(),
+      _quantize(),
+      _input_to_input_weights(nullptr),
+      _input_to_forget_weights(nullptr),
+      _input_to_cell_weights(nullptr),
+      _input_to_output_weights(nullptr),
+      _recurrent_to_input_weights(nullptr),
+      _recurrent_to_forget_weights(nullptr),
+      _recurrent_to_cell_weights(nullptr),
+      _recurrent_to_output_weights(nullptr),
+      _input_gate_bias(nullptr),
+      _forget_gate_bias(nullptr),
+      _cell_bias(nullptr),
+      _output_gate_bias(nullptr),
+      _recurrent_weights(),
+      _input_weights(),
+      _weights(),
+      _input(),
+      _weights_transposed(),
+      _output_highp(),
+      _output_lowp(),
+      _bias(),
+      _forget_gate_input(),
+      _input_gate_input(),
+      _output_gate_input(),
+      _input_modulation_gate_input(),
+      _forget_gate_output(),
+      _input_gate_output(),
+      _output_gate_output(),
+      _input_modulation_gate_output(),
+      _cell_state1(),
+      _cell_state2(),
+      _output_state_tmp(),
+      _output_state_out_symm(),
+      _output_state_out_f32(),
       _is_prepared(false)
 {
 }
 
 void NELSTMLayerQuantized::configure(const ITensor *input,
-                                     const ITensor *input_to_input_weights, const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
-                                     const ITensor *recurrent_to_input_weights, const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
-                                     const ITensor *input_gate_bias, const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
-                                     ITensor *cell_state_in, const ITensor *output_state_in,
-                                     ITensor *cell_state_out, ITensor *output_state_out)
+                                     const ITensor *input_to_input_weights,
+                                     const ITensor *input_to_forget_weights,
+                                     const ITensor *input_to_cell_weights,
+                                     const ITensor *input_to_output_weights,
+                                     const ITensor *recurrent_to_input_weights,
+                                     const ITensor *recurrent_to_forget_weights,
+                                     const ITensor *recurrent_to_cell_weights,
+                                     const ITensor *recurrent_to_output_weights,
+                                     const ITensor *input_gate_bias,
+                                     const ITensor *forget_gate_bias,
+                                     const ITensor *cell_bias,
+                                     const ITensor *output_gate_bias,
+                                     ITensor       *cell_state_in,
+                                     const ITensor *output_state_in,
+                                     ITensor       *cell_state_out,
+                                     ITensor       *output_state_out)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                 recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                 input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
-
-    ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayerQuantized::validate(input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
-                                                              input_to_output_weights->info(),
-                                                              recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                              input_gate_bias->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info()));
-
-    ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                           recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                           input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights,
+                                 input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+                                 recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias,
+                                 forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
+                                 cell_state_out, output_state_out);
+
+    ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayerQuantized::validate(
+        input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
+        input_to_output_weights->info(), recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(),
+        recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), input_gate_bias->info(),
+        forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(),
+        output_state_in->info(), cell_state_out->info(), output_state_out->info()));
+
+    ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights,
+                           input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+                           recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias,
+                           cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
+                           output_state_out);
 
     const int input_size  = input->info()->dimension(0);
     const int batch_size  = input->info()->dimension(1);
@@ -83,8 +152,10 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
 
     const QuantizationInfo qweights = input_to_input_weights->info()->quantization_info(); // Weights quantization
 
-    auto_init_if_empty(*cell_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
-    auto_init_if_empty(*output_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
+    auto_init_if_empty(*cell_state_out->info(),
+                       TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
+    auto_init_if_empty(*output_state_out->info(),
+                       TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
 
     _input_to_input_weights      = input_to_input_weights;
     _input_to_forget_weights     = input_to_forget_weights;
@@ -100,34 +171,41 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
     _output_gate_bias            = output_gate_bias;
 
     // Weights concatenation
-    std::vector<const ITensor *> inputs_weights_vector{ input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights };
-    std::vector<const ITensor *> recurrent_weights_vector{ recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights };
+    std::vector<const ITensor *> inputs_weights_vector{input_to_input_weights, input_to_forget_weights,
+                                                       input_to_cell_weights, input_to_output_weights};
+    std::vector<const ITensor *> recurrent_weights_vector{recurrent_to_input_weights, recurrent_to_forget_weights,
+                                                          recurrent_to_cell_weights, recurrent_to_output_weights};
 
-    _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _input_weights.allocator()->init(
+        TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
     _concat_input_weights.configure(inputs_weights_vector, &_input_weights, Window::DimY);
 
-    _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _recurrent_weights.allocator()->init(
+        TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
     _concat_recurrent_weights.configure(recurrent_weights_vector, &_recurrent_weights, Window::DimY);
 
-    std::vector<const ITensor *> weights_vector{ &_recurrent_weights, &_input_weights };
-    _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    std::vector<const ITensor *> weights_vector{&_recurrent_weights, &_input_weights};
+    _weights.allocator()->init(
+        TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
     _concat_weights.configure(weights_vector, &_weights, Window::DimX);
     _transpose_weights.configure(&_weights, &_weights_transposed);
 
     // Input concatenation
-    std::vector<const ITensor *> input_vector{ input, output_state_in };
+    std::vector<const ITensor *> input_vector{input, output_state_in};
     _memory_group.manage(&_input);
-    _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
+    _input.allocator()->init(
+        TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
     _concat_inputs.configure(input_vector, &_input, Window::DimX);
 
     // Bias concatenation
-    std::vector<const ITensor *> bias_vector{ input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias };
+    std::vector<const ITensor *> bias_vector{input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias};
     _bias.allocator()->init(TensorInfo(TensorShape(4 * output_size), 1, DataType::S32));
     _concat_bias.configure(bias_vector, &_bias, Window::DimX);
 
     // Invert the offset for gemmlowp
     _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset));
-    _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
+    _weights_transposed.info()->set_quantization_info(
+        QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
 
     // Run gemmlowp
     _memory_group.manage(&_output_highp);
@@ -137,7 +215,8 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
 
     // Set the offset back
     _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
-    _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
+    _weights_transposed.info()->set_quantization_info(
+        QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
 
     // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12))
     _output_lowp.allocator()->init(TensorInfo(_output_highp.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_3));
@@ -159,64 +238,80 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
     _bias.allocator()->allocate();
 
     // Get the gate tensors
-    if(batch_size > 1)
+    if (batch_size > 1)
     {
         _memory_group.manage(&_input_gate_input);
-        _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size });
+        _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, {0, 0}, {output_size, batch_size});
         _memory_group.manage(&_forget_gate_input);
-        _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size });
+        _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, {output_size, 0},
+                                       {2 * output_size, batch_size});
         _memory_group.manage(&_input_modulation_gate_input);
-        _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size });
+        _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, {2 * output_size, 0},
+                                     {3 * output_size, batch_size});
         _memory_group.manage(&_output_gate_input);
-        _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size });
+        _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, {3 * output_size, 0},
+                                       {4 * output_size, batch_size});
         _output_lowp.allocator()->allocate();
     }
     else
     {
         _memory_group.manage(&_input_gate_input);
-        _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0 }, { output_size });
+        _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, {0}, {output_size});
         _memory_group.manage(&_forget_gate_input);
-        _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size });
+        _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, {output_size}, {2 * output_size});
         _memory_group.manage(&_input_modulation_gate_input);
-        _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size });
+        _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, {2 * output_size},
+                                     {3 * output_size});
         _memory_group.manage(&_output_gate_input);
-        _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size });
+        _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, {3 * output_size}, {4 * output_size});
         _output_lowp.allocator()->allocate();
     }
 
     // Forget gate
     _memory_group.manage(&_forget_gate_output);
-    _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _forget_gate_output.allocator()->init(
+        TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _forget_gate_input.allocator()->allocate();
 
     // Input gate
     _memory_group.manage(&_input_gate_output);
-    _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _input_gate_output.allocator()->init(
+        TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output,
+                                  ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _input_gate_input.allocator()->allocate();
 
     // Input modulation gate equation
     _memory_group.manage(&_input_modulation_gate_output);
-    _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+    _input_modulation_gate_output.allocator()->init(
+        TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
     _input_modulation_gate_input.allocator()->allocate();
 
     // Output gate
     _memory_group.manage(&_output_gate_output);
-    _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _output_gate_output.allocator()->init(
+        TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _output_gate_input.allocator()->allocate();
 
     // Long term memory
     _memory_group.manage(&_cell_state1);
-    _cell_state1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
-    _mul1.configure(&_forget_gate_output, cell_state_in, &_cell_state1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _cell_state1.allocator()->init(
+        TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+    _mul1.configure(&_forget_gate_output, cell_state_in, &_cell_state1, 1, ConvertPolicy::SATURATE,
+                    RoundingPolicy::TO_ZERO);
     _forget_gate_output.allocator()->allocate();
 
     _memory_group.manage(&_cell_state2);
-    _cell_state2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
-    _mul2.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _cell_state2.allocator()->init(
+        TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+    _mul2.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state2, 1, ConvertPolicy::SATURATE,
+                    RoundingPolicy::TO_ZERO);
     _input_modulation_gate_output.allocator()->allocate();
     _input_gate_output.allocator()->allocate();
 
@@ -226,18 +321,23 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
 
     // Short term memory
     _memory_group.manage(&_output_state_tmp);
-    _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _tanh_output_state.configure(cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+    _output_state_tmp.allocator()->init(
+        TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _tanh_output_state.configure(cell_state_out, &_output_state_tmp,
+                                 ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
 
     _memory_group.manage(&_output_state_out_symm);
-    _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _mul3.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _output_state_out_symm.allocator()->init(
+        TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _mul3.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE,
+                    RoundingPolicy::TO_ZERO);
     _output_gate_output.allocator()->allocate();
     _output_state_tmp.allocator()->allocate();
 
     // Requantize the output state from QSYMM16 to QASYMM8
     _memory_group.manage(&_output_state_out_f32);
-    _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
+    _output_state_out_f32.allocator()->init(
+        TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
     _dequantize.configure(&_output_state_out_symm, &_output_state_out_f32);
     _output_state_out_symm.allocator()->allocate();
 
@@ -246,15 +346,28 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
 }
 
 Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
-                                      const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                                      const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                                      const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                                      const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                                      const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out)
+                                      const ITensorInfo *input_to_input_weights,
+                                      const ITensorInfo *input_to_forget_weights,
+                                      const ITensorInfo *input_to_cell_weights,
+                                      const ITensorInfo *input_to_output_weights,
+                                      const ITensorInfo *recurrent_to_input_weights,
+                                      const ITensorInfo *recurrent_to_forget_weights,
+                                      const ITensorInfo *recurrent_to_cell_weights,
+                                      const ITensorInfo *recurrent_to_output_weights,
+                                      const ITensorInfo *input_gate_bias,
+                                      const ITensorInfo *forget_gate_bias,
+                                      const ITensorInfo *cell_bias,
+                                      const ITensorInfo *output_gate_bias,
+                                      const ITensorInfo *cell_state_in,
+                                      const ITensorInfo *output_state_in,
+                                      const ITensorInfo *cell_state_out,
+                                      const ITensorInfo *output_state_out)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
-                                        recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in,
-                                        output_state_in, cell_state_out, output_state_out);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(
+        input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+        recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+        input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
+        output_state_out);
 
     const int input_size  = input->dimension(0);
     const int batch_size  = input->dimension(1);
@@ -266,29 +379,51 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ERROR_ON(input_gate_bias->num_dimensions() > 1);
     ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2);
 
-    TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8));
-    TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8));
-    TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
-    TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm));
-    TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4));
+    TensorInfo input_weights_info(input_to_input_weights->clone()
+                                      ->set_tensor_shape(TensorShape(input_size, output_size))
+                                      .set_data_type(DataType::QASYMM8));
+    TensorInfo recurrent_weights_info(input_to_input_weights->clone()
+                                          ->set_tensor_shape(TensorShape(output_size, output_size))
+                                          .set_data_type(DataType::QASYMM8));
+    TensorInfo bias_info(
+        input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
+    TensorInfo output_state_info(cell_state_in->clone()
+                                     ->set_tensor_shape(TensorShape(output_size, batch_size))
+                                     .set_data_type(DataType::QASYMM8)
+                                     .set_quantization_info(qasymm));
+    TensorInfo cell_state_info(cell_state_in->clone()
+                                   ->set_tensor_shape(TensorShape(output_size, batch_size))
+                                   .set_data_type(DataType::QSYMM16)
+                                   .set_quantization_info(qsymm_4));
 
     // Shape checks
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights,
+                                                   input_to_cell_weights, input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights,
+                                                   recurrent_to_forget_weights, recurrent_to_cell_weights,
+                                                   recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias,
+                                                   output_gate_bias);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_in);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_in);
 
     // Data type checks
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights,
+                                                       input_to_forget_weights, input_to_cell_weights,
+                                                       input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_input_weights, recurrent_to_forget_weights,
+                                                       recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias,
+                                                       output_gate_bias);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in);
 
     // Quantization checks
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input_weights_info, input_to_input_weights,
+                                                              input_to_forget_weights, input_to_cell_weights,
+                                                              input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights,
+                                                              recurrent_to_cell_weights, recurrent_to_output_weights);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in);
 
@@ -310,7 +445,8 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
     recurrent_weights_vector.emplace_back(recurrent_to_cell_weights);
     recurrent_weights_vector.emplace_back(recurrent_to_output_weights);
     const TensorInfo recurrent_weights(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY));
 
     // _concat_weights
     std::vector<const ITensorInfo *> weights_vector;
@@ -320,7 +456,7 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(weights_vector, &weights, Window::DimX));
     // _transpose_weights
     const TensorShape weights_transposed_shape(weights.tensor_shape()[1], weights.tensor_shape()[0]);
-    TensorInfo        weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape);
+    TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape);
     ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(&weights, &weights_transposed));
 
     // _concat_inputs
@@ -346,7 +482,8 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
 
     // _gemmlowp
     const TensorInfo output_highp(TensorShape(4 * output_size, batch_size), 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp));
 
     // Set the offset back
     input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
@@ -357,7 +494,8 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
     const float multiplier        = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
     int32_t     output_multiplier = 0;
     int32_t     output_shift      = 0;
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
 
     // _output_stage
     GEMMLowpOutputStageInfo info;
@@ -372,68 +510,91 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
     TensorInfo input_modulation_gate_input;
     TensorInfo output_gate_input;
 
-    if(batch_size > 1)
+    if (batch_size > 1)
     {
         // _slice_input_tensor
         input_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, { 0, 0 }, { output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NESlice::validate(&output_lowp, &input_gate_input, {0, 0}, {output_size, batch_size}));
         // _slice_forget_tensor
         forget_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NESlice::validate(&output_lowp, &forget_gate_input, {output_size, 0}, {2 * output_size, batch_size}));
         // _slice_cell_tensor
         input_modulation_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size, 0},
+                                                      {3 * output_size, batch_size}));
         // _slice_output_tensor
         output_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NESlice::validate(&output_lowp, &output_gate_input, {3 * output_size, 0}, {4 * output_size, batch_size}));
     }
     else
     {
         // _slice_input_tensor
         input_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, { 0 }, { output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, {0}, {output_size}));
         // _slice_forget_tensor
         forget_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &forget_gate_input, { output_size }, { 2 * output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NESlice::validate(&output_lowp, &forget_gate_input, {output_size}, {2 * output_size}));
         // _slice_cell_tensor
         input_modulation_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size }, { 3 * output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NESlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size}, {3 * output_size}));
         // _slice_output_tensor
         output_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &output_gate_input, { 3 * output_size }, { 4 * output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NESlice::validate(&output_lowp, &output_gate_input, {3 * output_size}, {4 * output_size}));
     }
 
     // _sigmoid_forget_gate
     const TensorInfo forget_gate_output(forget_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_gate_input, &forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(&forget_gate_input, &forget_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     // _sigmoid_input_gate
     const TensorInfo input_gate_output(input_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+        &input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     // _tanh_modulation_gate
-    const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+    const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16,
+                                                  qsymm_0);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
     // _sigmoid_output_gate
     const TensorInfo output_gate_output(output_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_gate_input, &output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(&output_gate_input, &output_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // _mul_forget_gate_cell_state
     const TensorInfo cell_state_tmp1(forget_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+        &forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
 
     // _mul_input_gate_input_mod_gate
     const TensorInfo cell_state_tmp2(input_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, &cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output,
+                                                                    &cell_state_tmp2, 1, ConvertPolicy::SATURATE,
+                                                                    RoundingPolicy::TO_ZERO));
 
     // _add_cell_state_tmps
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE));
 
     // _tanh_modulation_gate
     const TensorInfo output_state_tmp(cell_state_out->tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, &output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(cell_state_out, &output_state_tmp,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
 
     // _mul_output_state_tmp_output_gate
     const TensorInfo output_state_out_symm(output_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, &output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output,
+                                                                    &output_state_out_symm, 1, ConvertPolicy::SATURATE,
+                                                                    RoundingPolicy::TO_ZERO));
 
     // _dequantize
     const TensorInfo output_state_out_f32(output_state_out_symm.tensor_shape(), 1, DataType::F32);
@@ -442,14 +603,14 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
     // _quantize
     ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayer::validate(&output_state_out_f32, output_state_out));
 
-    if(cell_state_out->total_size() != 0)
+    if (cell_state_out->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_out);
     }
 
-    if(output_state_out->total_size() != 0)
+    if (output_state_out->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_out);
@@ -508,7 +669,7 @@ void NELSTMLayerQuantized::run()
 
 void NELSTMLayerQuantized::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         _input_weights.allocator()->allocate();
         _concat_input_weights.run();
diff --git a/src/runtime/NEON/functions/NELogical.cpp b/src/runtime/NEON/functions/NELogical.cpp
index 92dcf15791..0013a521d1 100644
--- a/src/runtime/NEON/functions/NELogical.cpp
+++ b/src/runtime/NEON/functions/NELogical.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NELogicalKernel.h"
 
@@ -32,15 +33,14 @@ namespace arm_compute
 {
 struct LogicalArgs
 {
-    std::unique_ptr<kernels::NELogicalKernel> kernel{ nullptr };
+    std::unique_ptr<kernels::NELogicalKernel> kernel{nullptr};
     ITensorPack                               pack{};
 };
 
 struct NELogicalAnd::Impl : public LogicalArgs
 {
 };
-NELogicalAnd::NELogicalAnd()
-    : _impl(std::make_unique<Impl>())
+NELogicalAnd::NELogicalAnd() : _impl(std::make_unique<Impl>())
 {
 }
 NELogicalAnd::~NELogicalAnd() = default;
@@ -72,8 +72,7 @@ void NELogicalAnd::run()
 struct NELogicalOr::Impl : public LogicalArgs
 {
 };
-NELogicalOr::NELogicalOr()
-    : _impl(std::make_unique<Impl>())
+NELogicalOr::NELogicalOr() : _impl(std::make_unique<Impl>())
 {
 }
 NELogicalOr::~NELogicalOr() = default;
@@ -105,8 +104,7 @@ void NELogicalOr::run()
 struct NELogicalNot::Impl : public LogicalArgs
 {
 };
-NELogicalNot::NELogicalNot()
-    : _impl(std::make_unique<Impl>())
+NELogicalNot::NELogicalNot() : _impl(std::make_unique<Impl>())
 {
 }
 NELogicalNot::~NELogicalNot() = default;
diff --git a/src/runtime/NEON/functions/NEMatMul.cpp b/src/runtime/NEON/functions/NEMatMul.cpp
index 58640f40ea..31898bafc4 100644
--- a/src/runtime/NEON/functions/NEMatMul.cpp
+++ b/src/runtime/NEON/functions/NEMatMul.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuMatMul.h"
 
@@ -33,23 +34,27 @@ namespace arm_compute
 {
 struct NEMatMul::Impl
 {
-    const ITensor                  *lhs{ nullptr };
-    const ITensor                  *rhs{ nullptr };
-    ITensor                        *output{ nullptr };
-    std::unique_ptr<cpu::CpuMatMul> op{ nullptr };
+    const ITensor                  *lhs{nullptr};
+    const ITensor                  *rhs{nullptr};
+    ITensor                        *output{nullptr};
+    std::unique_ptr<cpu::CpuMatMul> op{nullptr};
     MemoryGroup                     memory_group{};
     WorkspaceData<Tensor>           workspace_tensors{};
     ITensorPack                     run_pack{};
 };
 
-NEMatMul::NEMatMul()
-    : _impl(std::make_unique<Impl>())
+NEMatMul::NEMatMul() : _impl(std::make_unique<Impl>())
 {
 }
 
 NEMatMul::~NEMatMul() = default;
 
-void NEMatMul::configure(ITensor *lhs, ITensor *rhs, ITensor *output, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
+void NEMatMul::configure(ITensor                   *lhs,
+                         ITensor                   *rhs,
+                         ITensor                   *output,
+                         const MatMulInfo          &info,
+                         const CpuMatMulSettings   &settings,
+                         const ActivationLayerInfo &act_info)
 {
     _impl->lhs    = lhs;
     _impl->rhs    = rhs;
@@ -58,11 +63,16 @@ void NEMatMul::configure(ITensor *lhs, ITensor *rhs, ITensor *output, const MatM
     ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->lhs, _impl->rhs, _impl->output);
     _impl->op = std::make_unique<cpu::CpuMatMul>();
     _impl->op->configure(lhs->info(), rhs->info(), output->info(), info, settings, act_info);
-    _impl->run_pack          = { { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs }, { ACL_DST, output } };
+    _impl->run_pack          = {{ACL_SRC_0, lhs}, {ACL_SRC_1, rhs}, {ACL_DST, output}};
     _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
 }
 
-Status NEMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
+Status NEMatMul::validate(const ITensorInfo         *lhs,
+                          const ITensorInfo         *rhs,
+                          const ITensorInfo         *output,
+                          const MatMulInfo          &info,
+                          const CpuMatMulSettings   &settings,
+                          const ActivationLayerInfo &act_info)
 {
     return cpu::CpuMatMul::validate(lhs, rhs, output, info, settings, act_info);
 }
diff --git a/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp
index 97ddaea41d..c3861afd2c 100644
--- a/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp
@@ -25,8 +25,9 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/NEON/functions/NEFill.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h"
 #include "src/cpu/operators/CpuMaxUnpooling.h"
@@ -35,20 +36,22 @@ namespace arm_compute
 {
 struct NEMaxUnpoolingLayer::Impl
 {
-    const ITensor                        *src{ nullptr };
-    const ITensor                        *indices{ nullptr };
-    ITensor                              *dst{ nullptr };
-    std::unique_ptr<cpu::CpuMaxUnpooling> op{ nullptr };
+    const ITensor                        *src{nullptr};
+    const ITensor                        *indices{nullptr};
+    ITensor                              *dst{nullptr};
+    std::unique_ptr<cpu::CpuMaxUnpooling> op{nullptr};
 };
 
 NEMaxUnpoolingLayer::~NEMaxUnpoolingLayer() = default;
 
-NEMaxUnpoolingLayer::NEMaxUnpoolingLayer()
-    : _fill_func(), _impl()
+NEMaxUnpoolingLayer::NEMaxUnpoolingLayer() : _fill_func(), _impl()
 {
 }
 
-void NEMaxUnpoolingLayer::configure(ITensor *input, ITensor *indices, ITensor *output, const PoolingLayerInfo &pool_info)
+void NEMaxUnpoolingLayer::configure(ITensor                *input,
+                                    ITensor                *indices,
+                                    ITensor                *output,
+                                    const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, indices, output, pool_info);
 
@@ -64,7 +67,10 @@ void NEMaxUnpoolingLayer::configure(ITensor *input, ITensor *indices, ITensor *o
     _impl->op->configure(input->info(), indices->info(), output->info(), pool_info);
 }
 
-Status NEMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+Status NEMaxUnpoolingLayer::validate(const ITensorInfo      *input,
+                                     const ITensorInfo      *indices,
+                                     const ITensorInfo      *output,
+                                     const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices);
     ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuMaxUnpooling::validate(input, indices, output, pool_info));
diff --git a/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp b/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp
index 7626aa0db2..dec0dde56d 100644
--- a/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h"
 
-#include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
index d3b1696335..d6d2e9dc46 100644
--- a/src/runtime/NEON/functions/NENormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NENormalizationLayerKernel.h"
 
@@ -61,13 +62,16 @@ void NENormalizationLayer::configure(const ITensor *input, ITensor *output, cons
     _input_squared.allocator()->allocate();
 }
 
-Status NENormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
+Status NENormalizationLayer::validate(const ITensorInfo            *input,
+                                      const ITensorInfo            *output,
+                                      const NormalizationLayerInfo &norm_info)
 {
     // Perform validation step
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
 
     ARM_COMPUTE_RETURN_ON_ERROR(NENormalizationLayerKernel::validate(input, input, output, norm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE,
+                                                                    RoundingPolicy::TO_ZERO));
 
     return Status{};
 }
@@ -78,4 +82,4 @@ void NENormalizationLayer::run()
     _multiply_f.run();
     NEScheduler::get().schedule(_norm_kernel.get(), Window::DimY);
 }
-}
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPReluLayer.cpp b/src/runtime/NEON/functions/NEPReluLayer.cpp
index 80c5690a4e..963e68bac7 100644
--- a/src/runtime/NEON/functions/NEPReluLayer.cpp
+++ b/src/runtime/NEON/functions/NEPReluLayer.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEPReluLayer.h"
 
 #include "arm_compute/core/ITensor.h"
+
 #include "src/cpu/operators/CpuPRelu.h"
 
 namespace arm_compute
@@ -32,17 +33,16 @@ using OperatorType = cpu::CpuPRelu;
 
 struct NEPReluLayer::Impl
 {
-    const ITensor                *src_0{ nullptr };
-    const ITensor                *src_1{ nullptr };
-    ITensor                      *dst{ nullptr };
-    std::unique_ptr<OperatorType> op{ nullptr };
+    const ITensor                *src_0{nullptr};
+    const ITensor                *src_1{nullptr};
+    ITensor                      *dst{nullptr};
+    std::unique_ptr<OperatorType> op{nullptr};
 };
 
-NEPReluLayer::NEPReluLayer()
-    : _impl(std::make_unique<Impl>())
+NEPReluLayer::NEPReluLayer() : _impl(std::make_unique<Impl>())
 {
 }
-NEPReluLayer::NEPReluLayer(NEPReluLayer &&) = default;
+NEPReluLayer::NEPReluLayer(NEPReluLayer &&)            = default;
 NEPReluLayer &NEPReluLayer::operator=(NEPReluLayer &&) = default;
 NEPReluLayer::~NEPReluLayer()                          = default;
 
diff --git a/src/runtime/NEON/functions/NEPadLayer.cpp b/src/runtime/NEON/functions/NEPadLayer.cpp
index 8bacdd3002..253566df0f 100644
--- a/src/runtime/NEON/functions/NEPadLayer.cpp
+++ b/src/runtime/NEON/functions/NEPadLayer.cpp
@@ -23,13 +23,13 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEPadLayer.h"
 
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
-#include "src/core/NEON/kernels/NEPadLayerKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/NEON/kernels/NEPadLayerKernel.h"
 
 namespace arm_compute
 {
@@ -38,9 +38,9 @@ namespace
 uint32_t last_padding_dimension(const PaddingList &padding)
 {
     int last_padding_dim = padding.size() - 1;
-    for(; last_padding_dim >= 0; --last_padding_dim)
+    for (; last_padding_dim >= 0; --last_padding_dim)
     {
-        if(padding[last_padding_dim].first > 0 || padding[last_padding_dim].second > 0)
+        if (padding[last_padding_dim].first > 0 || padding[last_padding_dim].second > 0)
         {
             break;
         }
@@ -52,11 +52,22 @@ uint32_t last_padding_dimension(const PaddingList &padding)
 NEPadLayer::~NEPadLayer() = default;
 
 NEPadLayer::NEPadLayer()
-    : _copy_function(), _pad_kernel(), _mode(), _padding(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results()
+    : _copy_function(),
+      _pad_kernel(),
+      _mode(),
+      _padding(),
+      _num_dimensions(0),
+      _slice_functions(),
+      _concat_functions(),
+      _slice_results(),
+      _concat_results()
 {
 }
 
-void NEPadLayer::configure_constant_mode(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value)
+void NEPadLayer::configure_constant_mode(ITensor           *input,
+                                         ITensor           *output,
+                                         const PaddingList &padding,
+                                         const PixelValue   constant_value)
 {
     _pad_kernel = std::make_unique<NEPadLayerKernel>();
     _pad_kernel->configure(input, output, padding, constant_value, PaddingMode::CONSTANT);
@@ -85,20 +96,20 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu
     Coordinates ends_after{};
     Coordinates strides{};
     ITensor    *prev = input;
-    for(uint32_t i = 0; i < _num_dimensions; ++i)
+    for (uint32_t i = 0; i < _num_dimensions; ++i)
     {
         // Values in strides from the previous dimensions need to be set to 1 to avoid reversing again.
-        if(i > 0)
+        if (i > 0)
         {
             strides.set(i - 1, 1);
         }
 
-        if(_padding[i].first > 0 || _padding[i].second > 0)
+        if (_padding[i].first > 0 || _padding[i].second > 0)
         {
             // Set the starts, ends, and strides values for the current dimension.
             // Due to the bit masks passed to strided slice, the values below the current dimension in
             // starts and ends will be ignored so do not need to be modified.
-            if(_mode == PaddingMode::REFLECT)
+            if (_mode == PaddingMode::REFLECT)
             {
                 starts_before.set(i, _padding[i].first);
                 ends_before.set(i, 0);
@@ -124,11 +135,12 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu
 
             // Reflect the input values for the padding before and after the input.
             std::vector<const ITensor *> concat_vector;
-            if(_padding[i].first > 0)
+            if (_padding[i].first > 0)
             {
-                if(i < prev->info()->num_dimensions())
+                if (i < prev->info()->num_dimensions())
                 {
-                    _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides, begin_mask_before, end_mask_before);
+                    _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides,
+                                                      begin_mask_before, end_mask_before);
                     concat_vector.emplace_back(&_slice_results[2 * i]);
                 }
                 else
@@ -138,11 +150,12 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu
                 }
             }
             concat_vector.push_back(prev);
-            if(_padding[i].second > 0)
+            if (_padding[i].second > 0)
             {
-                if(i < prev->info()->num_dimensions())
+                if (i < prev->info()->num_dimensions())
                 {
-                    _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after, strides, begin_mask_after, end_mask_after);
+                    _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after,
+                                                          strides, begin_mask_after, end_mask_after);
                     concat_vector.emplace_back(&_slice_results[2 * i + 1]);
                 }
                 else
@@ -154,12 +167,12 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu
             // Concatenate the padding before and after with the input.
             ITensor *out = (i == _num_dimensions - 1) ? output : &_concat_results[i];
             out->info()->set_quantization_info(output->info()->quantization_info());
-            for(auto &v : concat_vector)
+            for (auto &v : concat_vector)
             {
                 v->info()->set_quantization_info(input->info()->quantization_info());
             }
             _concat_functions[i].configure(concat_vector, out, i);
-            if(i != _num_dimensions - 1)
+            if (i != _num_dimensions - 1)
             {
                 _concat_results[i].allocator()->allocate();
             }
@@ -170,7 +183,11 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu
     }
 }
 
-void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
+void NEPadLayer::configure(ITensor           *input,
+                           ITensor           *output,
+                           const PaddingList &padding,
+                           const PixelValue   constant_value,
+                           const PaddingMode  mode)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode));
     ARM_COMPUTE_LOG_PARAMS(input, output, padding, constant_value, mode);
@@ -178,15 +195,16 @@ void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &p
     _padding = padding;
     _mode    = mode;
 
-    const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding);
+    const TensorShape padded_shape =
+        misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding);
 
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(padded_shape));
 
     // Find the last dimension requiring padding so that it is known when to write to output and whether any padding is applied.
     _num_dimensions = last_padding_dimension(padding) + 1;
-    if(_num_dimensions > 0)
+    if (_num_dimensions > 0)
     {
-        switch(_mode)
+        switch (_mode)
         {
             case PaddingMode::CONSTANT:
             {
@@ -210,19 +228,23 @@ void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &p
     }
 }
 
-Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
+Status NEPadLayer::validate(const ITensorInfo *input,
+                            const ITensorInfo *output,
+                            const PaddingList &padding,
+                            const PixelValue   constant_value,
+                            const PaddingMode  mode)
 {
     ARM_COMPUTE_UNUSED(constant_value);
 
     const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
 
-    if(output->total_size() > 0)
+    if (output->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), padded_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
-    switch(mode)
+    switch (mode)
     {
         case PaddingMode::CONSTANT:
         {
@@ -231,9 +253,9 @@ Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
         case PaddingMode::REFLECT:
         case PaddingMode::SYMMETRIC:
         {
-            for(uint32_t i = 0; i < padding.size(); ++i)
+            for (uint32_t i = 0; i < padding.size(); ++i)
             {
-                if(mode == PaddingMode::REFLECT)
+                if (mode == PaddingMode::REFLECT)
                 {
                     ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first >= input->dimension(i));
                     ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second >= input->dimension(i));
@@ -256,9 +278,9 @@ Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
 
 void NEPadLayer::run()
 {
-    if(_num_dimensions > 0)
+    if (_num_dimensions > 0)
     {
-        switch(_mode)
+        switch (_mode)
         {
             case PaddingMode::CONSTANT:
             {
@@ -268,15 +290,15 @@ void NEPadLayer::run()
             case PaddingMode::REFLECT:
             case PaddingMode::SYMMETRIC:
             {
-                for(uint32_t i = 0; i < _num_dimensions; ++i)
+                for (uint32_t i = 0; i < _num_dimensions; ++i)
                 {
-                    if(_padding[i].first > 0 || _padding[i].second > 0)
+                    if (_padding[i].first > 0 || _padding[i].second > 0)
                     {
-                        if(_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0)
+                        if (_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0)
                         {
                             _slice_functions[2 * i].run();
                         }
-                        if(_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0)
+                        if (_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0)
                         {
                             _slice_functions[2 * i + 1].run();
                         }
diff --git a/src/runtime/NEON/functions/NEPermute.cpp b/src/runtime/NEON/functions/NEPermute.cpp
index 517b86a1cb..80cd04ce6c 100644
--- a/src/runtime/NEON/functions/NEPermute.cpp
+++ b/src/runtime/NEON/functions/NEPermute.cpp
@@ -24,19 +24,19 @@
 #include "arm_compute/runtime/NEON/functions/NEPermute.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuPermute.h"
 
 namespace arm_compute
 {
 struct NEPermute::Impl
 {
-    const ITensor                   *src{ nullptr };
-    ITensor                         *dst{ nullptr };
-    std::unique_ptr<cpu::CpuPermute> op{ nullptr };
+    const ITensor                   *src{nullptr};
+    ITensor                         *dst{nullptr};
+    std::unique_ptr<cpu::CpuPermute> op{nullptr};
 };
 
-NEPermute::NEPermute()
-    : _impl(std::make_unique<Impl>())
+NEPermute::NEPermute() : _impl(std::make_unique<Impl>())
 {
 }
 
diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
index ad83a26beb..97155a9e74 100644
--- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
 
 #include "arm_compute/core/ITensor.h"
+
 #include "src/cpu/operators/CpuMul.h"
 
 #include <utility>
@@ -32,32 +33,42 @@ namespace arm_compute
 {
 struct NEPixelWiseMultiplication::Impl
 {
-    const ITensor               *src_0{ nullptr };
-    const ITensor               *src_1{ nullptr };
-    ITensor                     *dst{ nullptr };
-    std::unique_ptr<cpu::CpuMul> op{ nullptr };
+    const ITensor               *src_0{nullptr};
+    const ITensor               *src_1{nullptr};
+    ITensor                     *dst{nullptr};
+    std::unique_ptr<cpu::CpuMul> op{nullptr};
 };
 
-NEPixelWiseMultiplication::NEPixelWiseMultiplication()
-    : _impl(std::make_unique<Impl>())
+NEPixelWiseMultiplication::NEPixelWiseMultiplication() : _impl(std::make_unique<Impl>())
 {
 }
 NEPixelWiseMultiplication::~NEPixelWiseMultiplication() = default;
 
-Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+Status NEPixelWiseMultiplication::validate(const ITensorInfo         *input1,
+                                           const ITensorInfo         *input2,
+                                           const ITensorInfo         *output,
+                                           float                      scale,
+                                           ConvertPolicy              overflow_policy,
+                                           RoundingPolicy             rounding_policy,
                                            const ActivationLayerInfo &act_info)
 {
     return cpu::CpuMul::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
 }
 
-void NEPixelWiseMultiplication::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+void NEPixelWiseMultiplication::configure(const ITensor             *input1,
+                                          const ITensor             *input2,
+                                          ITensor                   *output,
+                                          float                      scale,
+                                          ConvertPolicy              overflow_policy,
+                                          RoundingPolicy             rounding_policy,
                                           const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
     _impl->dst   = output;
     _impl->op    = std::make_unique<cpu::CpuMul>();
-    _impl->op->configure(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, act_info);
+    _impl->op->configure(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy,
+                         act_info);
 }
 
 void NEPixelWiseMultiplication::run()
@@ -71,24 +82,29 @@ void NEPixelWiseMultiplication::run()
 
 struct NEComplexPixelWiseMultiplication::Impl
 {
-    ITensor                            *src_0{ nullptr };
-    ITensor                            *src_1{ nullptr };
-    ITensor                            *dst{ nullptr };
-    std::unique_ptr<cpu::CpuComplexMul> op{ nullptr };
+    ITensor                            *src_0{nullptr};
+    ITensor                            *src_1{nullptr};
+    ITensor                            *dst{nullptr};
+    std::unique_ptr<cpu::CpuComplexMul> op{nullptr};
 };
 
-NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication()
-    : _impl(std::make_unique<Impl>())
+NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication() : _impl(std::make_unique<Impl>())
 {
 }
 NEComplexPixelWiseMultiplication::~NEComplexPixelWiseMultiplication() = default;
 
-Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo         *input1,
+                                                  const ITensorInfo         *input2,
+                                                  const ITensorInfo         *output,
+                                                  const ActivationLayerInfo &act_info)
 {
     return cpu::CpuComplexMul::validate(input1, input2, output, act_info);
 }
 
-void NEComplexPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+void NEComplexPixelWiseMultiplication::configure(ITensor                   *input1,
+                                                 ITensor                   *input2,
+                                                 ITensor                   *output,
+                                                 const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
diff --git a/src/runtime/NEON/functions/NEPooling3dLayer.cpp b/src/runtime/NEON/functions/NEPooling3dLayer.cpp
index 53f9dbf0a2..e017e8c21d 100644
--- a/src/runtime/NEON/functions/NEPooling3dLayer.cpp
+++ b/src/runtime/NEON/functions/NEPooling3dLayer.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuPool3d.h"
 
@@ -33,9 +34,9 @@ namespace arm_compute
 {
 struct NEPooling3dLayer::Impl
 {
-    const ITensor                  *src{ nullptr };
-    ITensor                        *dst{ nullptr };
-    std::unique_ptr<cpu::CpuPool3d> op{ nullptr };
+    const ITensor                  *src{nullptr};
+    ITensor                        *dst{nullptr};
+    std::unique_ptr<cpu::CpuPool3d> op{nullptr};
     MemoryGroup                     memory_group{};
     ITensorPack                     run_pack{};
     WorkspaceData<Tensor>           workspace_tensors{};
@@ -43,8 +44,7 @@ struct NEPooling3dLayer::Impl
 
 NEPooling3dLayer::~NEPooling3dLayer() = default;
 
-NEPooling3dLayer::NEPooling3dLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _impl(std::make_unique<Impl>())
+NEPooling3dLayer::NEPooling3dLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
 {
     _impl->memory_group = MemoryGroup(std::move(memory_manager));
 }
@@ -56,11 +56,12 @@ void NEPooling3dLayer::configure(const ITensor *input, ITensor *output, const Po
     _impl->op  = std::make_unique<cpu::CpuPool3d>();
     _impl->op->configure(input->info(), output->info(), pool_info);
 
-    _impl->run_pack          = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST_0, _impl->dst } };
+    _impl->run_pack          = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST_0, _impl->dst}};
     _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
 }
 
-Status NEPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info)
+Status
+NEPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info)
 {
     return cpu::CpuPool3d::validate(input, output, pool_info);
 }
@@ -72,4 +73,4 @@ void NEPooling3dLayer::run()
     _impl->op->run(_impl->run_pack);
 }
 
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index 5a3b9c5e7e..eb9125be3c 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuPool2d.h"
 
@@ -33,10 +34,10 @@ namespace arm_compute
 {
 struct NEPoolingLayer::Impl
 {
-    ITensor                        *src{ nullptr };
-    ITensor                        *dst{ nullptr };
-    ITensor                        *indices{ nullptr };
-    std::unique_ptr<cpu::CpuPool2d> op{ nullptr };
+    ITensor                        *src{nullptr};
+    ITensor                        *dst{nullptr};
+    ITensor                        *indices{nullptr};
+    std::unique_ptr<cpu::CpuPool2d> op{nullptr};
     MemoryGroup                     memory_group{};
     ITensorPack                     run_pack{};
     WorkspaceData<Tensor>           workspace_tensors{};
@@ -44,8 +45,7 @@ struct NEPoolingLayer::Impl
 
 NEPoolingLayer::~NEPoolingLayer() = default;
 
-NEPoolingLayer::NEPoolingLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _impl(std::make_unique<Impl>())
+NEPoolingLayer::NEPoolingLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
 {
     _impl->memory_group = MemoryGroup(std::move(memory_manager));
 }
@@ -58,11 +58,16 @@ void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLay
     _impl->op      = std::make_unique<cpu::CpuPool2d>();
     _impl->op->configure(input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr);
 
-    _impl->run_pack          = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST_0, _impl->dst }, { TensorType::ACL_DST_1, _impl->indices } };
+    _impl->run_pack          = {{TensorType::ACL_SRC, _impl->src},
+                                {TensorType::ACL_DST_0, _impl->dst},
+                                {TensorType::ACL_DST_1, _impl->indices}};
     _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
 }
 
-Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status NEPoolingLayer::validate(const ITensorInfo      *input,
+                                const ITensorInfo      *output,
+                                const PoolingLayerInfo &pool_info,
+                                const ITensorInfo      *indices)
 {
     return cpu::CpuPool2d::validate(input, output, pool_info, indices);
 }
diff --git a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
index aba09239cf..dbb6bf9df1 100644
--- a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
+++ b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
@@ -27,15 +27,19 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEPriorBoxLayerKernel.h"
 
 namespace arm_compute
 {
-void NEPriorBoxLayer::configure(const ITensor *input1, const ITensor *input2, ITensor *output, const PriorBoxLayerInfo &info)
+void NEPriorBoxLayer::configure(const ITensor           *input1,
+                                const ITensor           *input2,
+                                ITensor                 *output,
+                                const PriorBoxLayerInfo &info)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, output, info);
 
@@ -44,7 +48,10 @@ void NEPriorBoxLayer::configure(const ITensor *input1, const ITensor *input2, IT
     _kernel = std::move(k);
 }
 
-Status NEPriorBoxLayer::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status NEPriorBoxLayer::validate(const ITensorInfo       *input1,
+                                 const ITensorInfo       *input2,
+                                 const ITensorInfo       *output,
+                                 const PriorBoxLayerInfo &info)
 {
     return NEPriorBoxLayerKernel::validate(input1, input2, output, info);
 }
diff --git a/src/runtime/NEON/functions/NEQLSTMLayer.cpp b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
index 2caaea02d8..dd78d10d16 100644
--- a/src/runtime/NEON/functions/NEQLSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
@@ -27,13 +27,14 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/QuantizationInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
-#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
 #include "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h"
 
 namespace arm_compute
@@ -41,12 +42,19 @@ namespace arm_compute
 using namespace arm_compute::utils::info_helpers;
 namespace
 {
-Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm_input, const ITensorInfo *mm_weights, const ITensorInfo *bias,
-                   float gemmlowp_scale, const TensorInfo *mm_res_info, const TensorInfo *outstage_tensor_info)
+Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info,
+                   const ITensorInfo       *mm_input,
+                   const ITensorInfo       *mm_weights,
+                   const ITensorInfo       *bias,
+                   float                    gemmlowp_scale,
+                   const TensorInfo        *mm_res_info,
+                   const TensorInfo        *outstage_tensor_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(mm_input, mm_weights, nullptr, mm_res_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+        gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));
     return Status{};
 }
 } // namespace
@@ -55,10 +63,7 @@ Status NEQLSTMLayer::validate_layer_norm(const ITensorInfo &in, const ITensorInf
 {
     // Output quantization scale will be different, but ignored here
     // since it will be configured at configure() stage.
-    const TensorInfo out
-    {
-        in
-    };
+    const TensorInfo out{in};
     return NEQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias);
 }
 
@@ -98,14 +103,12 @@ void NEQLSTMLayer::TensorCopyKernel::configure(ITensor &src, ITensor &dst)
 
 void NEQLSTMLayer::TensorCopyKernel::run()
 {
-    Iterator input_iter{ _src, _window };
-    Iterator output_iter{ _dst, _window };
+    Iterator input_iter{_src, _window};
+    Iterator output_iter{_dst, _window};
 
-    execute_window_loop(_window, [&](const Coordinates &)
-    {
-        memcpy(output_iter.ptr(), input_iter.ptr(), _row_size);
-    },
-    input_iter, output_iter);
+    execute_window_loop(
+        _window, [&](const Coordinates &) { memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); }, input_iter,
+        output_iter);
 }
 
 NEQLSTMLayer::~NEQLSTMLayer() = default;
@@ -191,10 +194,17 @@ NEQLSTMLayer::NEQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
     _memory_group = MemoryGroup(std::move(memory_manager));
 }
 
-void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
-                                const ITensor *mm_input, const ITensor *mm_weights, const ITensor *bias,
-                                Tensor *mm_res, Tensor *outstage_res, float gemmlowp_scale,
-                                const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info)
+void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm,
+                                NEGEMMLowpOutputStage        &outstage,
+                                GEMMLowpOutputStageInfo      &gemmlowp_info,
+                                const ITensor                *mm_input,
+                                const ITensor                *mm_weights,
+                                const ITensor                *bias,
+                                Tensor                       *mm_res,
+                                Tensor                       *outstage_res,
+                                float                         gemmlowp_scale,
+                                const TensorInfo             &mm_res_info,
+                                const TensorInfo             &outstage_tensor_info)
 {
     _memory_group.manage(mm_res);
     _memory_group.manage(outstage_res);
@@ -206,66 +216,87 @@ void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutp
     mm.configure(mm_input, mm_weights, nullptr, mm_res);
 
     // Configure output stage
-    quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
+    quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                 &gemmlowp_info.gemmlowp_shift);
     outstage.configure(mm_res, bias, outstage_res, gemmlowp_info);
     mm_res->allocator()->allocate();
 }
 
-void NEQLSTMLayer::configure(const ITensor *input,
-                             const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
-                             const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
-                             const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
-                             const ITensor *cell_state_in, ITensor *output_state_in,
-                             ITensor *cell_state_out, ITensor *output_state_out, ITensor *output,
+void NEQLSTMLayer::configure(const ITensor             *input,
+                             const ITensor             *input_to_forget_weights,
+                             const ITensor             *input_to_cell_weights,
+                             const ITensor             *input_to_output_weights,
+                             const ITensor             *recurrent_to_forget_weights,
+                             const ITensor             *recurrent_to_cell_weights,
+                             const ITensor             *recurrent_to_output_weights,
+                             const ITensor             *forget_gate_bias,
+                             const ITensor             *cell_bias,
+                             const ITensor             *output_gate_bias,
+                             const ITensor             *cell_state_in,
+                             ITensor                   *output_state_in,
+                             ITensor                   *cell_state_out,
+                             ITensor                   *output_state_out,
+                             ITensor                   *output,
                              const LSTMParams<ITensor> &lstm_params)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
                                  recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                 forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
+                                 forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
+                                 cell_state_out, output_state_out);
 
     ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
                            recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                           forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
+                           forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
+                           cell_state_out, output_state_out);
 
     // Set lstm parameters
     LSTMParams<ITensorInfo> lstm_params_info{};
     build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
 
-    _input_to_forget_weights_transposed.info()->set_quantization_info(input_to_forget_weights->info()->quantization_info());
+    _input_to_forget_weights_transposed.info()->set_quantization_info(
+        input_to_forget_weights->info()->quantization_info());
     _input_to_cell_weights_transposed.info()->set_quantization_info(input_to_cell_weights->info()->quantization_info());
-    _input_to_output_weights_transposed.info()->set_quantization_info(input_to_output_weights->info()->quantization_info());
-    _recurrent_to_forget_weights_transposed.info()->set_quantization_info(recurrent_to_forget_weights->info()->quantization_info());
-    _recurrent_to_cell_weights_transposed.info()->set_quantization_info(recurrent_to_cell_weights->info()->quantization_info());
-    _recurrent_to_output_weights_transposed.info()->set_quantization_info(recurrent_to_output_weights->info()->quantization_info());
-
-    if(input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED)
+    _input_to_output_weights_transposed.info()->set_quantization_info(
+        input_to_output_weights->info()->quantization_info());
+    _recurrent_to_forget_weights_transposed.info()->set_quantization_info(
+        recurrent_to_forget_weights->info()->quantization_info());
+    _recurrent_to_cell_weights_transposed.info()->set_quantization_info(
+        recurrent_to_cell_weights->info()->quantization_info());
+    _recurrent_to_output_weights_transposed.info()->set_quantization_info(
+        recurrent_to_output_weights->info()->quantization_info());
+
+    if (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED)
     {
         _convert_input_to_forget_weights_to_qsymm8 = true;
         // Setup dequantize output tensor to go from QASYMM8_SIGNED -> F32
 
-        _input_to_forget_weights_f32.allocator()->init(TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::F32)
-                                                       .set_data_layout(input_to_forget_weights->info()->data_layout()));
+        _input_to_forget_weights_f32.allocator()->init(
+            TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::F32)
+                .set_data_layout(input_to_forget_weights->info()->data_layout()));
         // Setup the quantize output tensor to go from F32 -> QSYMM8
-        _input_to_forget_weights_symm8.allocator()->init((TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::QSYMM8)
-                                                          .set_data_layout(input_to_forget_weights->info()->data_layout())
-                                                          .set_quantization_info(input_to_forget_weights->info()->quantization_info())));
+        _input_to_forget_weights_symm8.allocator()->init(
+            (TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::QSYMM8)
+                 .set_data_layout(input_to_forget_weights->info()->data_layout())
+                 .set_quantization_info(input_to_forget_weights->info()->quantization_info())));
 
         _dequantize_input_to_forget_weights.configure(input_to_forget_weights, &_input_to_forget_weights_f32);
         _quantize_input_to_forget_weights.configure(&_input_to_forget_weights_f32, &_input_to_forget_weights_symm8);
 
-        ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(input->info(), _input_to_forget_weights_symm8.info(), input_to_cell_weights->info(), input_to_output_weights->info(),
-                                                          recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                          forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
-                                                          cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(),
-                                                          lstm_params_info));
+        ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(
+            input->info(), _input_to_forget_weights_symm8.info(), input_to_cell_weights->info(),
+            input_to_output_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(),
+            recurrent_to_output_weights->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
+            cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(),
+            output->info(), lstm_params_info));
     }
     else
     {
-        ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
-                                                          recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                          forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
-                                                          cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(),
-                                                          lstm_params_info));
+        ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(
+            input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
+            input_to_output_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(),
+            recurrent_to_output_weights->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
+            cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(),
+            output->info(), lstm_params_info));
     }
 
     const int batch_size  = input->info()->dimension(1);
@@ -277,7 +308,9 @@ void NEQLSTMLayer::configure(const ITensor *input,
     const UniformQuantizationInfo qoutput_state_in = output_state_in->info()->quantization_info().uniform();
 
     _projection_bias             = lstm_params.projection_bias();
-    _input_to_forget_weights     = (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED) ? &_input_to_forget_weights_symm8 : input_to_forget_weights;
+    _input_to_forget_weights     = (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED)
+                                       ? &_input_to_forget_weights_symm8
+                                       : input_to_forget_weights;
     _input_to_cell_weights       = input_to_cell_weights;
     _input_to_output_weights     = input_to_output_weights;
     _recurrent_to_forget_weights = recurrent_to_forget_weights;
@@ -287,7 +320,7 @@ void NEQLSTMLayer::configure(const ITensor *input,
 
     // Layer normalization
     _has_layer_norm = lstm_params.use_layer_norm();
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         set_layer_norm_weight(lstm_params.forget_layer_norm_weights(), LayerNormGate::Forget);
         set_layer_norm_weight(lstm_params.cell_layer_norm_weights(), LayerNormGate::Cell);
@@ -309,22 +342,25 @@ void NEQLSTMLayer::configure(const ITensor *input,
 
     // Calculate quantized parameters for clipping.
     int16_t quantized_cell_clip = 0;
-    if(lstm_params.cell_clip() > 0.0f)
+    if (lstm_params.cell_clip() > 0.0f)
     {
         quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
     }
     _has_cell_clipping = quantized_cell_clip > 0;
 
     // Precompute effective bias for optimizing the matmul computations.
-    if(!_has_cifg)
+    if (!_has_cifg)
     {
         _input_to_input_weights     = lstm_params.input_to_input_weights();
         _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights();
 
         _input_to_input_reduction     = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
         _recurrent_to_input_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
-        _input_to_input_reduction->configure(_input_to_input_weights->info(), _input_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-        _recurrent_to_input_reduction->configure(_recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+        _input_to_input_reduction->configure(_input_to_input_weights->info(), _input_to_input_eff_bias.info(),
+                                             GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+        _recurrent_to_input_reduction->configure(
+            _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(),
+            GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
     }
 
     _input_to_forget_reduction     = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
@@ -334,19 +370,31 @@ void NEQLSTMLayer::configure(const ITensor *input,
     _input_to_output_reduction     = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
     _recurrent_to_output_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
 
-    _input_to_forget_reduction->configure(input_to_forget_weights->info(), _input_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_forget_reduction->configure(recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
-    _input_to_cell_reduction->configure(input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_cell_reduction->configure(recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
-    _input_to_output_reduction->configure(input_to_output_weights->info(), _input_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_output_reduction->configure(recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
-    if(_has_projection)
+    _input_to_forget_reduction->configure(input_to_forget_weights->info(), _input_to_forget_eff_bias.info(),
+                                          GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_forget_reduction->configure(
+        recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(),
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_cell_reduction->configure(input_to_cell_weights->info(), _input_to_cell_eff_bias.info(),
+                                        GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_cell_reduction->configure(
+        recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(),
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_output_reduction->configure(input_to_output_weights->info(), _input_to_output_eff_bias.info(),
+                                          GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_output_reduction->configure(
+        recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(),
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    if (_has_projection)
     {
         _projection_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
-        _projection_reduction->configure(_projection_weights->info(), _projection_eff_bias.info(), GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
-        if(_projection_bias != nullptr)
+        _projection_reduction->configure(
+            _projection_weights->info(), _projection_eff_bias.info(),
+            GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
+        if (_projection_bias != nullptr)
         {
-            _projection_bias_add.configure(_projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE);
+            _projection_bias_add.configure(_projection_bias, &_projection_eff_bias, &_projection_eff_bias,
+                                           ConvertPolicy::SATURATE);
         }
     }
 
@@ -354,15 +402,19 @@ void NEQLSTMLayer::configure(const ITensor *input,
     _transpose_input_to_forget_weights.configure(input_to_forget_weights, &_input_to_forget_weights_transposed);
     _transpose_input_to_cell_weights.configure(input_to_cell_weights, &_input_to_cell_weights_transposed);
     _transpose_input_to_output_weights.configure(input_to_output_weights, &_input_to_output_weights_transposed);
-    _transpose_recurrent_to_forget_weights.configure(recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed);
+    _transpose_recurrent_to_forget_weights.configure(recurrent_to_forget_weights,
+                                                     &_recurrent_to_forget_weights_transposed);
     _transpose_recurrent_to_cell_weights.configure(recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed);
-    _transpose_recurrent_to_output_weights.configure(recurrent_to_output_weights, &_recurrent_to_output_weights_transposed);
-    if(!_has_cifg)
+    _transpose_recurrent_to_output_weights.configure(recurrent_to_output_weights,
+                                                     &_recurrent_to_output_weights_transposed);
+    if (!_has_cifg)
     {
-        _transpose_input_to_input_weights.configure(lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed);
-        _transpose_recurrent_to_input_weights.configure(lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed);
+        _transpose_input_to_input_weights.configure(lstm_params.input_to_input_weights(),
+                                                    &_input_to_input_weights_transposed);
+        _transpose_recurrent_to_input_weights.configure(lstm_params.recurrent_to_input_weights(),
+                                                        &_recurrent_to_input_weights_transposed);
     }
-    if(_has_projection)
+    if (_has_projection)
     {
         _transpose_projection_weights.configure(_projection_weights, &_projection_weights_transposed);
     }
@@ -375,40 +427,52 @@ void NEQLSTMLayer::configure(const ITensor *input,
 
     const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
     // Forget gate.
-    const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
-    const float      input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
-    configure_mm(_mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info,
-                 input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias,
-                 &_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale,
-                 mm_out_info, forget_gate_outstage_info);
-
-    const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
-    configure_mm(_mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info,
-                 output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias,
-                 &_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale,
-                 mm_out_info, forget_gate_outstage_info);
-
-    _accumulate_input_recurrent_forget.configure(&_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);
+    const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16,
+                                               QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
+    const float      input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale *
+                                        qinput.scale / lstm_params.forget_intermediate_scale();
+    configure_mm(_mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, input,
+                 &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, &_mm_input_to_forget_res,
+                 &_input_to_forget_outstage_res, input_to_forget_scale, mm_out_info, forget_gate_outstage_info);
+
+    const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
+    configure_mm(_mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info, output_state_in,
+                 &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias, &_mm_recurrent_to_forget_res,
+                 &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale, mm_out_info, forget_gate_outstage_info);
+
+    _accumulate_input_recurrent_forget.configure(&_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
+                                                 &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);
     _input_to_forget_outstage_res.allocator()->allocate();
 
-    if(_has_peephole)
+    if (_has_peephole)
     {
         _mul_cell_to_forget_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
         _memory_group.manage(&_mul_cell_to_forget_res);
-        _pixelwise_mul_cell_to_forget.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-        _cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));
+        _pixelwise_mul_cell_to_forget.configure(cell_state_in, lstm_params.cell_to_forget_weights(),
+                                                &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE,
+                                                RoundingPolicy::TO_ZERO);
+        _cell_to_forget_outstage_res.allocator()->init(
+            TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+                       QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));
         _memory_group.manage(&_cell_to_forget_outstage_res);
-        const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
-        quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
-        _cell_to_forget_outstage.configure(&_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info);
+        const float cell_to_forget_scale =
+            std::pow(2, cell_shift) *
+            lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale /
+            lstm_params.forget_intermediate_scale();
+        quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                     &gemmlowp_info.gemmlowp_shift);
+        _cell_to_forget_outstage.configure(&_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res,
+                                           gemmlowp_info);
         _mul_cell_to_forget_res.allocator()->allocate();
-        _accumulate_cell_forget.configure(&_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);
+        _accumulate_cell_forget.configure(&_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res,
+                                          &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);
         _cell_to_forget_outstage_res.allocator()->allocate();
     }
 
     Tensor *forget_activation_input = &_recurrent_to_forget_outstage_res;
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         configure_layer_norm(LayerNormGate::Forget, forget_activation_input);
         forget_activation_input->allocator()->allocate();
@@ -417,33 +481,36 @@ void NEQLSTMLayer::configure(const ITensor *input,
 
     // Output quantization info of Sigmoid and Tanh activations
     const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0);
-    const TensorInfo       forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
+    const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
 
     _memory_group.manage(&_forget_gate);
     _forget_gate.allocator()->init(forget_gate_info);
-    _forget_gate_sigmoid.configure(forget_activation_input, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _forget_gate_sigmoid.configure(forget_activation_input, &_forget_gate,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     forget_activation_input->allocator()->allocate();
 
     // Modulation gate.
-    const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
-    const float      input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
-    configure_mm(_mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info,
-                 input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias,
-                 &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale,
+    const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16,
+                                        QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
+    const float      input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale *
+                                      qinput.scale / lstm_params.cell_intermediate_scale();
+    configure_mm(_mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, input, &_input_to_cell_weights_transposed,
+                 &_input_to_cell_eff_bias, &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale,
                  mm_out_info, cell_outstage_info);
 
-    const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
-    configure_mm(_mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info,
-                 output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias,
-                 &_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale,
-                 mm_out_info, cell_outstage_info);
+    const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale *
+                                          qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
+    configure_mm(_mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, output_state_in,
+                 &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, &_mm_recurrent_to_cell_res,
+                 &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, mm_out_info, cell_outstage_info);
 
-    _accumulate_input_recurrent_modulation.configure(&_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE);
+    _accumulate_input_recurrent_modulation.configure(&_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res,
+                                                     &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE);
     _input_to_cell_outstage_res.allocator()->allocate();
 
     Tensor *cell_activation_input = &_recurrent_to_cell_outstage_res;
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         configure_layer_norm(LayerNormGate::Cell, cell_activation_input);
         cell_activation_input->allocator()->allocate();
@@ -454,14 +521,15 @@ void NEQLSTMLayer::configure(const ITensor *input,
 
     _memory_group.manage(&_cell_gate);
     _cell_gate.allocator()->init(cell_gate_info);
-    _cell_gate_tanh.configure(cell_activation_input, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+    _cell_gate_tanh.configure(cell_activation_input, &_cell_gate,
+                              ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
     cell_activation_input->allocator()->allocate();
 
     // Input gate.
     const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
     _input_gate.allocator()->init(input_gate_info);
     _memory_group.manage(&_input_gate);
-    if(_has_cifg)
+    if (_has_cifg)
     {
         _ones.allocator()->init(*_forget_gate.info());
         _input_gate_sub.configure(&_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE);
@@ -469,104 +537,137 @@ void NEQLSTMLayer::configure(const ITensor *input,
     }
     else
     {
-        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
-        const float      input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
-        configure_mm(_mm_input_to_input, _input_to_input_outstage, gemmlowp_info,
-                     input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias,
-                     &_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale,
-                     mm_out_info, input_outstage_info);
-
-        const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
-        configure_mm(_mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info,
-                     output_state_in, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias,
+        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                             QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
+        const float      input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale *
+                                           qinput.scale / lstm_params.input_intermediate_scale();
+        configure_mm(_mm_input_to_input, _input_to_input_outstage, gemmlowp_info, input,
+                     &_input_to_input_weights_transposed, &_input_to_input_eff_bias, &_mm_input_to_input_res,
+                     &_input_to_input_outstage_res, input_to_input_scale, mm_out_info, input_outstage_info);
+
+        const float recurrent_to_input_scale =
+            _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale /
+            lstm_params.input_intermediate_scale();
+        configure_mm(_mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info, output_state_in,
+                     &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias,
                      &_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale,
                      mm_out_info, input_outstage_info);
-        _accumulate_input_recurrent_input.configure(&_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
+        _accumulate_input_recurrent_input.configure(&_input_to_input_outstage_res, &_recurrent_to_input_outstage_res,
+                                                    &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
         _input_to_input_outstage_res.allocator()->allocate();
 
-        if(_has_peephole)
+        if (_has_peephole)
         {
-            _mul_cell_to_input_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
+            _mul_cell_to_input_res.allocator()->init(
+                TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
             _memory_group.manage(&_mul_cell_to_input_res);
-            _pixelwise_mul_cell_to_input.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-            const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
-            quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
-            _cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));
+            _pixelwise_mul_cell_to_input.configure(cell_state_in, lstm_params.cell_to_input_weights(),
+                                                   &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE,
+                                                   RoundingPolicy::TO_ZERO);
+            const float cell_to_input_scale =
+                std::pow(2, cell_shift) *
+                lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale /
+                lstm_params.input_intermediate_scale();
+            quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                         &gemmlowp_info.gemmlowp_shift);
+            _cell_to_input_outstage_res.allocator()->init(
+                TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+                           QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));
             _memory_group.manage(&_cell_to_input_outstage_res);
-            _cell_to_input_outstage.configure(&_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info);
+            _cell_to_input_outstage.configure(&_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res,
+                                              gemmlowp_info);
             _mul_cell_to_input_res.allocator()->allocate();
-            _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
+            _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res,
+                                             &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
             _cell_to_input_outstage_res.allocator()->allocate();
         }
 
         Tensor *input_activation_input = &_recurrent_to_input_outstage_res;
 
-        if(_has_layer_norm)
+        if (_has_layer_norm)
         {
             configure_layer_norm(LayerNormGate::Input, input_activation_input);
             input_activation_input->allocator()->allocate();
             input_activation_input = &get_layer_norm_output(LayerNormGate::Input);
         }
 
-        _input_gate_sigmoid.configure(input_activation_input, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        _input_gate_sigmoid.configure(input_activation_input, &_input_gate,
+                                      ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
         input_activation_input->allocator()->allocate();
     }
     // Cell.
     // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
-    _pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE,
+                                         RoundingPolicy::TO_ZERO);
     const float      cell_gate_scale      = _cell_gate.info()->quantization_info().uniform().scale;
     const float      mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift);
-    const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0));
+    const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                         QuantizationInfo(mul_input_cell_scale, 0));
     _memory_group.manage(&_mul_input_cell_res);
     _mul_input_cell_res.allocator()->init(mul_input_cell_info);
-    _pixelwise_mul_input_cell.configure(&_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_input_cell.configure(&_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE,
+                                        RoundingPolicy::TO_ZERO);
     _cell_gate.allocator()->allocate();
     _add_forget_cell.configure(&_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE);
     _mul_input_cell_res.allocator()->allocate();
     _forget_gate.allocator()->allocate();
-    if(_has_cell_clipping)
+    if (_has_cell_clipping)
     {
-        _cell_clip.configure(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip));
+        _cell_clip.configure(cell_state_out, nullptr,
+                             ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                 -quantized_cell_clip, quantized_cell_clip));
     }
     // Output gate.
-    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
-    const float      input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
-    configure_mm(_mm_input_to_output, _input_to_output_outstage, gemmlowp_info,
-                 input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias,
-                 &_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale,
-                 mm_out_info, output_outstage_info);
-
-    const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
-    configure_mm(_mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info,
-                 output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias,
-                 &_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale,
-                 mm_out_info, output_outstage_info);
-
-    _accumulate_input_recurrent_output.configure(&_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);
+    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                          QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
+    const float      input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale *
+                                        qinput.scale / lstm_params.output_intermediate_scale();
+    configure_mm(_mm_input_to_output, _input_to_output_outstage, gemmlowp_info, input,
+                 &_input_to_output_weights_transposed, &_input_to_output_eff_bias, &_mm_input_to_output_res,
+                 &_input_to_output_outstage_res, input_to_output_scale, mm_out_info, output_outstage_info);
+
+    const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.output_intermediate_scale();
+    configure_mm(_mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info, output_state_in,
+                 &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias, &_mm_recurrent_to_output_res,
+                 &_recurrent_to_output_outstage_res, recurrent_to_output_scale, mm_out_info, output_outstage_info);
+
+    _accumulate_input_recurrent_output.configure(&_recurrent_to_output_outstage_res, &_input_to_output_outstage_res,
+                                                 &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);
     _input_to_output_outstage_res.allocator()->allocate();
 
-    if(_has_peephole)
+    if (_has_peephole)
     {
         // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
         // Here we are not using the output stage because all operations are done in float
         _mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32));
         _memory_group.manage(&_mul_cell_to_output_res);
-        _pixelwise_mul_cell_to_output.configure(cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-
-        const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
-        quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
-        _cell_to_output_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)));
+        _pixelwise_mul_cell_to_output.configure(cell_state_out, lstm_params.cell_to_output_weights(),
+                                                &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE,
+                                                RoundingPolicy::TO_ZERO);
+
+        const float cell_to_output_scale =
+            std::pow(2, cell_shift) *
+            lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale /
+            lstm_params.output_intermediate_scale();
+        quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                     &gemmlowp_info.gemmlowp_shift);
+        _cell_to_output_outstage_res.allocator()->init(
+            TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+                       QuantizationInfo(lstm_params.output_intermediate_scale(), 0)));
         _memory_group.manage(&_cell_to_output_outstage_res);
-        _cell_to_output_outstage.configure(&_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, gemmlowp_info);
+        _cell_to_output_outstage.configure(&_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res,
+                                           gemmlowp_info);
         _mul_cell_to_output_res.allocator()->allocate();
 
-        _accumulate_cell_to_output.configure(&_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);
+        _accumulate_cell_to_output.configure(&_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res,
+                                             &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);
         _cell_to_output_outstage_res.allocator()->allocate();
     }
 
     Tensor *output_activation_input = &_recurrent_to_output_outstage_res;
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         configure_layer_norm(LayerNormGate::Output, output_activation_input);
         output_activation_input->allocator()->allocate();
@@ -576,20 +677,24 @@ void NEQLSTMLayer::configure(const ITensor *input,
 
     _memory_group.manage(&_output_gate);
     _output_gate.allocator()->init(output_gate_info);
-    _output_gate_sigmoid.configure(output_activation_input, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _output_gate_sigmoid.configure(output_activation_input, &_output_gate,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     output_activation_input->allocator()->allocate();
 
     // Hidden.
-    _hidden_tanh.configure(cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+    _hidden_tanh.configure(cell_state_out, &_input_gate,
+                           ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
     // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
     _memory_group.manage(&_hidden_mul_res);
     const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32);
     _hidden_mul_res.allocator()->init(hidden_mul_res);
-    _pixelwise_mul_hidden.configure(&_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_hidden.configure(&_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE,
+                                    RoundingPolicy::TO_ZERO);
     _output_gate.allocator()->allocate();
     _input_gate.allocator()->allocate();
     const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
-    quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);
+    quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                 &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);
     gemmlowp_info.gemmlowp_offset  = lstm_params.hidden_state_zero();
     gemmlowp_info.output_data_type = output_state_in->info()->data_type();
 
@@ -598,7 +703,7 @@ void NEQLSTMLayer::configure(const ITensor *input,
 
     _memory_group.manage(&_hidden_gate);
 
-    if(_projection_tensor_copy_required)
+    if (_projection_tensor_copy_required)
     {
         _hidden_gate.allocator()->init(*output_state_out->info());
         _hidden_gate.info()->set_tensor_shape(_hidden_mul_res.info()->tensor_shape());
@@ -609,27 +714,26 @@ void NEQLSTMLayer::configure(const ITensor *input,
     _hidden_mul_res.allocator()->allocate();
 
     // Projection.
-    if(_has_projection)
+    if (_has_projection)
     {
         const TensorInfo              projection_outstage_info(*output_state_out->info());
-        const UniformQuantizationInfo qprojection      = _projection_weights->info()->quantization_info().uniform();
-        const float                   projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
-        gemmlowp_info.gemmlowp_offset                  = qoutput_state_in.offset;
-        gemmlowp_info.gemmlowp_min_bound               = std::numeric_limits<int8_t>::lowest();
-        gemmlowp_info.gemmlowp_max_bound               = std::numeric_limits<int8_t>::max();
-        gemmlowp_info.output_data_type                 = DataType::QASYMM8_SIGNED;
-
-        TensorInfo projection_mm_out_info{ mm_out_info };
+        const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform();
+        const float projection_scale  = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
+        gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;
+        gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
+        gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
+        gemmlowp_info.output_data_type   = DataType::QASYMM8_SIGNED;
+
+        TensorInfo projection_mm_out_info{mm_out_info};
         projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
 
-        configure_mm(_mm_projection, _projection_outstage, gemmlowp_info,
-                     hidden_gate_result, &_projection_weights_transposed, &_projection_eff_bias,
-                     &_mm_projection_res, &_projection_outstage_res, projection_scale,
-                     projection_mm_out_info, projection_outstage_info);
+        configure_mm(_mm_projection, _projection_outstage, gemmlowp_info, hidden_gate_result,
+                     &_projection_weights_transposed, &_projection_eff_bias, &_mm_projection_res,
+                     &_projection_outstage_res, projection_scale, projection_mm_out_info, projection_outstage_info);
 
         ITensor *accumulate_destination = output_state_out;
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _hidden_gate.allocator()->allocate();
             _projection_accumulate_res.allocator()->init(*output_state_in->info());
@@ -638,30 +742,34 @@ void NEQLSTMLayer::configure(const ITensor *input,
             accumulate_destination = &_projection_accumulate_res;
         }
 
-        _accumulate_projection.configure(&_projection_outstage_res, accumulate_destination, accumulate_destination, ConvertPolicy::SATURATE);
+        _accumulate_projection.configure(&_projection_outstage_res, accumulate_destination, accumulate_destination,
+                                         ConvertPolicy::SATURATE);
         _projection_outstage_res.allocator()->allocate();
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _projection_accumulate_to_output_copy.configure(_projection_accumulate_res, *output_state_out);
             _projection_accumulate_res.allocator()->allocate();
         }
 
-        int8_t quantized_projection_clip{ 0 };
-        if(lstm_params.projection_clip() > 0.0f)
+        int8_t quantized_projection_clip{0};
+        if (lstm_params.projection_clip() > 0.0f)
         {
-            quantized_projection_clip = utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);
+            quantized_projection_clip =
+                utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);
         }
 
-        if(quantized_projection_clip > 0)
+        if (quantized_projection_clip > 0)
         {
-            _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, quantized_projection_clip));
+            _projection_clip.configure(output_state_out, nullptr,
+                                       ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                           -quantized_projection_clip, quantized_projection_clip));
             _has_projection_clipping = true;
         }
     }
     else
     {
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _hidden_to_output_copy.configure(_hidden_gate, *output_state_out);
             _hidden_gate.allocator()->allocate();
@@ -672,17 +780,27 @@ void NEQLSTMLayer::configure(const ITensor *input,
     _copy_output.configure(output_state_out, output);
 }
 
-Status NEQLSTMLayer::validate(const ITensorInfo *input,
-                              const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                              const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                              const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                              const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                              const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output,
+Status NEQLSTMLayer::validate(const ITensorInfo             *input,
+                              const ITensorInfo             *input_to_forget_weights,
+                              const ITensorInfo             *input_to_cell_weights,
+                              const ITensorInfo             *input_to_output_weights,
+                              const ITensorInfo             *recurrent_to_forget_weights,
+                              const ITensorInfo             *recurrent_to_cell_weights,
+                              const ITensorInfo             *recurrent_to_output_weights,
+                              const ITensorInfo             *forget_gate_bias,
+                              const ITensorInfo             *cell_bias,
+                              const ITensorInfo             *output_gate_bias,
+                              const ITensorInfo             *cell_state_in,
+                              const ITensorInfo             *output_state_in,
+                              const ITensorInfo             *cell_state_out,
+                              const ITensorInfo             *output_state_out,
+                              const ITensorInfo             *output,
                               const LSTMParams<ITensorInfo> &lstm_params)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
-                                        recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
-                                        cell_state_out, output_state_out, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+                                        recurrent_to_forget_weights, recurrent_to_cell_weights,
+                                        recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+                                        cell_state_in, output_state_in, cell_state_out, output_state_out, output);
 
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != 2, "Input must have exactly 2 dimensions");
@@ -694,22 +812,27 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
 
     ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->dimension(0) != input_size);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, input_to_cell_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights,
+                                                   input_to_cell_weights);
     ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QASYMM8_SIGNED, DataType::QSYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights,
+                                                   recurrent_to_cell_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8);
 
     // If the input_to_forget_weights data type is DataType::QSYMM8 then it can never match the other weights as they are all DataType::QASYMM8_SIGNED
     if (input_to_forget_weights->data_type() == DataType::QSYMM8)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_cell_weights, input_to_output_weights,
-                                                           recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+                                                           recurrent_to_forget_weights, recurrent_to_cell_weights,
+                                                           recurrent_to_output_weights);
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                                           recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights,
+                                                           input_to_output_weights, recurrent_to_forget_weights,
+                                                           recurrent_to_cell_weights, recurrent_to_output_weights);
     }
     ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units);
@@ -728,20 +851,25 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_in);
 
     // Check whether peephole weights are all there or none
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1,
+                                                             DataType::QSYMM16);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->dimension(0) != num_units);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(),
+                                                           lstm_params.cell_to_output_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(),
+                                                       lstm_params.cell_to_output_weights());
 
-        if(!lstm_params.has_cifg_opt())
+        if (!lstm_params.has_cifg_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(),
+                                                               lstm_params.cell_to_input_weights());
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(),
+                                                           lstm_params.cell_to_input_weights());
         }
     }
 
@@ -755,7 +883,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
 
     // Calculate quantized parameters for clipping.
     int16_t quantized_cell_clip = 0;
-    if(lstm_params.cell_clip() > 0.0f)
+    if (lstm_params.cell_clip() > 0.0f)
     {
         quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
     }
@@ -763,60 +891,90 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
     // Precompute effective bias for optimizing the matmul computations.
     const TensorInfo eff_bias_info(TensorShape(num_units), 1, DataType::S32);
     const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32);
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false,
-                                                                                              -qinput.offset, true)));
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false,
-                                                                                              -qoutput_state_in.offset,
-                                                                                              true)));
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+            lstm_params.input_to_input_weights(), &eff_bias_info,
+            GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+            lstm_params.recurrent_to_input_weights(), &eff_bias_info,
+            GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false,
-                                                                                          -qoutput_state_in.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset,
-                                                                                          true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false,
-                                                                                          -qoutput_state_in.offset, true)));
-    if(lstm_params.has_projection())
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+        input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+        recurrent_to_forget_weights, &eff_bias_info,
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+        input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+        recurrent_to_cell_weights, &eff_bias_info,
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+        input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+        recurrent_to_output_weights, &eff_bias_info,
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    if (lstm_params.has_projection())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false,
-                                                                                              lstm_params.hidden_state_zero(),
-                                                                                              true)));
-        if(lstm_params.projection_bias() != nullptr)
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+            lstm_params.projection_weights(), &projection_eff_bias_info,
+            GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)));
+        if (lstm_params.projection_bias() != nullptr)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.projection_bias(), 1, DataType::S32);
-            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info, &projection_eff_bias_info, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info,
+                                               &projection_eff_bias_info, ConvertPolicy::SATURATE));
         }
     }
 
-    const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_cell_weights->data_type(), input_to_cell_weights->quantization_info());
-    const TensorInfo input_to_output_weights_transposed(TensorShape(num_units, input_size), 1, input_to_output_weights->data_type(), input_to_output_weights->quantization_info());
-    const TensorInfo recurrent_to_forget_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info());
-    const TensorInfo recurrent_to_cell_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_cell_weights->data_type(), recurrent_to_cell_weights->quantization_info());
-    const TensorInfo recurrent_to_output_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_output_weights->data_type(), recurrent_to_output_weights->quantization_info());
-    const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info());
+    const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_cell_weights->data_type(),
+                                              input_to_cell_weights->quantization_info());
+    const TensorInfo input_to_output_weights_transposed(TensorShape(num_units, input_size), 1,
+                                                        input_to_output_weights->data_type(),
+                                                        input_to_output_weights->quantization_info());
+    const TensorInfo recurrent_to_forget_weights_transposed(TensorShape(num_units, output_size), 1,
+                                                            recurrent_to_forget_weights->data_type(),
+                                                            recurrent_to_forget_weights->quantization_info());
+    const TensorInfo recurrent_to_cell_weights_transposed(TensorShape(num_units, output_size), 1,
+                                                          recurrent_to_cell_weights->data_type(),
+                                                          recurrent_to_cell_weights->quantization_info());
+    const TensorInfo recurrent_to_output_weights_transposed(TensorShape(num_units, output_size), 1,
+                                                            recurrent_to_output_weights->data_type(),
+                                                            recurrent_to_output_weights->quantization_info());
+    const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1,
+                                                  recurrent_to_forget_weights->data_type(),
+                                                  recurrent_to_forget_weights->quantization_info());
 
     ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_cell_weights, &input_weights_transposed));
     ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_output_weights, &input_to_output_weights_transposed));
-    ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_forget_weights, &recurrent_to_forget_weights_transposed));
-    ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_cell_weights, &recurrent_to_cell_weights_transposed));
-    ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_output_weights, &recurrent_to_output_weights_transposed));
-    if(!lstm_params.has_cifg_opt())
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NETranspose::validate(recurrent_to_forget_weights, &recurrent_to_forget_weights_transposed));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NETranspose::validate(recurrent_to_cell_weights, &recurrent_to_cell_weights_transposed));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NETranspose::validate(recurrent_to_output_weights, &recurrent_to_output_weights_transposed));
+    if (!lstm_params.has_cifg_opt())
     {
-        const TensorInfo recurrent_to_input_weights_transposed(TensorShape(num_units, output_size), 1,
-                                                               recurrent_to_forget_weights->data_type(), lstm_params.recurrent_to_input_weights()->quantization_info());
+        const TensorInfo recurrent_to_input_weights_transposed(
+            TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(),
+            lstm_params.recurrent_to_input_weights()->quantization_info());
         const TensorInfo input_to_input_weights_transposed(TensorShape(num_units, input_size), 1,
-                                                           lstm_params.input_to_input_weights()->data_type(), lstm_params.input_to_input_weights()->quantization_info());
-        ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.input_to_input_weights(), &input_to_input_weights_transposed));
-        ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_to_input_weights_transposed));
+                                                           lstm_params.input_to_input_weights()->data_type(),
+                                                           lstm_params.input_to_input_weights()->quantization_info());
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NETranspose::validate(lstm_params.input_to_input_weights(), &input_to_input_weights_transposed));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NETranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_to_input_weights_transposed));
     }
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
-        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
-        ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed));
+        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1,
+                                                       lstm_params.projection_weights()->data_type(),
+                                                       lstm_params.projection_weights()->quantization_info());
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NETranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed));
     }
 
     GEMMLowpOutputStageInfo gemmlowp_info;
@@ -829,28 +987,42 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
 
     // Forget gate.
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_intermediate_scale() == 0);
-    const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
+    const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                          QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
     const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
-    const float      input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_forget_scale, &mm_out_info, &forget_outstage_info));
+    const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale /
+                                        lstm_params.forget_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                            input_to_forget_scale, &mm_out_info, &forget_outstage_info));
 
-    const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info));
+    const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                            &eff_bias_info, recurrent_to_forget_scale, &mm_out_info,
+                                            &forget_outstage_info));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info,
+                                                               &forget_outstage_info, ConvertPolicy::SATURATE));
 
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_ZERO));
-        const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1,
+                                                             DataType::QSYMM16);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f,
+                                                ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        const float cell_to_forget_scale = std::pow(2, cell_shift) *
+                                           lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale /
+                                           lstm_params.forget_intermediate_scale();
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+            cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info,
+                                                                   &forget_outstage_info, ConvertPolicy::SATURATE));
     }
 
-    if(has_layer_norm)
+    if (has_layer_norm)
     {
         const ITensorInfo *w_info = lstm_params.forget_layer_norm_weights();
         const ITensorInfo *b_info = forget_gate_bias;
@@ -859,22 +1031,31 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
 
     // Output quantization info of Sigmoid and Tanh activations
     const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0);
-    const TensorInfo       forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
+    const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_outstage_info, &forget_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(&forget_outstage_info, &forget_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Modulation gate.
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_intermediate_scale() == 0);
-    const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
-    const float      input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info));
-
-    const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info));
-
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE));
-
-    if(has_layer_norm)
+    const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                        QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
+    const float      input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale /
+                                      lstm_params.cell_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                            input_to_cell_scale, &mm_out_info, &cell_outstage_info));
+
+    const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale *
+                                          qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                            &eff_bias_info, recurrent_to_cell_scale, &mm_out_info,
+                                            &cell_outstage_info));
+
+    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info,
+                                                               &cell_outstage_info, ConvertPolicy::SATURATE));
+
+    if (has_layer_norm)
     {
         const ITensorInfo *w_info = lstm_params.cell_layer_norm_weights();
         const ITensorInfo *b_info = cell_bias;
@@ -882,94 +1063,134 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
     }
     const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_outstage_info, &cell_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(&cell_outstage_info, &cell_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
 
     // Input gate.
     const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
-    if(lstm_params.has_cifg_opt())
+    if (lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used");
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr,
+                                        "Input gate bias must not be present when CIFG is used");
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info,
+                                                                      &forget_gate_info, ConvertPolicy::SATURATE));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
+                                            lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
 
         // If the input_to_forget_weights data type is DataType::QSYMM8 then it can never match the other weights as they are all DataType::QASYMM8_SIGNED
         if (input_to_forget_weights->data_type() == DataType::QSYMM8)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights());
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.input_to_input_weights(),
+                                                               lstm_params.recurrent_to_input_weights());
         }
         else
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights());
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights,
+                                                               lstm_params.input_to_input_weights(),
+                                                               lstm_params.recurrent_to_input_weights());
         }
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_forget_weights, lstm_params.input_to_input_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, lstm_params.recurrent_to_input_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights,
+                                                       lstm_params.recurrent_to_input_weights());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.input_gate_bias());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias());
 
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_intermediate_scale() == 0);
-        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
-        const float      input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info));
-
-        const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info));
-
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
-
-        if(lstm_params.has_peephole_opt())
+        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                             QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
+        const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale *
+                                           qinput.scale / lstm_params.input_intermediate_scale();
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                                input_to_input_scale, &mm_out_info, &input_outstage_info));
+
+        const float recurrent_to_input_scale =
+            lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale /
+            lstm_params.input_intermediate_scale();
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                                &eff_bias_info, recurrent_to_input_scale, &mm_out_info,
+                                                &input_outstage_info));
+
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info,
+                                                                   &input_outstage_info, ConvertPolicy::SATURATE));
+
+        if (lstm_params.has_peephole_opt())
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
-                                                                            RoundingPolicy::TO_ZERO));
-            const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
-            ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info,
+                                                    1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+            const float cell_to_input_scale = std::pow(2, cell_shift) *
+                                              lstm_params.cell_to_input_weights()->quantization_info().uniform().scale /
+                                              lstm_params.input_intermediate_scale();
+            ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+                cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info,
+                                                                       &input_outstage_info, ConvertPolicy::SATURATE));
         }
 
-        if(has_layer_norm)
+        if (has_layer_norm)
         {
             const ITensorInfo *w_info = lstm_params.input_layer_norm_weights();
             const ITensorInfo *b_info = lstm_params.input_gate_bias();
             ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(input_outstage_info, *w_info, *b_info));
         }
 
-        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_outstage_info, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEActivationLayer::validate(&input_outstage_info, &input_gate_info,
+                                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
     }
     // Cell.
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
-    if(quantized_cell_clip > 0)
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+        &forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+        &input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
+    if (quantized_cell_clip > 0)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip,
-                                                                                                             quantized_cell_clip)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEActivationLayer::validate(cell_state_out, nullptr,
+                                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                            -quantized_cell_clip, quantized_cell_clip)));
     }
     // Output gate.
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_intermediate_scale() == 0);
-    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
-    const float      input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info));
-
-    const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info));
-
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
-    if(lstm_params.has_peephole_opt())
+    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                          QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
+    const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale /
+                                        lstm_params.output_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                            input_to_output_scale, &mm_out_info, &output_outstage_info));
+
+    const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.output_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                            &eff_bias_info, recurrent_to_output_scale, &mm_out_info,
+                                            &output_outstage_info));
+
+    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info,
+                                                               &output_outstage_info, ConvertPolicy::SATURATE));
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1,
+                                                             DataType::QSYMM16);
         // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
         // Here we are not using the output stage because all operations are done in float
         // const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
         // ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+            cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
+            RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info,
+                                                                   &output_outstage_info, ConvertPolicy::SATURATE));
     }
 
-    if(has_layer_norm)
+    if (has_layer_norm)
     {
         const ITensorInfo *w_info = lstm_params.output_layer_norm_weights();
         const ITensorInfo *b_info = output_gate_bias;
@@ -977,85 +1198,103 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
     }
 
     const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_outstage_info, &output_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(&output_outstage_info, &output_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Hidden.
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(cell_state_out, &input_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
     const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32);
     const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+        &output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
 
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.hidden_state_scale() == 0);
     const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                     &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
     gemmlowp_info.gemmlowp_offset  = lstm_params.hidden_state_zero();
     gemmlowp_info.output_data_type = hidden_out_info.data_type();
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
 
     const bool projection_tensor_copy_required = num_units != output_size;
 
     // Projection.
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights,
+                                                           lstm_params.projection_weights());
         ARM_COMPUTE_RETURN_ERROR_ON(qoutput_state_in.scale == 0);
 
-        const UniformQuantizationInfo qprojection      = lstm_params.projection_weights()->quantization_info().uniform();
-        const float                   projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+        const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform();
+        const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+            projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
         gemmlowp_info.gemmlowp_offset    = qoutput_state_in.offset;
         gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
         gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
         gemmlowp_info.output_data_type   = DataType::QASYMM8_SIGNED;
 
         const TensorInfo projection_outstage_info(*output_state_out);
-        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
+        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1,
+                                                       lstm_params.projection_weights()->data_type(),
+                                                       lstm_params.projection_weights()->quantization_info());
 
-        TensorInfo projection_mm_out_info{ mm_out_info };
+        TensorInfo projection_mm_out_info{mm_out_info};
         projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, &projection_eff_bias_info, projection_scale, &projection_mm_out_info,
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed,
+                                                &projection_eff_bias_info, projection_scale, &projection_mm_out_info,
                                                 &projection_outstage_info));
 
-        if(projection_tensor_copy_required)
+        if (projection_tensor_copy_required)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info));
         }
 
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(output_state_out, output_state_out, output_state_out,
+                                                                   ConvertPolicy::SATURATE));
 
-        if(projection_tensor_copy_required)
+        if (projection_tensor_copy_required)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out));
         }
 
-        int8_t quantized_projection_clip{ 0 };
-        if(lstm_params.projection_clip() > 0.0f)
+        int8_t quantized_projection_clip{0};
+        if (lstm_params.projection_clip() > 0.0f)
         {
             quantized_projection_clip = quantize_qasymm8_signed(lstm_params.projection_clip(), qprojection);
         }
 
-        if(quantized_projection_clip > 0)
+        if (quantized_projection_clip > 0)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip,
-                                                                                                                   quantized_projection_clip)));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+                output_state_out, nullptr,
+                ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                    -quantized_projection_clip, quantized_projection_clip)));
         }
     }
     else
     {
-        if(projection_tensor_copy_required)
+        if (projection_tensor_copy_required)
         {
             ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(hidden_out_info, *output_state_out));
         }
     }
 
-    if(cell_state_out->total_size() > 0)
+    if (cell_state_out->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(cell_state_in, cell_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(cell_state_in, cell_state_out);
     }
 
-    if(output_state_out->total_size() > 0)
+    if (output_state_out->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out);
@@ -1080,14 +1319,14 @@ void NEQLSTMLayer::run()
     _recurrent_to_forget_outstage.run();
     _accumulate_input_recurrent_forget.run();
 
-    if(_has_peephole)
+    if (_has_peephole)
     {
         _pixelwise_mul_cell_to_forget.run();
         _cell_to_forget_outstage.run();
         _accumulate_cell_forget.run();
     }
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Forget).get(), Window::DimY);
     }
@@ -1102,7 +1341,7 @@ void NEQLSTMLayer::run()
     _recurrent_to_cell_outstage.run();
     _accumulate_input_recurrent_modulation.run();
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Cell).get(), Window::DimY);
     }
@@ -1110,7 +1349,7 @@ void NEQLSTMLayer::run()
     _cell_gate_tanh.run();
 
     // Input gate
-    if(_has_cifg)
+    if (_has_cifg)
     {
         _input_gate_sub.run();
     }
@@ -1122,14 +1361,14 @@ void NEQLSTMLayer::run()
         _recurrent_to_input_outstage.run();
         _accumulate_input_recurrent_input.run();
 
-        if(_has_peephole)
+        if (_has_peephole)
         {
             _pixelwise_mul_cell_to_input.run();
             _cell_to_input_outstage.run();
             _accumulate_cell_input.run();
         }
 
-        if(_has_layer_norm)
+        if (_has_layer_norm)
         {
             NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Input).get(), Window::DimY);
         }
@@ -1142,7 +1381,7 @@ void NEQLSTMLayer::run()
     _pixelwise_mul_input_cell.run();
     _add_forget_cell.run();
 
-    if(_has_cell_clipping)
+    if (_has_cell_clipping)
     {
         _cell_clip.run();
     }
@@ -1153,14 +1392,14 @@ void NEQLSTMLayer::run()
     _mm_recurrent_to_output.run();
     _recurrent_to_output_outstage.run();
     _accumulate_input_recurrent_output.run();
-    if(_has_peephole)
+    if (_has_peephole)
     {
         _pixelwise_mul_cell_to_output.run();
         _cell_to_output_outstage.run();
         _accumulate_cell_to_output.run();
     }
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Output).get(), Window::DimY);
     }
@@ -1173,31 +1412,31 @@ void NEQLSTMLayer::run()
     _hidden_outstage.run();
 
     // Projection.
-    if(_has_projection)
+    if (_has_projection)
     {
         _mm_projection.run();
         _projection_outstage.run();
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _projection_output_to_accumulate_copy.run();
         }
 
         _accumulate_projection.run();
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _projection_accumulate_to_output_copy.run();
         }
 
-        if(_has_projection_clipping)
+        if (_has_projection_clipping)
         {
             _projection_clip.run();
         }
     }
     else
     {
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _hidden_to_output_copy.run();
         }
@@ -1209,9 +1448,9 @@ void NEQLSTMLayer::run()
 
 void NEQLSTMLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
-        if(_convert_input_to_forget_weights_to_qsymm8)
+        if (_convert_input_to_forget_weights_to_qsymm8)
         {
             _input_to_forget_weights_f32.allocator()->allocate();
             _input_to_forget_weights_symm8.allocator()->allocate();
@@ -1234,28 +1473,25 @@ void NEQLSTMLayer::prepare()
         _transpose_recurrent_to_output_weights.run();
 
         // Precompute effective biases
-        if(_has_cifg)
+        if (_has_cifg)
         {
-            std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 32767);
+            std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()),
+                        _ones.info()->total_size() / _ones.info()->element_size(), 32767);
         }
         else
         {
             _input_to_input_eff_bias.allocator()->allocate();
             _recurrent_to_input_eff_bias.allocator()->allocate();
 
-            ITensorPack packII =
-            {
-                { TensorType::ACL_SRC, _input_to_input_weights },
-                { TensorType::ACL_DST, &_input_to_input_eff_bias }
-            };
-            NEScheduler::get().schedule_op(_input_to_input_reduction.get(), Window::DimY, _input_to_input_reduction->window(), packII);
+            ITensorPack packII = {{TensorType::ACL_SRC, _input_to_input_weights},
+                                  {TensorType::ACL_DST, &_input_to_input_eff_bias}};
+            NEScheduler::get().schedule_op(_input_to_input_reduction.get(), Window::DimY,
+                                           _input_to_input_reduction->window(), packII);
 
-            ITensorPack packRI =
-            {
-                { TensorType::ACL_SRC, _recurrent_to_input_weights },
-                { TensorType::ACL_DST, &_recurrent_to_input_eff_bias }
-            };
-            NEScheduler::get().schedule_op(_recurrent_to_input_reduction.get(), Window::DimY, _recurrent_to_input_reduction->window(), packRI);
+            ITensorPack packRI = {{TensorType::ACL_SRC, _recurrent_to_input_weights},
+                                  {TensorType::ACL_DST, &_recurrent_to_input_eff_bias}};
+            NEScheduler::get().schedule_op(_recurrent_to_input_reduction.get(), Window::DimY,
+                                           _recurrent_to_input_reduction->window(), packRI);
 
             _input_to_input_weights_transposed.allocator()->allocate();
             _recurrent_to_input_weights_transposed.allocator()->allocate();
@@ -1271,58 +1507,44 @@ void NEQLSTMLayer::prepare()
         _input_to_output_eff_bias.allocator()->allocate();
         _recurrent_to_output_eff_bias.allocator()->allocate();
 
-        ITensorPack packIF =
-        {
-            { TensorType::ACL_SRC, _input_to_forget_weights },
-            { TensorType::ACL_DST, &_input_to_forget_eff_bias }
-        };
-        NEScheduler::get().schedule_op(_input_to_forget_reduction.get(), Window::DimY, _input_to_forget_reduction->window(), packIF);
-
-        ITensorPack packRF =
-        {
-            { TensorType::ACL_SRC, _recurrent_to_forget_weights },
-            { TensorType::ACL_DST, &_recurrent_to_forget_eff_bias }
-        };
-        NEScheduler::get().schedule_op(_recurrent_to_forget_reduction.get(), Window::DimY, _recurrent_to_forget_reduction->window(), packRF);
-
-        ITensorPack packIC =
-        {
-            { TensorType::ACL_SRC, _input_to_cell_weights },
-            { TensorType::ACL_DST, &_input_to_cell_eff_bias }
-        };
-        NEScheduler::get().schedule_op(_input_to_cell_reduction.get(), Window::DimY, _input_to_cell_reduction->window(), packIC);
-
-        ITensorPack packRC =
-        {
-            { TensorType::ACL_SRC, _recurrent_to_cell_weights },
-            { TensorType::ACL_DST, &_recurrent_to_cell_eff_bias }
-        };
-        NEScheduler::get().schedule_op(_recurrent_to_cell_reduction.get(), Window::DimY, _recurrent_to_cell_reduction->window(), packRC);
-
-        ITensorPack packIO =
-        {
-            { TensorType::ACL_SRC, _input_to_output_weights },
-            { TensorType::ACL_DST, &_input_to_output_eff_bias }
-        };
-        NEScheduler::get().schedule_op(_input_to_output_reduction.get(), Window::DimY, _input_to_output_reduction->window(), packIO);
-
-        ITensorPack packRO =
-        {
-            { TensorType::ACL_SRC, _recurrent_to_output_weights },
-            { TensorType::ACL_DST, &_recurrent_to_output_eff_bias }
-        };
-        NEScheduler::get().schedule_op(_recurrent_to_output_reduction.get(), Window::DimY, _recurrent_to_output_reduction->window(), packRO);
-
-        if(_has_projection)
+        ITensorPack packIF = {{TensorType::ACL_SRC, _input_to_forget_weights},
+                              {TensorType::ACL_DST, &_input_to_forget_eff_bias}};
+        NEScheduler::get().schedule_op(_input_to_forget_reduction.get(), Window::DimY,
+                                       _input_to_forget_reduction->window(), packIF);
+
+        ITensorPack packRF = {{TensorType::ACL_SRC, _recurrent_to_forget_weights},
+                              {TensorType::ACL_DST, &_recurrent_to_forget_eff_bias}};
+        NEScheduler::get().schedule_op(_recurrent_to_forget_reduction.get(), Window::DimY,
+                                       _recurrent_to_forget_reduction->window(), packRF);
+
+        ITensorPack packIC = {{TensorType::ACL_SRC, _input_to_cell_weights},
+                              {TensorType::ACL_DST, &_input_to_cell_eff_bias}};
+        NEScheduler::get().schedule_op(_input_to_cell_reduction.get(), Window::DimY, _input_to_cell_reduction->window(),
+                                       packIC);
+
+        ITensorPack packRC = {{TensorType::ACL_SRC, _recurrent_to_cell_weights},
+                              {TensorType::ACL_DST, &_recurrent_to_cell_eff_bias}};
+        NEScheduler::get().schedule_op(_recurrent_to_cell_reduction.get(), Window::DimY,
+                                       _recurrent_to_cell_reduction->window(), packRC);
+
+        ITensorPack packIO = {{TensorType::ACL_SRC, _input_to_output_weights},
+                              {TensorType::ACL_DST, &_input_to_output_eff_bias}};
+        NEScheduler::get().schedule_op(_input_to_output_reduction.get(), Window::DimY,
+                                       _input_to_output_reduction->window(), packIO);
+
+        ITensorPack packRO = {{TensorType::ACL_SRC, _recurrent_to_output_weights},
+                              {TensorType::ACL_DST, &_recurrent_to_output_eff_bias}};
+        NEScheduler::get().schedule_op(_recurrent_to_output_reduction.get(), Window::DimY,
+                                       _recurrent_to_output_reduction->window(), packRO);
+
+        if (_has_projection)
         {
             _projection_eff_bias.allocator()->allocate();
-            ITensorPack pack =
-            {
-                { TensorType::ACL_SRC, _projection_weights },
-                { TensorType::ACL_DST, &_projection_eff_bias }
-            };
-            NEScheduler::get().schedule_op(_projection_reduction.get(), Window::DimY, _projection_reduction->window(), pack);
-            if(_projection_bias != nullptr)
+            ITensorPack pack = {{TensorType::ACL_SRC, _projection_weights},
+                                {TensorType::ACL_DST, &_projection_eff_bias}};
+            NEScheduler::get().schedule_op(_projection_reduction.get(), Window::DimY, _projection_reduction->window(),
+                                           pack);
+            if (_projection_bias != nullptr)
             {
                 _projection_bias_add.run();
                 _projection_bias->mark_as_unused();
@@ -1332,7 +1554,7 @@ void NEQLSTMLayer::prepare()
             _transpose_projection_weights.run();
             _projection_weights->mark_as_unused();
 
-            if(!_projection_tensor_copy_required)
+            if (!_projection_tensor_copy_required)
             {
                 _hidden_gate.mark_as_unused();
                 _projection_accumulate_res.mark_as_unused();
diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
index dad246ac89..9b72783c97 100644
--- a/src/runtime/NEON/functions/NEQuantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
@@ -26,19 +26,19 @@
 
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/cpu/operators/CpuQuantize.h"
 
 namespace arm_compute
 {
 struct NEQuantizationLayer::Impl
 {
-    const ITensor                    *src{ nullptr };
-    ITensor                          *dst{ nullptr };
-    std::unique_ptr<cpu::CpuQuantize> op{ nullptr };
+    const ITensor                    *src{nullptr};
+    ITensor                          *dst{nullptr};
+    std::unique_ptr<cpu::CpuQuantize> op{nullptr};
 };
 
-NEQuantizationLayer::NEQuantizationLayer()
-    : _impl(std::make_unique<Impl>())
+NEQuantizationLayer::NEQuantizationLayer() : _impl(std::make_unique<Impl>())
 {
 }
 NEQuantizationLayer::~NEQuantizationLayer() = default;
diff --git a/src/runtime/NEON/functions/NERNNLayer.cpp b/src/runtime/NEON/functions/NERNNLayer.cpp
index a66ef3d27a..2824693800 100644
--- a/src/runtime/NEON/functions/NERNNLayer.cpp
+++ b/src/runtime/NEON/functions/NERNNLayer.cpp
@@ -27,9 +27,10 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 
 namespace arm_compute
@@ -37,13 +38,26 @@ namespace arm_compute
 NERNNLayer::~NERNNLayer() = default;
 
 NERNNLayer::NERNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_f(), _activation(), _fully_connected(memory_manager), _copy_f(), _fully_connected_out(), _gemm_output(), _add_output(),
+    : _memory_group(std::move(memory_manager)),
+      _gemm_state_f(),
+      _add_f(),
+      _activation(),
+      _fully_connected(memory_manager),
+      _copy_f(),
+      _fully_connected_out(),
+      _gemm_output(),
+      _add_output(),
       _is_prepared(false)
 {
 }
 
-Status NERNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state,
-                            const ITensorInfo *output, const ActivationLayerInfo &info)
+Status NERNNLayer::validate(const ITensorInfo         *input,
+                            const ITensorInfo         *weights,
+                            const ITensorInfo         *recurrent_weights,
+                            const ITensorInfo         *bias,
+                            const ITensorInfo         *hidden_state,
+                            const ITensorInfo         *output,
+                            const ActivationLayerInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
@@ -60,24 +74,34 @@ Status NERNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights
     ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), hidden_state->tensor_shape());
 
-    auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type());
+    auto shape_info =
+        TensorInfo(misc::shape_calculator::compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1,
+                   input->data_type());
 
     ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
     ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&shape_info, &shape_info, info));
 
     return Status{};
 }
 
-void NERNNLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights, const ITensor *bias, ITensor *hidden_state, ITensor *output,
+void NERNNLayer::configure(const ITensor       *input,
+                           const ITensor       *weights,
+                           const ITensor       *recurrent_weights,
+                           const ITensor       *bias,
+                           ITensor             *hidden_state,
+                           ITensor             *output,
                            ActivationLayerInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NERNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(NERNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(),
+                                                    bias->info(), hidden_state->info(), output->info(), info));
     ARM_COMPUTE_LOG_PARAMS(input, weights, recurrent_weights, bias, hidden_state, output, info);
 
     const int   idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
-    TensorShape shape      = misc::shape_calculator::compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
+    TensorShape shape      = misc::shape_calculator::compute_rnn_shape(recurrent_weights->info(),
+                                                                       hidden_state->info()->dimension(idx_height));
 
     _is_prepared = false;
 
@@ -125,7 +149,7 @@ void NERNNLayer::run()
 
 void NERNNLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         _fully_connected.prepare();
         _gemm_state_f.prepare();
diff --git a/src/runtime/NEON/functions/NEROIAlignLayer.cpp b/src/runtime/NEON/functions/NEROIAlignLayer.cpp
index a9bdb50d95..68bb5d5ef3 100644
--- a/src/runtime/NEON/functions/NEROIAlignLayer.cpp
+++ b/src/runtime/NEON/functions/NEROIAlignLayer.cpp
@@ -29,14 +29,20 @@
 
 namespace arm_compute
 {
-Status NEROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status NEROIAlignLayer::validate(const ITensorInfo         *input,
+                                 const ITensorInfo         *rois,
+                                 ITensorInfo               *output,
+                                 const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(NEROIAlignLayerKernel::validate(input, rois, output, pool_info));
 
     return Status{};
 }
 
-void NEROIAlignLayer::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIAlignLayer::configure(const ITensor             *input,
+                                const ITensor             *rois,
+                                ITensor                   *output,
+                                const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info);
 
diff --git a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
index a24f2aac50..babec4aa92 100644
--- a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h"
+
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h"
 
@@ -31,17 +33,22 @@ namespace arm_compute
 {
 NEROIPoolingLayer::~NEROIPoolingLayer() = default;
 
-NEROIPoolingLayer::NEROIPoolingLayer()
-    : _roi_kernel()
+NEROIPoolingLayer::NEROIPoolingLayer() : _roi_kernel()
 {
 }
 
-Status NEROIPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status NEROIPoolingLayer::validate(const ITensorInfo         *input,
+                                   const ITensorInfo         *rois,
+                                   const ITensorInfo         *output,
+                                   const ROIPoolingLayerInfo &pool_info)
 {
     return NEROIPoolingLayerKernel::validate(input, rois, output, pool_info);
 }
 
-void NEROIPoolingLayer::configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIPoolingLayer::configure(const ITensor             *input,
+                                  const ITensor             *rois,
+                                  const ITensor             *output,
+                                  const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info);
 
@@ -53,4 +60,4 @@ void NEROIPoolingLayer::run()
 {
     NEScheduler::get().schedule(_roi_kernel.get(), Window::DimX);
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NERange.cpp b/src/runtime/NEON/functions/NERange.cpp
index a6f7be8be0..95492df126 100644
--- a/src/runtime/NEON/functions/NERange.cpp
+++ b/src/runtime/NEON/functions/NERange.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NERange.h"
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NERangeKernel.h"
 
@@ -31,8 +32,7 @@ namespace arm_compute
 {
 NERange::~NERange() = default;
 
-NERange::NERange()
-    : _kernel()
+NERange::NERange() : _kernel()
 {
 }
 
@@ -52,4 +52,4 @@ void NERange::run()
 {
     NEScheduler::get().schedule(_kernel.get(), Window::DimX);
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp
index 9f96479295..d37cf4a8d0 100644
--- a/src/runtime/NEON/functions/NEReduceMean.cpp
+++ b/src/runtime/NEON/functions/NEReduceMean.cpp
@@ -25,21 +25,24 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status
+validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
 {
     ARM_COMPUTE_UNUSED(keep_dims);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
 
@@ -47,29 +50,29 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
     const int          input_dims    = input->num_dimensions();
     Coordinates        axis_local    = reduction_axis;
 
-    for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
     {
         //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)).
         ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast<int>(input->num_dimensions())));
         ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast<int>(input->num_dimensions()));
     }
 
-    if(output->tensor_shape().total_size() != 0)
+    if (output->tensor_shape().total_size() != 0)
     {
         // Only validate if not using auto_init for the output tensor
         TensorShape out_shape = input->tensor_shape();
         // Validate output_shape only if not using auto_init
         convert_negative_axis(axis_local, input_dims);
         std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
-        for(unsigned int i = 0; i < reduction_ops; ++i)
+        for (unsigned int i = 0; i < reduction_ops; ++i)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
             ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
-            if(output->total_size() > 0 && keep_dims)
+            if (output->total_size() > 0 && keep_dims)
             {
                 ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
             }
-            if(keep_dims)
+            if (keep_dims)
             {
                 out_shape.set(axis_local[i], 1);
             }
@@ -91,11 +94,19 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
 NEReduceMean::~NEReduceMean() = default;
 
 NEReduceMean::NEReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims()
+    : _memory_group(std::move(memory_manager)),
+      _reduction_kernels(),
+      _reduced_outs(),
+      _reshape(),
+      _reduction_ops(),
+      _keep_dims()
 {
 }
 
-Status NEReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status NEReduceMean::validate(const ITensorInfo *input,
+                              const Coordinates &reduction_axis,
+                              bool               keep_dims,
+                              const ITensorInfo *output)
 {
     return validate_config(input, reduction_axis, keep_dims, output);
 }
@@ -107,7 +118,8 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis,
     // Perform validate step
     ARM_COMPUTE_ERROR_THROW_ON(NEReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info()));
     // Output auto inizialitation if not yet initialized
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims);
+    const TensorShape output_shape =
+        arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims);
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     _reduction_ops = reduction_axis.num_dimensions();
@@ -124,37 +136,40 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis,
     convert_negative_axis(axis_local, input_dims);
 
     // Perform reduction for every axis
-    for(int i = 0; i < _reduction_ops; ++i)
+    for (int i = 0; i < _reduction_ops; ++i)
     {
-        TensorShape out_shape = i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+        TensorShape out_shape =
+            i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
         out_shape.set(axis_local[i], 1);
         auto in = (i == 0) ? tmp_input : (&_reduced_outs[i - 1]);
 
-        if(i == _reduction_ops - 1 && keep_dims)
+        if (i == _reduction_ops - 1 && keep_dims)
         {
             _reduction_kernels[i].configure(in, tmp_output, axis_local[i], ReductionOperation::MEAN_SUM);
         }
         else
         {
-            _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_output->info()->num_channels(), tmp_output->info()->data_type(), tmp_output->info()->quantization_info()));
+            _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_output->info()->num_channels(),
+                                                          tmp_output->info()->data_type(),
+                                                          tmp_output->info()->quantization_info()));
             _memory_group.manage(&_reduced_outs[i]);
             _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM);
         }
     }
 
     // Allocate intermediate tensors
-    for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+    for (int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
     {
         _reduced_outs[i].allocator()->allocate();
     }
     // Configure reshape layer if we want to drop the dimensions
-    if(!keep_dims)
+    if (!keep_dims)
     {
         TensorShape out_shape = tmp_input->info()->tensor_shape();
         // We have to sort the reduction axis vectors in order for remove_dimension
         // to work properly
         std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
-        for(int i = 0; i < _reduction_ops; ++i)
+        for (int i = 0; i < _reduction_ops; ++i)
         {
             out_shape.remove_dimension(axis_local[i] - i, false);
         }
@@ -166,11 +181,11 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis,
 void NEReduceMean::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
-    for(auto &kernel : _reduction_kernels)
+    for (auto &kernel : _reduction_kernels)
     {
         kernel.run();
     }
-    if(!_keep_dims)
+    if (!_keep_dims)
     {
         _reshape.run();
     }
diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index 9660347a16..8540d750fc 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp
@@ -26,9 +26,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
-#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
 
 namespace arm_compute
 {
@@ -42,7 +43,7 @@ namespace
  */
 size_t reduction_window_split_dimension(unsigned int axis)
 {
-    switch(axis)
+    switch (axis)
     {
         case 0:
             return Window::DimY;
@@ -59,13 +60,21 @@ size_t reduction_window_split_dimension(unsigned int axis)
 NEReductionOperation::~NEReductionOperation() = default;
 
 NEReductionOperation::NEReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _reduction_kernel(), _reshape(), _output_internal(), _window_split(0), _reduction_axis(), _is_reshape_required(false)
+    : _memory_group(memory_manager),
+      _reduction_kernel(),
+      _reshape(),
+      _output_internal(),
+      _window_split(0),
+      _reduction_axis(),
+      _is_reshape_required(false)
 {
 }
 
-Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+Status NEReductionOperation::validate(
+    const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+                                    "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
 
     const auto is_reshape_required = !keep_dims;
@@ -74,9 +83,10 @@ Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInf
 
     TensorInfo info_before_reshape;
 
-    if(is_reshape_required)
+    if (is_reshape_required)
     {
-        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
+        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
 
         auto shape_before_reshape = input->tensor_shape();
@@ -84,17 +94,20 @@ Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInf
 
         const auto input_num_channles = input->num_channels();
         const auto input_qinfo        = input->quantization_info();
-        const auto is_arg_min_max     = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
-        const auto output_data_type   = is_arg_min_max ? DataType::S32 : output->data_type();
+        const auto is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
+        const auto output_data_type = is_arg_min_max ? DataType::S32 : output->data_type();
 
-        info_before_reshape.set_data_type(output_data_type).set_tensor_shape(shape_before_reshape).set_num_channels(input_num_channles).set_quantization_info(input_qinfo);
+        info_before_reshape.set_data_type(output_data_type)
+            .set_tensor_shape(shape_before_reshape)
+            .set_num_channels(input_num_channles)
+            .set_quantization_info(input_qinfo);
 
         output_internal = &info_before_reshape;
     }
 
     ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output_internal, axis, op));
 
-    if(is_reshape_required)
+    if (is_reshape_required)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(output_internal, output));
     }
@@ -102,7 +115,8 @@ Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInf
     return Status{};
 }
 
-void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+void NEReductionOperation::configure(
+    ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, output, axis, op, keep_dims);
@@ -112,19 +126,32 @@ void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned i
     auto      *output_internal = output;
     const auto is_arg_min_max  = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
 
-    if(_is_reshape_required)
+    if (_is_reshape_required)
     {
-        const auto output_internal_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis);
-        const auto output_external_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
-        const auto output_data_type      = is_arg_min_max ? DataType::S32 : input->info()->data_type();
-        const auto num_channels          = input->info()->num_channels();
-        const auto qinfo                 = input->info()->quantization_info();
-
-        _output_internal.allocator()->init(input->info()->clone()->set_data_type(output_data_type).set_tensor_shape(output_internal_shape).reset_padding().set_is_resizable(true).set_num_channels(
-                                               num_channels).set_quantization_info(qinfo));
+        const auto output_internal_shape =
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis);
+        const auto output_external_shape =
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
+        const auto output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type();
+        const auto num_channels     = input->info()->num_channels();
+        const auto qinfo            = input->info()->quantization_info();
+
+        _output_internal.allocator()->init(input->info()
+                                               ->clone()
+                                               ->set_data_type(output_data_type)
+                                               .set_tensor_shape(output_internal_shape)
+                                               .reset_padding()
+                                               .set_is_resizable(true)
+                                               .set_num_channels(num_channels)
+                                               .set_quantization_info(qinfo));
         _memory_group.manage(&_output_internal);
         output_internal = &_output_internal;
-        auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_data_type).set_tensor_shape(output_external_shape).reset_padding().set_is_resizable(true));
+        auto_init_if_empty(*output->info(), input->info()
+                                                ->clone()
+                                                ->set_data_type(output_data_type)
+                                                .set_tensor_shape(output_external_shape)
+                                                .reset_padding()
+                                                .set_is_resizable(true));
     }
 
     ARM_COMPUTE_ERROR_THROW_ON(NEReductionOperation::validate(input->info(), output->info(), axis, op, keep_dims));
@@ -135,7 +162,7 @@ void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned i
     _window_split   = reduction_window_split_dimension(axis);
     _reduction_axis = axis;
 
-    if(_is_reshape_required)
+    if (_is_reshape_required)
     {
         _reshape.configure(output_internal, output);
         _output_internal.allocator()->allocate();
@@ -146,7 +173,7 @@ void NEReductionOperation::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
     NEScheduler::get().schedule(_reduction_kernel.get(), _window_split);
-    if(_is_reshape_required)
+    if (_is_reshape_required)
     {
         _reshape.run();
     }
diff --git a/src/runtime/NEON/functions/NEReorderLayer.cpp b/src/runtime/NEON/functions/NEReorderLayer.cpp
index 427bf8c501..89cf575f38 100644
--- a/src/runtime/NEON/functions/NEReorderLayer.cpp
+++ b/src/runtime/NEON/functions/NEReorderLayer.cpp
@@ -23,20 +23,24 @@
  */
 #if defined(__aarch64__)
 
-#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/NEON/functions/NEReorderLayer.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/core/NEON/kernels/NEReorderKernel.h"
 
 namespace arm_compute
 {
 NEReorderLayer::~NEReorderLayer() = default;
 
-NEReorderLayer::NEReorderLayer()
-    : _reorder_kernel(std::make_unique<NEReorderKernel>())
+NEReorderLayer::NEReorderLayer() : _reorder_kernel(std::make_unique<NEReorderKernel>())
 {
 }
 
-void NEReorderLayer::configure(const ITensor *input, ITensor *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf)
+void NEReorderLayer::configure(const ITensor            *input,
+                               ITensor                  *output,
+                               arm_compute::WeightFormat input_wf,
+                               arm_compute::WeightFormat output_wf)
 {
     auto k = std::make_unique<NEReorderKernel>();
     k->configure(input, output, input_wf, output_wf);
@@ -49,11 +53,14 @@ void NEReorderLayer::run()
     NEScheduler::get().schedule(_reorder_kernel.get(), Window::DimX);
 }
 
-Status NEReorderLayer::validate(const ITensorInfo *input, const ITensorInfo *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf)
+Status NEReorderLayer::validate(const ITensorInfo        *input,
+                                const ITensorInfo        *output,
+                                arm_compute::WeightFormat input_wf,
+                                arm_compute::WeightFormat output_wf)
 {
     return NEReorderKernel::validate(input, output, input_wf, output_wf);
 }
 
 } // namespace arm_compute
 
-#endif  // defined(__aarch64__)
-\ No newline at end of file
+#endif // defined(__aarch64__)
diff --git a/src/runtime/NEON/functions/NEReorgLayer.cpp b/src/runtime/NEON/functions/NEReorgLayer.cpp
index 8ee73d7390..14e41d6df4 100644
--- a/src/runtime/NEON/functions/NEReorgLayer.cpp
+++ b/src/runtime/NEON/functions/NEReorgLayer.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEReorgLayer.h"
 
-#include "src/core/NEON/kernels/NEReorgLayerKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEReorgLayerKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEReshapeLayer.cpp b/src/runtime/NEON/functions/NEReshapeLayer.cpp
index 3ccb42361e..bed70ff66c 100644
--- a/src/runtime/NEON/functions/NEReshapeLayer.cpp
+++ b/src/runtime/NEON/functions/NEReshapeLayer.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuReshape.h"
 
 #include <utility>
@@ -32,16 +33,15 @@ namespace arm_compute
 {
 struct NEReshapeLayer::Impl
 {
-    const ITensor                   *src{ nullptr };
-    ITensor                         *dst{ nullptr };
-    std::unique_ptr<cpu::CpuReshape> op{ nullptr };
+    const ITensor                   *src{nullptr};
+    ITensor                         *dst{nullptr};
+    std::unique_ptr<cpu::CpuReshape> op{nullptr};
 };
 
-NEReshapeLayer::NEReshapeLayer()
-    : _impl(std::make_unique<Impl>())
+NEReshapeLayer::NEReshapeLayer() : _impl(std::make_unique<Impl>())
 {
 }
-NEReshapeLayer::NEReshapeLayer(NEReshapeLayer &&) = default;
+NEReshapeLayer::NEReshapeLayer(NEReshapeLayer &&)            = default;
 NEReshapeLayer &NEReshapeLayer::operator=(NEReshapeLayer &&) = default;
 NEReshapeLayer::~NEReshapeLayer()                            = default;
 
diff --git a/src/runtime/NEON/functions/NEReverse.cpp b/src/runtime/NEON/functions/NEReverse.cpp
index e1988f2ab3..a90f8d2e76 100644
--- a/src/runtime/NEON/functions/NEReverse.cpp
+++ b/src/runtime/NEON/functions/NEReverse.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEReverse.h"
 
-#include "src/core/NEON/kernels/NEReverseKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEReverseKernel.h"
 
 namespace arm_compute
 {
@@ -38,7 +37,10 @@ void NEReverse::configure(const ITensor *input, ITensor *output, const ITensor *
     _kernel = std::move(k);
 }
 
-Status NEReverse::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis)
+Status NEReverse::validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const ITensorInfo *axis,
+                           bool               use_inverted_axis)
 {
     return NEReverseKernel::validate(input, output, axis, use_inverted_axis);
 }
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index 09f037334e..0d011064f6 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEScale.h"
 
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "src/cpu/operators/CpuScale.h"
@@ -32,16 +33,16 @@ namespace arm_compute
 {
 struct NEScale::Impl
 {
-    const ITensor                 *src{ nullptr };
-    ITensor                       *dst{ nullptr };
-    Tensor                         dx{ nullptr };      /**< Element's distance between the X real coordinate and the smallest X following integer */
-    Tensor                         dy{ nullptr };      /**< Element's distance between the Y real coordinate and the smallest Y following integer */
-    Tensor                         offsets{ nullptr }; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */
-    std::unique_ptr<cpu::CpuScale> op{ nullptr };
+    const ITensor *src{nullptr};
+    ITensor       *dst{nullptr};
+    Tensor dx{nullptr}; /**< Element's distance between the X real coordinate and the smallest X following integer */
+    Tensor dy{nullptr}; /**< Element's distance between the Y real coordinate and the smallest Y following integer */
+    Tensor offsets{
+        nullptr}; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */
+    std::unique_ptr<cpu::CpuScale> op{nullptr};
 };
 
-NEScale::NEScale()
-    : _impl(std::make_unique<Impl>())
+NEScale::NEScale() : _impl(std::make_unique<Impl>())
 {
 }
 NEScale::~NEScale() = default;
@@ -57,25 +58,33 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo &
 
     // Configure for size of allocation of internal tensors
     // Get data layout and width/height indices
-    const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : info.data_layout;
-    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const DataLayout data_layout =
+        info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : info.data_layout;
+    const int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
     // Compute the ratio between source width/height and destination width/height
-    const bool is_align_corners_used = info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy);
-    const auto wr                    = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_width), output->info()->dimension(idx_width), is_align_corners_used);
-    const auto hr                    = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_height), output->info()->dimension(idx_height), is_align_corners_used);
+    const bool is_align_corners_used =
+        info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy);
+    const auto wr = arm_compute::scale_utils::calculate_resize_ratio(
+        input->info()->dimension(idx_width), output->info()->dimension(idx_width), is_align_corners_used);
+    const auto hr = arm_compute::scale_utils::calculate_resize_ratio(
+        input->info()->dimension(idx_height), output->info()->dimension(idx_height), is_align_corners_used);
 
     // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-    InterpolationPolicy policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : info.interpolation_policy;
+    InterpolationPolicy policy_to_use =
+        (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+            ? InterpolationPolicy::NEAREST_NEIGHBOR
+            : info.interpolation_policy;
 
     // Get the tensor shape
     TensorShape shape(output->info()->dimension(idx_width));
     shape.set(1, output->info()->dimension(idx_height), false);
 
-    bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required(data_layout, input->info()->data_type(), policy_to_use, info.border_mode);
+    bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required(
+        data_layout, input->info()->data_type(), policy_to_use, info.border_mode);
 
-    if(precompute_indices_weights)
+    if (precompute_indices_weights)
     {
         const TensorInfo tensor_info_dxdy(shape, Format::F32);
         const TensorInfo tensor_info_offsets(shape, Format::S32);
@@ -83,7 +92,7 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo &
         _impl->dx.allocator()->init(tensor_info_dxdy);
         _impl->dy.allocator()->init(tensor_info_dxdy);
         _impl->offsets.allocator()->init(tensor_info_offsets);
-        switch(policy_to_use)
+        switch (policy_to_use)
         {
             case InterpolationPolicy::NEAREST_NEIGHBOR:
             {
@@ -109,7 +118,8 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo &
     }
     else
     {
-        if(policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && policy_to_use != InterpolationPolicy::BILINEAR && policy_to_use != InterpolationPolicy::AREA)
+        if (policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && policy_to_use != InterpolationPolicy::BILINEAR &&
+            policy_to_use != InterpolationPolicy::AREA)
         {
             ARM_COMPUTE_ERROR("Unsupported interpolation mode");
         }
diff --git a/src/runtime/NEON/functions/NESelect.cpp b/src/runtime/NEON/functions/NESelect.cpp
index 26c2eb8fe9..55cad2202b 100644
--- a/src/runtime/NEON/functions/NESelect.cpp
+++ b/src/runtime/NEON/functions/NESelect.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NESelect.h"
 
 #include "arm_compute/core/Types.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NESelectKernel.h"
 
diff --git a/src/runtime/NEON/functions/NESlice.cpp b/src/runtime/NEON/functions/NESlice.cpp
index 4a8912bfe9..12d43adc84 100644
--- a/src/runtime/NEON/functions/NESlice.cpp
+++ b/src/runtime/NEON/functions/NESlice.cpp
@@ -25,8 +25,9 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEStridedSliceKernel.h"
 
@@ -34,7 +35,10 @@ namespace arm_compute
 {
 namespace experimental
 {
-void NESlice::configure(const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+void NESlice::configure(const ITensorInfo *input,
+                        ITensorInfo       *output,
+                        const Coordinates &starts,
+                        const Coordinates &ends)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends);
@@ -47,15 +51,16 @@ void NESlice::configure(const ITensorInfo *input, ITensorInfo *output, const Coo
     _kernel = std::move(k);
 }
 
-Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+Status NESlice::validate(const ITensorInfo *input,
+                         const ITensorInfo *output,
+                         const Coordinates &starts,
+                         const Coordinates &ends)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
 
     // Check start dimensions for being non-negative
-    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i)
-    {
-        return i < 0;
-    }));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) { return i < 0; }));
 
     // Get absolute end coordinates
     const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
@@ -66,20 +71,22 @@ Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, co
 
 struct NESlice::Impl
 {
-    const ITensor                         *src{ nullptr };
-    ITensor                               *dst{ nullptr };
-    std::unique_ptr<experimental::NESlice> op{ nullptr };
+    const ITensor                         *src{nullptr};
+    ITensor                               *dst{nullptr};
+    std::unique_ptr<experimental::NESlice> op{nullptr};
 };
 
-NESlice::NESlice()
-    : _impl(std::make_unique<Impl>())
+NESlice::NESlice() : _impl(std::make_unique<Impl>())
 {
 }
-NESlice::NESlice(NESlice &&) = default;
+NESlice::NESlice(NESlice &&)            = default;
 NESlice &NESlice::operator=(NESlice &&) = default;
 NESlice::~NESlice()                     = default;
 
-Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+Status NESlice::validate(const ITensorInfo *input,
+                         const ITensorInfo *output,
+                         const Coordinates &starts,
+                         const Coordinates &ends)
 {
     return experimental::NESlice::validate(input, output, starts, ends);
 }
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 0947ff94a6..e3c2012d05 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -22,9 +22,11 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
+
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/core/helpers/SoftmaxHelpers.h"
 #include "src/cpu/kernels/CpuSoftmaxKernel.h"
@@ -35,10 +37,10 @@ namespace arm_compute
 template <bool IS_LOG>
 struct NESoftmaxLayerGeneric<IS_LOG>::Impl
 {
-    const ITensor                                  *src{ nullptr };
-    ITensor                                        *dst{ nullptr };
-    Tensor                                          max{ nullptr };
-    std::unique_ptr<cpu::CpuSoftmaxGeneric<IS_LOG>> op{ nullptr };
+    const ITensor                                  *src{nullptr};
+    ITensor                                        *dst{nullptr};
+    Tensor                                          max{nullptr};
+    std::unique_ptr<cpu::CpuSoftmaxGeneric<IS_LOG>> op{nullptr};
     MemoryGroup                                     memory_group{};
     ITensorPack                                     run_pack{};
     WorkspaceData<Tensor>                           workspace_tensors{};
@@ -53,9 +55,9 @@ NESoftmaxLayerGeneric<IS_LOG>::NESoftmaxLayerGeneric(std::shared_ptr<IMemoryMana
 
 template <bool IS_LOG>
 NESoftmaxLayerGeneric<IS_LOG>::NESoftmaxLayerGeneric(NESoftmaxLayerGeneric &&) = default;
-template <bool                 IS_LOG>
+template <bool IS_LOG>
 NESoftmaxLayerGeneric<IS_LOG> &NESoftmaxLayerGeneric<IS_LOG>::operator=(NESoftmaxLayerGeneric &&) = default;
-template <bool                 IS_LOG>
+template <bool IS_LOG>
 NESoftmaxLayerGeneric<IS_LOG>::~NESoftmaxLayerGeneric() = default;
 
 template <bool IS_LOG>
@@ -68,12 +70,13 @@ void NESoftmaxLayerGeneric<IS_LOG>::configure(ITensor *input, ITensor *output, f
     _impl->op  = std::make_unique<cpu::CpuSoftmaxGeneric<IS_LOG>>();
     _impl->op->configure(input->info(), output->info(), beta, axis);
 
-    _impl->run_pack          = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST, _impl->dst } };
+    _impl->run_pack          = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}};
     _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
 }
 
 template <bool IS_LOG>
-Status NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
+Status
+NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuSoftmaxGeneric<IS_LOG>::validate(input, output, beta, axis));
@@ -81,7 +84,7 @@ Status NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const I
 }
 
 template <bool IS_LOG>
-void           NESoftmaxLayerGeneric<IS_LOG>::run()
+void NESoftmaxLayerGeneric<IS_LOG>::run()
 {
     // Acquire all the temporaries
     MemoryGroupResourceScope scope_mg(_impl->memory_group);
diff --git a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
index c4509510dc..556ebdd800 100644
--- a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
+++ b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
@@ -28,8 +28,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/NEON/functions/NEFill.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
 
@@ -37,17 +38,19 @@ namespace arm_compute
 {
 NESpaceToBatchLayer::~NESpaceToBatchLayer() = default;
 
-NESpaceToBatchLayer::NESpaceToBatchLayer()
-    : _space_to_batch_kernel(), _fill_f(), _has_padding(false)
+NESpaceToBatchLayer::NESpaceToBatchLayer() : _space_to_batch_kernel(), _fill_f(), _has_padding(false)
 {
 }
 
-void NESpaceToBatchLayer::configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output)
+void NESpaceToBatchLayer::configure(const ITensor *input,
+                                    const ITensor *block_shape,
+                                    const ITensor *paddings,
+                                    ITensor       *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
     ARM_COMPUTE_LOG_PARAMS(input, block_shape, paddings, output);
 
-    if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+    if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
         _fill_f      = std::make_unique<NEFill>();
@@ -57,11 +60,16 @@ void NESpaceToBatchLayer::configure(const ITensor *input, const ITensor *block_s
     _space_to_batch_kernel->configure(input, block_shape, paddings, output);
 }
 
-void NESpaceToBatchLayer::configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output)
+void NESpaceToBatchLayer::configure(const ITensor *input,
+                                    const int      block_shape_x,
+                                    const int      block_shape_y,
+                                    const Size2D  &padding_left,
+                                    const Size2D  &padding_right,
+                                    ITensor       *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+    if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
         _fill_f      = std::make_unique<NEFill>();
@@ -71,17 +79,25 @@ void NESpaceToBatchLayer::configure(const ITensor *input, const int block_shape_
     _space_to_batch_kernel->configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
 }
 
-Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+Status NESpaceToBatchLayer::validate(const ITensorInfo *input,
+                                     const ITensorInfo *block_shape,
+                                     const ITensorInfo *paddings,
+                                     const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
 
     return Status{};
 }
 
-Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status NESpaceToBatchLayer::validate(const ITensorInfo *input,
+                                     const int          block_shape_x,
+                                     const int          block_shape_y,
+                                     const Size2D      &padding_left,
+                                     const Size2D      &padding_right,
                                      const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NESpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
 
     return Status{};
 }
@@ -89,7 +105,7 @@ Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const int block_s
 void NESpaceToBatchLayer::run()
 {
     // Zero out output only if we have paddings
-    if(_has_padding)
+    if (_has_padding)
     {
         _fill_f->run();
     }
diff --git a/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp b/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp
index b37bf0d20f..846b619429 100644
--- a/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp
+++ b/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NESpaceToDepthLayerKernel.h"
 
@@ -36,8 +37,7 @@ namespace arm_compute
 {
 NESpaceToDepthLayer::~NESpaceToDepthLayer() = default;
 
-NESpaceToDepthLayer::NESpaceToDepthLayer()
-    : _space_to_depth_kernel()
+NESpaceToDepthLayer::NESpaceToDepthLayer() : _space_to_depth_kernel()
 {
 }
 
diff --git a/src/runtime/NEON/functions/NESplit.cpp b/src/runtime/NEON/functions/NESplit.cpp
index db19bbb824..53b09e9ae5 100644
--- a/src/runtime/NEON/functions/NESplit.cpp
+++ b/src/runtime/NEON/functions/NESplit.cpp
@@ -34,7 +34,7 @@ namespace arm_compute
 {
 void NESplit::run()
 {
-    for(unsigned i = 0; i < _num_outputs; ++i)
+    for (unsigned i = 0; i < _num_outputs; ++i)
     {
         _slice_functions[i].run();
     }
diff --git a/src/runtime/NEON/functions/NEStackLayer.cpp b/src/runtime/NEON/functions/NEStackLayer.cpp
index 68554e0931..03e7026691 100644
--- a/src/runtime/NEON/functions/NEStackLayer.cpp
+++ b/src/runtime/NEON/functions/NEStackLayer.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEStackLayerKernel.h"
 
@@ -38,9 +39,7 @@ namespace arm_compute
 NEStackLayer::~NEStackLayer() = default;
 
 NEStackLayer::NEStackLayer() // NOLINT
-    : _input(),
-      _stack_kernels(),
-      _num_inputs(0)
+    : _input(), _stack_kernels(), _num_inputs(0)
 {
 }
 
@@ -54,7 +53,7 @@ void NEStackLayer::configure(const std::vector<ITensor *> &input, int axis, ITen
     // Wrap around negative values
     const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
 
-    for(unsigned int i = 0; i < _num_inputs; i++)
+    for (unsigned int i = 0; i < _num_inputs; i++)
     {
         _stack_kernels[i] = std::make_unique<NEStackLayerKernel>();
         _stack_kernels[i]->configure(input[i], axis_u, i, _num_inputs, output);
@@ -72,7 +71,7 @@ Status NEStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis,
 
     const unsigned int num_inputs = input.size();
 
-    for(unsigned int i = 0; i < num_inputs; i++)
+    for (unsigned int i = 0; i < num_inputs; i++)
     {
         // All the tensors must have the same rank
         ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank);
@@ -85,7 +84,7 @@ Status NEStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis,
 
 void NEStackLayer::run()
 {
-    for(unsigned i = 0; i < _num_inputs; i++)
+    for (unsigned i = 0; i < _num_inputs; i++)
     {
         NEScheduler::get().schedule(_stack_kernels[i].get(), Window::DimY);
     }
diff --git a/src/runtime/NEON/functions/NEStridedSlice.cpp b/src/runtime/NEON/functions/NEStridedSlice.cpp
index 4f50749a4f..6a3ac8be05 100644
--- a/src/runtime/NEON/functions/NEStridedSlice.cpp
+++ b/src/runtime/NEON/functions/NEStridedSlice.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEStridedSliceKernel.h"
 
@@ -32,9 +33,14 @@ namespace arm_compute
 {
 namespace experimental
 {
-void NEStridedSlice::configure(const ITensorInfo *input, ITensorInfo *output,
-                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void NEStridedSlice::configure(const ITensorInfo *input,
+                               ITensorInfo       *output,
+                               const Coordinates &starts,
+                               const Coordinates &ends,
+                               const BiStrides   &strides,
+                               int32_t            begin_mask,
+                               int32_t            end_mask,
+                               int32_t            shrink_axis_mask)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
 
@@ -43,9 +49,14 @@ void NEStridedSlice::configure(const ITensorInfo *input, ITensorInfo *output,
     _kernel = std::move(k);
 }
 
-Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status NEStridedSlice::validate(const ITensorInfo *input,
+                                const ITensorInfo *output,
+                                const Coordinates &starts,
+                                const Coordinates &ends,
+                                const BiStrides   &strides,
+                                int32_t            begin_mask,
+                                int32_t            end_mask,
+                                int32_t            shrink_axis_mask)
 {
     return NEStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
 }
@@ -53,22 +64,26 @@ Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *out
 
 struct NEStridedSlice::Impl
 {
-    const ITensor                                *src{ nullptr };
-    ITensor                                      *dst{ nullptr };
-    std::unique_ptr<experimental::NEStridedSlice> op{ nullptr };
+    const ITensor                                *src{nullptr};
+    ITensor                                      *dst{nullptr};
+    std::unique_ptr<experimental::NEStridedSlice> op{nullptr};
 };
 
-NEStridedSlice::NEStridedSlice()
-    : _impl(std::make_unique<Impl>())
+NEStridedSlice::NEStridedSlice() : _impl(std::make_unique<Impl>())
 {
 }
-NEStridedSlice::NEStridedSlice(NEStridedSlice &&) = default;
+NEStridedSlice::NEStridedSlice(NEStridedSlice &&)            = default;
 NEStridedSlice &NEStridedSlice::operator=(NEStridedSlice &&) = default;
 NEStridedSlice::~NEStridedSlice()                            = default;
 
-void NEStridedSlice::configure(const ITensor *input, ITensor *output,
-                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void NEStridedSlice::configure(const ITensor     *input,
+                               ITensor           *output,
+                               const Coordinates &starts,
+                               const Coordinates &ends,
+                               const BiStrides   &strides,
+                               int32_t            begin_mask,
+                               int32_t            end_mask,
+                               int32_t            shrink_axis_mask)
 {
     _impl->src = input;
     _impl->dst = output;
@@ -84,10 +99,16 @@ void NEStridedSlice::run()
     _impl->op->run(pack);
 }
 
-Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status NEStridedSlice::validate(const ITensorInfo *input,
+                                const ITensorInfo *output,
+                                const Coordinates &starts,
+                                const Coordinates &ends,
+                                const BiStrides   &strides,
+                                int32_t            begin_mask,
+                                int32_t            end_mask,
+                                int32_t            shrink_axis_mask)
 {
-    return experimental::NEStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    return experimental::NEStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask,
+                                                  shrink_axis_mask);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NETile.cpp b/src/runtime/NEON/functions/NETile.cpp
index 526603f1a3..d10b1c8e95 100644
--- a/src/runtime/NEON/functions/NETile.cpp
+++ b/src/runtime/NEON/functions/NETile.cpp
@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NETile.h"
 
-#include "src/core/NEON/kernels/NETileKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NETileKernel.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NETranspose.cpp b/src/runtime/NEON/functions/NETranspose.cpp
index 78c7ea202a..0144a85e8c 100644
--- a/src/runtime/NEON/functions/NETranspose.cpp
+++ b/src/runtime/NEON/functions/NETranspose.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/operators/CpuTranspose.h"
 
@@ -31,13 +32,12 @@ namespace arm_compute
 {
 struct NETranspose::Impl
 {
-    const ITensor                     *src{ nullptr };
-    ITensor                           *dst{ nullptr };
-    std::unique_ptr<cpu::CpuTranspose> op{ nullptr };
+    const ITensor                     *src{nullptr};
+    ITensor                           *dst{nullptr};
+    std::unique_ptr<cpu::CpuTranspose> op{nullptr};
 };
 
-NETranspose::NETranspose()
-    : _impl(std::make_unique<Impl>())
+NETranspose::NETranspose() : _impl(std::make_unique<Impl>())
 {
 }
 
diff --git a/src/runtime/NEON/functions/NEUnstack.cpp b/src/runtime/NEON/functions/NEUnstack.cpp
index 0ffab5e92a..2f7ed2bb1f 100644
--- a/src/runtime/NEON/functions/NEUnstack.cpp
+++ b/src/runtime/NEON/functions/NEUnstack.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/common/utils/Log.h"
 
 namespace arm_compute
@@ -39,13 +40,15 @@ inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor)
     return wrap_around(axis, static_cast<int>(tensor->num_dimensions()));
 }
 
-inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions)
+inline void setup_slice_coordinates_and_mask(Coordinates       &slice_start,
+                                             int32_t           &slice_end_mask,
+                                             const unsigned int input_num_dimensions)
 {
     // Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time.
     Coordinates slice_end;
     slice_start.set_num_dimensions(input_num_dimensions);
     slice_end.set_num_dimensions(input_num_dimensions);
-    for(size_t k = 0; k < input_num_dimensions; ++k)
+    for (size_t k = 0; k < input_num_dimensions; ++k)
     {
         slice_start.set(k, 0);
         slice_end.set(k, -1);
@@ -55,19 +58,19 @@ inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &
 } // namespace
 
 NEUnstack::NEUnstack() // NOLINT
-    : _num_slices(0),
-      _strided_slice_vector()
+    : _num_slices(0), _strided_slice_vector()
 {
 }
 
 void NEUnstack::configure(const ITensor *input, const std::vector<ITensor *> &output_vector, int axis)
 {
     std::vector<ITensorInfo *> outputs_vector_info(output_vector.size());
-    std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ITensor * t)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(t);
-        return t->info();
-    });
+    std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(),
+                   [](ITensor *t)
+                   {
+                       ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+                       return t->info();
+                   });
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_ERROR_THROW_ON(NEUnstack::validate(input->info(), outputs_vector_info, axis));
@@ -81,11 +84,12 @@ void NEUnstack::configure(const ITensor *input, const std::vector<ITensor *> &ou
     Coordinates slice_start;
     int32_t     slice_end_mask;
     setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions());
-    for(unsigned int slice = 0; slice < _num_slices; ++slice)
+    for (unsigned int slice = 0; slice < _num_slices; ++slice)
     {
         // Adjusts start and end coordinates to take a 2D slice at a time
         slice_start.set(axis_u, slice);
-        _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u));
+        _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0,
+                                               slice_end_mask, (1 << axis_u));
     }
 }
 
@@ -102,18 +106,20 @@ Status NEUnstack::validate(const ITensorInfo *input, const std::vector<ITensorIn
 
     Coordinates slice_start;
     int32_t     slice_end_mask;
-    for(size_t k = 0; k < num_slices; ++k)
+    for (size_t k = 0; k < num_slices; ++k)
     {
         slice_start.set(wrap_axis(axis, input), k);
         setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions());
-        ARM_COMPUTE_RETURN_ON_ERROR(NEStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input))));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(),
+                                                             BiStrides(), 0, slice_end_mask,
+                                                             (1 << wrap_axis(axis, input))));
     }
     return Status{};
 }
 
 void NEUnstack::run()
 {
-    for(unsigned i = 0; i < _num_slices; ++i)
+    for (unsigned i = 0; i < _num_slices; ++i)
     {
         _strided_slice_vector[i].run();
     }
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index a8eded29ff..8d77abcfc7 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -26,15 +26,15 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/NEON/kernels/convolution/common/utils.hpp"
 #include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
 #include "src/cpu/operators/CpuWinogradConv2d.h"
 
-#include "src/core/NEON/kernels/convolution/common/utils.hpp"
-
 namespace arm_compute
 {
 using namespace arm_compute::experimental;
@@ -42,14 +42,14 @@ using namespace arm_compute::experimental;
 struct NEWinogradConvolutionLayer::Impl
 {
     MemoryGroup                             memory_group{};
-    std::unique_ptr<cpu::CpuWinogradConv2d> op{ nullptr };
+    std::unique_ptr<cpu::CpuWinogradConv2d> op{nullptr};
     ITensorPack                             run_pack{};
     ITensorPack                             prep_pack{};
     WorkspaceData<Tensor>                   workspace{};
     experimental::MemoryRequirements        aux_mem_req{};
-    const ITensor                          *original_weights{ nullptr };
-    bool                                    is_prepared{ false };
-    bool                                    is_activationlayer_enabled{ false };
+    const ITensor                          *original_weights{nullptr};
+    bool                                    is_prepared{false};
+    bool                                    is_activationlayer_enabled{false};
     DataLayout                              data_layout{};
 };
 
@@ -61,17 +61,24 @@ NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(const std::shared_ptr<IMe
 
 NEWinogradConvolutionLayer::~NEWinogradConvolutionLayer() = default;
 
-void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
-                                           bool enable_fast_math)
+void NEWinogradConvolutionLayer::configure(const ITensor             *input,
+                                           const ITensor             *weights,
+                                           const ITensor             *biases,
+                                           ITensor                   *output,
+                                           const PadStrideInfo       &conv_info,
+                                           const ActivationLayerInfo &act_info,
+                                           bool                       enable_fast_math)
 {
     _impl->original_weights = weights;
     _impl->op               = std::make_unique<cpu::CpuWinogradConv2d>();
-    _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info, act_info, enable_fast_math);
+    _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+                         conv_info, act_info, enable_fast_math);
 
     _impl->aux_mem_req = _impl->op->workspace();
-    _impl->run_pack    = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
-    _impl->prep_pack   = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } };
-    _impl->workspace   = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+    _impl->run_pack    = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+    _impl->prep_pack   = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}};
+    _impl->workspace =
+        manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
 }
 
 void NEWinogradConvolutionLayer::run()
@@ -82,15 +89,20 @@ void NEWinogradConvolutionLayer::run()
     _impl->op->run(_impl->run_pack);
 }
 
-Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                            const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status NEWinogradConvolutionLayer::validate(const ITensorInfo         *input,
+                                            const ITensorInfo         *weights,
+                                            const ITensorInfo         *biases,
+                                            const ITensorInfo         *output,
+                                            const PadStrideInfo       &conv_info,
+                                            const ActivationLayerInfo &act_info,
+                                            bool                       enable_fast_math)
 {
     return cpu::CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math);
 }
 
 void NEWinogradConvolutionLayer::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->prep_pack);
         _impl->original_weights->mark_as_unused();
diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
index b0a553212a..d4d6193fce 100644
--- a/src/runtime/OMP/OMPScheduler.cpp
+++ b/src/runtime/OMP/OMPScheduler.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
+
 #include <omp.h>
 
 namespace arm_compute
@@ -63,7 +64,7 @@ void OMPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Win
     const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
     const unsigned int num_threads    = std::min(num_iterations, _num_threads);
 
-    if(!kernel->is_parallelisable() || num_threads == 1)
+    if (!kernel->is_parallelisable() || num_threads == 1)
     {
         ThreadInfo info;
         info.cpu_info = &cpu_info();
@@ -73,10 +74,10 @@ void OMPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Win
     {
         const unsigned int                num_windows = num_threads;
         std::vector<IScheduler::Workload> workloads(num_windows);
-        for(unsigned int t = 0; t < num_windows; t++)
+        for (unsigned int t = 0; t < num_windows; t++)
         {
             //Capture 't' by copy, all the other variables by reference:
-            workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo & info)
+            workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo &info)
             {
                 Window win = max_window.split_window(hints.split_dimension(), t, num_windows);
                 win.validate();
@@ -92,7 +93,7 @@ void OMPScheduler::run_workloads(std::vector<arm_compute::IScheduler::Workload>
     const unsigned int amount_of_work     = static_cast<unsigned int>(workloads.size());
     const unsigned int num_threads_to_use = std::min(_num_threads, amount_of_work);
 
-    if(num_threads_to_use < 1)
+    if (num_threads_to_use < 1)
     {
         return;
     }
@@ -100,8 +101,9 @@ void OMPScheduler::run_workloads(std::vector<arm_compute::IScheduler::Workload>
     ThreadInfo info;
     info.cpu_info    = &cpu_info();
     info.num_threads = num_threads_to_use;
-    #pragma omp parallel for firstprivate(info) num_threads(num_threads_to_use) default(shared) proc_bind(close) schedule(static, 1)
-    for(unsigned int wid = 0; wid < amount_of_work; ++wid)
+#pragma omp parallel for firstprivate(info) num_threads(num_threads_to_use) default(shared) proc_bind(close) \
+    schedule(static, 1)
+    for (unsigned int wid = 0; wid < amount_of_work; ++wid)
     {
         const int tid = omp_get_thread_num();
 
diff --git a/src/runtime/OffsetLifetimeManager.cpp b/src/runtime/OffsetLifetimeManager.cpp
index a47fa184fa..d746f618b5 100644
--- a/src/runtime/OffsetLifetimeManager.cpp
+++ b/src/runtime/OffsetLifetimeManager.cpp
@@ -43,8 +43,7 @@ size_t align_offset(size_t offset, size_t alignment)
     return (remainder != 0U) ? offset + (alignment - remainder) : offset;
 }
 } // namespace
-OffsetLifetimeManager::OffsetLifetimeManager()
-    : _blob(0)
+OffsetLifetimeManager::OffsetLifetimeManager() : _blob(0)
 {
 }
 
@@ -71,21 +70,22 @@ void OffsetLifetimeManager::update_blobs_and_mappings()
 
     // Update blob size
     size_t max_aggregated_size = 0;
-    std::for_each(std::begin(_free_blobs), std::end(_free_blobs), [&](const Blob & b)
-    {
-        max_aggregated_size += b.max_size;
-        _blob.alignment = std::max(_blob.alignment, b.max_alignment);
-    });
+    std::for_each(std::begin(_free_blobs), std::end(_free_blobs),
+                  [&](const Blob &b)
+                  {
+                      max_aggregated_size += b.max_size;
+                      _blob.alignment = std::max(_blob.alignment, b.max_alignment);
+                  });
     max_aggregated_size += _free_blobs.size() * _blob.alignment;
     _blob.owners = std::max(_blob.owners, _free_blobs.size());
     _blob.size   = std::max(_blob.size, max_aggregated_size);
 
     // Calculate group mappings
-    auto &group_mappings = _active_group->mappings();
+    auto  &group_mappings = _active_group->mappings();
     size_t offset         = 0;
-    for(auto &free_blob : _free_blobs)
+    for (auto &free_blob : _free_blobs)
     {
-        for(auto &bound_element_id : free_blob.bound_elements)
+        for (auto &bound_element_id : free_blob.bound_elements)
         {
             ARM_COMPUTE_ERROR_ON(_active_elements.find(bound_element_id) == std::end(_active_elements));
             Element &bound_element               = _active_elements[bound_element_id];
diff --git a/src/runtime/OffsetMemoryPool.cpp b/src/runtime/OffsetMemoryPool.cpp
index ffedf5586c..8f3c1a84ba 100644
--- a/src/runtime/OffsetMemoryPool.cpp
+++ b/src/runtime/OffsetMemoryPool.cpp
@@ -21,8 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include <algorithm>
-
 #include "arm_compute/runtime/OffsetMemoryPool.h"
 
 #include "arm_compute/core/Error.h"
@@ -31,6 +29,8 @@
 #include "arm_compute/runtime/MemoryRegion.h"
 #include "arm_compute/runtime/Types.h"
 
+#include <algorithm>
+
 namespace arm_compute
 {
 OffsetMemoryPool::OffsetMemoryPool(IAllocator *allocator, BlobInfo blob_info)
@@ -50,7 +50,7 @@ void OffsetMemoryPool::acquire(MemoryMappings &handles)
     ARM_COMPUTE_ERROR_ON(_blob == nullptr);
 
     // Set memory to handlers
-    for(auto &handle : handles)
+    for (auto &handle : handles)
     {
         ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
         handle.first->set_owned_region(_blob->extract_subregion(handle.second, _blob_info.size - handle.second));
@@ -59,7 +59,7 @@ void OffsetMemoryPool::acquire(MemoryMappings &handles)
 
 void OffsetMemoryPool::release(MemoryMappings &handles)
 {
-    for(auto &handle : handles)
+    for (auto &handle : handles)
     {
         ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
         handle.first->set_region(nullptr);
diff --git a/src/runtime/OperatorTensor.cpp b/src/runtime/OperatorTensor.cpp
index a8ad53da90..19415b35cf 100644
--- a/src/runtime/OperatorTensor.cpp
+++ b/src/runtime/OperatorTensor.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/OperatorTensor.h"
+
 #include "arm_compute/runtime/MemoryRegion.h"
 
 #include "support/Cast.h"
@@ -47,7 +48,7 @@ ITensorInfo *OperatorTensor::info()
 
 uint8_t *OperatorTensor::buffer() const
 {
-    switch(_mem_type)
+    switch (_mem_type)
     {
         case MemoryType::CPU:
             return (uint8_t *)utils::cast::polymorphic_downcast<MemoryRegion *>(_memory->region())->buffer();
diff --git a/src/runtime/PoolManager.cpp b/src/runtime/PoolManager.cpp
index 87376a71a4..7fb9bd8000 100644
--- a/src/runtime/PoolManager.cpp
+++ b/src/runtime/PoolManager.cpp
@@ -31,8 +31,7 @@
 
 using namespace arm_compute;
 
-PoolManager::PoolManager()
-    : _free_pools(), _occupied_pools(), _sem(), _mtx()
+PoolManager::PoolManager() : _free_pools(), _occupied_pools(), _sem(), _mtx()
 {
 }
 
@@ -52,10 +51,8 @@ void PoolManager::unlock_pool(IMemoryPool *pool)
     ARM_COMPUTE_ERROR_ON_MSG(_free_pools.empty() && _occupied_pools.empty(), "Haven't setup any pools!");
 
     arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
-    auto it = std::find_if(std::begin(_occupied_pools), std::end(_occupied_pools), [pool](const std::unique_ptr<IMemoryPool> &pool_it)
-    {
-        return pool_it.get() == pool;
-    });
+    auto it = std::find_if(std::begin(_occupied_pools), std::end(_occupied_pools),
+                           [pool](const std::unique_ptr<IMemoryPool> &pool_it) { return pool_it.get() == pool; });
     ARM_COMPUTE_ERROR_ON_MSG(it == std::end(_occupied_pools), "Pool to be unlocked couldn't be found!");
     _free_pools.splice(std::begin(_free_pools), _occupied_pools, it);
     _sem->signal();
@@ -78,7 +75,7 @@ std::unique_ptr<IMemoryPool> PoolManager::release_pool()
     arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
     ARM_COMPUTE_ERROR_ON_MSG(!_occupied_pools.empty(), "All pools should be free in order to release one!");
 
-    if(!_free_pools.empty())
+    if (!_free_pools.empty())
     {
         std::unique_ptr<IMemoryPool> pool = std::move(_free_pools.front());
         ARM_COMPUTE_ERROR_ON(_free_pools.front() != nullptr);
diff --git a/src/runtime/RuntimeContext.cpp b/src/runtime/RuntimeContext.cpp
index d1dea066e7..1de8d2abdb 100644
--- a/src/runtime/RuntimeContext.cpp
+++ b/src/runtime/RuntimeContext.cpp
@@ -28,8 +28,7 @@
 
 namespace arm_compute
 {
-RuntimeContext::RuntimeContext()
-    : _owned_scheduler(SchedulerFactory::create()), _scheduler(_owned_scheduler.get())
+RuntimeContext::RuntimeContext() : _owned_scheduler(SchedulerFactory::create()), _scheduler(_owned_scheduler.get())
 {
 }
 
diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp
index 0713b9a2ad..e52fb59940 100644
--- a/src/runtime/Scheduler.cpp
+++ b/src/runtime/Scheduler.cpp
@@ -76,7 +76,7 @@ void Scheduler::set(Type t)
 
 bool Scheduler::is_available(Type t)
 {
-    if(t == Type::CUSTOM)
+    if (t == Type::CUSTOM)
     {
         return _custom_scheduler != nullptr;
     }
@@ -93,11 +93,12 @@ Scheduler::Type Scheduler::get_type()
 
 IScheduler &Scheduler::get()
 {
-    if(_scheduler_type == Type::CUSTOM)
+    if (_scheduler_type == Type::CUSTOM)
     {
-        if(_custom_scheduler == nullptr)
+        if (_custom_scheduler == nullptr)
         {
-            ARM_COMPUTE_ERROR("No custom scheduler has been setup. Call set(std::shared_ptr<IScheduler> &scheduler) before Scheduler::get()");
+            ARM_COMPUTE_ERROR("No custom scheduler has been setup. Call set(std::shared_ptr<IScheduler> &scheduler) "
+                              "before Scheduler::get()");
         }
         else
         {
@@ -106,13 +107,13 @@ IScheduler &Scheduler::get()
     }
     else
     {
-        if(_schedulers.empty())
+        if (_schedulers.empty())
         {
             _schedulers = init();
         }
 
         auto it = _schedulers.find(_scheduler_type);
-        if(it != _schedulers.end())
+        if (it != _schedulers.end())
         {
             return *it->second;
         }
diff --git a/src/runtime/SchedulerFactory.cpp b/src/runtime/SchedulerFactory.cpp
index cc21d62630..4fb08d79f5 100644
--- a/src/runtime/SchedulerFactory.cpp
+++ b/src/runtime/SchedulerFactory.cpp
@@ -48,7 +48,7 @@ const SchedulerFactory::Type SchedulerFactory::_default_type = SchedulerFactory:
 
 std::unique_ptr<IScheduler> SchedulerFactory::create(Type type)
 {
-    switch(type)
+    switch (type)
     {
         case Type::ST:
         {
diff --git a/src/runtime/SchedulerUtils.cpp b/src/runtime/SchedulerUtils.cpp
index 6f9a32c879..74ee539fec 100644
--- a/src/runtime/SchedulerUtils.cpp
+++ b/src/runtime/SchedulerUtils.cpp
@@ -47,35 +47,34 @@ std::pair<unsigned, unsigned> split_2d(unsigned max_threads, std::size_t m, std:
     double ratio = m / static_cast<double>(n);
 
     // nt = sqrt(max_threads * (m / n) )
-    const unsigned adjusted = std::round(
-                                  std::sqrt(max_threads * ratio));
+    const unsigned adjusted = std::round(std::sqrt(max_threads * ratio));
 
     //find the nearest factor of max_threads
-    for(unsigned i = 0; i != adjusted; ++i)
+    for (unsigned i = 0; i != adjusted; ++i)
     {
         //try down
         const unsigned adj_down = adjusted - i;
-        if(max_threads % adj_down == 0)
+        if (max_threads % adj_down == 0)
         {
-            return { adj_down, max_threads / adj_down };
+            return {adj_down, max_threads / adj_down};
         }
 
         //try up
         const unsigned adj_up = adjusted + i;
-        if(max_threads % adj_up == 0)
+        if (max_threads % adj_up == 0)
         {
-            return { adj_up, max_threads / adj_up };
+            return {adj_up, max_threads / adj_up};
         }
     }
 
     //we didn't find anything so lets bail out with maxes biased to the largest dimension
-    if(m > n)
+    if (m > n)
     {
-        return { std::min<unsigned>(m, max_threads), 1 };
+        return {std::min<unsigned>(m, max_threads), 1};
     }
     else
     {
-        return { 1, std::min<unsigned>(n, max_threads) };
+        return {1, std::min<unsigned>(n, max_threads)};
     }
 }
 #endif /* #ifndef BARE_METAL */
diff --git a/src/runtime/SubTensor.cpp b/src/runtime/SubTensor.cpp
index ae16c8be0a..f87256abb1 100644
--- a/src/runtime/SubTensor.cpp
+++ b/src/runtime/SubTensor.cpp
@@ -27,8 +27,7 @@
 
 using namespace arm_compute;
 
-SubTensor::SubTensor()
-    : _parent(nullptr), _info()
+SubTensor::SubTensor() : _parent(nullptr), _info()
 {
 }
 
diff --git a/src/runtime/Tensor.cpp b/src/runtime/Tensor.cpp
index 6dcef9f0b5..f17e323694 100644
--- a/src/runtime/Tensor.cpp
+++ b/src/runtime/Tensor.cpp
@@ -25,8 +25,7 @@
 
 namespace arm_compute
 {
-Tensor::Tensor(IRuntimeContext *)
-    : _allocator(this)
+Tensor::Tensor(IRuntimeContext *) : _allocator(this)
 {
 }
 
diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp
index 4ae27c59fc..372852bfea 100644
--- a/src/runtime/TensorAllocator.cpp
+++ b/src/runtime/TensorAllocator.cpp
@@ -43,13 +43,13 @@ bool validate_subtensor_shape(const TensorInfo &parent_info, const TensorInfo &c
     const size_t       parent_dims  = parent_info.num_dimensions();
     const size_t       child_dims   = child_info.num_dimensions();
 
-    if(child_dims <= parent_dims)
+    if (child_dims <= parent_dims)
     {
-        for(size_t num_dimensions = child_dims; num_dimensions > 0; --num_dimensions)
+        for (size_t num_dimensions = child_dims; num_dimensions > 0; --num_dimensions)
         {
             const size_t child_dim_size = coords[num_dimensions - 1] + child_shape[num_dimensions - 1];
 
-            if((coords[num_dimensions - 1] < 0) || (child_dim_size > parent_shape[num_dimensions - 1]))
+            if ((coords[num_dimensions - 1] < 0) || (child_dim_size > parent_shape[num_dimensions - 1]))
             {
                 is_valid = false;
                 break;
@@ -65,8 +65,7 @@ bool validate_subtensor_shape(const TensorInfo &parent_info, const TensorInfo &c
 }
 } // namespace
 
-TensorAllocator::TensorAllocator(IMemoryManageable *owner)
-    : _owner(owner), _associated_memory_group(nullptr), _memory()
+TensorAllocator::TensorAllocator(IMemoryManageable *owner) : _owner(owner), _associated_memory_group(nullptr), _memory()
 {
 }
 
@@ -88,7 +87,7 @@ TensorAllocator::TensorAllocator(TensorAllocator &&o) noexcept
 
 TensorAllocator &TensorAllocator::operator=(TensorAllocator &&o) noexcept
 {
-    if(&o != this)
+    if (&o != this)
     {
         _owner   = o._owner;
         o._owner = nullptr;
@@ -117,8 +116,10 @@ void TensorAllocator::init(const TensorAllocator &allocator, const Coordinates &
     _memory = Memory(allocator._memory.region());
 
     // Init tensor info with new dimensions
-    size_t total_size = parent_info.offset_element_in_bytes(coords) + sub_info.total_size() - sub_info.offset_first_element_in_bytes();
-    sub_info.init(sub_info.tensor_shape(), sub_info.format(), parent_info.strides_in_bytes(), parent_info.offset_element_in_bytes(coords), total_size);
+    size_t total_size =
+        parent_info.offset_element_in_bytes(coords) + sub_info.total_size() - sub_info.offset_first_element_in_bytes();
+    sub_info.init(sub_info.tensor_shape(), sub_info.format(), parent_info.strides_in_bytes(),
+                  parent_info.offset_element_in_bytes(coords), total_size);
 
     // Set TensorInfo
     init(sub_info);
@@ -133,7 +134,7 @@ void TensorAllocator::allocate()
 {
     // Align to 64-byte boundaries by default if alignment is not specified
     const size_t alignment_to_use = (alignment() != 0) ? alignment() : 64;
-    if(_associated_memory_group == nullptr)
+    if (_associated_memory_group == nullptr)
     {
         _memory.set_owned_region(std::make_unique<MemoryRegion>(info().total_size(), alignment_to_use));
     }
diff --git a/src/runtime/Utils.cpp b/src/runtime/Utils.cpp
index 15e9d43a49..a7f7b5f3cb 100644
--- a/src/runtime/Utils.cpp
+++ b/src/runtime/Utils.cpp
@@ -41,20 +41,17 @@ static const std::string information =
 
 const std::string &string_from_scheduler_type(Scheduler::Type t)
 {
-    static std::map<Scheduler::Type, const std::string> scheduler_type_map =
-    {
-        { Scheduler::Type::ST, "Single Thread" },
-        { Scheduler::Type::CPP, "C++11 Threads" },
-        { Scheduler::Type::OMP, "OpenMP Threads" },
-        { Scheduler::Type::CUSTOM, "Custom" }
-    };
+    static std::map<Scheduler::Type, const std::string> scheduler_type_map = {{Scheduler::Type::ST, "Single Thread"},
+                                                                              {Scheduler::Type::CPP, "C++11 Threads"},
+                                                                              {Scheduler::Type::OMP, "OpenMP Threads"},
+                                                                              {Scheduler::Type::CUSTOM, "Custom"}};
 
     return scheduler_type_map[t];
 }
 
 void schedule_kernel_on_ctx(IRuntimeContext *ctx, ICPPKernel *kernel, const IScheduler::Hints &hints)
 {
-    if(ctx)
+    if (ctx)
     {
         ARM_COMPUTE_ERROR_ON(ctx->scheduler() == nullptr);
         ctx->scheduler()->schedule(kernel, hints);
@@ -68,7 +65,7 @@ void schedule_kernel_on_ctx(IRuntimeContext *ctx, ICPPKernel *kernel, const ISch
 unsigned int calculate_number_of_stages_only_x_axis(size_t input_x_dimension, unsigned int axis)
 {
     // We need only 1 stage for all axis except x-axis
-    if(axis != 0)
+    if (axis != 0)
     {
         return 1;
     }
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp
index 1bfb8124e9..aba32871d0 100644
--- a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include <utility>
 
 namespace arm_compute
@@ -37,25 +38,27 @@ namespace cl_direct_conv
 {
 using namespace arm_compute::misc::shape_calculator;
 
-ClDirectConvDefaultConfigBifrost::ClDirectConvDefaultConfigBifrost(GPUTarget gpu)
-    : IClDirectConvKernelConfig(gpu)
+ClDirectConvDefaultConfigBifrost::ClDirectConvDefaultConfigBifrost(GPUTarget gpu) : IClDirectConvKernelConfig(gpu)
 {
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info)
 {
-    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigBifrost::*)(const ITensorInfo * src, const ITensorInfo * wei, const PadStrideInfo & conv_info);
+    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigBifrost::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
 
-    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(&ClDirectConvDefaultConfigBifrost::configure_G71_f32,
-                                                                          &ClDirectConvDefaultConfigBifrost::configure_G71_f16,
-                                                                          &ClDirectConvDefaultConfigBifrost::configure_G71_u8);
+    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(
+        &ClDirectConvDefaultConfigBifrost::configure_G71_f32, &ClDirectConvDefaultConfigBifrost::configure_G71_f16,
+        &ClDirectConvDefaultConfigBifrost::configure_G71_u8);
 
-    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_default(&ClDirectConvDefaultConfigBifrost::configure_default_f32,
-                                                                              &ClDirectConvDefaultConfigBifrost::configure_default_f16,
-                                                                              &ClDirectConvDefaultConfigBifrost::configure_G71_u8);
+    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_default(
+        &ClDirectConvDefaultConfigBifrost::configure_default_f32,
+        &ClDirectConvDefaultConfigBifrost::configure_default_f16, &ClDirectConvDefaultConfigBifrost::configure_G71_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G71:
             func = configs_G71.get_function(src->data_type());
@@ -69,18 +72,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure(const IT
     return (this->*func)(src, wei, conv_info);
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f32(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
         TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
 
         desc.n0 = 4;
 
-        if(output_shape[0] > 16)
+        if (output_shape[0] > 16)
         {
             desc.m0 = 2;
         }
@@ -93,18 +98,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f32(
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f16(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
         TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
 
         desc.n0 = 4;
 
-        if(output_shape[0] > 16)
+        if (output_shape[0] > 16)
         {
             desc.m0 = 4;
         }
@@ -117,18 +124,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f16(
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_u8(const ITensorInfo   *src,
+                                                                               const ITensorInfo   *wei,
+                                                                               const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
         TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
 
         desc.n0 = 4;
 
-        if(output_shape[0] > 16)
+        if (output_shape[0] > 16)
         {
             desc.m0 = 4;
         }
@@ -141,18 +150,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_u8(c
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f32(const ITensorInfo   *src,
+                                                                                    const ITensorInfo   *wei,
+                                                                                    const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
         TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
 
         desc.n0 = 4;
 
-        if(output_shape[0] > 16)
+        if (output_shape[0] > 16)
         {
             desc.m0 = 2;
         }
@@ -165,18 +176,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f16(const ITensorInfo   *src,
+                                                                                    const ITensorInfo   *wei,
+                                                                                    const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
         TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
 
         desc.n0 = 4;
 
-        if(output_shape[0] > 16)
+        if (output_shape[0] > 16)
         {
             desc.m0 = 4;
         }
@@ -188,5 +201,5 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_
 
     return desc;
 }
-} // namespace opencl
+} // namespace cl_direct_conv
 } // namespace arm_compute
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h
index 6b60b2c007..ed6a4c3c68 100644
--- a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h
@@ -41,15 +41,21 @@ public:
     ClDirectConvDefaultConfigBifrost(GPUTarget gpu);
 
     // Inherited overridden method
-    DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
+    DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
 
 private:
-    DirectConvComputeKernelInfo configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_G71_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_default_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_default_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G71_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_default_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_default_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
 };
-} // namespace opencl
+} // namespace cl_direct_conv
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGBIFROST */
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp
index 8f2fd82412..4b7666d5aa 100644
--- a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include <utility>
 
 namespace arm_compute
@@ -37,25 +38,27 @@ namespace cl_direct_conv
 {
 using namespace arm_compute::misc::shape_calculator;
 
-ClDirectConvDefaultConfigValhall::ClDirectConvDefaultConfigValhall(GPUTarget gpu)
-    : IClDirectConvKernelConfig(gpu)
+ClDirectConvDefaultConfigValhall::ClDirectConvDefaultConfigValhall(GPUTarget gpu) : IClDirectConvKernelConfig(gpu)
 {
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info)
 {
-    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigValhall::*)(const ITensorInfo * src, const ITensorInfo * wei, const PadStrideInfo & conv_info);
+    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigValhall::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
 
-    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(&ClDirectConvDefaultConfigValhall::configure_G78_f32,
-                                                                          &ClDirectConvDefaultConfigValhall::configure_G78_f16,
-                                                                          &ClDirectConvDefaultConfigValhall::configure_G78_u8);
+    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(
+        &ClDirectConvDefaultConfigValhall::configure_G78_f32, &ClDirectConvDefaultConfigValhall::configure_G78_f16,
+        &ClDirectConvDefaultConfigValhall::configure_G78_u8);
 
-    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G57(&ClDirectConvDefaultConfigValhall::configure_G57_f32,
-                                                                          &ClDirectConvDefaultConfigValhall::configure_G57_f16,
-                                                                          &ClDirectConvDefaultConfigValhall::configure_G78_u8);
+    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G57(
+        &ClDirectConvDefaultConfigValhall::configure_G57_f32, &ClDirectConvDefaultConfigValhall::configure_G57_f16,
+        &ClDirectConvDefaultConfigValhall::configure_G78_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G57:
             func = configs_G57.get_function(src->data_type());
@@ -70,15 +73,17 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure(const IT
     return (this->*func)(src, wei, conv_info);
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
-        const TensorShape wei_shape                  = wei->tensor_shape();
-        const TensorShape dst_shape                  = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
         const bool        export_weights_to_cl_image = export_to_cl_image(wei);
 
         const int32_t ofm          = dst_shape[0];
@@ -87,11 +92,11 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32(
 
         desc.export_weights_to_cl_image = export_weights_to_cl_image;
 
-        if(dst_shape[0] <= 4)
+        if (dst_shape[0] <= 4)
         {
-            if(is_pointwise)
+            if (is_pointwise)
             {
-                if(ofm == 4)
+                if (ofm == 4)
                 {
                     desc.m0 = 1;
                     desc.n0 = 4;
@@ -113,7 +118,7 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32(
         }
         else
         {
-            if(m < 64)
+            if (m < 64)
             {
                 desc.m0 = 1;
                 desc.n0 = 1;
@@ -131,15 +136,17 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32(
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
-        const TensorShape wei_shape                  = wei->tensor_shape();
-        const TensorShape dst_shape                  = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
         const bool        export_weights_to_cl_image = export_to_cl_image(wei);
 
         const int32_t ofm          = dst_shape[0];
@@ -149,15 +156,15 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(
 
         desc.export_weights_to_cl_image = export_weights_to_cl_image;
 
-        if(dst_shape[0] <= 4)
+        if (dst_shape[0] <= 4)
         {
             // k0 should be as larger as possible. However, we should avoid
             // having left-over for loops that make the implementation slower.
-            if((k % 16) == 0)
+            if ((k % 16) == 0)
             {
                 desc.k0 = 16;
             }
-            else if((k % 8) == 0)
+            else if ((k % 8) == 0)
             {
                 desc.k0 = 8;
             }
@@ -166,9 +173,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(
                 desc.k0 = 4;
             }
 
-            if(is_pointwise)
+            if (is_pointwise)
             {
-                if(ofm == 4)
+                if (ofm == 4)
                 {
                     desc.m0 = 1;
                     desc.n0 = 4;
@@ -187,15 +194,15 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(
         }
         else
         {
-            if(m < 64)
+            if (m < 64)
             {
                 desc.m0 = 1;
                 desc.n0 = 1;
-                if((k % 16) == 0)
+                if ((k % 16) == 0)
                 {
                     desc.k0 = 16;
                 }
-                else if((k % 8) == 0)
+                else if ((k % 8) == 0)
                 {
                     desc.k0 = 8;
                 }
@@ -206,9 +213,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(
             }
             else
             {
-                if(ofm >= 16)
+                if (ofm >= 16)
                 {
-                    if(m / 6 > 24000)
+                    if (m / 6 > 24000)
                     {
                         desc.m0 = 6;
                     }
@@ -223,11 +230,11 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(
                 {
                     desc.m0 = 2;
                     desc.n0 = 8;
-                    if((k % 16) == 0)
+                    if ((k % 16) == 0)
                     {
                         desc.k0 = 16;
                     }
-                    else if((k % 8) == 0)
+                    else if ((k % 8) == 0)
                     {
                         desc.k0 = 8;
                     }
@@ -243,18 +250,20 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_u8(const ITensorInfo   *src,
+                                                                               const ITensorInfo   *wei,
+                                                                               const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
         TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
 
         desc.n0 = 4;
 
-        if(output_shape[0] > 16)
+        if (output_shape[0] > 16)
         {
             desc.m0 = 4;
         }
@@ -267,15 +276,17 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_u8(c
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
-        const TensorShape wei_shape                  = wei->tensor_shape();
-        const TensorShape dst_shape                  = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
         const bool        export_weights_to_cl_image = export_to_cl_image(wei);
 
         const int32_t m            = dst_shape[1] * dst_shape[2];
@@ -283,9 +294,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32(
 
         desc.export_weights_to_cl_image = export_weights_to_cl_image;
 
-        if(dst_shape[0] <= 4)
+        if (dst_shape[0] <= 4)
         {
-            if(is_pointwise)
+            if (is_pointwise)
             {
                 desc.m0 = 1;
                 desc.n0 = 1;
@@ -300,9 +311,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32(
         }
         else
         {
-            if(m < 64)
+            if (m < 64)
             {
-                if(m == 1)
+                if (m == 1)
                 {
                     desc.m0 = 1;
                     desc.n0 = 1;
@@ -327,15 +338,17 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32(
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
-        const TensorShape wei_shape                  = wei->tensor_shape();
-        const TensorShape dst_shape                  = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
         const bool        export_weights_to_cl_image = export_to_cl_image(wei);
 
         const int32_t ofm          = dst_shape[0];
@@ -344,9 +357,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(
 
         desc.export_weights_to_cl_image = export_weights_to_cl_image;
 
-        if(dst_shape[0] <= 4)
+        if (dst_shape[0] <= 4)
         {
-            if(is_pointwise)
+            if (is_pointwise)
             {
                 desc.m0 = 2;
                 desc.n0 = 1;
@@ -361,9 +374,9 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(
         }
         else
         {
-            if(m < 64)
+            if (m < 64)
             {
-                if(m == 1)
+                if (m == 1)
                 {
                     desc.m0 = 1;
                     desc.n0 = 1;
@@ -378,7 +391,7 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(
             }
             else
             {
-                if(ofm > 16)
+                if (ofm > 16)
                 {
                     desc.m0 = 4;
                     desc.n0 = 8;
@@ -396,5 +409,5 @@ DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(
 
     return desc;
 }
-} // namespace opencl
+} // namespace cl_direct_conv
 } // namespace arm_compute
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h
index f9d5c5299e..efd879a567 100644
--- a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h
@@ -41,15 +41,21 @@ public:
     ClDirectConvDefaultConfigValhall(GPUTarget gpu);
 
     // Inherited overridden method
-    DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
+    DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
 
 private:
-    DirectConvComputeKernelInfo configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_G57_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_G57_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G57_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G57_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
 };
-} // namespace opencl
+} // namespace cl_direct_conv
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGVALHALL */
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h b/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h
index 232167fc59..2c2509f70b 100644
--- a/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h
@@ -46,7 +46,7 @@ public:
      */
     static std::unique_ptr<IClDirectConvKernelConfig> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
                 return std::make_unique<ClDirectConvDefaultConfigBifrost>(GPUTarget::G71);
@@ -59,6 +59,6 @@ public:
         }
     }
 };
-} // namespace opencl
+} // namespace cl_direct_conv
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG */
diff --git a/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h b/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h
index 6104d73594..e5b270c720 100644
--- a/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h
+++ b/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/common/Macros.h"
 
 namespace arm_compute
@@ -52,8 +53,7 @@ public:
      * @param[in] func_int8 Function to call for direct convolution Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
      *
      */
-    ClDirectConvConfigArray(T func_f32, T func_f16, T func_int8)
-        : _configs{ func_f32, func_f16, func_int8 }
+    ClDirectConvConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8}
     {
     }
 
@@ -65,7 +65,7 @@ public:
      */
     T get_function(DataType data_type)
     {
-        switch(data_type)
+        switch (data_type)
         {
             case DataType::F32:
                 return _configs.at(DT_F32);
@@ -92,8 +92,7 @@ public:
      *
      * @param[in] arch GPU target
      */
-    IClDirectConvKernelConfig(GPUTarget arch)
-        : _target(arch)
+    IClDirectConvKernelConfig(GPUTarget arch) : _target(arch)
     {
     }
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClDirectConvKernelConfig);
@@ -105,11 +104,12 @@ public:
      * @param[in] wei       Weights tensor
      * @param[in] conv_info Convolution info
      */
-    virtual DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0;
+    virtual DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0;
 
 protected:
     GPUTarget _target;
 };
-} // namespace opencl
+} // namespace cl_direct_conv
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_ICLDIRECTCONVKERNELCONFIG */
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp
index 5311fdcec3..98ebf3ebbe 100644
--- a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h"
-#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
@@ -30,28 +29,34 @@
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h"
+
 namespace arm_compute
 {
 namespace cl_dwc
 {
 namespace
 {
-DWCComputeKernelInfo configure_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                   unsigned int depth_multiplier, bool is_g71)
+DWCComputeKernelInfo configure_f32(const ITensorInfo   *src,
+                                   const ITensorInfo   *wei,
+                                   const PadStrideInfo &conv_info,
+                                   const Size2D        &dilation,
+                                   unsigned int         depth_multiplier,
+                                   bool                 is_g71)
 {
     DWCComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
-        const size_t idx_c          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
-        const size_t idx_w          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
         const TensorShape wei_shape = wei->tensor_shape();
         const size_t      kernel_c  = wei_shape[idx_c];
         const size_t      kernel_w  = wei_shape[idx_w];
 
         desc.export_input_to_cl_image = false;
 
-        if(is_g71)
+        if (is_g71)
         {
             desc.export_weights_to_cl_image = false;
         }
@@ -60,17 +65,17 @@ DWCComputeKernelInfo configure_f32(const ITensorInfo *src, const ITensorInfo *we
             desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
         }
 
-        if(depth_multiplier == 1)
+        if (depth_multiplier == 1)
         {
             desc.n0 = 4;
         }
         else
         {
-            if((depth_multiplier % 4) == 0)
+            if ((depth_multiplier % 4) == 0)
             {
                 desc.n0 = 4;
             }
-            else if((depth_multiplier % 2) == 0)
+            else if ((depth_multiplier % 2) == 0)
             {
                 desc.n0 = 2;
             }
@@ -81,14 +86,15 @@ DWCComputeKernelInfo configure_f32(const ITensorInfo *src, const ITensorInfo *we
         }
 
         // Note: If we reduce n0, export to cl_image must be false
-        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
 
         desc.n0 = adjust_vec_size(desc.n0, kernel_c);
 
         // Set m0 only if stride_x == 1 and dilation_x == 1
-        if(conv_info.stride().first == 1 && dilation.x() == 1)
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
         {
-            if((kernel_w >= 9) || (kernel_w == 1))
+            if ((kernel_w >= 9) || (kernel_w == 1))
             {
                 desc.m0 = 1;
             }
@@ -106,16 +112,20 @@ DWCComputeKernelInfo configure_f32(const ITensorInfo *src, const ITensorInfo *we
     return desc;
 }
 
-DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                   unsigned int depth_multiplier, bool is_g71)
+DWCComputeKernelInfo configure_f16(const ITensorInfo   *src,
+                                   const ITensorInfo   *wei,
+                                   const PadStrideInfo &conv_info,
+                                   const Size2D        &dilation,
+                                   unsigned int         depth_multiplier,
+                                   bool                 is_g71)
 {
     DWCComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Src and weights have the same dimension indices
-        const size_t idx_c          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
-        const size_t idx_w          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
         const TensorShape src_shape = src->tensor_shape();
         const TensorShape wei_shape = wei->tensor_shape();
         const size_t      src_w     = src_shape[idx_w];
@@ -124,7 +134,7 @@ DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *we
 
         desc.export_input_to_cl_image = false;
 
-        if(is_g71)
+        if (is_g71)
         {
             desc.export_weights_to_cl_image = false;
         }
@@ -133,9 +143,9 @@ DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *we
             desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
         }
 
-        if(depth_multiplier == 1)
+        if (depth_multiplier == 1)
         {
-            if(desc.export_weights_to_cl_image == false)
+            if (desc.export_weights_to_cl_image == false)
             {
                 desc.n0 = 8;
             }
@@ -146,11 +156,11 @@ DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *we
         }
         else
         {
-            if((depth_multiplier % 4) == 0)
+            if ((depth_multiplier % 4) == 0)
             {
                 desc.n0 = 4;
             }
-            else if((depth_multiplier % 2) == 0)
+            else if ((depth_multiplier % 2) == 0)
             {
                 desc.n0 = 2;
             }
@@ -161,20 +171,21 @@ DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *we
         }
 
         // Note: If we reduce n0, export to cl_image must be false
-        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
 
         desc.n0 = adjust_vec_size(desc.n0, kernel_c);
 
         // Set m0 only if stride_x == 1 and dilation_x == 1
-        if(conv_info.stride().first == 1 && dilation.x() == 1)
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
         {
-            if((kernel_w >= 9) || (kernel_w == 1))
+            if ((kernel_w >= 9) || (kernel_w == 1))
             {
                 desc.m0 = 1;
             }
             else
             {
-                if((src_w % 5) == 0)
+                if ((src_w % 5) == 0)
                 {
                     desc.m0 = 5;
                 }
@@ -194,27 +205,30 @@ DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *we
 }
 } // namespace
 
-ClDWCNativeDefaultConfigBifrost::ClDWCNativeDefaultConfigBifrost(GPUTarget gpu)
-    : IClDWCNativeKernelConfig(gpu)
+ClDWCNativeDefaultConfigBifrost::ClDWCNativeDefaultConfigBifrost(GPUTarget gpu) : IClDWCNativeKernelConfig(gpu)
 {
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure(const ITensorInfo   *src,
+                                                                const ITensorInfo   *wei,
+                                                                const PadStrideInfo &conv_info,
+                                                                const Size2D        &dilation,
+                                                                unsigned int         depth_multiplier)
 {
-    using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigBifrost::*)(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                   unsigned int depth_multiplier);
+    using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigBifrost::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+        unsigned int depth_multiplier);
 
-    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(&ClDWCNativeDefaultConfigBifrost::configure_G71_f32,
-                                                                         &ClDWCNativeDefaultConfigBifrost::configure_G71_f16,
-                                                                         &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8);
+    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(
+        &ClDWCNativeDefaultConfigBifrost::configure_G71_f32, &ClDWCNativeDefaultConfigBifrost::configure_G71_f16,
+        &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8);
 
-    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClDWCNativeDefaultConfigBifrost::configure_G7x_f32,
-                                                                         &ClDWCNativeDefaultConfigBifrost::configure_G7x_f16,
-                                                                         &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8);
+    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(
+        &ClDWCNativeDefaultConfigBifrost::configure_G7x_f32, &ClDWCNativeDefaultConfigBifrost::configure_G7x_f16,
+        &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G71:
             func = configs_G71.get_function(src->data_type());
@@ -228,43 +242,58 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure(const ITensorInf
     return (this->*func)(src, wei, conv_info, dilation, depth_multiplier);
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f32(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
 {
     return configure_f32(src, wei, conv_info, dilation, depth_multiplier, true);
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f16(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
 {
     return configure_f16(src, wei, conv_info, dilation, depth_multiplier, true);
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f32(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
 {
     return configure_f32(src, wei, conv_info, dilation, depth_multiplier, false);
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f16(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
 {
     return configure_f16(src, wei, conv_info, dilation, depth_multiplier, false);
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_u8(const ITensorInfo   *src,
+                                                                       const ITensorInfo   *wei,
+                                                                       const PadStrideInfo &conv_info,
+                                                                       const Size2D        &dilation,
+                                                                       unsigned int         depth_multiplier)
 {
     ARM_COMPUTE_UNUSED(wei);
 
     DWCComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         desc.export_input_to_cl_image   = false;
         desc.export_weights_to_cl_image = false;
         desc.n0                         = (depth_multiplier == 1) ? 4 : 1;
-        if(conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
+        if (conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
         {
             desc.m0 = 2;
         }
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h
index cec2cae5dd..41d86c9c14 100644
--- a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h
@@ -41,20 +41,38 @@ public:
     ClDWCNativeDefaultConfigBifrost(GPUTarget gpu);
 
     // Inherited overridden method
-    DWCComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                   unsigned int depth_multiplier) override;
+    DWCComputeKernelInfo configure(const ITensorInfo   *src,
+                                   const ITensorInfo   *wei,
+                                   const PadStrideInfo &conv_info,
+                                   const Size2D        &dilation,
+                                   unsigned int         depth_multiplier) override;
 
 private:
-    DWCComputeKernelInfo configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier);
-    DWCComputeKernelInfo configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier);
-    DWCComputeKernelInfo configure_G7x_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier);
-    DWCComputeKernelInfo configure_G7x_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier);
-    DWCComputeKernelInfo configure_G7x_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                          unsigned int depth_multiplier);
+    DWCComputeKernelInfo configure_G71_f32(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G71_f16(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G7x_f32(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G7x_f16(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G7x_u8(const ITensorInfo   *src,
+                                          const ITensorInfo   *wei,
+                                          const PadStrideInfo &conv_info,
+                                          const Size2D        &dilation,
+                                          unsigned int         depth_multiplier);
 };
 } // namespace cl_dwc
 } // namespace arm_compute
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp
index 51f3787875..ef1bb3858c 100644
--- a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h"
-#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
@@ -30,31 +29,36 @@
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h"
+
 namespace arm_compute
 {
 namespace cl_dwc
 {
-ClDWCNativeDefaultConfigValhall::ClDWCNativeDefaultConfigValhall(GPUTarget gpu)
-    : IClDWCNativeKernelConfig(gpu)
+ClDWCNativeDefaultConfigValhall::ClDWCNativeDefaultConfigValhall(GPUTarget gpu) : IClDWCNativeKernelConfig(gpu)
 {
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure(const ITensorInfo   *src,
+                                                                const ITensorInfo   *wei,
+                                                                const PadStrideInfo &conv_info,
+                                                                const Size2D        &dilation,
+                                                                unsigned int         depth_multiplier)
 {
-    using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigValhall::*)(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                   unsigned int depth_multiplier);
+    using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigValhall::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+        unsigned int depth_multiplier);
 
-    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(&ClDWCNativeDefaultConfigValhall::configure_G78_f32,
-                                                                         &ClDWCNativeDefaultConfigValhall::configure_G78_f16,
-                                                                         &ClDWCNativeDefaultConfigValhall::configure_G78_u8);
+    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(
+        &ClDWCNativeDefaultConfigValhall::configure_G78_f32, &ClDWCNativeDefaultConfigValhall::configure_G78_f16,
+        &ClDWCNativeDefaultConfigValhall::configure_G78_u8);
 
-    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClDWCNativeDefaultConfigValhall::configure_G78_f32,
-                                                                         &ClDWCNativeDefaultConfigValhall::configure_G77_f16,
-                                                                         &ClDWCNativeDefaultConfigValhall::configure_G78_u8);
+    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(
+        &ClDWCNativeDefaultConfigValhall::configure_G78_f32, &ClDWCNativeDefaultConfigValhall::configure_G77_f16,
+        &ClDWCNativeDefaultConfigValhall::configure_G78_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G77:
             func = configs_G77.get_function(src->data_type());
@@ -69,15 +73,18 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure(const ITensorInf
     return (this->*func)(src, wei, conv_info, dilation, depth_multiplier);
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
 {
     DWCComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
-        const size_t idx_c          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
-        const size_t idx_w          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
         const TensorShape wei_shape = wei->tensor_shape();
         const size_t      kernel_c  = wei_shape[idx_c];
         const size_t      kernel_w  = wei_shape[idx_w];
@@ -85,17 +92,17 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const IT
         desc.export_input_to_cl_image   = false;
         desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
 
-        if(depth_multiplier == 1)
+        if (depth_multiplier == 1)
         {
             desc.n0 = 4;
         }
         else
         {
-            if((depth_multiplier % 4) == 0)
+            if ((depth_multiplier % 4) == 0)
             {
                 desc.n0 = 4;
             }
-            else if((depth_multiplier % 2) == 0)
+            else if ((depth_multiplier % 2) == 0)
             {
                 desc.n0 = 2;
             }
@@ -106,14 +113,15 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const IT
         }
 
         // Note: If we reduce n0, export to cl_image must be false
-        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
 
         desc.n0 = adjust_vec_size(desc.n0, kernel_c);
 
         // Set m0 only if stride_x == 1 and dilation_x == 1
-        if(conv_info.stride().first == 1 && dilation.x() == 1)
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
         {
-            if((kernel_w >= 9) || (kernel_w == 1))
+            if ((kernel_w >= 9) || (kernel_w == 1))
             {
                 desc.m0 = 1;
             }
@@ -131,16 +139,19 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const IT
     return desc;
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
 {
     DWCComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Src and weights have the same dimension indices
-        const size_t idx_c          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
-        const size_t idx_w          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
         const TensorShape src_shape = src->tensor_shape();
         const TensorShape wei_shape = wei->tensor_shape();
         const size_t      src_w     = src_shape[idx_w];
@@ -150,9 +161,9 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const IT
         desc.export_input_to_cl_image   = false;
         desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
 
-        if(depth_multiplier == 1)
+        if (depth_multiplier == 1)
         {
-            if(desc.export_weights_to_cl_image == false)
+            if (desc.export_weights_to_cl_image == false)
             {
                 desc.n0 = 8;
             }
@@ -163,11 +174,11 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const IT
         }
         else
         {
-            if((depth_multiplier % 4) == 0)
+            if ((depth_multiplier % 4) == 0)
             {
                 desc.n0 = 4;
             }
-            else if((depth_multiplier % 2) == 0)
+            else if ((depth_multiplier % 2) == 0)
             {
                 desc.n0 = 2;
             }
@@ -178,20 +189,21 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const IT
         }
 
         // Note: If we reduce n0, export to cl_image must be false
-        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
 
         desc.n0 = adjust_vec_size(desc.n0, kernel_c);
 
         // Set m0 only if stride_x == 1 and dilation_x == 1
-        if(conv_info.stride().first == 1 && dilation.x() == 1)
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
         {
-            if((kernel_w >= 9) || (kernel_w == 1))
+            if ((kernel_w >= 9) || (kernel_w == 1))
             {
                 desc.m0 = 1;
             }
             else
             {
-                if((src_w % 5) == 0)
+                if ((src_w % 5) == 0)
                 {
                     desc.m0 = 5;
                 }
@@ -210,19 +222,22 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const IT
     return desc;
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                       unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_u8(const ITensorInfo   *src,
+                                                                       const ITensorInfo   *wei,
+                                                                       const PadStrideInfo &conv_info,
+                                                                       const Size2D        &dilation,
+                                                                       unsigned int         depth_multiplier)
 {
     ARM_COMPUTE_UNUSED(wei);
 
     DWCComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         desc.export_input_to_cl_image   = false;
         desc.export_weights_to_cl_image = false;
         desc.n0                         = (depth_multiplier == 1) ? 4 : 1;
-        if(conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
+        if (conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
         {
             desc.m0 = 2;
         }
@@ -235,15 +250,18 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_u8(const ITe
     return desc;
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
 {
     DWCComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
-        const size_t idx_c          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
-        const size_t idx_w          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
         const TensorShape wei_shape = wei->tensor_shape();
         const size_t      kernel_c  = wei_shape[idx_c];
         const size_t      kernel_w  = wei_shape[idx_w];
@@ -251,9 +269,9 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const IT
         desc.export_input_to_cl_image   = false;
         desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
 
-        if(depth_multiplier == 1)
+        if (depth_multiplier == 1)
         {
-            if(desc.export_weights_to_cl_image == false)
+            if (desc.export_weights_to_cl_image == false)
             {
                 desc.n0 = 8;
             }
@@ -264,11 +282,11 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const IT
         }
         else
         {
-            if((depth_multiplier % 4) == 0)
+            if ((depth_multiplier % 4) == 0)
             {
                 desc.n0 = 4;
             }
-            else if((depth_multiplier % 2) == 0)
+            else if ((depth_multiplier % 2) == 0)
             {
                 desc.n0 = 2;
             }
@@ -279,14 +297,15 @@ DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const IT
         }
 
         // Note: If we reduce n0, export to cl_image must be false
-        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
 
         desc.n0 = adjust_vec_size(desc.n0, kernel_c);
 
         // Set m0 only if stride_x == 1 and dilation_x == 1
-        if(conv_info.stride().first == 1 && dilation.x() == 1)
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
         {
-            if((kernel_w >= 9) || (kernel_w == 1))
+            if ((kernel_w >= 9) || (kernel_w == 1))
             {
                 desc.m0 = 1;
             }
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h
index 4d51fa668c..fabce77b54 100644
--- a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h
@@ -41,18 +41,33 @@ public:
     ClDWCNativeDefaultConfigValhall(GPUTarget gpu);
 
     // Inherited overridden method
-    DWCComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                   unsigned int depth_multiplier) override;
+    DWCComputeKernelInfo configure(const ITensorInfo   *src,
+                                   const ITensorInfo   *wei,
+                                   const PadStrideInfo &conv_info,
+                                   const Size2D        &dilation,
+                                   unsigned int         depth_multiplier) override;
 
 private:
-    DWCComputeKernelInfo configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier);
-    DWCComputeKernelInfo configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier);
-    DWCComputeKernelInfo configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                          unsigned int depth_multiplier);
-    DWCComputeKernelInfo configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier);
+    DWCComputeKernelInfo configure_G78_f32(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G78_f16(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G78_u8(const ITensorInfo   *src,
+                                          const ITensorInfo   *wei,
+                                          const PadStrideInfo &conv_info,
+                                          const Size2D        &dilation,
+                                          unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G77_f16(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
 };
 } // namespace cl_dwc
 } // namespace arm_compute
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp
index 5593c6de61..c8b006c546 100644
--- a/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp
@@ -32,7 +32,7 @@ namespace cl_dwc
 bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_multiplier)
 {
     // Check whether we can use the cl image with the weights.
-    if(!export_to_cl_image(weights))
+    if (!export_to_cl_image(weights))
     {
         return false;
     }
@@ -45,12 +45,12 @@ bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_mul
     // If we can use the cl image storage with the weights, we prefer to use the cl buffer storage in the following cases for performance reasons:
     // 1- When the kernel size is 1x1
     // 2- When the depth multiplier is greater than 1 and not multiple of 4.
-    if((kernel_w == 1) && (kernel_h == 1))
+    if ((kernel_w == 1) && (kernel_h == 1))
     {
         return false;
     }
 
-    if((depth_multiplier > 1) && (depth_multiplier % 4) != 0)
+    if ((depth_multiplier > 1) && (depth_multiplier % 4) != 0)
     {
         return false;
     }
@@ -58,4 +58,4 @@ bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_mul
     return true;
 }
 } // namespace cl_dwc
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h
index c08053dcb3..49ce6ff479 100644
--- a/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h
@@ -46,7 +46,7 @@ public:
      */
     static std::unique_ptr<IClDWCNativeKernelConfig> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
                 // The heuristic for Midgard is the same as the one used for Arm Mali-G71
diff --git a/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h
index b5df132a12..614a6622df 100644
--- a/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h
+++ b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/common/Macros.h"
 
 namespace arm_compute
@@ -52,8 +53,7 @@ public:
      * @param[in] func_int8 Function to call for depthwise convolution Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
      *
      */
-    ClDWCNativeConfigArray(T func_f32, T func_f16, T func_int8)
-        : _configs{ func_f32, func_f16, func_int8 }
+    ClDWCNativeConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8}
     {
     }
 
@@ -65,7 +65,7 @@ public:
      */
     T get_function(DataType data_type)
     {
-        switch(data_type)
+        switch (data_type)
         {
             case DataType::F32:
                 return _configs.at(DT_F32);
@@ -92,8 +92,7 @@ public:
      *
      * @param[in] arch GPU target
      */
-    IClDWCNativeKernelConfig(GPUTarget arch)
-        : _target(arch)
+    IClDWCNativeKernelConfig(GPUTarget arch) : _target(arch)
     {
     }
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClDWCNativeKernelConfig);
@@ -107,8 +106,11 @@ public:
      * @param[in] dilation         Kernel dilation
      * @param[in] depth_multiplier Output feature maps multiplier
      */
-    virtual DWCComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier) = 0;
+    virtual DWCComputeKernelInfo configure(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier) = 0;
 
 protected:
     GPUTarget _target;
diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp
index 990f050112..3380d8f1b7 100644
--- a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp
+++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp
@@ -35,17 +35,19 @@ namespace cl_indirect_conv
 {
 using namespace arm_compute::misc::shape_calculator;
 
-ClIndirectConvDefaultConfigValhall::ClIndirectConvDefaultConfigValhall(GPUTarget gpu)
-    : IClIndirectConvKernelConfig(gpu)
+ClIndirectConvDefaultConfigValhall::ClIndirectConvDefaultConfigValhall(GPUTarget gpu) : IClIndirectConvKernelConfig(gpu)
 {
 }
 
-DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure(const ITensorInfo   *src,
+                                                                          const ITensorInfo   *wei,
+                                                                          const PadStrideInfo &conv_info)
 {
-    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClIndirectConvDefaultConfigValhall::*)(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClIndirectConvDefaultConfigValhall::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
 
-    ClIndirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClIndirectConvDefaultConfigValhall::configure_G77_f32,
-                                                                            &ClIndirectConvDefaultConfigValhall::configure_G77_f16);
+    ClIndirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(
+        &ClIndirectConvDefaultConfigValhall::configure_G77_f32, &ClIndirectConvDefaultConfigValhall::configure_G77_f16);
 
     // Important note: Indirect convolution should not be used when the kernel size is 1x1 (pointwise). The reason is because the indirect buffer makes
     // indirect convolution less efficient than direct convolution or gemm. For this reason, the heuristic of indirect convolution has not been tuned
@@ -57,22 +59,24 @@ DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure(const
     return (this->*func)(src, wei, conv_info);
 }
 
-DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f32(const ITensorInfo   *src,
+                                                                                  const ITensorInfo   *wei,
+                                                                                  const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
-        const TensorShape dst_shape                  = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
         const bool        export_weights_to_cl_image = export_to_cl_image(wei);
-        const int32_t stride_x     = conv_info.stride().first;
-        const int32_t stride_y     = conv_info.stride().second;
-        const int32_t ofm          = dst_shape[0];
-        const int32_t m            = (dst_shape[1]/ stride_x) * (dst_shape[2] / stride_y);
+        const int32_t     stride_x                   = conv_info.stride().first;
+        const int32_t     stride_y                   = conv_info.stride().second;
+        const int32_t     ofm                        = dst_shape[0];
+        const int32_t     m                          = (dst_shape[1] / stride_x) * (dst_shape[2] / stride_y);
 
         desc.export_weights_to_cl_image = export_weights_to_cl_image;
 
-        if(ofm <= 4)
+        if (ofm <= 4)
         {
             desc.m0 = 1;
             desc.n0 = 2;
@@ -82,7 +86,7 @@ DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f3
         {
             // The 16000 threshold value has been identified as the right
             // one for using the biggest block size allowed on F32: 5x4x4
-            if(m < 16000)
+            if (m < 16000)
             {
                 desc.m0 = 4;
                 desc.n0 = 4;
@@ -100,31 +104,33 @@ DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f3
     return desc;
 }
 
-DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f16(const ITensorInfo   *src,
+                                                                                  const ITensorInfo   *wei,
+                                                                                  const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
-        const TensorShape wei_shape                  = wei->tensor_shape();
-        const TensorShape dst_shape                  = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
         const bool        export_weights_to_cl_image = export_to_cl_image(wei);
 
-        const int32_t ofm          = dst_shape[0];
-        const int32_t m            = dst_shape[1] * dst_shape[2];
-        const int32_t k            = wei_shape[0];
+        const int32_t ofm = dst_shape[0];
+        const int32_t m   = dst_shape[1] * dst_shape[2];
+        const int32_t k   = wei_shape[0];
 
         desc.export_weights_to_cl_image = export_weights_to_cl_image;
 
-        if(ofm <= 4)
+        if (ofm <= 4)
         {
             // k0 should be as larger as possible. However, we should avoid
             // having left-over for loops that make the implementation slower.
-            if((k % 16) == 0)
+            if ((k % 16) == 0)
             {
                 desc.k0 = 16;
             }
-            else if((k % 8) == 0)
+            else if ((k % 8) == 0)
             {
                 desc.k0 = 8;
             }
@@ -140,11 +146,11 @@ DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f1
         {
             // The 16000 threshold value has been identified as the right
             // one for using the biggest block size allowed on F16: 8x4
-            if(m >= 16000 && k < 4)
+            if (m >= 16000 && k < 4)
             {
                 desc.m0 = 8;
                 desc.n0 = 4;
-                desc.k0 = 4;    // k0 is clamped to k inside the kernel when k is less than 4
+                desc.k0 = 4; // k0 is clamped to k inside the kernel when k is less than 4
             }
             else
             {
diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h
index 68dca91885..bab808c66c 100644
--- a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h
+++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h
@@ -41,11 +41,14 @@ public:
     ClIndirectConvDefaultConfigValhall(GPUTarget gpu);
 
     // Inherited overridden method
-    DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
+    DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
 
 private:
-    DirectConvComputeKernelInfo configure_G77_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G77_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
 };
 } // namespace cl_indirect_conv
 } // namespace arm_compute
diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h
index 73fbb87560..dd614e1f68 100644
--- a/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h
+++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h
@@ -45,7 +45,7 @@ public:
      */
     static std::unique_ptr<IClIndirectConvKernelConfig> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
             case GPUTarget::BIFROST:
diff --git a/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h
index d2f4cde662..d05da18b58 100644
--- a/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h
+++ b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/common/Macros.h"
 
 namespace arm_compute
@@ -49,8 +50,7 @@ public:
      * @param[in] func_f16 Function to call for indirect convolution F16
      *
      */
-    ClIndirectConvConfigArray(T func_f32, T func_f16)
-        : _configs{ func_f32, func_f16}
+    ClIndirectConvConfigArray(T func_f32, T func_f16) : _configs{func_f32, func_f16}
     {
     }
 
@@ -62,7 +62,7 @@ public:
      */
     T get_function(DataType data_type)
     {
-        switch(data_type)
+        switch (data_type)
         {
             case DataType::F32:
                 return _configs.at(DT_F32);
@@ -85,8 +85,7 @@ public:
      *
      * @param[in] arch GPU target
      */
-    IClIndirectConvKernelConfig(GPUTarget arch)
-        : _target(arch)
+    IClIndirectConvKernelConfig(GPUTarget arch) : _target(arch)
     {
     }
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClIndirectConvKernelConfig);
@@ -98,7 +97,8 @@ public:
      * @param[in] wei       Weights tensor
      * @param[in] conv_info Convolution info
      */
-    virtual DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0;
+    virtual DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0;
 
 protected:
     GPUTarget _target;
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp
index 01102b3d60..b3c8d891dc 100644
--- a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp
@@ -28,30 +28,33 @@
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
-#include <utility>
 
+#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
 #include "src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h"
 
+#include <utility>
+
 namespace arm_compute
 {
 namespace cl_matmul
 {
-ClMatMulNativeDefaultConfigValhall::ClMatMulNativeDefaultConfigValhall(GPUTarget gpu)
-    : IClMatMulNativeKernelConfig(gpu)
+ClMatMulNativeDefaultConfigValhall::ClMatMulNativeDefaultConfigValhall(GPUTarget gpu) : IClMatMulNativeKernelConfig(gpu)
 {
 }
 
-MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info)
+MatMulKernelInfo
+ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info)
 {
-    using ConfigurationFunctionExecutorPtr = MatMulKernelInfo (ClMatMulNativeDefaultConfigValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo & info);
+    using ConfigurationFunctionExecutorPtr = MatMulKernelInfo (ClMatMulNativeDefaultConfigValhall::*)(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
 
-    ClMatMulNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G710(&ClMatMulNativeDefaultConfigValhall::configure_G710_f32,
-                                                                             &ClMatMulNativeDefaultConfigValhall::configure_G710_f16,
-                                                                             &ClMatMulNativeDefaultConfigValhall::configure_G710_u8);
+    ClMatMulNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G710(
+        &ClMatMulNativeDefaultConfigValhall::configure_G710_f32,
+        &ClMatMulNativeDefaultConfigValhall::configure_G710_f16,
+        &ClMatMulNativeDefaultConfigValhall::configure_G710_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G710:
         default:
@@ -67,7 +70,7 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo
 
     const bool is_batched = lhs_shape.num_dimensions() > 2;
 
-    if(is_batched == true)
+    if (is_batched == true)
     {
         lhs_shape.collapse_from(2);
     }
@@ -81,103 +84,48 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo
     return (this->*func)(m, n, k, b, rhs->lock_paddings(), info);
 }
 
-MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
 {
-    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 16, 1 },
-        { 4096, 48, 32, 36, 4, 4, 4, 1 },
-        { 688, 92, 68, 32, 2, 8, 4, 1 },
-        { 24, 464, 412, 24, 2, 8, 4, 1 },
-        { 112, 184, 144, 28, 4, 4, 16, 1 },
-        { 5776, 64, 32, 36, 2, 4, 16, 1 },
-        { 1568, 64, 40, 36, 2, 8, 8, 1 },
-        { 2920, 64, 64, 24, 4, 4, 16, 1 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 8, 0 },
-        { 4096, 48, 32, 36, 4, 4, 8, 0 },
-        { 688, 92, 68, 32, 5, 4, 4, 0 },
-        { 24, 464, 412, 24, 6, 2, 8, 0 },
-        { 112, 184, 144, 28, 6, 4, 4, 0 },
-        { 5776, 64, 32, 36, 5, 4, 4, 0 },
-        { 1568, 64, 40, 36, 4, 4, 8, 0 },
-        { 2920, 64, 64, 24, 4, 4, 8, 0 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t =
-    {
-        { 3136, 64, 64, 36, 4, 4, 4, 1 },
-        { 4096, 48, 32, 36, 2, 2, 16, 1 },
-        { 688, 92, 68, 32, 4, 4, 4, 1 },
-        { 24, 464, 412, 24, 6, 2, 8, 1 },
-        { 112, 184, 144, 28, 4, 2, 16, 1 },
-        { 5776, 64, 32, 36, 4, 4, 4, 1 },
-        { 1568, 64, 40, 36, 4, 4, 8, 1 },
-        { 2920, 64, 64, 24, 4, 4, 4, 1 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t =
-    {
-        { 3136, 64, 64, 36, 5, 4, 4, 0 },
-        { 4096, 48, 32, 36, 5, 4, 4, 0 },
-        { 688, 92, 68, 32, 5, 4, 4, 0 },
-        { 24, 464, 412, 24, 6, 2, 4, 0 },
-        { 112, 184, 144, 28, 5, 4, 4, 0 },
-        { 5776, 64, 32, 36, 5, 4, 4, 0 },
-        { 1568, 64, 40, 36, 5, 4, 4, 0 },
-        { 2920, 64, 64, 24, 6, 2, 4, 0 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 16, 1 },
-        { 4096, 48, 32, 36, 4, 4, 4, 1 },
-        { 688, 92, 68, 32, 2, 8, 4, 1 },
-        { 24, 464, 412, 24, 2, 8, 4, 1 },
-        { 112, 184, 144, 28, 4, 4, 16, 1 },
-        { 5776, 64, 32, 36, 2, 8, 8, 1 },
-        { 1568, 64, 40, 36, 4, 4, 8, 1 },
-        { 2920, 64, 64, 24, 4, 4, 16, 1 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 4, 0 },
-        { 4096, 48, 32, 36, 4, 4, 4, 0 },
-        { 688, 92, 68, 32, 4, 4, 4, 0 },
-        { 24, 464, 412, 24, 4, 4, 4, 0 },
-        { 112, 184, 144, 28, 4, 4, 4, 0 },
-        { 5776, 64, 32, 36, 4, 4, 8, 0 },
-        { 1568, 64, 40, 36, 4, 4, 4, 0 },
-        { 2920, 64, 64, 24, 4, 4, 4, 0 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t =
-    {
-        { 3136, 64, 64, 36, 4, 4, 4, 1 },
-        { 4096, 48, 32, 36, 4, 4, 4, 1 },
-        { 688, 92, 68, 32, 4, 4, 4, 1 },
-        { 24, 464, 412, 24, 2, 2, 16, 1 },
-        { 112, 184, 144, 28, 4, 4, 4, 1 },
-        { 5776, 64, 32, 36, 4, 4, 4, 1 },
-        { 1568, 64, 40, 36, 4, 4, 4, 1 },
-        { 2920, 64, 64, 24, 4, 4, 4, 1 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t =
-    {
-        { 3136, 64, 64, 36, 4, 4, 4, 0 },
-        { 4096, 48, 32, 36, 4, 4, 4, 0 },
-        { 688, 92, 68, 32, 4, 4, 4, 0 },
-        { 24, 464, 412, 24, 4, 2, 8, 0 },
-        { 112, 184, 144, 28, 4, 4, 4, 0 },
-        { 5776, 64, 32, 36, 4, 4, 4, 0 },
-        { 1568, 64, 40, 36, 4, 4, 4, 0 },
-        { 2920, 64, 64, 24, 4, 4, 4, 0 }
-    };
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1},   {688, 92, 68, 32, 2, 8, 4, 1},
+        {24, 464, 412, 24, 2, 8, 4, 1},  {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 2, 4, 16, 1},
+        {1568, 64, 40, 36, 2, 8, 8, 1},  {2920, 64, 64, 24, 4, 4, 16, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt = {
+        {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0},  {688, 92, 68, 32, 5, 4, 4, 0},
+        {24, 464, 412, 24, 6, 2, 8, 0}, {112, 184, 144, 28, 6, 4, 4, 0}, {5776, 64, 32, 36, 5, 4, 4, 0},
+        {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = {
+        {3136, 64, 64, 36, 4, 4, 4, 1}, {4096, 48, 32, 36, 2, 2, 16, 1},  {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 6, 2, 8, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 4, 4, 4, 1},
+        {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 4, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t = {
+        {3136, 64, 64, 36, 5, 4, 4, 0}, {4096, 48, 32, 36, 5, 4, 4, 0},  {688, 92, 68, 32, 5, 4, 4, 0},
+        {24, 464, 412, 24, 6, 2, 4, 0}, {112, 184, 144, 28, 5, 4, 4, 0}, {5776, 64, 32, 36, 5, 4, 4, 0},
+        {1568, 64, 40, 36, 5, 4, 4, 0}, {2920, 64, 64, 24, 6, 2, 4, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1},   {688, 92, 68, 32, 2, 8, 4, 1},
+        {24, 464, 412, 24, 2, 8, 4, 1},  {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 2, 8, 8, 1},
+        {1568, 64, 40, 36, 4, 4, 8, 1},  {2920, 64, 64, 24, 4, 4, 16, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0},  {688, 92, 68, 32, 4, 4, 4, 0},
+        {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 8, 0},
+        {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = {
+        {3136, 64, 64, 36, 4, 4, 4, 1},  {4096, 48, 32, 36, 4, 4, 4, 1},  {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 2, 2, 16, 1}, {112, 184, 144, 28, 4, 4, 4, 1}, {5776, 64, 32, 36, 4, 4, 4, 1},
+        {1568, 64, 40, 36, 4, 4, 4, 1},  {2920, 64, 64, 24, 4, 4, 4, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t = {
+        {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0},  {688, 92, 68, 32, 4, 4, 4, 0},
+        {24, 464, 412, 24, 4, 2, 8, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 4, 0},
+        {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}};
 
     const bool adj_lhs = info.adj_lhs();
     const bool adj_rhs = info.adj_rhs();
@@ -185,17 +133,17 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32(unsigned
     const MatMulNativeConfigsMatrix *configs_best_to_use     = nullptr;
     const MatMulNativeConfigsMatrix *configs_fallback_to_use = nullptr;
 
-    if((adj_lhs == false) && (adj_rhs == false))
+    if ((adj_lhs == false) && (adj_rhs == false))
     {
         configs_best_to_use     = &configs_mnkb_best_nt_nt;
         configs_fallback_to_use = &configs_mnkb_fallback_nt_nt;
     }
-    else if((adj_lhs == false) && (adj_rhs == true))
+    else if ((adj_lhs == false) && (adj_rhs == true))
     {
         configs_best_to_use     = &configs_mnkb_best_nt_t;
         configs_fallback_to_use = &configs_mnkb_fallback_nt_t;
     }
-    else if((adj_lhs == true) && (adj_rhs == false))
+    else if ((adj_lhs == true) && (adj_rhs == false))
     {
         configs_best_to_use     = &configs_mnkb_best_t_nt;
         configs_fallback_to_use = &configs_mnkb_fallback_t_nt;
@@ -209,108 +157,51 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32(unsigned
     MatMulKernelInfo desc0 = find_info(*configs_best_to_use, adj_lhs, adj_rhs, m, n, k, b);
     MatMulKernelInfo desc1 = find_info(*configs_fallback_to_use, adj_lhs, adj_rhs, m, n, k, b);
 
-    return select_info(desc0,
-                       desc1,
-                       m, n, k, b, DataType::F32, rhs_lock_padding);
+    return select_info(desc0, desc1, m, n, k, b, DataType::F32, rhs_lock_padding);
 }
 
-MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
 {
-    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 16, 1 },
-        { 4096, 48, 32, 36, 4, 4, 8, 1 },
-        { 688, 92, 68, 32, 4, 4, 16, 1 },
-        { 24, 464, 412, 24, 4, 4, 4, 1 },
-        { 112, 184, 144, 28, 4, 4, 16, 1 },
-        { 5776, 64, 32, 36, 4, 4, 8, 1 },
-        { 1568, 64, 40, 36, 4, 4, 8, 1 },
-        { 2920, 64, 64, 24, 4, 4, 16, 1 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt =
-    {
-        { 3136, 64, 64, 36, 6, 4, 8, 0 },
-        { 4096, 48, 32, 36, 6, 4, 8, 0 },
-        { 688, 92, 68, 32, 6, 4, 8, 0 },
-        { 24, 464, 412, 24, 4, 4, 8, 0 },
-        { 112, 184, 144, 28, 6, 4, 8, 0 },
-        { 5776, 64, 32, 36, 6, 4, 8, 0 },
-        { 1568, 64, 40, 36, 6, 4, 8, 0 },
-        { 2920, 64, 64, 24, 6, 4, 8, 0 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t =
-    {
-        { 3136, 64, 64, 36, 6, 4, 8, 1 },
-        { 4096, 48, 32, 36, 6, 4, 8, 1 },
-        { 688, 92, 68, 32, 4, 4, 4, 1 },
-        { 24, 464, 412, 24, 6, 2, 4, 1 },
-        { 112, 184, 144, 28, 4, 2, 16, 1 },
-        { 5776, 64, 32, 36, 6, 4, 8, 1 },
-        { 1568, 64, 40, 36, 6, 4, 8, 1 },
-        { 2920, 64, 64, 24, 6, 4, 8, 1 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t =
-    {
-        { 3136, 64, 64, 36, 6, 2, 16, 0 },
-        { 4096, 48, 32, 36, 5, 4, 8, 0 },
-        { 688, 92, 68, 32, 6, 2, 16, 0 },
-        { 24, 464, 412, 24, 6, 2, 16, 0 },
-        { 112, 184, 144, 28, 6, 2, 16, 0 },
-        { 5776, 64, 32, 36, 5, 4, 8, 0 },
-        { 1568, 64, 40, 36, 5, 4, 8, 0 },
-        { 2920, 64, 64, 24, 6, 2, 16, 0 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 16, 1 },
-        { 4096, 48, 32, 36, 4, 4, 4, 1 },
-        { 688, 92, 68, 32, 4, 4, 4, 1 },
-        { 24, 464, 412, 24, 4, 4, 4, 1 },
-        { 112, 184, 144, 28, 4, 4, 4, 1 },
-        { 5776, 64, 32, 36, 4, 4, 4, 1 },
-        { 1568, 64, 40, 36, 4, 4, 4, 1 },
-        { 2920, 64, 64, 24, 4, 4, 4, 1 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 4, 0 },
-        { 4096, 48, 32, 36, 4, 4, 4, 0 },
-        { 688, 92, 68, 32, 4, 4, 4, 0 },
-        { 24, 464, 412, 24, 4, 4, 4, 0 },
-        { 112, 184, 144, 28, 4, 4, 4, 0 },
-        { 5776, 64, 32, 36, 4, 4, 4, 0 },
-        { 1568, 64, 40, 36, 4, 4, 4, 0 },
-        { 2920, 64, 64, 24, 4, 4, 4, 0 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t =
-    {
-        { 3136, 64, 64, 36, 4, 4, 16, 1 },
-        { 4096, 48, 32, 36, 4, 4, 8, 1 },
-        { 688, 92, 68, 32, 4, 4, 4, 1 },
-        { 24, 464, 412, 24, 4, 2, 8, 1 },
-        { 112, 184, 144, 28, 4, 2, 16, 1 },
-        { 5776, 64, 32, 36, 4, 4, 16, 1 },
-        { 1568, 64, 40, 36, 4, 4, 8, 1 },
-        { 2920, 64, 64, 24, 4, 4, 16, 1 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t =
-    {
-        { 3136, 64, 64, 36, 4, 4, 8, 0 },
-        { 4096, 48, 32, 36, 4, 4, 8, 0 },
-        { 688, 92, 68, 32, 4, 4, 8, 0 },
-        { 24, 464, 412, 24, 4, 4, 8, 0 },
-        { 112, 184, 144, 28, 4, 4, 8, 0 },
-        { 5776, 64, 32, 36, 4, 4, 8, 0 },
-        { 1568, 64, 40, 36, 4, 4, 8, 0 },
-        { 2920, 64, 64, 24, 4, 4, 8, 0 }
-    };
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 8, 1},   {688, 92, 68, 32, 4, 4, 16, 1},
+        {24, 464, 412, 24, 4, 4, 4, 1},  {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 4, 4, 8, 1},
+        {1568, 64, 40, 36, 4, 4, 8, 1},  {2920, 64, 64, 24, 4, 4, 16, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt = {
+        {3136, 64, 64, 36, 6, 4, 8, 0}, {4096, 48, 32, 36, 6, 4, 8, 0},  {688, 92, 68, 32, 6, 4, 8, 0},
+        {24, 464, 412, 24, 4, 4, 8, 0}, {112, 184, 144, 28, 6, 4, 8, 0}, {5776, 64, 32, 36, 6, 4, 8, 0},
+        {1568, 64, 40, 36, 6, 4, 8, 0}, {2920, 64, 64, 24, 6, 4, 8, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = {
+        {3136, 64, 64, 36, 6, 4, 8, 1}, {4096, 48, 32, 36, 6, 4, 8, 1},   {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 6, 2, 4, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 6, 4, 8, 1},
+        {1568, 64, 40, 36, 6, 4, 8, 1}, {2920, 64, 64, 24, 6, 4, 8, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t = {
+        {3136, 64, 64, 36, 6, 2, 16, 0}, {4096, 48, 32, 36, 5, 4, 8, 0},   {688, 92, 68, 32, 6, 2, 16, 0},
+        {24, 464, 412, 24, 6, 2, 16, 0}, {112, 184, 144, 28, 6, 2, 16, 0}, {5776, 64, 32, 36, 5, 4, 8, 0},
+        {1568, 64, 40, 36, 5, 4, 8, 0},  {2920, 64, 64, 24, 6, 2, 16, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1},  {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 4, 4, 4, 1},  {112, 184, 144, 28, 4, 4, 4, 1}, {5776, 64, 32, 36, 4, 4, 4, 1},
+        {1568, 64, 40, 36, 4, 4, 4, 1},  {2920, 64, 64, 24, 4, 4, 4, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0},  {688, 92, 68, 32, 4, 4, 4, 0},
+        {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 4, 0},
+        {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 8, 1},   {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 4, 2, 8, 1},  {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 4, 4, 16, 1},
+        {1568, 64, 40, 36, 4, 4, 8, 1},  {2920, 64, 64, 24, 4, 4, 16, 1}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t = {
+        {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0},  {688, 92, 68, 32, 4, 4, 8, 0},
+        {24, 464, 412, 24, 4, 4, 8, 0}, {112, 184, 144, 28, 4, 4, 8, 0}, {5776, 64, 32, 36, 4, 4, 8, 0},
+        {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}};
 
     const bool adj_lhs = info.adj_lhs();
     const bool adj_rhs = info.adj_rhs();
@@ -318,17 +209,17 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f16(unsigned
     const MatMulNativeConfigsMatrix *configs_best_to_use     = nullptr;
     const MatMulNativeConfigsMatrix *configs_fallback_to_use = nullptr;
 
-    if((adj_lhs == false) && (adj_rhs == false))
+    if ((adj_lhs == false) && (adj_rhs == false))
     {
         configs_best_to_use     = &configs_mnkb_best_nt_nt;
         configs_fallback_to_use = &configs_mnkb_fallback_nt_nt;
     }
-    else if((adj_lhs == false) && (adj_rhs == true))
+    else if ((adj_lhs == false) && (adj_rhs == true))
     {
         configs_best_to_use     = &configs_mnkb_best_nt_t;
         configs_fallback_to_use = &configs_mnkb_fallback_nt_t;
     }
-    else if((adj_lhs == true) && (adj_rhs == false))
+    else if ((adj_lhs == true) && (adj_rhs == false))
     {
         configs_best_to_use     = &configs_mnkb_best_t_nt;
         configs_fallback_to_use = &configs_mnkb_fallback_t_nt;
@@ -342,75 +233,46 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f16(unsigned
     MatMulKernelInfo desc0 = find_info(*configs_best_to_use, adj_lhs, adj_rhs, m, n, k, b);
     MatMulKernelInfo desc1 = find_info(*configs_fallback_to_use, adj_lhs, adj_rhs, m, n, k, b);
 
-    return select_info(desc0,
-                       desc1,
-                       m, n, k, b, DataType::F16, rhs_lock_padding);
+    return select_info(desc0, desc1, m, n, k, b, DataType::F16, rhs_lock_padding);
 }
 
-MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
 {
     ARM_COMPUTE_UNUSED(rhs_lock_padding);
 
-    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt =
-    {
-        { 3136, 64, 64, 36, 6, 4, 4, 0 },
-        { 4096, 48, 32, 36, 6, 4, 4, 0 },
-        { 688, 92, 68, 32, 2, 8, 4, 0 },
-        { 24, 464, 412, 24, 4, 4, 4, 0 },
-        { 112, 184, 144, 28, 6, 4, 4, 0 },
-        { 5776, 64, 32, 36, 6, 4, 4, 0 },
-        { 1568, 64, 40, 36, 6, 4, 4, 0 },
-        { 2920, 64, 64, 24, 5, 4, 4, 0 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t =
-    {
-        { 3136, 64, 64, 36, 4, 4, 16, 0 },
-        { 4096, 48, 32, 36, 4, 4, 16, 0 },
-        { 688, 92, 68, 32, 4, 4, 16, 0 },
-        { 24, 464, 412, 24, 6, 2, 16, 0 },
-        { 112, 184, 144, 28, 4, 4, 16, 0 },
-        { 5776, 64, 32, 36, 4, 4, 16, 0 },
-        { 1568, 64, 40, 36, 6, 4, 4, 0 },
-        { 2920, 64, 64, 24, 4, 4, 16, 0 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 8, 0 },
-        { 4096, 48, 32, 36, 4, 4, 8, 0 },
-        { 688, 92, 68, 32, 4, 4, 4, 0 },
-        { 24, 464, 412, 24, 4, 4, 4, 0 },
-        { 112, 184, 144, 28, 4, 4, 8, 0 },
-        { 5776, 64, 32, 36, 4, 4, 8, 0 },
-        { 1568, 64, 40, 36, 4, 4, 8, 0 },
-        { 2920, 64, 64, 24, 4, 4, 8, 0 }
-    };
-
-    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t =
-    {
-        { 3136, 64, 64, 36, 4, 2, 16, 0 },
-        { 4096, 48, 32, 36, 4, 4, 4, 0 },
-        { 688, 92, 68, 32, 4, 4, 8, 0 },
-        { 24, 464, 412, 24, 4, 2, 16, 0 },
-        { 112, 184, 144, 28, 4, 2, 16, 0 },
-        { 5776, 64, 32, 36, 4, 4, 4, 0 },
-        { 1568, 64, 40, 36, 4, 4, 8, 0 },
-        { 2920, 64, 64, 24, 4, 2, 16, 0 }
-    };
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = {
+        {3136, 64, 64, 36, 6, 4, 4, 0}, {4096, 48, 32, 36, 6, 4, 4, 0},  {688, 92, 68, 32, 2, 8, 4, 0},
+        {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 6, 4, 4, 0}, {5776, 64, 32, 36, 6, 4, 4, 0},
+        {1568, 64, 40, 36, 6, 4, 4, 0}, {2920, 64, 64, 24, 5, 4, 4, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = {
+        {3136, 64, 64, 36, 4, 4, 16, 0}, {4096, 48, 32, 36, 4, 4, 16, 0},  {688, 92, 68, 32, 4, 4, 16, 0},
+        {24, 464, 412, 24, 6, 2, 16, 0}, {112, 184, 144, 28, 4, 4, 16, 0}, {5776, 64, 32, 36, 4, 4, 16, 0},
+        {1568, 64, 40, 36, 6, 4, 4, 0},  {2920, 64, 64, 24, 4, 4, 16, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0},  {688, 92, 68, 32, 4, 4, 4, 0},
+        {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 8, 0}, {5776, 64, 32, 36, 4, 4, 8, 0},
+        {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}};
+
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = {
+        {3136, 64, 64, 36, 4, 2, 16, 0}, {4096, 48, 32, 36, 4, 4, 4, 0},   {688, 92, 68, 32, 4, 4, 8, 0},
+        {24, 464, 412, 24, 4, 2, 16, 0}, {112, 184, 144, 28, 4, 2, 16, 0}, {5776, 64, 32, 36, 4, 4, 4, 0},
+        {1568, 64, 40, 36, 4, 4, 8, 0},  {2920, 64, 64, 24, 4, 2, 16, 0}};
 
     const bool adj_lhs = info.adj_lhs();
     const bool adj_rhs = info.adj_rhs();
 
-    if((adj_lhs == false) && (adj_rhs == false))
+    if ((adj_lhs == false) && (adj_rhs == false))
     {
         return find_info(configs_mnkb_best_nt_nt, adj_lhs, adj_rhs, m, n, k, b);
     }
-    else if((adj_lhs == false) && (adj_rhs == true))
+    else if ((adj_lhs == false) && (adj_rhs == true))
     {
         return find_info(configs_mnkb_best_nt_t, adj_lhs, adj_rhs, m, n, k, b);
     }
-    else if((adj_lhs == true) && (adj_rhs == false))
+    else if ((adj_lhs == true) && (adj_rhs == false))
     {
         return find_info(configs_mnkb_best_t_nt, adj_lhs, adj_rhs, m, n, k, b);
     }
@@ -419,5 +281,5 @@ MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_u8(unsigned
         return find_info(configs_mnkb_best_t_t, adj_lhs, adj_rhs, m, n, k, b);
     }
 }
-} // namespace opencl
+} // namespace cl_matmul
 } // namespace arm_compute
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h
index fe167d18dd..6b39db6a3f 100644
--- a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h
@@ -44,10 +44,13 @@ public:
     MatMulKernelInfo configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info) override;
 
 private:
-    MatMulKernelInfo configure_G710_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
-    MatMulKernelInfo configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
-    MatMulKernelInfo configure_G710_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+    MatMulKernelInfo configure_G710_f32(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+    MatMulKernelInfo configure_G710_f16(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+    MatMulKernelInfo configure_G710_u8(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
 };
-} // namespace opencl
+} // namespace cl_matmul
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL */
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp
index 1e06e84d4d..89cad30214 100644
--- a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
+
 #include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
 
 #include <limits>
@@ -37,22 +38,32 @@ namespace cl_matmul
 {
 MatMulKernelInfo select_info(const MatMulKernelInfo &info0,
                              const MatMulKernelInfo &info1,
-                             unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type, bool rhs_lock_padding)
+                             unsigned int            m,
+                             unsigned int            n,
+                             unsigned int            k,
+                             unsigned int            b,
+                             DataType                data_type,
+                             bool                    rhs_lock_padding)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(info1.export_rhs_to_cl_image == true, "The fallback MatMul configuration cannot have export_to_cl_image = true");
-    ARM_COMPUTE_ERROR_ON_MSG(info0.adj_lhs != info1.adj_lhs, "The MatMul configurations must have the same adj_lhs value");
-    ARM_COMPUTE_ERROR_ON_MSG(info0.adj_rhs != info1.adj_rhs, "The MatMul configurations must have the same adj_rhs value");
+    ARM_COMPUTE_ERROR_ON_MSG(info1.export_rhs_to_cl_image == true,
+                             "The fallback MatMul configuration cannot have export_to_cl_image = true");
+    ARM_COMPUTE_ERROR_ON_MSG(info0.adj_lhs != info1.adj_lhs,
+                             "The MatMul configurations must have the same adj_lhs value");
+    ARM_COMPUTE_ERROR_ON_MSG(info0.adj_rhs != info1.adj_rhs,
+                             "The MatMul configurations must have the same adj_rhs value");
 
     const bool adj_lhs = info0.adj_lhs;
     const bool adj_rhs = info0.adj_rhs;
 
-    TensorInfo lhs_info = !adj_lhs ? TensorInfo(TensorShape(k, m, b), 1, data_type) : TensorInfo(TensorShape(m, k, b), 1, data_type);
-    TensorInfo rhs_info = !adj_rhs ? TensorInfo(TensorShape(n, k, b), 1, data_type) : TensorInfo(TensorShape(k, n, b), 1, data_type);
+    TensorInfo lhs_info =
+        !adj_lhs ? TensorInfo(TensorShape(k, m, b), 1, data_type) : TensorInfo(TensorShape(m, k, b), 1, data_type);
+    TensorInfo rhs_info =
+        !adj_rhs ? TensorInfo(TensorShape(n, k, b), 1, data_type) : TensorInfo(TensorShape(k, n, b), 1, data_type);
     TensorInfo dst_info;
 
-    if(rhs_lock_padding == false)
+    if (rhs_lock_padding == false)
     {
-        if(bool(opencl::kernels::ClMatMulNativeKernel::validate(&lhs_info, &rhs_info, nullptr, &dst_info, info0)))
+        if (bool(opencl::kernels::ClMatMulNativeKernel::validate(&lhs_info, &rhs_info, nullptr, &dst_info, info0)))
         {
             return info0;
         }
@@ -67,7 +78,13 @@ MatMulKernelInfo select_info(const MatMulKernelInfo &info0,
     }
 }
 
-MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, bool adj_lhs, bool adj_rhs, unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs,
+                           bool                             adj_lhs,
+                           bool                             adj_rhs,
+                           unsigned int                     m,
+                           unsigned int                     n,
+                           unsigned int                     k,
+                           unsigned int                     b)
 {
     size_t min_acc = std::numeric_limits<size_t>::max();
     size_t min_idx = 0;
@@ -76,12 +93,13 @@ MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, bool adj_lh
     const size_t num_rows = configs.size();
     const size_t num_cols = configs[0].size();
 
-    ARM_COMPUTE_ERROR_ON_MSG(num_cols != 8U, "The entry should have 8 integer values representing: M, N, K, B, M0, N0. K0, IMG_RHS");
+    ARM_COMPUTE_ERROR_ON_MSG(num_cols != 8U,
+                             "The entry should have 8 integer values representing: M, N, K, B, M0, N0. K0, IMG_RHS");
     ARM_COMPUTE_UNUSED(num_cols);
 
     // Find nearest GeMM workload
     // Note: the workload does not depend on the K dimension
-    for(size_t y = 0; y < num_rows; ++y)
+    for (size_t y = 0; y < num_rows; ++y)
     {
         size_t mc0 = static_cast<size_t>(configs[y][0]);
         size_t nc0 = static_cast<size_t>(configs[y][1]);
@@ -94,7 +112,7 @@ MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, bool adj_lh
         acc += (k - kc0) * (k - kc0);
         acc += (b - bc0) * (b - bc0);
         acc = std::sqrt(acc);
-        if(acc < min_acc)
+        if (acc < min_acc)
         {
             min_acc = acc;
             min_idx = y;
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h
index 3881617558..a114fffa68 100644
--- a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h
@@ -52,7 +52,12 @@ using MatMulNativeConfigsMatrix = std::vector<std::vector<int32_t>>;
  */
 MatMulKernelInfo select_info(const MatMulKernelInfo &info0,
                              const MatMulKernelInfo &info1,
-                             unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type, bool rhs_lock_padding);
+                             unsigned int            m,
+                             unsigned int            n,
+                             unsigned int            k,
+                             unsigned int            b,
+                             DataType                data_type,
+                             bool                    rhs_lock_padding);
 
 /** Find the preferred configurations for the MatMul Native kernel using the MatMulNativeConfigsMatrix provided by the user
  *
@@ -66,7 +71,13 @@ MatMulKernelInfo select_info(const MatMulKernelInfo &info0,
  *
  * @return @ref MatMulKernelInfo
  */
-MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, bool adj_lhs, bool adj_rhs, unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs,
+                           bool                             adj_lhs,
+                           bool                             adj_rhs,
+                           unsigned int                     m,
+                           unsigned int                     n,
+                           unsigned int                     k,
+                           unsigned int                     b);
 } // namespace cl_matmul
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS */
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h
index a2dbfc7dd5..b10018a6d2 100644
--- a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h
@@ -45,7 +45,7 @@ public:
      */
     static std::unique_ptr<IClMatMulNativeKernelConfig> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
             case GPUTarget::BIFROST:
@@ -56,6 +56,6 @@ public:
         }
     }
 };
-} // namespace opencl
+} // namespace cl_matmul
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG */
diff --git a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h
index 4f548bd01d..b9b091100c 100644
--- a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h
+++ b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h
@@ -28,6 +28,7 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/MatMulInfo.h"
+
 #include "src/core/common/Macros.h"
 
 namespace arm_compute
@@ -53,8 +54,7 @@ public:
      * @param[in] func_int8 Function to call for matmul native Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
      *
      */
-    ClMatMulNativeConfigArray(T func_f32, T func_f16, T func_int8)
-        : _configs{ func_f32, func_f16, func_int8 }
+    ClMatMulNativeConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8}
     {
     }
 
@@ -66,7 +66,7 @@ public:
      */
     T get_function(DataType data_type)
     {
-        switch(data_type)
+        switch (data_type)
         {
             case DataType::F32:
                 return _configs.at(DT_F32);
@@ -93,8 +93,7 @@ public:
      *
      * @param[in] arch GPU target
      */
-    IClMatMulNativeKernelConfig(GPUTarget arch)
-        : _target(arch)
+    IClMatMulNativeKernelConfig(GPUTarget arch) : _target(arch)
     {
     }
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClMatMulNativeKernelConfig);
diff --git a/support/Bfloat16.h b/support/Bfloat16.h
index e67c729a6c..17013294e2 100644
--- a/support/Bfloat16.h
+++ b/support/Bfloat16.h
@@ -43,18 +43,17 @@ inline uint16_t float_to_bf16(const float v)
 #if defined(ARM_COMPUTE_ENABLE_BF16)
     uint16_t res;
 
-    __asm __volatile(
-        "ldr    s0, [%[fromptr]]\n"
-        ".inst    0x1e634000\n" // BFCVT h0, s0
-        "str    h0, [%[toptr]]\n"
-        :
-        : [fromptr] "r"(fromptr), [toptr] "r"(&res)
-        : "v0", "memory");
+    __asm __volatile("ldr    s0, [%[fromptr]]\n"
+                     ".inst    0x1e634000\n" // BFCVT h0, s0
+                     "str    h0, [%[toptr]]\n"
+                     :
+                     : [fromptr] "r"(fromptr), [toptr] "r"(&res)
+                     : "v0", "memory");
 #else  /* defined(ARM_COMPUTE_ENABLE_BF16) */
     uint16_t       res   = (*fromptr >> 16);
     const uint16_t error = (*fromptr & 0x0000ffff);
     uint16_t       bf_l  = res & 0x0001;
-    if((error > 0x8000) || ((error == 0x8000) && (bf_l != 0)))
+    if ((error > 0x8000) || ((error == 0x8000) && (bf_l != 0)))
     {
         res += 1;
     }
@@ -75,23 +74,21 @@ inline float bf16_to_float(const uint16_t &v)
     memcpy(&fp, &lv, sizeof(lv));
     return fp;
 }
-}
+} // namespace
 
 /** Brain floating point representation class */
 class bfloat16 final
 {
 public:
     /** Default Constructor */
-    bfloat16()
-        : value(0)
+    bfloat16() : value(0)
     {
     }
     /** Constructor
      *
      * @param[in] v Floating-point value
      */
-    bfloat16(float v)
-        : value(float_to_bf16(v))
+    bfloat16(float v) : value(float_to_bf16(v))
     {
     }
     /** Assignment operator
diff --git a/support/Cast.h b/support/Cast.h
index 53d5f68065..5fd763690b 100644
--- a/support/Cast.h
+++ b/support/Cast.h
@@ -46,7 +46,7 @@ namespace cast
 template <typename Target, typename Source>
 inline Target polymorphic_cast(Source *v)
 {
-    if(dynamic_cast<Target>(v) == nullptr)
+    if (dynamic_cast<Target>(v) == nullptr)
     {
         ARM_COMPUTE_THROW(std::bad_cast());
     }
@@ -86,7 +86,7 @@ inline Target polymorphic_downcast(Source *v)
 template <typename Target, typename Source, typename Deleter>
 std::unique_ptr<Target, Deleter> polymorphic_cast_unique_ptr(std::unique_ptr<Source, Deleter> &&v)
 {
-    if(dynamic_cast<Target *>(v.get()) == nullptr)
+    if (dynamic_cast<Target *>(v.get()) == nullptr)
     {
         ARM_COMPUTE_THROW(std::bad_cast());
     }
diff --git a/support/DeepCopy.h b/support/DeepCopy.h
index 0117897901..c0279284c0 100644
--- a/support/DeepCopy.h
+++ b/support/DeepCopy.h
@@ -40,9 +40,8 @@ namespace
 template <typename Base, typename Derived>
 Base *default_polymorphic_copy(const Base *ptr)
 {
-    static_assert(std::is_base_of<Base, Derived>::value,
-                  "Derived is not a specialization of Base");
-    if(ptr == nullptr)
+    static_assert(std::is_base_of<Base, Derived>::value, "Derived is not a specialization of Base");
+    if (ptr == nullptr)
     {
         return nullptr;
     }
@@ -62,25 +61,18 @@ class deep_unique_ptr
 public:
     using CopyFunc = std::function<Base *(const Base *)>;
 
-    deep_unique_ptr(std::nullptr_t val = nullptr) noexcept
-        : _val{ val },
-    _copy{}
+    deep_unique_ptr(std::nullptr_t val = nullptr) noexcept : _val{val}, _copy{}
     {
     }
     template <typename Derived, typename CopyFuncDerived>
-    deep_unique_ptr(Derived *value, const CopyFuncDerived &copy) noexcept
-        : _val{ value },
-    _copy{ std::move(copy) }
+    deep_unique_ptr(Derived *value, const CopyFuncDerived &copy) noexcept : _val{value}, _copy{std::move(copy)}
     {
-        static_assert(std::is_base_of<Base, Derived>::value,
-                      "Derived is not a specialization of Base");
-        static_assert(
-            std::is_constructible<CopyFunc, CopyFuncDerived>::value,
-            "CopyFuncDerived is not valid for a copy functor");
+        static_assert(std::is_base_of<Base, Derived>::value, "Derived is not a specialization of Base");
+        static_assert(std::is_constructible<CopyFunc, CopyFuncDerived>::value,
+                      "CopyFuncDerived is not valid for a copy functor");
     }
 
-    deep_unique_ptr(const deep_unique_ptr<Base> &ptr)
-        : deep_unique_ptr(ptr.clone())
+    deep_unique_ptr(const deep_unique_ptr<Base> &ptr) : deep_unique_ptr(ptr.clone())
     {
     }
     deep_unique_ptr &operator=(const deep_unique_ptr<Base> &ptr)
@@ -90,7 +82,7 @@ public:
         return *this;
     }
 
-    deep_unique_ptr(deep_unique_ptr<Base> &&ptr) = default;
+    deep_unique_ptr(deep_unique_ptr<Base> &&ptr)            = default;
     deep_unique_ptr &operator=(deep_unique_ptr<Base> &&ptr) = default;
     ~deep_unique_ptr()                                      = default;
     friend void swap(deep_unique_ptr &ptr0, deep_unique_ptr<Base> &ptr1) noexcept
@@ -135,11 +127,11 @@ public:
 
     bool operator==(const deep_unique_ptr<Base> &rhs) const
     {
-        if(rhs.get() == nullptr && _val == nullptr)
+        if (rhs.get() == nullptr && _val == nullptr)
         {
             return true;
         }
-        else if(rhs.get() == nullptr || _val == nullptr)
+        else if (rhs.get() == nullptr || _val == nullptr)
         {
             return false;
         }
@@ -152,9 +144,9 @@ public:
 private:
     deep_unique_ptr clone() const
     {
-        return { _copy(_val.get()), CopyFunc(_copy) };
+        return {_copy(_val.get()), CopyFunc(_copy)};
     }
-    std::unique_ptr<Base> _val{ nullptr };
+    std::unique_ptr<Base> _val{nullptr};
     CopyFunc              _copy{};
 };
 
@@ -170,34 +162,26 @@ private:
 template <typename Base, typename Derived, typename CopyFunc>
 deep_unique_ptr<Base> make_deep_unique(Derived &&temp, CopyFunc copy)
 {
-    return
-    {
-        new Derived(std::move(temp)),
-        CopyFunc{ std::move(copy) }
-    };
+    return {new Derived(std::move(temp)), CopyFunc{std::move(copy)}};
 }
 
 template <typename Base, typename Derived>
 deep_unique_ptr<Base> make_deep_unique(Derived &&temp)
 {
-    static_assert(std::is_base_of<Base, Derived>::value,
-                  "Derived is not a specialization of Base");
+    static_assert(std::is_base_of<Base, Derived>::value, "Derived is not a specialization of Base");
 
-    return make_deep_unique<Base, Derived>(
-               std::move(temp), default_polymorphic_copy<Base, Derived>);
+    return make_deep_unique<Base, Derived>(std::move(temp), default_polymorphic_copy<Base, Derived>);
 }
 
 template <typename Base, typename Derived, typename... Args>
-deep_unique_ptr<Base> make_deep_unique(Args &&... args)
+deep_unique_ptr<Base> make_deep_unique(Args &&...args)
 {
-    static_assert(std::is_constructible<Derived, Args...>::value,
-                  "Cannot instantiate Derived from arguments");
+    static_assert(std::is_constructible<Derived, Args...>::value, "Cannot instantiate Derived from arguments");
 
-    return make_deep_unique<Base, Derived>(
-               std::move(Derived{ std::forward<Args>(args)... }));
+    return make_deep_unique<Base, Derived>(std::move(Derived{std::forward<Args>(args)...}));
 }
 
 } // namespace memory
 } // namespace utils
 } // namespace arm_compute
-#endif // ARM_COMPUTE_MISC_ITERABLE_H
-\ No newline at end of file
+#endif // ARM_COMPUTE_MISC_ITERABLE_H
diff --git a/support/Half.h b/support/Half.h
index 081da5ebc1..f5c27da2d3 100644
--- a/support/Half.h
+++ b/support/Half.h
@@ -24,12 +24,12 @@
 #ifndef __ARM_COMPUTE_HALF_H__
 #define __ARM_COMPUTE_HALF_H__
 
-#if(BARE_METAL)
+#if (BARE_METAL)
 #define HALF_ENABLE_CPP11_CMATH 0
 #endif /* BARE_METAL */
 
 // Set style to round to nearest
-#define HALF_ROUND_STYLE 1
+#define HALF_ROUND_STYLE        1
 #define HALF_ROUND_TIES_TO_EVEN 1
 
 #include "half/half.hpp"
diff --git a/support/Iterable.h b/support/Iterable.h
index a0bafaf4ce..8d99e70196 100644
--- a/support/Iterable.h
+++ b/support/Iterable.h
@@ -44,8 +44,7 @@ public:
      *
      * @param[in] it Value to reverse iterate on
      */
-    explicit reverse_iterable(T &it)
-        : _it(it)
+    explicit reverse_iterable(T &it) : _it(it)
     {
     }
 
diff --git a/support/Mutex.h b/support/Mutex.h
index 6e68fa5248..9c2b55c3ac 100644
--- a/support/Mutex.h
+++ b/support/Mutex.h
@@ -50,10 +50,10 @@ public:
     ~Mutex() = default;
 
     /** Lock */
-    void lock() {};
+    void lock(){};
 
     /** Unlock */
-    void unlock() {};
+    void unlock(){};
 
     /** Try the lock.
      *
@@ -73,8 +73,7 @@ public:
     typedef Mutex mutex_type;
 
 public:
-    explicit lock_guard(Mutex &m_)
-        : m(m_)
+    explicit lock_guard(Mutex &m_) : m(m_)
     {
     }
     ~lock_guard()
@@ -97,15 +96,14 @@ public:
     unique_lock() noexcept : m(nullptr)
     {
     }
-    explicit unique_lock(mutex_type &m)
-        : m(&m)
+    explicit unique_lock(mutex_type &m) : m(&m)
     {
     }
-    unique_lock(const unique_lock &) = delete;
-    unique_lock(unique_lock &&)      = default;
+    unique_lock(const unique_lock &)            = delete;
+    unique_lock(unique_lock &&)                 = default;
     unique_lock &operator=(const unique_lock &) = delete;
-    unique_lock &operator=(unique_lock &&) = default;
-    ~unique_lock()                         = default;
+    unique_lock &operator=(unique_lock &&)      = default;
+    ~unique_lock()                              = default;
     void lock()
     {
     }
@@ -121,5 +119,5 @@ private:
     mutex_type *m;
 };
 #endif /* NO_MULTI_THREADING */
-}
+} // namespace arm_compute
 #endif /* __ARM_COMPUTE_MUTEX_H__ */
diff --git a/support/Random.h b/support/Random.h
index 7658e6d529..1a804d3290 100644
--- a/support/Random.h
+++ b/support/Random.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_MISC_RANDOM_H
 
 #include "arm_compute/core/Error.h"
+
 #include "utils/Utils.h"
 
 #include <random>
@@ -47,7 +48,9 @@ public:
     static constexpr bool is_fp_16bit = std::is_same<T, half>::value || std::is_same<T, bfloat16>::value;
     static constexpr bool is_integral = std::is_integral<T>::value && !is_fp_16bit;
 
-    using fp_dist     = typename std::conditional<is_fp_16bit, arm_compute::utils::uniform_real_distribution_16bit<T>, std::uniform_real_distribution<T>>::type;
+    using fp_dist     = typename std::conditional<is_fp_16bit,
+                                              arm_compute::utils::uniform_real_distribution_16bit<T>,
+                                              std::uniform_real_distribution<T>>::type;
     using DT          = typename std::conditional<is_integral, std::uniform_int_distribution<T>, fp_dist>::type;
     using result_type = T;
     using range_pair  = std::pair<result_type, result_type>;
@@ -62,7 +65,7 @@ public:
         : _distributions(), _selector()
     {
         result_type clow = low;
-        for(const auto &erange : exclude_ranges)
+        for (const auto &erange : exclude_ranges)
         {
             result_type epsilon = is_integral ? result_type(1) : result_type(std::numeric_limits<T>::epsilon());
 
diff --git a/support/Rounding.h b/support/Rounding.h
index e2732dc459..5691a6680b 100644
--- a/support/Rounding.h
+++ b/support/Rounding.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+
 #include "support/AclRequires.h"
 #include "support/ToolchainSupport.h"
 
@@ -153,10 +154,10 @@ inline T round_half_even(T value, T epsilon = std::numeric_limits<T>::epsilon())
     T ipart          = 0;
     std::modf(positive_value, &ipart);
     // If 'value' is exactly halfway between two integers
-    if(std::abs(positive_value - (ipart + 0.5f)) < epsilon)
+    if (std::abs(positive_value - (ipart + 0.5f)) < epsilon)
     {
         // If 'ipart' is even then return 'ipart'
-        if(std::fmod(ipart, 2.f) < epsilon)
+        if (std::fmod(ipart, 2.f) < epsilon)
         {
             return support::cpp11::copysign(ipart, value);
         }
@@ -179,7 +180,7 @@ inline T round_half_even(T value, T epsilon = std::numeric_limits<T>::epsilon())
 template <typename T, ARM_COMPUTE_REQUIRES_TA(traits::is_floating_point<T>::value)>
 inline T round(T value, RoundingMode rounding_mode)
 {
-    switch(rounding_mode)
+    switch (rounding_mode)
     {
         case RoundingMode::TO_ZERO:
             return round_to_zero(value);
diff --git a/support/SaturateCast.h b/support/SaturateCast.h
index a9982d8e96..7af9f983ed 100644
--- a/support/SaturateCast.h
+++ b/support/SaturateCast.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/utils/misc/Traits.h"
 #include "arm_compute/core/utils/misc/Utility.h"
+
 #include "support/Rounding.h"
 
 namespace arm_compute
diff --git a/support/Semaphore.h b/support/Semaphore.h
index e182b53a2d..f44179332b 100644
--- a/support/Semaphore.h
+++ b/support/Semaphore.h
@@ -24,8 +24,9 @@
 #ifndef __ARM_COMPUTE_UTILS_SEMAMPHORE_H__
 #define __ARM_COMPUTE_UTILS_SEMAMPHORE_H__
 
-#include "Mutex.h"
 #include "support/Mutex.h"
+
+#include "Mutex.h"
 #include <condition_variable>
 
 namespace arm_compute
@@ -39,8 +40,7 @@ public:
      *
      * @param[in] value Semaphore initial value
      */
-    Semaphore(int value = 0)
-        : _value(value), _m(), _cv()
+    Semaphore(int value = 0) : _value(value), _m(), _cv()
     {
     }
     /** Signals a semaphore */
@@ -56,10 +56,7 @@ public:
     inline void wait()
     {
         std::unique_lock<std::mutex> lock(_m);
-        _cv.wait(lock, [this]()
-        {
-            return _value > 0;
-        });
+        _cv.wait(lock, [this]() { return _value > 0; });
         --_value;
     }
 
@@ -73,8 +70,7 @@ private:
 class Semaphore
 {
 public:
-    Semaphore(int value = 0)
-        : _value(value)
+    Semaphore(int value = 0) : _value(value)
     {
         (void)_value;
     }
@@ -93,5 +89,5 @@ private:
     int _value;
 };
 #endif /* NO_MULTI_THREADING */
-} // arm_compute
+} // namespace arm_compute
 #endif /* __ARM_COMPUTE_UTILS_SEMAMPHORE_H__ */
diff --git a/support/StringSupport.h b/support/StringSupport.h
index e8b3ca7ab3..7d1b5e7778 100644
--- a/support/StringSupport.h
+++ b/support/StringSupport.h
@@ -57,14 +57,14 @@ inline int stoi(const std::string &str, std::size_t *pos = 0, NumericBase base =
     assert(base == NumericBase::BASE_10 || base == NumericBase::BASE_16);
     unsigned int      x;
     std::stringstream ss;
-    if(base == NumericBase::BASE_16)
+    if (base == NumericBase::BASE_16)
     {
         ss << std::hex;
     }
     ss << str;
     ss >> x;
 
-    if(pos)
+    if (pos)
     {
         std::string       s;
         std::stringstream ss_p;
@@ -93,14 +93,14 @@ inline unsigned long stoul(const std::string &str, std::size_t *pos = 0, Numeric
     assert(base == NumericBase::BASE_10 || base == NumericBase::BASE_16);
     std::stringstream stream;
     unsigned long     value = 0;
-    if(base == NumericBase::BASE_16)
+    if (base == NumericBase::BASE_16)
     {
         stream << std::hex;
     }
     stream << str;
     stream >> value;
 
-    if(pos)
+    if (pos)
     {
         std::string       s;
         std::stringstream ss_p;
@@ -113,7 +113,7 @@ inline unsigned long stoul(const std::string &str, std::size_t *pos = 0, Numeric
     return value;
 }
 
-#if(__ANDROID__ || BARE_METAL)
+#if (__ANDROID__ || BARE_METAL)
 /** Convert integer and float values to string.
  *
  * @note This function implements the same behaviour as std::to_string. The
@@ -124,7 +124,7 @@ inline unsigned long stoul(const std::string &str, std::size_t *pos = 0, Numeric
  * @return String representation of @p value.
  */
 template <typename T, typename std::enable_if<std::is_arithmetic<typename std::decay<T>::type>::value, int>::type = 0>
-inline std::string to_string(T && value)
+inline std::string to_string(T &&value)
 {
     std::stringstream stream;
     stream << std::forward<T>(value);
@@ -186,7 +186,7 @@ inline std::string to_string(const std::string &value)
  * @return Float representation of input string.
  */
 template <typename... Ts>
-int stof(Ts &&... args)
+int stof(Ts &&...args)
 {
     return ::std::stof(std::forward<Ts>(args)...);
 }
diff --git a/support/ToolchainSupport.h b/support/ToolchainSupport.h
index 96826dad5e..4d394889c3 100644
--- a/support/ToolchainSupport.h
+++ b/support/ToolchainSupport.h
@@ -24,6 +24,9 @@
 #ifndef ARM_COMPUTE_SUPPORT_TOOLCHAINSUPPORT
 #define ARM_COMPUTE_SUPPORT_TOOLCHAINSUPPORT
 
+#include "support/Bfloat16.h"
+#include "support/Half.h"
+
 #include <cassert>
 #include <cmath>
 #include <cstddef>
@@ -33,9 +36,6 @@
 #include <string>
 #include <type_traits>
 
-#include "support/Bfloat16.h"
-#include "support/Half.h"
-
 #ifndef M_PI
 #define M_PI (3.14159265358979323846)
 #endif // M_PI
@@ -50,7 +50,7 @@ namespace support
 {
 namespace cpp11
 {
-#if(__ANDROID__ || BARE_METAL)
+#if (__ANDROID__ || BARE_METAL)
 template <typename T>
 inline T nearbyint(T value)
 {
@@ -129,11 +129,12 @@ inline T copysign(T x, T y)
  *
  * @return Result floating point value equal to (x*y) + z.c
  */
-template < typename T, typename = typename std::enable_if < std::is_floating_point<T>::value
+template <typename T,
+          typename = typename std::enable_if<std::is_floating_point<T>::value
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                                                            || std::is_same<T, float16_t>::value
+                                             || std::is_same<T, float16_t>::value
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                                                            >::type >
+                                             >::type>
 inline T fma(T x, T y, T z)
 {
     return ::fma(x, y, z);
@@ -151,7 +152,7 @@ inline T fma(T x, T y, T z)
  *          if successful (not including the ending null character), or a negative value if an error occurred.
  */
 template <typename... Ts>
-inline int snprintf(char *s, size_t n, const char *fmt, Ts &&... args)
+inline int snprintf(char *s, size_t n, const char *fmt, Ts &&...args)
 {
     return ::snprintf(s, n, fmt, std::forward<Ts>(args)...);
 }
@@ -244,11 +245,12 @@ inline T copysign(T x, T y)
  *
  * @return Result floating point value equal to (x*y) + z.
  */
-template < typename T, typename = typename std::enable_if < std::is_floating_point<T>::value
+template <typename T,
+          typename = typename std::enable_if<std::is_floating_point<T>::value
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                                                            || std::is_same<T, float16_t>::value
+                                             || std::is_same<T, float16_t>::value
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                                                            >::type >
+                                             >::type>
 inline T fma(T x, T y, T z)
 {
     return std::fma(x, y, z);
@@ -266,7 +268,7 @@ inline T fma(T x, T y, T z)
  *          if successful (not including the ending null character), or a negative value if an error occurred.
  */
 template <typename... Ts>
-inline int snprintf(char *s, std::size_t n, const char *fmt, Ts &&... args)
+inline int snprintf(char *s, std::size_t n, const char *fmt, Ts &&...args)
 {
     return std::snprintf(s, n, fmt, std::forward<Ts>(args)...);
 }
diff --git a/utils/CommonGraphOptions.cpp b/utils/CommonGraphOptions.cpp
index c0270726da..42524d802d 100644
--- a/utils/CommonGraphOptions.cpp
+++ b/utils/CommonGraphOptions.cpp
@@ -37,15 +37,15 @@ namespace
 {
 std::pair<unsigned int, unsigned int> parse_validation_range(const std::string &validation_range)
 {
-    std::pair<unsigned int /* start */, unsigned int /* end */> range = { 0, std::numeric_limits<unsigned int>::max() };
-    if(!validation_range.empty())
+    std::pair<unsigned int /* start */, unsigned int /* end */> range = {0, std::numeric_limits<unsigned int>::max()};
+    if (!validation_range.empty())
     {
         std::string       str;
         std::stringstream stream(validation_range);
 
         // Get first value
         std::getline(stream, str, ',');
-        if(stream.fail())
+        if (stream.fail())
         {
             return range;
         }
@@ -56,7 +56,7 @@ std::pair<unsigned int, unsigned int> parse_validation_range(const std::string &
 
         // Get second value
         std::getline(stream, str);
-        if(stream.fail())
+        if (stream.fail())
         {
             range.second = range.first;
             return range;
@@ -88,24 +88,26 @@ namespace utils
     os << "Tuner mode : " << common_params.tuner_mode << std::endl;
     os << "Tuner file : " << common_params.tuner_file << std::endl;
     os << "MLGO file : " << common_params.mlgo_file << std::endl;
-    os << "Fast math enabled? : " << (common_params.fast_math_hint == FastMathHint::Enabled ? true_str : false_str) << std::endl;
-    if(!common_params.data_path.empty())
+    os << "Fast math enabled? : " << (common_params.fast_math_hint == FastMathHint::Enabled ? true_str : false_str)
+       << std::endl;
+    if (!common_params.data_path.empty())
     {
         os << "Data path : " << common_params.data_path << std::endl;
     }
-    if(!common_params.image.empty())
+    if (!common_params.image.empty())
     {
         os << "Image file : " << common_params.image << std::endl;
     }
-    if(!common_params.labels.empty())
+    if (!common_params.labels.empty())
     {
         os << "Labels file : " << common_params.labels << std::endl;
     }
-    if(!common_params.validation_file.empty())
+    if (!common_params.validation_file.empty())
     {
-        os << "Validation range : " << common_params.validation_range_start << "-" << common_params.validation_range_end << std::endl;
+        os << "Validation range : " << common_params.validation_range_start << "-" << common_params.validation_range_end
+           << std::endl;
         os << "Validation file : " << common_params.validation_file << std::endl;
-        if(!common_params.validation_path.empty())
+        if (!common_params.validation_path.empty())
         {
             os << "Validation path : " << common_params.validation_path << std::endl;
         }
@@ -134,33 +136,25 @@ CommonGraphOptions::CommonGraphOptions(CommandLineParser &parser)
       tuner_file(parser.add_option<SimpleOption<std::string>>("tuner-file")),
       mlgo_file(parser.add_option<SimpleOption<std::string>>("mlgo-file"))
 {
-    std::set<arm_compute::graph::Target> supported_targets
-    {
+    std::set<arm_compute::graph::Target> supported_targets{
         Target::NEON,
         Target::CL,
         Target::CLVK,
     };
 
-    std::set<arm_compute::DataType> supported_data_types
-    {
+    std::set<arm_compute::DataType> supported_data_types{
         DataType::F16,
         DataType::F32,
         DataType::QASYMM8,
         DataType::QASYMM8_SIGNED,
     };
 
-    std::set<DataLayout> supported_data_layouts
-    {
+    std::set<DataLayout> supported_data_layouts{
         DataLayout::NHWC,
         DataLayout::NCHW,
     };
 
-    const std::set<CLTunerMode> supported_tuner_modes
-    {
-        CLTunerMode::EXHAUSTIVE,
-        CLTunerMode::NORMAL,
-        CLTunerMode::RAPID
-    };
+    const std::set<CLTunerMode> supported_tuner_modes{CLTunerMode::EXHAUSTIVE, CLTunerMode::NORMAL, CLTunerMode::RAPID};
 
     target      = parser.add_option<EnumOption<Target>>("target", supported_targets, Target::NEON);
     data_type   = parser.add_option<EnumOption<DataType>>("type", supported_data_types, DataType::F32);
@@ -175,11 +169,10 @@ CommonGraphOptions::CommonGraphOptions(CommandLineParser &parser)
     data_layout->set_help("Data layout to use");
     enable_tuner->set_help("Enable OpenCL dynamic tuner");
     enable_cl_cache->set_help("Enable OpenCL program caches");
-    tuner_mode->set_help(
-        "Configures the time taken by the tuner to tune. "
-        "Exhaustive: slowest but produces the most performant LWS configuration. "
-        "Normal: slow but produces the LWS configurations on par with Exhaustive most of the time. "
-        "Rapid: fast but produces less performant LWS configurations");
+    tuner_mode->set_help("Configures the time taken by the tuner to tune. "
+                         "Exhaustive: slowest but produces the most performant LWS configuration. "
+                         "Normal: slow but produces the LWS configurations on par with Exhaustive most of the time. "
+                         "Rapid: fast but produces less performant LWS configurations");
     fast_math_hint->set_help("Enable fast math");
     data_path->set_help("Path where graph parameters reside");
     image->set_help("Input image for the graph");
@@ -193,8 +186,9 @@ CommonGraphOptions::CommonGraphOptions(CommandLineParser &parser)
 
 CommonGraphParams consume_common_graph_parameters(CommonGraphOptions &options)
 {
-    FastMathHint fast_math_hint_value = options.fast_math_hint->value() ? FastMathHint::Enabled : FastMathHint::Disabled;
-    auto         validation_range     = parse_validation_range(options.validation_range->value());
+    FastMathHint fast_math_hint_value =
+        options.fast_math_hint->value() ? FastMathHint::Enabled : FastMathHint::Disabled;
+    auto validation_range = parse_validation_range(options.validation_range->value());
 
     CommonGraphParams common_params;
     common_params.help      = options.help->is_set() ? options.help->value() : false;
@@ -202,19 +196,21 @@ CommonGraphParams consume_common_graph_parameters(CommonGraphOptions &options)
     common_params.batches   = options.batches->value();
     common_params.target    = options.target->value();
     common_params.data_type = options.data_type->value();
-    if(options.data_layout->is_set())
+    if (options.data_layout->is_set())
     {
         common_params.data_layout = options.data_layout->value();
     }
-    common_params.enable_tuner           = options.enable_tuner->is_set() ? options.enable_tuner->value() : false;
-    common_params.enable_cl_cache        = common_params.target == arm_compute::graph::Target::NEON ? false : (options.enable_cl_cache->is_set() ? options.enable_cl_cache->value() : true);
-    common_params.tuner_mode             = options.tuner_mode->value();
-    common_params.fast_math_hint         = options.fast_math_hint->is_set() ? fast_math_hint_value : FastMathHint::Disabled;
-    common_params.data_path              = options.data_path->value();
-    common_params.image                  = options.image->value();
-    common_params.labels                 = options.labels->value();
-    common_params.validation_file        = options.validation_file->value();
-    common_params.validation_path        = options.validation_path->value();
+    common_params.enable_tuner    = options.enable_tuner->is_set() ? options.enable_tuner->value() : false;
+    common_params.enable_cl_cache = common_params.target == arm_compute::graph::Target::NEON
+                                        ? false
+                                        : (options.enable_cl_cache->is_set() ? options.enable_cl_cache->value() : true);
+    common_params.tuner_mode      = options.tuner_mode->value();
+    common_params.fast_math_hint  = options.fast_math_hint->is_set() ? fast_math_hint_value : FastMathHint::Disabled;
+    common_params.data_path       = options.data_path->value();
+    common_params.image           = options.image->value();
+    common_params.labels          = options.labels->value();
+    common_params.validation_file = options.validation_file->value();
+    common_params.validation_path = options.validation_path->value();
     common_params.validation_range_start = validation_range.first;
     common_params.validation_range_end   = validation_range.second;
     common_params.tuner_file             = options.tuner_file->value();
diff --git a/utils/CommonGraphOptions.h b/utils/CommonGraphOptions.h
index afdb78b1be..c42e06cb84 100644
--- a/utils/CommonGraphOptions.h
+++ b/utils/CommonGraphOptions.h
@@ -24,13 +24,13 @@
 #ifndef ARM_COMPUTE_EXAMPLES_UTILS_COMMON_GRAPH_OPTIONS
 #define ARM_COMPUTE_EXAMPLES_UTILS_COMMON_GRAPH_OPTIONS
 
-#include "utils/command_line/CommandLineOptions.h"
-#include "utils/command_line/CommandLineParser.h"
-
 #include "arm_compute/graph/TypeLoader.h"
 #include "arm_compute/graph/TypePrinter.h"
 #include "arm_compute/runtime/CL/CLTunerTypes.h"
 
+#include "utils/command_line/CommandLineOptions.h"
+#include "utils/command_line/CommandLineParser.h"
+
 namespace arm_compute
 {
 namespace utils
@@ -92,16 +92,16 @@ namespace utils
 /** Structure holding all the common graph parameters */
 struct CommonGraphParams
 {
-    bool                             help{ false };
-    int                              threads{ 0 };
-    int                              batches{ 1 };
-    arm_compute::graph::Target       target{ arm_compute::graph::Target::NEON };
-    arm_compute::DataType            data_type{ DataType::F32 };
-    arm_compute::DataLayout          data_layout{ DataLayout::NHWC };
-    bool                             enable_tuner{ false };
-    bool                             enable_cl_cache{ false };
-    arm_compute::CLTunerMode         tuner_mode{ CLTunerMode::NORMAL };
-    arm_compute::graph::FastMathHint fast_math_hint{ arm_compute::graph::FastMathHint::Disabled };
+    bool                             help{false};
+    int                              threads{0};
+    int                              batches{1};
+    arm_compute::graph::Target       target{arm_compute::graph::Target::NEON};
+    arm_compute::DataType            data_type{DataType::F32};
+    arm_compute::DataLayout          data_layout{DataLayout::NHWC};
+    bool                             enable_tuner{false};
+    bool                             enable_cl_cache{false};
+    arm_compute::CLTunerMode         tuner_mode{CLTunerMode::NORMAL};
+    arm_compute::graph::FastMathHint fast_math_hint{arm_compute::graph::FastMathHint::Disabled};
     std::string                      data_path{};
     std::string                      image{};
     std::string                      labels{};
@@ -109,8 +109,8 @@ struct CommonGraphParams
     std::string                      validation_path{};
     std::string                      tuner_file{};
     std::string                      mlgo_file{};
-    unsigned int                     validation_range_start{ 0 };
-    unsigned int                     validation_range_end{ std::numeric_limits<unsigned int>::max() };
+    unsigned int                     validation_range_start{0};
+    unsigned int                     validation_range_end{std::numeric_limits<unsigned int>::max()};
 };
 
 /** Formatted output of the CommonGraphParams type
diff --git a/utils/GraphUtils.cpp b/utils/GraphUtils.cpp
index c3f71299f6..ca8e14abba 100644
--- a/utils/GraphUtils.cpp
+++ b/utils/GraphUtils.cpp
@@ -43,18 +43,21 @@ using namespace arm_compute::graph_utils;
 
 namespace
 {
-std::pair<arm_compute::TensorShape, arm_compute::PermutationVector> compute_permutation_parameters(const arm_compute::TensorShape &shape,
-                                                                                                   arm_compute::DataLayout data_layout)
+std::pair<arm_compute::TensorShape, arm_compute::PermutationVector>
+compute_permutation_parameters(const arm_compute::TensorShape &shape, arm_compute::DataLayout data_layout)
 {
     // Set permutation parameters if needed
     arm_compute::TensorShape       permuted_shape = shape;
     arm_compute::PermutationVector perm;
     // Permute only if num_dimensions greater than 2
-    if(shape.num_dimensions() > 2)
+    if (shape.num_dimensions() > 2)
     {
-        perm = (data_layout == arm_compute::DataLayout::NHWC) ? arm_compute::PermutationVector(2U, 0U, 1U) : arm_compute::PermutationVector(1U, 2U, 0U);
+        perm = (data_layout == arm_compute::DataLayout::NHWC) ? arm_compute::PermutationVector(2U, 0U, 1U)
+                                                              : arm_compute::PermutationVector(1U, 2U, 0U);
 
-        arm_compute::PermutationVector perm_shape = (data_layout == arm_compute::DataLayout::NCHW) ? arm_compute::PermutationVector(2U, 0U, 1U) : arm_compute::PermutationVector(1U, 2U, 0U);
+        arm_compute::PermutationVector perm_shape = (data_layout == arm_compute::DataLayout::NCHW)
+                                                        ? arm_compute::PermutationVector(2U, 0U, 1U)
+                                                        : arm_compute::PermutationVector(1U, 2U, 0U);
         arm_compute::permute(permuted_shape, perm_shape);
     }
 
@@ -62,17 +65,16 @@ std::pair<arm_compute::TensorShape, arm_compute::PermutationVector> compute_perm
 }
 } // namespace
 
-TFPreproccessor::TFPreproccessor(float min_range, float max_range)
-    : _min_range(min_range), _max_range(max_range)
+TFPreproccessor::TFPreproccessor(float min_range, float max_range) : _min_range(min_range), _max_range(max_range)
 {
 }
 void TFPreproccessor::preprocess(ITensor &tensor)
 {
-    if(tensor.info()->data_type() == DataType::F32)
+    if (tensor.info()->data_type() == DataType::F32)
     {
         preprocess_typed<float>(tensor);
     }
-    else if(tensor.info()->data_type() == DataType::F16)
+    else if (tensor.info()->data_type() == DataType::F16)
     {
         preprocess_typed<half>(tensor);
     }
@@ -89,19 +91,20 @@ void TFPreproccessor::preprocess_typed(ITensor &tensor)
     window.use_tensor_dimensions(tensor.info()->tensor_shape());
 
     const float range = _max_range - _min_range;
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const T value                                     = *reinterpret_cast<T *>(tensor.ptr_to_element(id));
-        float   res                                       = value / 255.f;            // Normalize to [0, 1]
-        res                                               = res * range + _min_range; // Map to [min_range, max_range]
-        *reinterpret_cast<T *>(tensor.ptr_to_element(id)) = res;
-    });
+    execute_window_loop(window,
+                        [&](const Coordinates &id)
+                        {
+                            const T value = *reinterpret_cast<T *>(tensor.ptr_to_element(id));
+                            float   res   = value / 255.f;            // Normalize to [0, 1]
+                            res           = res * range + _min_range; // Map to [min_range, max_range]
+                            *reinterpret_cast<T *>(tensor.ptr_to_element(id)) = res;
+                        });
 }
 
 CaffePreproccessor::CaffePreproccessor(std::array<float, 3> mean, bool bgr, float scale)
     : _mean(mean), _bgr(bgr), _scale(scale)
 {
-    if(_bgr)
+    if (_bgr)
     {
         std::swap(_mean[0], _mean[2]);
     }
@@ -109,11 +112,11 @@ CaffePreproccessor::CaffePreproccessor(std::array<float, 3> mean, bool bgr, floa
 
 void CaffePreproccessor::preprocess(ITensor &tensor)
 {
-    if(tensor.info()->data_type() == DataType::F32)
+    if (tensor.info()->data_type() == DataType::F32)
     {
         preprocess_typed<float>(tensor);
     }
-    else if(tensor.info()->data_type() == DataType::F16)
+    else if (tensor.info()->data_type() == DataType::F16)
     {
         preprocess_typed<half>(tensor);
     }
@@ -130,15 +133,16 @@ void CaffePreproccessor::preprocess_typed(ITensor &tensor)
     window.use_tensor_dimensions(tensor.info()->tensor_shape());
     const int channel_idx = get_data_layout_dimension_index(tensor.info()->data_layout(), DataLayoutDimension::CHANNEL);
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const T value                                     = *reinterpret_cast<T *>(tensor.ptr_to_element(id)) - T(_mean[id[channel_idx]]);
-        *reinterpret_cast<T *>(tensor.ptr_to_element(id)) = value * T(_scale);
-    });
+    execute_window_loop(window,
+                        [&](const Coordinates &id)
+                        {
+                            const T value =
+                                *reinterpret_cast<T *>(tensor.ptr_to_element(id)) - T(_mean[id[channel_idx]]);
+                            *reinterpret_cast<T *>(tensor.ptr_to_element(id)) = value * T(_scale);
+                        });
 }
 
-PPMWriter::PPMWriter(std::string name, unsigned int maximum)
-    : _name(std::move(name)), _iterator(0), _maximum(maximum)
+PPMWriter::PPMWriter(std::string name, unsigned int maximum) : _name(std::move(name)), _iterator(0), _maximum(maximum)
 {
 }
 
@@ -150,15 +154,14 @@ bool PPMWriter::access_tensor(ITensor &tensor)
     arm_compute::utils::save_to_ppm(tensor, ss.str());
 
     _iterator++;
-    if(_maximum == 0)
+    if (_maximum == 0)
     {
         return true;
     }
     return _iterator < _maximum;
 }
 
-DummyAccessor::DummyAccessor(unsigned int maximum)
-    : _iterator(0), _maximum(maximum)
+DummyAccessor::DummyAccessor(unsigned int maximum) : _iterator(0), _maximum(maximum)
 {
 }
 
@@ -171,7 +174,7 @@ bool DummyAccessor::access_tensor(ITensor &tensor)
 {
     ARM_COMPUTE_UNUSED(tensor);
     bool ret = _maximum == 0 || _iterator < _maximum;
-    if(_iterator == _maximum)
+    if (_iterator == _maximum)
     {
         _iterator = 0;
     }
@@ -182,7 +185,8 @@ bool DummyAccessor::access_tensor(ITensor &tensor)
     return ret;
 }
 
-NumPyAccessor::NumPyAccessor(std::string npy_path, TensorShape shape, DataType data_type, DataLayout data_layout, std::ostream &output_stream)
+NumPyAccessor::NumPyAccessor(
+    std::string npy_path, TensorShape shape, DataType data_type, DataLayout data_layout, std::ostream &output_stream)
     : _npy_tensor(), _filename(std::move(npy_path)), _output_stream(output_stream)
 {
     NumPyBinLoader loader(_filename, data_layout);
@@ -203,8 +207,10 @@ void NumPyAccessor::access_numpy_tensor(ITensor &tensor, T tolerance)
     int       num_mismatches        = utils::compare_tensor<T>(tensor, _npy_tensor, tolerance);
     float     percentage_mismatches = static_cast<float>(num_mismatches) / num_elements;
 
-    _output_stream << "Results: " << 100.f - (percentage_mismatches * 100) << " % matches with the provided output[" << _filename << "]." << std::endl;
-    _output_stream << "         " << num_elements - num_mismatches << " out of " << num_elements << " matches with the provided output[" << _filename << "]." << std::endl
+    _output_stream << "Results: " << 100.f - (percentage_mismatches * 100) << " % matches with the provided output["
+                   << _filename << "]." << std::endl;
+    _output_stream << "         " << num_elements - num_mismatches << " out of " << num_elements
+                   << " matches with the provided output[" << _filename << "]." << std::endl
                    << std::endl;
 }
 
@@ -213,7 +219,7 @@ bool NumPyAccessor::access_tensor(ITensor &tensor)
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&tensor, 1, DataType::F32, DataType::QASYMM8);
     ARM_COMPUTE_ERROR_ON(_npy_tensor.info()->dimension(0) != tensor.info()->dimension(0));
 
-    switch(tensor.info()->data_type())
+    switch (tensor.info()->data_type())
     {
         case DataType::QASYMM8:
             access_numpy_tensor<qasymm8_t>(tensor, 0);
@@ -262,7 +268,7 @@ ImageAccessor::ImageAccessor(std::string filename, bool bgr, std::unique_ptr<IPr
 
 bool ImageAccessor::access_tensor(ITensor &tensor)
 {
-    if(!_already_loaded)
+    if (!_already_loaded)
     {
         auto image_loader = utils::ImageLoaderFactory::create(_filename);
         ARM_COMPUTE_EXIT_ON_MSG(image_loader == nullptr, "Unsupported image type");
@@ -273,27 +279,30 @@ bool ImageAccessor::access_tensor(ITensor &tensor)
         // Get permutated shape and permutation parameters
         TensorShape                    permuted_shape = tensor.info()->tensor_shape();
         arm_compute::PermutationVector perm;
-        if(tensor.info()->data_layout() != DataLayout::NCHW)
+        if (tensor.info()->data_layout() != DataLayout::NCHW)
         {
-            std::tie(permuted_shape, perm) = compute_permutation_parameters(tensor.info()->tensor_shape(), tensor.info()->data_layout());
+            std::tie(permuted_shape, perm) =
+                compute_permutation_parameters(tensor.info()->tensor_shape(), tensor.info()->data_layout());
         }
 
 #ifdef __arm__
-        ARM_COMPUTE_EXIT_ON_MSG_VAR(image_loader->width() != permuted_shape.x() || image_loader->height() != permuted_shape.y(),
-                                    "Failed to load image file: dimensions [%d,%d] not correct, expected [%" PRIu32 ",%" PRIu32 "].",
-                                    image_loader->width(), image_loader->height(), permuted_shape.x(), permuted_shape.y());
+        ARM_COMPUTE_EXIT_ON_MSG_VAR(
+            image_loader->width() != permuted_shape.x() || image_loader->height() != permuted_shape.y(),
+            "Failed to load image file: dimensions [%d,%d] not correct, expected [%" PRIu32 ",%" PRIu32 "].",
+            image_loader->width(), image_loader->height(), permuted_shape.x(), permuted_shape.y());
 #else  // __arm__
-        ARM_COMPUTE_EXIT_ON_MSG_VAR(image_loader->width() != permuted_shape.x() || image_loader->height() != permuted_shape.y(),
-                                    "Failed to load image file: dimensions [%d,%d] not correct, expected [%" PRIu64 ",%" PRIu64 "].",
-                                    image_loader->width(), image_loader->height(),
-                                    static_cast<uint64_t>(permuted_shape.x()), static_cast<uint64_t>(permuted_shape.y()));
+        ARM_COMPUTE_EXIT_ON_MSG_VAR(
+            image_loader->width() != permuted_shape.x() || image_loader->height() != permuted_shape.y(),
+            "Failed to load image file: dimensions [%d,%d] not correct, expected [%" PRIu64 ",%" PRIu64 "].",
+            image_loader->width(), image_loader->height(), static_cast<uint64_t>(permuted_shape.x()),
+            static_cast<uint64_t>(permuted_shape.y()));
 #endif // __arm__
 
         // Fill the tensor with the PPM content (BGR)
         image_loader->fill_planar_tensor(tensor, _bgr);
 
         // Preprocess tensor
-        if(_preprocessor)
+        if (_preprocessor)
         {
             _preprocessor->preprocess(tensor);
         }
@@ -310,7 +319,12 @@ ValidationInputAccessor::ValidationInputAccessor(const std::string             &
                                                  unsigned int                   start,
                                                  unsigned int                   end,
                                                  std::ostream                  &output_stream)
-    : _path(std::move(images_path)), _images(), _preprocessor(std::move(preprocessor)), _bgr(bgr), _offset(0), _output_stream(output_stream)
+    : _path(std::move(images_path)),
+      _images(),
+      _preprocessor(std::move(preprocessor)),
+      _bgr(bgr),
+      _offset(0),
+      _output_stream(output_stream)
 {
     ARM_COMPUTE_EXIT_ON_MSG(start > end, "Invalid validation range!");
 
@@ -322,10 +336,10 @@ ValidationInputAccessor::ValidationInputAccessor(const std::string             &
 
         // Parse image names
         unsigned int counter = 0;
-        for(std::string line; !std::getline(ifs, line).fail() && counter <= end; ++counter)
+        for (std::string line; !std::getline(ifs, line).fail() && counter <= end; ++counter)
         {
             // Add image to process if withing range
-            if(counter >= start)
+            if (counter >= start)
             {
                 std::stringstream linestream(line);
                 std::string       image_name;
@@ -335,7 +349,7 @@ ValidationInputAccessor::ValidationInputAccessor(const std::string             &
             }
         }
     }
-    catch(const std::ifstream::failure &e)
+    catch (const std::ifstream::failure &e)
     {
         ARM_COMPUTE_ERROR_VAR("Accessing %s: %s", image_list.c_str(), e.what());
     }
@@ -344,7 +358,7 @@ ValidationInputAccessor::ValidationInputAccessor(const std::string             &
 bool ValidationInputAccessor::access_tensor(arm_compute::ITensor &tensor)
 {
     bool ret = _offset < _images.size();
-    if(ret)
+    if (ret)
     {
         utils::JPEGLoader jpeg;
 
@@ -356,28 +370,30 @@ bool ValidationInputAccessor::access_tensor(arm_compute::ITensor &tensor)
         // Get permutated shape and permutation parameters
         TensorShape                    permuted_shape = tensor.info()->tensor_shape();
         arm_compute::PermutationVector perm;
-        if(tensor.info()->data_layout() != DataLayout::NCHW)
+        if (tensor.info()->data_layout() != DataLayout::NCHW)
         {
-            std::tie(permuted_shape, perm) = compute_permutation_parameters(tensor.info()->tensor_shape(),
-                                                                            tensor.info()->data_layout());
+            std::tie(permuted_shape, perm) =
+                compute_permutation_parameters(tensor.info()->tensor_shape(), tensor.info()->data_layout());
         }
 
 #ifdef __arm__
         ARM_COMPUTE_EXIT_ON_MSG_VAR(jpeg.width() != permuted_shape.x() || jpeg.height() != permuted_shape.y(),
-                                    "Failed to load image file: dimensions [%d,%d] not correct, expected [%" PRIu32 ",%" PRIu32 "].",
+                                    "Failed to load image file: dimensions [%d,%d] not correct, expected [%" PRIu32
+                                    ",%" PRIu32 "].",
                                     jpeg.width(), jpeg.height(), permuted_shape.x(), permuted_shape.y());
 #else  // __arm__
         ARM_COMPUTE_EXIT_ON_MSG_VAR(jpeg.width() != permuted_shape.x() || jpeg.height() != permuted_shape.y(),
-                                    "Failed to load image file: dimensions [%d,%d] not correct, expected [%" PRIu64 ",%" PRIu64 "].",
-                                    jpeg.width(), jpeg.height(),
-                                    static_cast<uint64_t>(permuted_shape.x()), static_cast<uint64_t>(permuted_shape.y()));
+                                    "Failed to load image file: dimensions [%d,%d] not correct, expected [%" PRIu64
+                                    ",%" PRIu64 "].",
+                                    jpeg.width(), jpeg.height(), static_cast<uint64_t>(permuted_shape.x()),
+                                    static_cast<uint64_t>(permuted_shape.y()));
 #endif // __arm__
 
         // Fill the tensor with the JPEG content (BGR)
         jpeg.fill_planar_tensor(tensor, _bgr);
 
         // Preprocess tensor
-        if(_preprocessor)
+        if (_preprocessor)
         {
             _preprocessor->preprocess(tensor);
         }
@@ -402,10 +418,10 @@ ValidationOutputAccessor::ValidationOutputAccessor(const std::string &image_list
 
         // Parse image correctly classified labels
         unsigned int counter = 0;
-        for(std::string line; !std::getline(ifs, line).fail() && counter <= end; ++counter)
+        for (std::string line; !std::getline(ifs, line).fail() && counter <= end; ++counter)
         {
             // Add label if within range
-            if(counter >= start)
+            if (counter >= start)
             {
                 std::stringstream linestream(line);
                 std::string       image_name;
@@ -416,7 +432,7 @@ ValidationOutputAccessor::ValidationOutputAccessor(const std::string &image_list
             }
         }
     }
-    catch(const std::ifstream::failure &e)
+    catch (const std::ifstream::failure &e)
     {
         ARM_COMPUTE_ERROR_VAR("Accessing %s: %s", image_list.c_str(), e.what());
     }
@@ -432,11 +448,11 @@ void ValidationOutputAccessor::reset()
 bool ValidationOutputAccessor::access_tensor(arm_compute::ITensor &tensor)
 {
     bool ret = _offset < _results.size();
-    if(ret)
+    if (ret)
     {
         // Get results
         std::vector<size_t> tensor_results;
-        switch(tensor.info()->data_type())
+        switch (tensor.info()->data_type())
         {
             case DataType::QASYMM8:
                 tensor_results = access_predictions_tensor<uint8_t>(tensor);
@@ -459,7 +475,7 @@ bool ValidationOutputAccessor::access_tensor(arm_compute::ITensor &tensor)
     }
 
     // Report top_n accuracy
-    if(_offset >= _results.size())
+    if (_offset >= _results.size())
     {
         report_top_n(1, _results.size(), _positive_samples_top1);
         report_top_n(5, _results.size(), _positive_samples_top5);
@@ -481,23 +497,19 @@ std::vector<size_t> ValidationOutputAccessor::access_predictions_tensor(arm_comp
 
     // Sort results
     std::iota(std::begin(index), std::end(index), static_cast<size_t>(0));
-    std::sort(std::begin(index), std::end(index),
-              [&](size_t a, size_t b)
-    {
-        return output_net[a] > output_net[b];
-    });
+    std::sort(std::begin(index), std::end(index), [&](size_t a, size_t b) { return output_net[a] > output_net[b]; });
 
     return index;
 }
 
-void ValidationOutputAccessor::aggregate_sample(const std::vector<size_t> &res, size_t &positive_samples, size_t top_n, size_t correct_label)
+void ValidationOutputAccessor::aggregate_sample(const std::vector<size_t> &res,
+                                                size_t                    &positive_samples,
+                                                size_t                     top_n,
+                                                size_t                     correct_label)
 {
-    auto is_valid_label = [correct_label](size_t label)
-    {
-        return label == correct_label;
-    };
+    auto is_valid_label = [correct_label](size_t label) { return label == correct_label; };
 
-    if(std::any_of(std::begin(res), std::begin(res) + top_n, is_valid_label))
+    if (std::any_of(std::begin(res), std::begin(res) + top_n, is_valid_label))
     {
         ++positive_samples;
     }
@@ -508,14 +520,15 @@ void ValidationOutputAccessor::report_top_n(size_t top_n, size_t total_samples,
     size_t negative_samples = total_samples - positive_samples;
     float  accuracy         = positive_samples / static_cast<float>(total_samples);
 
-    _output_stream << "----------Top " << top_n << " accuracy ----------" << std::endl
-                   << std::endl;
+    _output_stream << "----------Top " << top_n << " accuracy ----------" << std::endl << std::endl;
     _output_stream << "Positive samples : " << positive_samples << std::endl;
     _output_stream << "Negative samples : " << negative_samples << std::endl;
     _output_stream << "Accuracy : " << accuracy << std::endl;
 }
 
-DetectionOutputAccessor::DetectionOutputAccessor(const std::string &labels_path, std::vector<TensorShape> &imgs_tensor_shapes, std::ostream &output_stream)
+DetectionOutputAccessor::DetectionOutputAccessor(const std::string        &labels_path,
+                                                 std::vector<TensorShape> &imgs_tensor_shapes,
+                                                 std::ostream             &output_stream)
     : _labels(), _tensor_shapes(std::move(imgs_tensor_shapes)), _output_stream(output_stream)
 {
     _labels.clear();
@@ -527,12 +540,12 @@ DetectionOutputAccessor::DetectionOutputAccessor(const std::string &labels_path,
         ifs.exceptions(std::ifstream::badbit);
         ifs.open(labels_path, std::ios::in | std::ios::binary);
 
-        for(std::string line; !std::getline(ifs, line).fail();)
+        for (std::string line; !std::getline(ifs, line).fail();)
         {
             _labels.emplace_back(line);
         }
     }
-    catch(const std::ifstream::failure &e)
+    catch (const std::ifstream::failure &e)
     {
         ARM_COMPUTE_ERROR_VAR("Accessing %s: %s", labels_path.c_str(), e.what());
     }
@@ -542,26 +555,24 @@ template <typename T>
 void DetectionOutputAccessor::access_predictions_tensor(ITensor &tensor)
 {
     const size_t num_detection = tensor.info()->valid_region().shape.y();
-    const auto   output_prt    = reinterpret_cast<T *>(tensor.buffer() + tensor.info()->offset_first_element_in_bytes());
+    const auto   output_prt = reinterpret_cast<T *>(tensor.buffer() + tensor.info()->offset_first_element_in_bytes());
 
-    if(num_detection > 0)
+    if (num_detection > 0)
     {
-        _output_stream << "---------------------- Detections ----------------------" << std::endl
-                       << std::endl;
+        _output_stream << "---------------------- Detections ----------------------" << std::endl << std::endl;
 
-        _output_stream << std::left << std::setprecision(4) << std::setw(8) << "Image | " << std::setw(8) << "Label | " << std::setw(12) << "Confidence | "
+        _output_stream << std::left << std::setprecision(4) << std::setw(8) << "Image | " << std::setw(8) << "Label | "
+                       << std::setw(12) << "Confidence | "
                        << "[ xmin, ymin, xmax, ymax ]" << std::endl;
 
-        for(size_t i = 0; i < num_detection; ++i)
+        for (size_t i = 0; i < num_detection; ++i)
         {
             auto im = static_cast<const int>(output_prt[i * 7]);
-            _output_stream << std::setw(8) << im << std::setw(8)
-                           << _labels[output_prt[i * 7 + 1]] << std::setw(12) << output_prt[i * 7 + 2]
-                           << " [" << (output_prt[i * 7 + 3] * _tensor_shapes[im].x())
-                           << ", " << (output_prt[i * 7 + 4] * _tensor_shapes[im].y())
-                           << ", " << (output_prt[i * 7 + 5] * _tensor_shapes[im].x())
-                           << ", " << (output_prt[i * 7 + 6] * _tensor_shapes[im].y())
-                           << "]" << std::endl;
+            _output_stream << std::setw(8) << im << std::setw(8) << _labels[output_prt[i * 7 + 1]] << std::setw(12)
+                           << output_prt[i * 7 + 2] << " [" << (output_prt[i * 7 + 3] * _tensor_shapes[im].x()) << ", "
+                           << (output_prt[i * 7 + 4] * _tensor_shapes[im].y()) << ", "
+                           << (output_prt[i * 7 + 5] * _tensor_shapes[im].x()) << ", "
+                           << (output_prt[i * 7 + 6] * _tensor_shapes[im].y()) << "]" << std::endl;
         }
     }
     else
@@ -574,7 +585,7 @@ bool DetectionOutputAccessor::access_tensor(ITensor &tensor)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&tensor, 1, DataType::F32);
 
-    switch(tensor.info()->data_type())
+    switch (tensor.info()->data_type())
     {
         case DataType::F32:
             access_predictions_tensor<float>(tensor);
@@ -586,7 +597,9 @@ bool DetectionOutputAccessor::access_tensor(ITensor &tensor)
     return false;
 }
 
-TopNPredictionsAccessor::TopNPredictionsAccessor(const std::string &labels_path, size_t top_n, std::ostream &output_stream)
+TopNPredictionsAccessor::TopNPredictionsAccessor(const std::string &labels_path,
+                                                 size_t             top_n,
+                                                 std::ostream      &output_stream)
     : _labels(), _output_stream(output_stream), _top_n(top_n)
 {
     _labels.clear();
@@ -598,12 +611,12 @@ TopNPredictionsAccessor::TopNPredictionsAccessor(const std::string &labels_path,
         ifs.exceptions(std::ifstream::badbit);
         ifs.open(labels_path, std::ios::in | std::ios::binary);
 
-        for(std::string line; !std::getline(ifs, line).fail();)
+        for (std::string line; !std::getline(ifs, line).fail();)
         {
             _labels.emplace_back(line);
         }
     }
-    catch(const std::ifstream::failure &e)
+    catch (const std::ifstream::failure &e)
     {
         ARM_COMPUTE_ERROR_VAR("Accessing %s: %s", labels_path.c_str(), e.what());
     }
@@ -627,18 +640,13 @@ void TopNPredictionsAccessor::access_predictions_tensor(ITensor &tensor)
     // Sort results
     std::iota(std::begin(index), std::end(index), static_cast<size_t>(0));
     std::sort(std::begin(index), std::end(index),
-              [&](size_t a, size_t b)
-    {
-        return classes_prob[a] > classes_prob[b];
-    });
+              [&](size_t a, size_t b) { return classes_prob[a] > classes_prob[b]; });
 
-    _output_stream << "---------- Top " << _top_n << " predictions ----------" << std::endl
-                   << std::endl;
-    for(size_t i = 0; i < _top_n; ++i)
+    _output_stream << "---------- Top " << _top_n << " predictions ----------" << std::endl << std::endl;
+    for (size_t i = 0; i < _top_n; ++i)
     {
-        _output_stream << std::fixed << std::setprecision(4)
-                       << +classes_prob[index.at(i)]
-                       << " - [id = " << index.at(i) << "]"
+        _output_stream << std::fixed << std::setprecision(4) << +classes_prob[index.at(i)] << " - [id = " << index.at(i)
+                       << "]"
                        << ", " << _labels[index.at(i)] << std::endl;
     }
 }
@@ -648,7 +656,7 @@ bool TopNPredictionsAccessor::access_tensor(ITensor &tensor)
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&tensor, 1, DataType::F32, DataType::QASYMM8);
     ARM_COMPUTE_ERROR_ON(_labels.size() != tensor.info()->dimension(0));
 
-    switch(tensor.info()->data_type())
+    switch (tensor.info()->data_type())
     {
         case DataType::QASYMM8:
             access_predictions_tensor<uint8_t>(tensor);
@@ -673,9 +681,9 @@ void RandomAccessor::fill(ITensor &tensor, D &&distribution)
 {
     std::mt19937 gen(_seed);
 
-    if(tensor.info()->padding().empty() && (dynamic_cast<SubTensor *>(&tensor) == nullptr))
+    if (tensor.info()->padding().empty() && (dynamic_cast<SubTensor *>(&tensor) == nullptr))
     {
-        for(size_t offset = 0; offset < tensor.info()->total_size(); offset += tensor.info()->element_size())
+        for (size_t offset = 0; offset < tensor.info()->total_size(); offset += tensor.info()->element_size())
         {
             const auto value                                 = static_cast<T>(distribution(gen));
             *reinterpret_cast<T *>(tensor.buffer() + offset) = value;
@@ -687,17 +695,18 @@ void RandomAccessor::fill(ITensor &tensor, D &&distribution)
         Window window;
         window.use_tensor_dimensions(tensor.info()->tensor_shape());
 
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto value                                  = static_cast<T>(distribution(gen));
-            *reinterpret_cast<T *>(tensor.ptr_to_element(id)) = value;
-        });
+        execute_window_loop(window,
+                            [&](const Coordinates &id)
+                            {
+                                const auto value                                  = static_cast<T>(distribution(gen));
+                                *reinterpret_cast<T *>(tensor.ptr_to_element(id)) = value;
+                            });
     }
 }
 
 bool RandomAccessor::access_tensor(ITensor &tensor)
 {
-    switch(tensor.info()->data_type())
+    switch (tensor.info()->data_type())
     {
         case DataType::QASYMM8:
         case DataType::U8:
@@ -750,7 +759,8 @@ bool RandomAccessor::access_tensor(ITensor &tensor)
         }
         case DataType::F16:
         {
-            arm_compute::utils::uniform_real_distribution_16bit<half> distribution_f16(_lower.get<float>(), _upper.get<float>());
+            arm_compute::utils::uniform_real_distribution_16bit<half> distribution_f16(_lower.get<float>(),
+                                                                                       _upper.get<float>());
             fill<half>(tensor, distribution_f16);
             break;
         }
@@ -779,7 +789,7 @@ NumPyBinLoader::NumPyBinLoader(std::string filename, DataLayout file_layout)
 
 bool NumPyBinLoader::access_tensor(ITensor &tensor)
 {
-    if(!_already_loaded)
+    if (!_already_loaded)
     {
         utils::NPYLoader loader;
         loader.open(_filename, _file_layout);
diff --git a/utils/GraphUtils.h b/utils/GraphUtils.h
index 80055acc0f..b48300bd01 100644
--- a/utils/GraphUtils.h
+++ b/utils/GraphUtils.h
@@ -66,7 +66,7 @@ public:
      * @param[in] bgr   Boolean specifying if the preprocessing should assume BGR format
      * @param[in] scale Scale value
      */
-    CaffePreproccessor(std::array<float, 3> mean = std::array<float, 3> { { 0, 0, 0 } }, bool bgr = true, float scale = 1.f);
+    CaffePreproccessor(std::array<float, 3> mean = std::array<float, 3>{{0, 0, 0}}, bool bgr = true, float scale = 1.f);
     void preprocess(ITensor &tensor) override;
 
 private:
@@ -74,8 +74,8 @@ private:
     void preprocess_typed(ITensor &tensor);
 
     std::array<float, 3> _mean;
-    bool  _bgr;
-    float _scale;
+    bool                 _bgr;
+    float                _scale;
 };
 
 /** TF preproccessor */
@@ -155,7 +155,11 @@ public:
      * @param[in]  data_layout   (Optional) DataLayout of the numpy tensor data.
      * @param[out] output_stream (Optional) Output stream
      */
-    NumPyAccessor(std::string npy_path, TensorShape shape, DataType data_type, DataLayout data_layout = DataLayout::NCHW, std::ostream &output_stream = std::cout);
+    NumPyAccessor(std::string   npy_path,
+                  TensorShape   shape,
+                  DataType      data_type,
+                  DataLayout    data_layout   = DataLayout::NCHW,
+                  std::ostream &output_stream = std::cout);
     /** Allow instances of this class to be move constructed */
     NumPyAccessor(NumPyAccessor &&) = default;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -353,7 +357,9 @@ public:
      * @param[in]  imgs_tensor_shapes Network input images tensor shapes.
      * @param[out] output_stream      (Optional) Output stream
      */
-    DetectionOutputAccessor(const std::string &labels_path, std::vector<TensorShape> &imgs_tensor_shapes, std::ostream &output_stream = std::cout);
+    DetectionOutputAccessor(const std::string        &labels_path,
+                            std::vector<TensorShape> &imgs_tensor_shapes,
+                            std::ostream             &output_stream = std::cout);
     /** Allow instances of this class to be move constructed */
     DetectionOutputAccessor(DetectionOutputAccessor &&) = default;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -422,7 +428,7 @@ public:
 
 private:
     template <typename T, typename D>
-    void fill(ITensor &tensor, D &&distribution);
+    void                            fill(ITensor &tensor, D &&distribution);
     PixelValue                      _lower;
     PixelValue                      _upper;
     std::random_device::result_type _seed;
@@ -458,7 +464,8 @@ private:
  *
  * @return A ramdom accessor
  */
-inline std::unique_ptr<graph::ITensorAccessor> get_random_accessor(PixelValue lower, PixelValue upper, const std::random_device::result_type seed = 0)
+inline std::unique_ptr<graph::ITensorAccessor>
+get_random_accessor(PixelValue lower, PixelValue upper, const std::random_device::result_type seed = 0)
 {
     return std::make_unique<RandomAccessor>(lower, upper, seed);
 }
@@ -473,11 +480,10 @@ inline std::unique_ptr<graph::ITensorAccessor> get_random_accessor(PixelValue lo
  *
  * @return An appropriate tensor accessor
  */
-inline std::unique_ptr<graph::ITensorAccessor> get_weights_accessor(const std::string &path,
-                                                                    const std::string &data_file,
-                                                                    DataLayout         file_layout = DataLayout::NCHW)
+inline std::unique_ptr<graph::ITensorAccessor>
+get_weights_accessor(const std::string &path, const std::string &data_file, DataLayout file_layout = DataLayout::NCHW)
 {
-    if(path.empty())
+    if (path.empty())
     {
         return std::make_unique<DummyAccessor>();
     }
@@ -495,30 +501,28 @@ inline std::unique_ptr<graph::ITensorAccessor> get_weights_accessor(const std::s
  *
  * @return An appropriate tensor accessor
  */
-inline std::unique_ptr<graph::ITensorAccessor> get_input_accessor(const arm_compute::utils::CommonGraphParams &graph_parameters,
-                                                                  std::unique_ptr<IPreprocessor>               preprocessor = nullptr,
-                                                                  bool                                         bgr          = true)
+inline std::unique_ptr<graph::ITensorAccessor>
+get_input_accessor(const arm_compute::utils::CommonGraphParams &graph_parameters,
+                   std::unique_ptr<IPreprocessor>               preprocessor = nullptr,
+                   bool                                         bgr          = true)
 {
-    if(!graph_parameters.validation_file.empty())
+    if (!graph_parameters.validation_file.empty())
     {
-        return std::make_unique<ValidationInputAccessor>(graph_parameters.validation_file,
-                                                         graph_parameters.validation_path,
-                                                         std::move(preprocessor),
-                                                         bgr,
-                                                         graph_parameters.validation_range_start,
-                                                         graph_parameters.validation_range_end);
+        return std::make_unique<ValidationInputAccessor>(
+            graph_parameters.validation_file, graph_parameters.validation_path, std::move(preprocessor), bgr,
+            graph_parameters.validation_range_start, graph_parameters.validation_range_end);
     }
     else
     {
         const std::string &image_file       = graph_parameters.image;
         const std::string &image_file_lower = lower_string(image_file);
-        if(arm_compute::utility::endswith(image_file_lower, ".npy"))
+        if (arm_compute::utility::endswith(image_file_lower, ".npy"))
         {
             return std::make_unique<NumPyBinLoader>(image_file, graph_parameters.data_layout);
         }
-        else if(arm_compute::utility::endswith(image_file_lower, ".jpeg")
-                || arm_compute::utility::endswith(image_file_lower, ".jpg")
-                || arm_compute::utility::endswith(image_file_lower, ".ppm"))
+        else if (arm_compute::utility::endswith(image_file_lower, ".jpeg") ||
+                 arm_compute::utility::endswith(image_file_lower, ".jpg") ||
+                 arm_compute::utility::endswith(image_file_lower, ".ppm"))
         {
             return std::make_unique<ImageAccessor>(image_file, bgr, std::move(preprocessor));
         }
@@ -541,20 +545,20 @@ inline std::unique_ptr<graph::ITensorAccessor> get_input_accessor(const arm_comp
  *
  * @return An appropriate tensor accessor
  */
-inline std::unique_ptr<graph::ITensorAccessor> get_output_accessor(const arm_compute::utils::CommonGraphParams &graph_parameters,
-                                                                   size_t                                       top_n         = 5,
-                                                                   bool                                         is_validation = false,
-                                                                   std::ostream                                &output_stream = std::cout)
+inline std::unique_ptr<graph::ITensorAccessor>
+get_output_accessor(const arm_compute::utils::CommonGraphParams &graph_parameters,
+                    size_t                                       top_n         = 5,
+                    bool                                         is_validation = false,
+                    std::ostream                                &output_stream = std::cout)
 {
     ARM_COMPUTE_UNUSED(is_validation);
-    if(!graph_parameters.validation_file.empty())
+    if (!graph_parameters.validation_file.empty())
     {
-        return std::make_unique<ValidationOutputAccessor>(graph_parameters.validation_file,
-                                                          output_stream,
+        return std::make_unique<ValidationOutputAccessor>(graph_parameters.validation_file, output_stream,
                                                           graph_parameters.validation_range_start,
                                                           graph_parameters.validation_range_end);
     }
-    else if(graph_parameters.labels.empty())
+    else if (graph_parameters.labels.empty())
     {
         return std::make_unique<DummyAccessor>(0);
     }
@@ -575,20 +579,20 @@ inline std::unique_ptr<graph::ITensorAccessor> get_output_accessor(const arm_com
  *
  * @return An appropriate tensor accessor
  */
-inline std::unique_ptr<graph::ITensorAccessor> get_detection_output_accessor(const arm_compute::utils::CommonGraphParams &graph_parameters,
-                                                                             std::vector<TensorShape>                     tensor_shapes,
-                                                                             bool                                         is_validation = false,
-                                                                             std::ostream                                &output_stream = std::cout)
+inline std::unique_ptr<graph::ITensorAccessor>
+get_detection_output_accessor(const arm_compute::utils::CommonGraphParams &graph_parameters,
+                              std::vector<TensorShape>                     tensor_shapes,
+                              bool                                         is_validation = false,
+                              std::ostream                                &output_stream = std::cout)
 {
     ARM_COMPUTE_UNUSED(is_validation);
-    if(!graph_parameters.validation_file.empty())
+    if (!graph_parameters.validation_file.empty())
     {
-        return std::make_unique<ValidationOutputAccessor>(graph_parameters.validation_file,
-                                                          output_stream,
+        return std::make_unique<ValidationOutputAccessor>(graph_parameters.validation_file, output_stream,
                                                           graph_parameters.validation_range_start,
                                                           graph_parameters.validation_range_end);
     }
-    else if(graph_parameters.labels.empty())
+    else if (graph_parameters.labels.empty())
     {
         return std::make_unique<DummyAccessor>(0);
     }
@@ -609,10 +613,13 @@ inline std::unique_ptr<graph::ITensorAccessor> get_detection_output_accessor(con
  *
  * @return An appropriate tensor accessor
  */
-inline std::unique_ptr<graph::ITensorAccessor> get_npy_output_accessor(const std::string &npy_path, TensorShape shape, DataType data_type, DataLayout data_layout = DataLayout::NCHW,
+inline std::unique_ptr<graph::ITensorAccessor> get_npy_output_accessor(const std::string &npy_path,
+                                                                       TensorShape        shape,
+                                                                       DataType           data_type,
+                                                                       DataLayout    data_layout   = DataLayout::NCHW,
                                                                        std::ostream &output_stream = std::cout)
 {
-    if(npy_path.empty())
+    if (npy_path.empty())
     {
         return std::make_unique<DummyAccessor>(0);
     }
@@ -631,9 +638,10 @@ inline std::unique_ptr<graph::ITensorAccessor> get_npy_output_accessor(const std
  *
  * @return An appropriate tensor accessor
  */
-inline std::unique_ptr<graph::ITensorAccessor> get_save_npy_output_accessor(const std::string &npy_name, const bool is_fortran = false)
+inline std::unique_ptr<graph::ITensorAccessor> get_save_npy_output_accessor(const std::string &npy_name,
+                                                                            const bool         is_fortran = false)
 {
-    if(npy_name.empty())
+    if (npy_name.empty())
     {
         return std::make_unique<DummyAccessor>(0);
     }
@@ -664,9 +672,11 @@ inline std::unique_ptr<graph::ITensorAccessor> get_print_output_accessor(std::os
  */
 inline TensorShape permute_shape(TensorShape tensor_shape, DataLayout in_data_layout, DataLayout out_data_layout)
 {
-    if(in_data_layout != out_data_layout)
+    if (in_data_layout != out_data_layout)
     {
-        arm_compute::PermutationVector perm_vec = (in_data_layout == DataLayout::NCHW) ? arm_compute::PermutationVector(2U, 0U, 1U) : arm_compute::PermutationVector(1U, 2U, 0U);
+        arm_compute::PermutationVector perm_vec = (in_data_layout == DataLayout::NCHW)
+                                                      ? arm_compute::PermutationVector(2U, 0U, 1U)
+                                                      : arm_compute::PermutationVector(1U, 2U, 0U);
         arm_compute::permute(tensor_shape, perm_vec);
     }
     return tensor_shape;
@@ -681,7 +691,7 @@ inline TensorShape permute_shape(TensorShape tensor_shape, DataLayout in_data_la
 inline graph::Target set_target_hint(int target)
 {
     ARM_COMPUTE_ERROR_ON_MSG(target > 2, "Invalid target. Target must be 0 (NEON), 1 (OpenCL), 2 (OpenCL + Tuner)");
-    if((target == 1 || target == 2))
+    if ((target == 1 || target == 2))
     {
         return graph::Target::CL;
     }
diff --git a/utils/ImageLoader.h b/utils/ImageLoader.h
index aab0f5e770..2ae1a416e2 100644
--- a/utils/ImageLoader.h
+++ b/utils/ImageLoader.h
@@ -68,8 +68,7 @@ public:
      *
      * @param[in] fs Image file stream
      */
-    FileImageFeeder(std::ifstream &fs)
-        : _fs(fs)
+    FileImageFeeder(std::ifstream &fs) : _fs(fs)
     {
     }
     // Inherited overridden methods
@@ -94,8 +93,7 @@ public:
      *
      * @param[in] data Pointer to data
      */
-    MemoryImageFeeder(const uint8_t *data)
-        : _data(data)
+    MemoryImageFeeder(const uint8_t *data) : _data(data)
     {
     }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -127,8 +125,7 @@ class IImageLoader
 {
 public:
     /** Default Constructor */
-    IImageLoader()
-        : _feeder(nullptr), _width(0), _height(0)
+    IImageLoader() : _feeder(nullptr), _width(0), _height(0)
     {
     }
     /** Virtual base destructor */
@@ -188,7 +185,7 @@ public:
             // Validate feeding data
             validate_info(image.info());
 
-            switch(image.info()->format())
+            switch (image.info()->format())
             {
                 case Format::U8:
                 {
@@ -204,15 +201,17 @@ public:
                     unsigned char green = 0;
                     unsigned char blue  = 0;
 
-                    execute_window_loop(window, [&](const Coordinates &)
-                    {
-                        red   = _feeder->get();
-                        green = _feeder->get();
-                        blue  = _feeder->get();
+                    execute_window_loop(
+                        window,
+                        [&](const Coordinates &)
+                        {
+                            red   = _feeder->get();
+                            green = _feeder->get();
+                            blue  = _feeder->get();
 
-                        *out.ptr() = 0.2126f * red + 0.7152f * green + 0.0722f * blue;
-                    },
-                    out);
+                            *out.ptr() = 0.2126f * red + 0.7152f * green + 0.0722f * blue;
+                        },
+                        out);
 
                     break;
                 }
@@ -226,11 +225,8 @@ public:
                     Iterator out(&image, window);
                     size_t   row_size = _width * image.info()->element_size();
 
-                    execute_window_loop(window, [&](const Coordinates &)
-                    {
-                        _feeder->get_row(out.ptr(), row_size);
-                    },
-                    out);
+                    execute_window_loop(
+                        window, [&](const Coordinates &) { _feeder->get_row(out.ptr(), row_size); }, out);
 
                     break;
                 }
@@ -241,7 +237,7 @@ public:
             // Unmap buffer if creating a CLTensor
             unmap(image);
         }
-        catch(const std::ifstream::failure &e)
+        catch (const std::ifstream::failure &e)
         {
             ARM_COMPUTE_ERROR_VAR("Loading image file: %s", e.what());
         }
@@ -257,15 +253,19 @@ public:
     void fill_planar_tensor(T &tensor, bool bgr = false)
     {
         ARM_COMPUTE_ERROR_ON(!is_open());
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&tensor, 1, DataType::U8, DataType::QASYMM8, DataType::F32, DataType::F16);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&tensor, 1, DataType::U8, DataType::QASYMM8, DataType::F32,
+                                                      DataType::F16);
 
         const DataLayout  data_layout  = tensor.info()->data_layout();
         const TensorShape tensor_shape = tensor.info()->tensor_shape();
 
         ARM_COMPUTE_UNUSED(tensor_shape);
-        ARM_COMPUTE_ERROR_ON(tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)] != _width);
-        ARM_COMPUTE_ERROR_ON(tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT)] != _height);
-        ARM_COMPUTE_ERROR_ON(tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)] != 3);
+        ARM_COMPUTE_ERROR_ON(tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)] !=
+                             _width);
+        ARM_COMPUTE_ERROR_ON(tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT)] !=
+                             _height);
+        ARM_COMPUTE_ERROR_ON(tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)] !=
+                             3);
 
         ARM_COMPUTE_ERROR_ON(_feeder.get() == nullptr);
 
@@ -282,7 +282,7 @@ public:
 
             // Iterate through every pixel of the image
             Window window;
-            if(data_layout == DataLayout::NCHW)
+            if (data_layout == DataLayout::NCHW)
             {
                 window.set(Window::DimX, Window::Dimension(0, _width, 1));
                 window.set(Window::DimY, Window::Dimension(0, _height, 1));
@@ -303,48 +303,50 @@ public:
             unsigned char green = 0;
             unsigned char blue  = 0;
 
-            execute_window_loop(window, [&](const Coordinates &)
-            {
-                red   = _feeder->get();
-                green = _feeder->get();
-                blue  = _feeder->get();
-
-                switch(tensor.info()->data_type())
+            execute_window_loop(
+                window,
+                [&](const Coordinates &)
                 {
-                    case DataType::U8:
-                    case DataType::QASYMM8:
-                    {
-                        *(out.ptr() + 0 * stride_z) = bgr ? blue : red;
-                        *(out.ptr() + 1 * stride_z) = green;
-                        *(out.ptr() + 2 * stride_z) = bgr ? red : blue;
-                        break;
-                    }
-                    case DataType::F32:
-                    {
-                        *reinterpret_cast<float *>(out.ptr() + 0 * stride_z) = static_cast<float>(bgr ? blue : red);
-                        *reinterpret_cast<float *>(out.ptr() + 1 * stride_z) = static_cast<float>(green);
-                        *reinterpret_cast<float *>(out.ptr() + 2 * stride_z) = static_cast<float>(bgr ? red : blue);
-                        break;
-                    }
-                    case DataType::F16:
-                    {
-                        *reinterpret_cast<half *>(out.ptr() + 0 * stride_z) = static_cast<half>(bgr ? blue : red);
-                        *reinterpret_cast<half *>(out.ptr() + 1 * stride_z) = static_cast<half>(green);
-                        *reinterpret_cast<half *>(out.ptr() + 2 * stride_z) = static_cast<half>(bgr ? red : blue);
-                        break;
-                    }
-                    default:
+                    red   = _feeder->get();
+                    green = _feeder->get();
+                    blue  = _feeder->get();
+
+                    switch (tensor.info()->data_type())
                     {
-                        ARM_COMPUTE_ERROR("Unsupported data type");
+                        case DataType::U8:
+                        case DataType::QASYMM8:
+                        {
+                            *(out.ptr() + 0 * stride_z) = bgr ? blue : red;
+                            *(out.ptr() + 1 * stride_z) = green;
+                            *(out.ptr() + 2 * stride_z) = bgr ? red : blue;
+                            break;
+                        }
+                        case DataType::F32:
+                        {
+                            *reinterpret_cast<float *>(out.ptr() + 0 * stride_z) = static_cast<float>(bgr ? blue : red);
+                            *reinterpret_cast<float *>(out.ptr() + 1 * stride_z) = static_cast<float>(green);
+                            *reinterpret_cast<float *>(out.ptr() + 2 * stride_z) = static_cast<float>(bgr ? red : blue);
+                            break;
+                        }
+                        case DataType::F16:
+                        {
+                            *reinterpret_cast<half *>(out.ptr() + 0 * stride_z) = static_cast<half>(bgr ? blue : red);
+                            *reinterpret_cast<half *>(out.ptr() + 1 * stride_z) = static_cast<half>(green);
+                            *reinterpret_cast<half *>(out.ptr() + 2 * stride_z) = static_cast<half>(bgr ? red : blue);
+                            break;
+                        }
+                        default:
+                        {
+                            ARM_COMPUTE_ERROR("Unsupported data type");
+                        }
                     }
-                }
-            },
-            out);
+                },
+                out);
 
             // Unmap buffer if creating a CLTensor
             unmap(tensor);
         }
-        catch(const std::ifstream::failure &e)
+        catch (const std::ifstream::failure &e)
         {
             ARM_COMPUTE_ERROR_VAR("Loading image file: %s", e.what());
         }
@@ -368,8 +370,7 @@ class PPMLoader : public IImageLoader
 {
 public:
     /** Default Constructor */
-    PPMLoader()
-        : IImageLoader(), _fs()
+    PPMLoader() : IImageLoader(), _fs()
     {
     }
 
@@ -386,7 +387,7 @@ public:
             _fs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
             _fs.open(filename, std::ios::in | std::ios::binary);
 
-            unsigned int max_val = 0;
+            unsigned int max_val               = 0;
             std::tie(_width, _height, max_val) = parse_ppm_header(_fs);
 
             ARM_COMPUTE_ERROR_ON_MSG_VAR(max_val >= 256, "2 bytes per colour channel not supported in file %s",
@@ -394,14 +395,14 @@ public:
 
             _feeder = std::make_unique<FileImageFeeder>(_fs);
         }
-        catch(std::runtime_error &e)
+        catch (std::runtime_error &e)
         {
             ARM_COMPUTE_ERROR_VAR("Accessing %s: %s", filename.c_str(), e.what());
         }
     }
     void close() override
     {
-        if(is_open())
+        if (is_open())
         {
             _fs.close();
             _feeder = nullptr;
@@ -443,8 +444,7 @@ private:
 
 public:
     /** Default Constructor */
-    JPEGLoader()
-        : IImageLoader(), _is_loaded(false), _data(nullptr)
+    JPEGLoader() : IImageLoader(), _is_loaded(false), _data(nullptr)
     {
     }
 
@@ -457,7 +457,7 @@ public:
     {
         int      bpp, width, height;
         uint8_t *rgb_image = stbi_load(filename.c_str(), &width, &height, &bpp, 3);
-        if(rgb_image == NULL)
+        if (rgb_image == NULL)
         {
             ARM_COMPUTE_ERROR_VAR("Accessing %s failed", filename.c_str());
         }
@@ -472,7 +472,7 @@ public:
     }
     void close() override
     {
-        if(is_open())
+        if (is_open())
         {
             _width  = 0;
             _height = 0;
@@ -483,7 +483,7 @@ public:
     /** Explicitly Releases the memory of the loaded data */
     void release()
     {
-        if(_is_loaded)
+        if (_is_loaded)
         {
             _data.reset();
             _is_loaded = false;
@@ -492,7 +492,7 @@ public:
     }
 
 private:
-    bool _is_loaded;
+    bool                                     _is_loaded;
     std::unique_ptr<uint8_t, malloc_deleter> _data;
 };
 
@@ -509,7 +509,7 @@ public:
     static std::unique_ptr<IImageLoader> create(const std::string &filename)
     {
         ImageType type = arm_compute::utils::get_image_type_from_file(filename);
-        switch(type)
+        switch (type)
         {
             case ImageType::PPM:
                 return std::make_unique<PPMLoader>();
diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h
index 69cc3d4fc0..4f14d985af 100644
--- a/utils/TypePrinter.h
+++ b/utils/TypePrinter.h
@@ -51,11 +51,13 @@
 #include "arm_compute/function_info/MatMulInfo.h"
 #include "arm_compute/runtime/CL/CLTunerTypes.h"
 #include "arm_compute/runtime/CL/CLTypes.h"
+#include "arm_compute/runtime/common/LSTMParams.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
 #include "arm_compute/runtime/NEON/functions/NEMatMul.h"
-#include "arm_compute/runtime/common/LSTMParams.h"
+
 #include "support/Cast.h"
 #include "support/StringSupport.h"
+
 #include <ostream>
 #include <sstream>
 #include <string>
@@ -71,7 +73,7 @@ namespace arm_compute
 template <typename T>
 std::string to_string_if_not_null(T *arg)
 {
-    if(arg == nullptr)
+    if (arg == nullptr)
     {
         return "nullptr";
     }
@@ -111,13 +113,13 @@ template <typename T>
     os << "[";
     bool   first = true;
     size_t i;
-    for(i = 0; i < args.size(); ++i)
+    for (i = 0; i < args.size(); ++i)
     {
-        if(i == max_print_size)
+        if (i == max_print_size)
         {
             break;
         }
-        if(first)
+        if (first)
         {
             first = false;
         }
@@ -127,7 +129,7 @@ template <typename T>
         }
         os << to_string(args[i]);
     }
-    if(i < args.size())
+    if (i < args.size())
     {
         os << ", ...";
     }
@@ -159,11 +161,11 @@ std::string to_string(const std::vector<T> &args)
 template <typename T>
 inline ::std::ostream &operator<<(::std::ostream &os, const Dimensions<T> &dimensions)
 {
-    if(dimensions.num_dimensions() > 0)
+    if (dimensions.num_dimensions() > 0)
     {
         os << dimensions[0];
 
-        for(unsigned int d = 1; d < dimensions.num_dimensions(); ++d)
+        for (unsigned int d = 1; d < dimensions.num_dimensions(); ++d)
         {
             os << "," << dimensions[d];
         }
@@ -181,7 +183,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const Dimensions<T> &dimen
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const RoundingPolicy &rounding_policy)
 {
-    switch(rounding_policy)
+    switch (rounding_policy)
     {
         case RoundingPolicy::TO_ZERO:
             os << "TO_ZERO";
@@ -209,7 +211,8 @@ inline ::std::ostream &operator<<(::std::ostream &os, const RoundingPolicy &roun
 inline ::std::ostream &operator<<(::std::ostream &os, const WeightsInfo &weights_info)
 {
     os << weights_info.are_reshaped() << ";";
-    os << weights_info.num_kernels() << ";" << weights_info.kernel_size().first << "," << weights_info.kernel_size().second;
+    os << weights_info.num_kernels() << ";" << weights_info.kernel_size().first << ","
+       << weights_info.kernel_size().second;
 
     return os;
 }
@@ -273,7 +276,8 @@ inline ::std::ostream &operator<<(::std::ostream &os, const GEMMKernelInfo &gemm
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const GEMMLHSMatrixInfo &gemm_info)
 {
-    os << "( m0=" << (unsigned int)gemm_info.m0 << " k0=" << gemm_info.k0 << "  v0=" << gemm_info.v0 << "  trans=" << gemm_info.transpose << "  inter=" << gemm_info.interleave << "})";
+    os << "( m0=" << (unsigned int)gemm_info.m0 << " k0=" << gemm_info.k0 << "  v0=" << gemm_info.v0
+       << "  trans=" << gemm_info.transpose << "  inter=" << gemm_info.interleave << "})";
     return os;
 }
 
@@ -286,8 +290,9 @@ inline ::std::ostream &operator<<(::std::ostream &os, const GEMMLHSMatrixInfo &g
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const GEMMRHSMatrixInfo &gemm_info)
 {
-    os << "( n0=" << (unsigned int)gemm_info.n0 << " k0=" << gemm_info.k0 << "  h0=" << gemm_info.h0 << "  trans=" << gemm_info.transpose << "  inter=" << gemm_info.interleave << " exp_img=" <<
-       gemm_info.export_to_cl_image << "})";
+    os << "( n0=" << (unsigned int)gemm_info.n0 << " k0=" << gemm_info.k0 << "  h0=" << gemm_info.h0
+       << "  trans=" << gemm_info.transpose << "  inter=" << gemm_info.interleave
+       << " exp_img=" << gemm_info.export_to_cl_image << "})";
     return os;
 }
 
@@ -340,8 +345,8 @@ inline std::string to_string(const GEMMKernelInfo &gemm_info)
 inline ::std::ostream &operator<<(::std::ostream &os, const BoundingBoxTransformInfo &bbox_info)
 {
     auto weights = bbox_info.weights();
-    os << "(" << bbox_info.img_width() << "x" << bbox_info.img_height() << ")~" << bbox_info.scale() << "(weights={" << weights[0] << ", " << weights[1] << ", " << weights[2] << ", " << weights[3] <<
-       "})";
+    os << "(" << bbox_info.img_width() << "x" << bbox_info.img_height() << ")~" << bbox_info.scale() << "(weights={"
+       << weights[0] << ", " << weights[1] << ", " << weights[2] << ", " << weights[3] << "})";
     return os;
 }
 
@@ -454,7 +459,7 @@ inline std::string to_string(const QuantizationInfo &quantization_info)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const ActivationLayerInfo::ActivationFunction &act_function)
 {
-    switch(act_function)
+    switch (act_function)
     {
         case ActivationLayerInfo::ActivationFunction::ABS:
             os << "ABS";
@@ -521,7 +526,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const ActivationLayerInfo:
 inline std::string to_string(const arm_compute::ActivationLayerInfo &info)
 {
     std::stringstream str;
-    if(info.enabled())
+    if (info.enabled())
     {
         str << info.activation();
     }
@@ -537,9 +542,9 @@ inline std::string to_string(const arm_compute::ActivationLayerInfo &info)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const ActivationLayerInfo *info)
 {
-    if(info != nullptr)
+    if (info != nullptr)
     {
-        if(info->enabled())
+        if (info->enabled())
         {
             os << info->activation();
             os << "(";
@@ -581,7 +586,7 @@ inline std::string to_string(const arm_compute::ActivationLayerInfo::ActivationF
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const NormType &norm_type)
 {
-    switch(norm_type)
+    switch (norm_type)
     {
         case NormType::CROSS_MAP:
             os << "CROSS_MAP";
@@ -634,7 +639,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const NormalizationLayerIn
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const PoolingType &pool_type)
 {
-    switch(pool_type)
+    switch (pool_type)
     {
         case PoolingType::AVG:
             os << "AVG";
@@ -689,7 +694,7 @@ inline std::string to_string(const RoundingPolicy &rounding_policy)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const DataLayout &data_layout)
 {
-    switch(data_layout)
+    switch (data_layout)
     {
         case DataLayout::UNKNOWN:
             os << "UNKNOWN";
@@ -736,7 +741,7 @@ inline std::string to_string(const arm_compute::DataLayout &data_layout)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const DataLayoutDimension &data_layout_dim)
 {
-    switch(data_layout_dim)
+    switch (data_layout_dim)
     {
         case DataLayoutDimension::WIDTH:
             os << "WIDTH";
@@ -768,7 +773,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const DataLayoutDimension
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const DataType &data_type)
 {
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::UNKNOWN:
             os << "UNKNOWN";
@@ -859,7 +864,7 @@ inline std::string to_string(const arm_compute::DataType &data_type)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const Format &format)
 {
-    switch(format)
+    switch (format)
     {
         case Format::UNKNOWN:
             os << "UNKNOWN";
@@ -941,7 +946,7 @@ inline std::string to_string(const Format &format)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const Channel &channel)
 {
-    switch(channel)
+    switch (channel)
     {
         case Channel::UNKNOWN:
             os << "UNKNOWN";
@@ -1008,7 +1013,7 @@ inline std::string to_string(const Channel &channel)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const BorderMode &mode)
 {
-    switch(mode)
+    switch (mode)
     {
         case BorderMode::UNDEFINED:
             os << "UNDEFINED";
@@ -1035,10 +1040,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const BorderMode &mode)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const BorderSize &border)
 {
-    os << border.top << ","
-       << border.right << ","
-       << border.bottom << ","
-       << border.left;
+    os << border.top << "," << border.right << "," << border.bottom << "," << border.left;
 
     return os;
 }
@@ -1053,7 +1055,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const BorderSize &border)
 inline ::std::ostream &operator<<(::std::ostream &os, const PaddingList &padding)
 {
     os << "{";
-    for(auto const &p : padding)
+    for (auto const &p : padding)
     {
         os << "{" << p.first << "," << p.second << "}";
     }
@@ -1071,7 +1073,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const PaddingList &padding
 inline ::std::ostream &operator<<(::std::ostream &os, const Multiples &multiples)
 {
     os << "(";
-    for(size_t i = 0; i < multiples.size() - 1; i++)
+    for (size_t i = 0; i < multiples.size() - 1; i++)
     {
         os << multiples[i] << ", ";
     }
@@ -1088,7 +1090,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const Multiples &multiples
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const InterpolationPolicy &policy)
 {
-    switch(policy)
+    switch (policy)
     {
         case InterpolationPolicy::NEAREST_NEIGHBOR:
             os << "NEAREST_NEIGHBOR";
@@ -1115,7 +1117,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const InterpolationPolicy
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const SamplingPolicy &policy)
 {
-    switch(policy)
+    switch (policy)
     {
         case SamplingPolicy::CENTER:
             os << "CENTER";
@@ -1146,18 +1148,16 @@ inline ::std::ostream &operator<<(std::ostream &os, const ITensorInfo *info)
        << "DataLayout=" << string_from_data_layout(data_layout) << ","
        << "DataType=" << string_from_data_type(data_type);
 
-    if(is_data_type_quantized(data_type))
+    if (is_data_type_quantized(data_type))
     {
         const QuantizationInfo qinfo   = info->quantization_info();
         const auto             scales  = qinfo.scale();
         const auto             offsets = qinfo.offset();
 
         os << ", QuantizationInfo={"
-           << "scales.size=" << scales.size()
-           << ", scale(s)=" << scales << ", ";
+           << "scales.size=" << scales.size() << ", scale(s)=" << scales << ", ";
 
-        os << "offsets.size=" << offsets.size()
-           << ", offset(s)=" << offsets << "}";
+        os << "offsets.size=" << offsets.size() << ", offset(s)=" << offsets << "}";
     }
     return os;
 }
@@ -1210,7 +1210,7 @@ inline std::string to_string(const ITensorInfo &info)
 inline std::string to_string(const ITensorInfo *info)
 {
     std::string ret_str = "nullptr";
-    if(info != nullptr)
+    if (info != nullptr)
     {
         std::stringstream str;
         str << info;
@@ -1239,7 +1239,7 @@ inline std::string to_string(ITensorInfo *info)
 inline std::string to_string(const ITensor *tensor)
 {
     std::string ret_str = "nullptr";
-    if(tensor != nullptr)
+    if (tensor != nullptr)
     {
         std::stringstream str;
         str << "ITensor->info(): " << tensor->info();
@@ -1282,7 +1282,7 @@ inline std::string to_string(ITensor &tensor)
 inline std::string to_string(const ICLTensor *cl_tensor)
 {
     std::string ret_str = "nullptr";
-    if(cl_tensor != nullptr)
+    if (cl_tensor != nullptr)
     {
         std::stringstream str;
         str << "ICLTensor->info(): " << cl_tensor->info();
@@ -1311,11 +1311,7 @@ inline std::string to_string(ICLTensor *cl_tensor)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const cl::NDRange &nd_range)
 {
-    os << "{"
-       << nd_range[0] << ","
-       << nd_range[1] << ","
-       << nd_range[2]
-       << "}";
+    os << "{" << nd_range[0] << "," << nd_range[1] << "," << nd_range[2] << "}";
     return os;
 }
 
@@ -1451,9 +1447,9 @@ inline ::std::ostream &operator<<(::std::ostream &os, const Window::Dimension &d
 inline ::std::ostream &operator<<(::std::ostream &os, const Window &win)
 {
     os << "{";
-    for(unsigned int i = 0; i < Coordinates::num_max_dimensions; i++)
+    for (unsigned int i = 0; i < Coordinates::num_max_dimensions; i++)
     {
-        if(i > 0)
+        if (i > 0)
         {
             os << ", ";
         }
@@ -1537,7 +1533,7 @@ inline std::string to_string(const Window &win)
 inline std::string to_string(Window *win)
 {
     std::string ret_str = "nullptr";
-    if(win != nullptr)
+    if (win != nullptr)
     {
         std::stringstream str;
         str << *win;
@@ -1570,7 +1566,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const Rectangle &rect)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const PaddingMode &mode)
 {
-    switch(mode)
+    switch (mode)
     {
         case PaddingMode::CONSTANT:
             os << "CONSTANT";
@@ -1612,8 +1608,8 @@ inline ::std::ostream &operator<<(::std::ostream &os, const PadStrideInfo &pad_s
 {
     os << pad_stride_info.stride().first << "," << pad_stride_info.stride().second;
     os << ";";
-    os << pad_stride_info.pad_left() << "," << pad_stride_info.pad_right() << ","
-       << pad_stride_info.pad_top() << "," << pad_stride_info.pad_bottom();
+    os << pad_stride_info.pad_left() << "," << pad_stride_info.pad_right() << "," << pad_stride_info.pad_top() << ","
+       << pad_stride_info.pad_bottom();
 
     return os;
 }
@@ -1718,7 +1714,7 @@ inline std::string to_string(const SamplingPolicy &policy)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const ConvertPolicy &policy)
 {
-    switch(policy)
+    switch (policy)
     {
         case ConvertPolicy::WRAP:
             os << "WRAP";
@@ -1749,7 +1745,7 @@ inline std::string to_string(const ConvertPolicy &policy)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const ArithmeticOperation &op)
 {
-    switch(op)
+    switch (op)
     {
         case ArithmeticOperation::ADD:
             os << "ADD";
@@ -1804,7 +1800,7 @@ inline std::string to_string(const ArithmeticOperation &op)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const ReductionOperation &op)
 {
-    switch(op)
+    switch (op)
     {
         case ReductionOperation::SUM:
             os << "SUM";
@@ -1859,7 +1855,7 @@ inline std::string to_string(const ReductionOperation &op)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const ComparisonOperation &op)
 {
-    switch(op)
+    switch (op)
     {
         case ComparisonOperation::Equal:
             os << "Equal";
@@ -1895,7 +1891,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const ComparisonOperation
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const ElementWiseUnary &op)
 {
-    switch(op)
+    switch (op)
     {
         case ElementWiseUnary::RSQRT:
             os << "RSQRT";
@@ -1992,7 +1988,7 @@ inline std::string to_string(const PoolingLayerInfo &info)
     str << "{Type=" << info.pool_type << ","
         << "DataLayout=" << info.data_layout << ","
         << "IsGlobalPooling=" << info.is_global_pooling;
-    if(!info.is_global_pooling)
+    if (!info.is_global_pooling)
     {
         str << ","
             << "PoolSize=" << info.pool_size.width << "," << info.pool_size.height << ","
@@ -2038,8 +2034,7 @@ inline std::string to_string(const Size3D &type)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const Padding3D &padding3d)
 {
-    os << padding3d.left << "," << padding3d.right << ","
-       << padding3d.top << "," << padding3d.bottom << ","
+    os << padding3d.left << "," << padding3d.right << "," << padding3d.top << "," << padding3d.bottom << ","
        << padding3d.front << "," << padding3d.back;
     return os;
 }
@@ -2066,7 +2061,7 @@ inline std::string to_string(const Padding3D &padding3d)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const DimensionRoundingType &rounding_type)
 {
-    switch(rounding_type)
+    switch (rounding_type)
     {
         case DimensionRoundingType::CEIL:
             os << "CEIL";
@@ -2091,7 +2086,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const Pooling3dLayerInfo &
 {
     os << "{Type=" << info.pool_type << ","
        << "IsGlobalPooling=" << info.is_global_pooling;
-    if(!info.is_global_pooling)
+    if (!info.is_global_pooling)
     {
         os << ","
            << "PoolSize=" << info.pool_size << ", "
@@ -2128,16 +2123,10 @@ inline std::string to_string(const PriorBoxLayerInfo &info)
 {
     std::stringstream str;
     str << "{";
-    str << "Clip:" << info.clip()
-        << "Flip:" << info.flip()
-        << "StepX:" << info.steps()[0]
-        << "StepY:" << info.steps()[1]
-        << "MinSizes:" << info.min_sizes().size()
-        << "MaxSizes:" << info.max_sizes().size()
-        << "ImgSizeX:" << info.img_size().x
-        << "ImgSizeY:" << info.img_size().y
-        << "Offset:" << info.offset()
-        << "Variances:" << info.variances().size();
+    str << "Clip:" << info.clip() << "Flip:" << info.flip() << "StepX:" << info.steps()[0]
+        << "StepY:" << info.steps()[1] << "MinSizes:" << info.min_sizes().size()
+        << "MaxSizes:" << info.max_sizes().size() << "ImgSizeX:" << info.img_size().x
+        << "ImgSizeY:" << info.img_size().y << "Offset:" << info.offset() << "Variances:" << info.variances().size();
     str << "}";
     return str.str();
 }
@@ -2178,7 +2167,7 @@ inline std::string to_string(const Size2D &type)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const ConvolutionMethod &conv_method)
 {
-    switch(conv_method)
+    switch (conv_method)
     {
         case ConvolutionMethod::GEMM:
             os << "GEMM";
@@ -2224,7 +2213,7 @@ inline std::string to_string(const ConvolutionMethod &conv_method)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const GPUTarget &gpu_target)
 {
-    switch(gpu_target)
+    switch (gpu_target)
     {
         case GPUTarget::GPU_ARCH_MASK:
             os << "GPU_ARCH_MASK";
@@ -2358,7 +2347,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const DetectionWindow &det
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const DetectionOutputLayerCodeType &detection_code)
 {
-    switch(detection_code)
+    switch (detection_code)
     {
         case DetectionOutputLayerCodeType::CENTER_SIZE:
             os << "CENTER_SIZE";
@@ -2410,8 +2399,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const DetectionOutputLayer
        << "BackgroundLabelId=" << detection_info.background_label_id() << ","
        << "ConfidenceThreshold=" << detection_info.confidence_threshold() << ","
        << "TopK=" << detection_info.top_k() << ","
-       << "NumLocClasses=" << detection_info.num_loc_classes()
-       << "}";
+       << "NumLocClasses=" << detection_info.num_loc_classes() << "}";
 
     return os;
 }
@@ -2447,8 +2435,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const DetectionPostProcess
        << "ScaleValue_h=" << detection_info.scale_value_h() << ","
        << "ScaleValue_w=" << detection_info.scale_value_w() << ","
        << "UseRegularNms=" << detection_info.use_regular_nms() << ","
-       << "DetectionPerClass=" << detection_info.detection_per_class()
-       << "}";
+       << "DetectionPerClass=" << detection_info.detection_per_class() << "}";
 
     return os;
 }
@@ -2488,16 +2475,9 @@ inline std::string to_string(const DetectionWindow &detection_window)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const PriorBoxLayerInfo &info)
 {
-    os << "Clip:" << info.clip()
-       << "Flip:" << info.flip()
-       << "StepX:" << info.steps()[0]
-       << "StepY:" << info.steps()[1]
-       << "MinSizes:" << info.min_sizes()
-       << "MaxSizes:" << info.max_sizes()
-       << "ImgSizeX:" << info.img_size().x
-       << "ImgSizeY:" << info.img_size().y
-       << "Offset:" << info.offset()
-       << "Variances:" << info.variances();
+    os << "Clip:" << info.clip() << "Flip:" << info.flip() << "StepX:" << info.steps()[0] << "StepY:" << info.steps()[1]
+       << "MinSizes:" << info.min_sizes() << "MaxSizes:" << info.max_sizes() << "ImgSizeX:" << info.img_size().x
+       << "ImgSizeY:" << info.img_size().y << "Offset:" << info.offset() << "Variances:" << info.variances();
 
     return os;
 }
@@ -2528,7 +2508,7 @@ inline std::string to_string(const WinogradInfo &type)
  */
 inline std::string to_string(const CLTunerMode val)
 {
-    switch(val)
+    switch (val)
     {
         case CLTunerMode::EXHAUSTIVE:
         {
@@ -2557,7 +2537,7 @@ inline std::string to_string(const CLTunerMode val)
  */
 inline std::string to_string(CLGEMMKernelType val)
 {
-    switch(val)
+    switch (val)
     {
         case CLGEMMKernelType::NATIVE:
         {
@@ -2660,7 +2640,7 @@ inline std::string to_string(const FullyConnectedLayerInfo &info)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const GEMMLowpOutputStageType &gemm_type)
 {
-    switch(gemm_type)
+    switch (gemm_type)
     {
         case GEMMLowpOutputStageType::NONE:
             os << "NONE";
@@ -2827,7 +2807,7 @@ inline std::string to_string(const ScaleKernelInfo &scale_info)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const FFTDirection &fft_dir)
 {
-    switch(fft_dir)
+    switch (fft_dir)
     {
         case FFTDirection::Forward:
             os << "Forward";
@@ -2945,7 +2925,7 @@ inline std::string to_string(const Coordinates2D &coord_2d)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const FuseBatchNormalizationType &fuse_type)
 {
-    switch(fuse_type)
+    switch (fuse_type)
     {
         case FuseBatchNormalizationType::CONVOLUTION:
             os << "CONVOLUTION";
@@ -3073,7 +3053,7 @@ inline std::string to_string(const uint8_t num)
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const NMSType &nms_type)
 {
-    switch(nms_type)
+    switch (nms_type)
     {
         case NMSType::LINEAR:
             os << "LINEAR";
@@ -3196,46 +3176,46 @@ inline std::string to_string(const Conv3dInfo &conv3d_info)
 inline std::string to_string(const WeightFormat wf)
 {
 #define __CASE_WEIGHT_FORMAT(wf) \
-case WeightFormat::wf:       \
-    return #wf;
-    switch(wf)
+    case WeightFormat::wf:       \
+        return #wf;
+    switch (wf)
     {
-            __CASE_WEIGHT_FORMAT(UNSPECIFIED)
-            __CASE_WEIGHT_FORMAT(ANY)
-            __CASE_WEIGHT_FORMAT(OHWI)
-            __CASE_WEIGHT_FORMAT(OHWIo2)
-            __CASE_WEIGHT_FORMAT(OHWIo4)
-            __CASE_WEIGHT_FORMAT(OHWIo8)
-            __CASE_WEIGHT_FORMAT(OHWIo16)
-            __CASE_WEIGHT_FORMAT(OHWIo32)
-            __CASE_WEIGHT_FORMAT(OHWIo64)
-            __CASE_WEIGHT_FORMAT(OHWIo128)
-            __CASE_WEIGHT_FORMAT(OHWIo4i2)
-            __CASE_WEIGHT_FORMAT(OHWIo4i2_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo8i2)
-            __CASE_WEIGHT_FORMAT(OHWIo8i2_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo16i2)
-            __CASE_WEIGHT_FORMAT(OHWIo16i2_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo32i2)
-            __CASE_WEIGHT_FORMAT(OHWIo32i2_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo64i2)
-            __CASE_WEIGHT_FORMAT(OHWIo64i2_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo4i4)
-            __CASE_WEIGHT_FORMAT(OHWIo4i4_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo8i4)
-            __CASE_WEIGHT_FORMAT(OHWIo8i4_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo16i4)
-            __CASE_WEIGHT_FORMAT(OHWIo16i4_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo32i4)
-            __CASE_WEIGHT_FORMAT(OHWIo32i4_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo64i4)
-            __CASE_WEIGHT_FORMAT(OHWIo64i4_bf16)
-            __CASE_WEIGHT_FORMAT(OHWIo2i8)
-            __CASE_WEIGHT_FORMAT(OHWIo4i8)
-            __CASE_WEIGHT_FORMAT(OHWIo8i8)
-            __CASE_WEIGHT_FORMAT(OHWIo16i8)
-            __CASE_WEIGHT_FORMAT(OHWIo32i8)
-            __CASE_WEIGHT_FORMAT(OHWIo64i8)
+        __CASE_WEIGHT_FORMAT(UNSPECIFIED)
+        __CASE_WEIGHT_FORMAT(ANY)
+        __CASE_WEIGHT_FORMAT(OHWI)
+        __CASE_WEIGHT_FORMAT(OHWIo2)
+        __CASE_WEIGHT_FORMAT(OHWIo4)
+        __CASE_WEIGHT_FORMAT(OHWIo8)
+        __CASE_WEIGHT_FORMAT(OHWIo16)
+        __CASE_WEIGHT_FORMAT(OHWIo32)
+        __CASE_WEIGHT_FORMAT(OHWIo64)
+        __CASE_WEIGHT_FORMAT(OHWIo128)
+        __CASE_WEIGHT_FORMAT(OHWIo4i2)
+        __CASE_WEIGHT_FORMAT(OHWIo4i2_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo8i2)
+        __CASE_WEIGHT_FORMAT(OHWIo8i2_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo16i2)
+        __CASE_WEIGHT_FORMAT(OHWIo16i2_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo32i2)
+        __CASE_WEIGHT_FORMAT(OHWIo32i2_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo64i2)
+        __CASE_WEIGHT_FORMAT(OHWIo64i2_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo4i4)
+        __CASE_WEIGHT_FORMAT(OHWIo4i4_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo8i4)
+        __CASE_WEIGHT_FORMAT(OHWIo8i4_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo16i4)
+        __CASE_WEIGHT_FORMAT(OHWIo16i4_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo32i4)
+        __CASE_WEIGHT_FORMAT(OHWIo32i4_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo64i4)
+        __CASE_WEIGHT_FORMAT(OHWIo64i4_bf16)
+        __CASE_WEIGHT_FORMAT(OHWIo2i8)
+        __CASE_WEIGHT_FORMAT(OHWIo4i8)
+        __CASE_WEIGHT_FORMAT(OHWIo8i8)
+        __CASE_WEIGHT_FORMAT(OHWIo16i8)
+        __CASE_WEIGHT_FORMAT(OHWIo32i8)
+        __CASE_WEIGHT_FORMAT(OHWIo64i8)
         default:
             return "invalid value";
     }
@@ -3282,8 +3262,7 @@ inline std::string to_string(const std::tuple<TensorShape, TensorShape, arm_comp
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const Padding2D &padding2d)
 {
-    os << padding2d.left << "," << padding2d.right << ","
-       << padding2d.top << "," << padding2d.bottom;
+    os << padding2d.left << "," << padding2d.right << "," << padding2d.top << "," << padding2d.bottom;
     return os;
 }
 
@@ -3426,7 +3405,8 @@ inline std::string to_string(const experimental::dynamic_fusion::CastAttributes
  *
  * @return Modified output stream.
  */
-inline ::std::ostream &operator<<(::std::ostream &os, const experimental::dynamic_fusion::DepthwiseConv2dAttributes &dw_conv2d_attr)
+inline ::std::ostream &operator<<(::std::ostream                                                &os,
+                                  const experimental::dynamic_fusion::DepthwiseConv2dAttributes &dw_conv2d_attr)
 {
     os << "DepthwiseConv2dAttributes="
        << "["
@@ -3518,7 +3498,8 @@ inline std::string to_string(const experimental::dynamic_fusion::ResizeAttribute
  *
  * @return Modified output stream.
  */
-inline ::std::ostream &operator<<(::std::ostream &os, const experimental::dynamic_fusion::SoftmaxAttributes &softmax_attr)
+inline ::std::ostream &operator<<(::std::ostream                                        &os,
+                                  const experimental::dynamic_fusion::SoftmaxAttributes &softmax_attr)
 {
     os << "SoftmaxAttributes="
        << "["
@@ -3583,8 +3564,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const arm_compute::MatMulK
        << "M0=" << matmul_info.m0 << ", "
        << "N0=" << matmul_info.n0 << ", "
        << "K0=" << matmul_info.k0 << ", "
-       << "export_rhs_to_cl_image=" << matmul_info.export_rhs_to_cl_image
-       << "]";
+       << "export_rhs_to_cl_image=" << matmul_info.export_rhs_to_cl_image << "]";
 
     return os;
 }
@@ -3612,8 +3592,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const arm_compute::CpuMatM
 {
     os << "CpuMatMulSettings="
        << "["
-       << "fast_math=" << settings.fast_math()
-       << "]";
+       << "fast_math=" << settings.fast_math() << "]";
 
     return os;
 }
diff --git a/utils/Utils.cpp b/utils/Utils.cpp
index 545d64e4b9..a143dc497f 100644
--- a/utils/Utils.cpp
+++ b/utils/Utils.cpp
@@ -59,7 +59,7 @@ namespace
  */
 void discard_comments(std::ifstream &fs)
 {
-    while(fs.peek() == '#')
+    while (fs.peek() == '#')
     {
         fs.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
     }
@@ -71,11 +71,11 @@ void discard_comments(std::ifstream &fs)
  */
 void discard_comments_and_spaces(std::ifstream &fs)
 {
-    while(true)
+    while (true)
     {
         discard_comments(fs);
 
-        if(isspace(fs.peek()) == 0)
+        if (isspace(fs.peek()) == 0)
         {
             break;
         }
@@ -88,13 +88,12 @@ void discard_comments_and_spaces(std::ifstream &fs)
 #ifndef BENCHMARK_EXAMPLES
 int run_example(int argc, char **argv, std::unique_ptr<Example> example)
 {
-    std::cout << "\n"
-              << argv[0] << "\n\n";
+    std::cout << "\n" << argv[0] << "\n\n";
 
     try
     {
         bool status = example->do_setup(argc, argv);
-        if(!status)
+        if (!status)
         {
             return 1;
         }
@@ -105,19 +104,17 @@ int run_example(int argc, char **argv, std::unique_ptr<Example> example)
         return 0;
     }
 #ifdef ARM_COMPUTE_CL
-    catch(cl::Error &err)
+    catch (cl::Error &err)
     {
         std::cerr << "!!!!!!!!!!!!!!!!!!!!!!!!!!!" << std::endl;
-        std::cerr << std::endl
-                  << "ERROR " << err.what() << "(" << err.err() << ")" << std::endl;
+        std::cerr << std::endl << "ERROR " << err.what() << "(" << err.err() << ")" << std::endl;
         std::cerr << "!!!!!!!!!!!!!!!!!!!!!!!!!!!" << std::endl;
     }
 #endif /* ARM_COMPUTE_CL */
-    catch(std::runtime_error &err)
+    catch (std::runtime_error &err)
     {
         std::cerr << "!!!!!!!!!!!!!!!!!!!!!!!!!!!" << std::endl;
-        std::cerr << std::endl
-                  << "ERROR " << err.what() << " " << (errno ? strerror(errno) : "") << std::endl;
+        std::cerr << std::endl << "ERROR " << err.what() << " " << (errno ? strerror(errno) : "") << std::endl;
         std::cerr << "!!!!!!!!!!!!!!!!!!!!!!!!!!!" << std::endl;
     }
 
@@ -131,13 +128,15 @@ void draw_detection_rectangle(ITensor *tensor, const DetectionWindow &rect, uint
 {
     ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(tensor, Format::RGB888);
 
-    uint8_t *top    = tensor->info()->offset_element_in_bytes(Coordinates(rect.x, rect.y)) + tensor->buffer();
-    uint8_t *bottom = tensor->info()->offset_element_in_bytes(Coordinates(rect.x, rect.y + rect.height)) + tensor->buffer();
-    uint8_t *left   = top;
-    uint8_t *right  = tensor->info()->offset_element_in_bytes(Coordinates(rect.x + rect.width, rect.y)) + tensor->buffer();
-    size_t   stride = tensor->info()->strides_in_bytes()[Window::DimY];
+    uint8_t *top = tensor->info()->offset_element_in_bytes(Coordinates(rect.x, rect.y)) + tensor->buffer();
+    uint8_t *bottom =
+        tensor->info()->offset_element_in_bytes(Coordinates(rect.x, rect.y + rect.height)) + tensor->buffer();
+    uint8_t *left = top;
+    uint8_t *right =
+        tensor->info()->offset_element_in_bytes(Coordinates(rect.x + rect.width, rect.y)) + tensor->buffer();
+    size_t stride = tensor->info()->strides_in_bytes()[Window::DimY];
 
-    for(size_t x = 0; x < rect.width; ++x)
+    for (size_t x = 0; x < rect.width; ++x)
     {
         top[0]    = r;
         top[1]    = g;
@@ -150,7 +149,7 @@ void draw_detection_rectangle(ITensor *tensor, const DetectionWindow &rect, uint
         bottom += 3;
     }
 
-    for(size_t y = 0; y < rect.height; ++y)
+    for (size_t y = 0; y < rect.height; ++y)
     {
         left[0]  = r;
         left[1]  = g;
@@ -176,22 +175,22 @@ ImageType get_image_type_from_file(const std::string &filename)
         fs.open(filename, std::ios::in | std::ios::binary);
 
         // Identify type from magic number
-        std::array<unsigned char, 2> magic_number{ { 0 } };
+        std::array<unsigned char, 2> magic_number{{0}};
         fs >> magic_number[0] >> magic_number[1];
 
         // PPM check
-        if(static_cast<char>(magic_number[0]) == 'P' && static_cast<char>(magic_number[1]) == '6')
+        if (static_cast<char>(magic_number[0]) == 'P' && static_cast<char>(magic_number[1]) == '6')
         {
             type = ImageType::PPM;
         }
-        else if(magic_number[0] == 0xFF && magic_number[1] == 0xD8)
+        else if (magic_number[0] == 0xFF && magic_number[1] == 0xD8)
         {
             type = ImageType::JPEG;
         }
 
         fs.close();
     }
-    catch(std::runtime_error &e)
+    catch (std::runtime_error &e)
     {
         ARM_COMPUTE_ERROR_VAR("Accessing %s: %s", filename.c_str(), e.what());
     }
@@ -202,7 +201,7 @@ ImageType get_image_type_from_file(const std::string &filename)
 std::tuple<unsigned int, unsigned int, int> parse_ppm_header(std::ifstream &fs)
 {
     // Check the PPM magic number is valid
-    std::array<char, 2> magic_number{ { 0 } };
+    std::array<char, 2> magic_number{{0}};
     fs >> magic_number[0] >> magic_number[1];
     ARM_COMPUTE_ERROR_ON_MSG(magic_number[0] != 'P' || magic_number[1] != '6', "Invalid file type");
     ARM_COMPUTE_UNUSED(magic_number);
@@ -238,12 +237,12 @@ npy::header_t parse_npy_header(std::ifstream &fs) //NOLINT
     // Parse header
     npy::header_t header = npy::parse_header(header_s);
 
-    bool fortran_order = false;
-    std::vector<unsigned long> shape = header.shape;
+    bool                       fortran_order = false;
+    std::vector<unsigned long> shape         = header.shape;
 
     std::reverse(shape.begin(), shape.end());
 
-    return npy::header_t{ header.dtype, fortran_order, shape };
+    return npy::header_t{header.dtype, fortran_order, shape};
 }
 
 /** This function returns the amount of memory free reading from /proc/meminfo
@@ -255,15 +254,15 @@ uint64_t get_mem_free_from_meminfo()
     std::string   line_attribute;
     std::ifstream file_meminfo("/proc/meminfo");
 
-    if(file_meminfo.is_open())
+    if (file_meminfo.is_open())
     {
-        while(!(file_meminfo >> line_attribute).fail())
+        while (!(file_meminfo >> line_attribute).fail())
         {
             //Test if is the line containing MemFree
-            if(line_attribute == "MemFree:")
+            if (line_attribute == "MemFree:")
             {
                 uint64_t mem_available;
-                if(!(file_meminfo >> mem_available).fail())
+                if (!(file_meminfo >> mem_available).fail())
                 {
                     return mem_available;
                 }
diff --git a/utils/Utils.h b/utils/Utils.h
index d181022ffe..626cbcf07f 100644
--- a/utils/Utils.h
+++ b/utils/Utils.h
@@ -87,9 +87,9 @@ public:
         return true;
     };
     /** Run the example. */
-    virtual void do_run() {};
+    virtual void do_run(){};
     /** Teardown the example. */
-    virtual void do_teardown() {};
+    virtual void do_teardown(){};
 
     /** Default destructor. */
     virtual ~Example() = default;
@@ -117,7 +117,8 @@ int run_example(int argc, char **argv)
  * @param[in]      g      Green colour to use
  * @param[in]      b      Blue colour to use
  */
-void draw_detection_rectangle(arm_compute::ITensor *tensor, const arm_compute::DetectionWindow &rect, uint8_t r, uint8_t g, uint8_t b);
+void draw_detection_rectangle(
+    arm_compute::ITensor *tensor, const arm_compute::DetectionWindow &rect, uint8_t r, uint8_t g, uint8_t b);
 
 /** Gets image type given a file
  *
@@ -157,7 +158,7 @@ inline std::string get_typestring(DataType data_type)
     const unsigned int i = 1;
     const char        *c = reinterpret_cast<const char *>(&i);
     std::string        endianness;
-    if(*c == 1)
+    if (*c == 1)
     {
         endianness = std::string("<");
     }
@@ -167,7 +168,7 @@ inline std::string get_typestring(DataType data_type)
     }
     const std::string no_endianness("|");
 
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::U8:
         case DataType::QASYMM8:
@@ -253,7 +254,8 @@ inline void unmap(CLTensor &tensor)
 template <typename T>
 class uniform_real_distribution_16bit
 {
-    static_assert(std::is_same<T, half>::value || std::is_same<T, bfloat16>::value, "Only half and bfloat16 data types supported");
+    static_assert(std::is_same<T, half>::value || std::is_same<T, bfloat16>::value,
+                  "Only half and bfloat16 data types supported");
 
 public:
     using result_type = T;
@@ -262,8 +264,7 @@ public:
      * @param[in] min Minimum value of the distribution
      * @param[in] max Maximum value of the distribution
      */
-    explicit uniform_real_distribution_16bit(float min = 0.f, float max = 1.0)
-        : dist(min, max)
+    explicit uniform_real_distribution_16bit(float min = 0.f, float max = 1.0) : dist(min, max)
     {
     }
 
@@ -285,8 +286,7 @@ class NPYLoader
 {
 public:
     /** Default constructor */
-    NPYLoader()
-        : _fs(), _shape(), _fortran_order(false), _typestring(), _file_layout(DataLayout::NCHW)
+    NPYLoader() : _fs(), _shape(), _fortran_order(false), _typestring(), _file_layout(DataLayout::NCHW)
     {
     }
 
@@ -310,7 +310,7 @@ public:
             _fortran_order       = header.fortran_order;
             _typestring          = header.dtype.str();
         }
-        catch(const std::ifstream::failure &e)
+        catch (const std::ifstream::failure &e)
         {
             ARM_COMPUTE_ERROR_VAR("Accessing %s: %s", npy_filename.c_str(), e.what());
         }
@@ -341,10 +341,10 @@ public:
         // Use the size of the input NPY tensor
         TensorShape shape;
         shape.set_num_dimensions(_shape.size());
-        for(size_t i = 0; i < _shape.size(); ++i)
+        for (size_t i = 0; i < _shape.size(); ++i)
         {
             size_t src = i;
-            if(_fortran_order)
+            if (_fortran_order)
             {
                 src = _shape.size() - 1 - i;
             }
@@ -365,7 +365,8 @@ public:
     void fill_tensor(T &tensor)
     {
         ARM_COMPUTE_ERROR_ON(!is_open());
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(&tensor, arm_compute::DataType::QASYMM8, arm_compute::DataType::S32, arm_compute::DataType::F32, arm_compute::DataType::F16);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(&tensor, arm_compute::DataType::QASYMM8, arm_compute::DataType::S32,
+                                              arm_compute::DataType::F32, arm_compute::DataType::F16);
         try
         {
             // Map buffer if creating a CLTensor
@@ -377,7 +378,8 @@ public:
             const size_t end_position = _fs.tellg();
             _fs.seekg(current_position, std::ios_base::beg);
 
-            ARM_COMPUTE_ERROR_ON_MSG((end_position - current_position) < tensor.info()->tensor_shape().total_size() * tensor.info()->element_size(),
+            ARM_COMPUTE_ERROR_ON_MSG((end_position - current_position) <
+                                         tensor.info()->tensor_shape().total_size() * tensor.info()->element_size(),
                                      "Not enough data in file");
             ARM_COMPUTE_UNUSED(end_position);
 
@@ -385,12 +387,12 @@ public:
             std::string expect_typestr = get_typestring(tensor.info()->data_type());
 
             bool enable_f32_to_f16_conversion = false;
-            if(_typestring != expect_typestr)
+            if (_typestring != expect_typestr)
             {
                 const std::string f32_typestring = "<f4";
                 const std::string f16_typestring = "<f2";
                 // if typestring does not match, check whether _typestring is F32 and can be downcasted to expect_typestr
-                if(_typestring == f32_typestring && expect_typestr == f16_typestring)
+                if (_typestring == f32_typestring && expect_typestr == f16_typestring)
                 {
                     enable_f32_to_f16_conversion = true;
                 }
@@ -402,11 +404,11 @@ public:
 
             bool are_layouts_different = (_file_layout != tensor.info()->data_layout());
             // Correct dimensions (Needs to match TensorShape dimension corrections)
-            if(_shape.size() != tensor.info()->tensor_shape().num_dimensions())
+            if (_shape.size() != tensor.info()->tensor_shape().num_dimensions())
             {
-                for(int i = static_cast<int>(_shape.size()) - 1; i > 0; --i)
+                for (int i = static_cast<int>(_shape.size()) - 1; i > 0; --i)
                 {
-                    if(_shape[i] == 1)
+                    if (_shape[i] == 1)
                     {
                         _shape.pop_back();
                     }
@@ -419,22 +421,28 @@ public:
 
             TensorShape                    permuted_shape = tensor.info()->tensor_shape();
             arm_compute::PermutationVector perm;
-            if(are_layouts_different && tensor.info()->tensor_shape().num_dimensions() > 2)
+            if (are_layouts_different && tensor.info()->tensor_shape().num_dimensions() > 2)
             {
-                perm                                    = (tensor.info()->data_layout() == arm_compute::DataLayout::NHWC) ? arm_compute::PermutationVector(2U, 0U, 1U) : arm_compute::PermutationVector(1U, 2U, 0U);
-                arm_compute::PermutationVector perm_vec = (tensor.info()->data_layout() == arm_compute::DataLayout::NCHW) ? arm_compute::PermutationVector(2U, 0U, 1U) : arm_compute::PermutationVector(1U, 2U, 0U);
+                perm = (tensor.info()->data_layout() == arm_compute::DataLayout::NHWC)
+                           ? arm_compute::PermutationVector(2U, 0U, 1U)
+                           : arm_compute::PermutationVector(1U, 2U, 0U);
+                arm_compute::PermutationVector perm_vec =
+                    (tensor.info()->data_layout() == arm_compute::DataLayout::NCHW)
+                        ? arm_compute::PermutationVector(2U, 0U, 1U)
+                        : arm_compute::PermutationVector(1U, 2U, 0U);
 
                 arm_compute::permute(permuted_shape, perm_vec);
             }
 
             // Validate tensor shape
-            ARM_COMPUTE_ERROR_ON_MSG(_shape.size() != tensor.info()->tensor_shape().num_dimensions(), "Tensor ranks mismatch");
-            for(size_t i = 0; i < _shape.size(); ++i)
+            ARM_COMPUTE_ERROR_ON_MSG(_shape.size() != tensor.info()->tensor_shape().num_dimensions(),
+                                     "Tensor ranks mismatch");
+            for (size_t i = 0; i < _shape.size(); ++i)
             {
                 ARM_COMPUTE_ERROR_ON_MSG(permuted_shape[i] != _shape[i], "Tensor dimensions mismatch");
             }
 
-            switch(tensor.info()->data_type())
+            switch (tensor.info()->data_type())
             {
                 case arm_compute::DataType::QASYMM8:
                 case arm_compute::DataType::S32:
@@ -442,7 +450,8 @@ public:
                 case arm_compute::DataType::F16:
                 {
                     // Read data
-                    if(!are_layouts_different && !_fortran_order && tensor.info()->padding().empty() && !enable_f32_to_f16_conversion)
+                    if (!are_layouts_different && !_fortran_order && tensor.info()->padding().empty() &&
+                        !enable_f32_to_f16_conversion)
                     {
                         // If tensor has no padding read directly from stream.
                         _fs.read(reinterpret_cast<char *>(tensor.buffer()), tensor.info()->total_size());
@@ -452,19 +461,19 @@ public:
                         // If tensor has padding or is in fortran order accessing tensor elements through execution window.
                         Window             window;
                         const unsigned int num_dims = _shape.size();
-                        if(_fortran_order)
+                        if (_fortran_order)
                         {
-                            for(unsigned int dim = 0; dim < num_dims; dim++)
+                            for (unsigned int dim = 0; dim < num_dims; dim++)
                             {
                                 permuted_shape.set(dim, _shape[num_dims - dim - 1]);
                                 perm.set(dim, num_dims - dim - 1);
                             }
-                            if(are_layouts_different)
+                            if (are_layouts_different)
                             {
                                 // Permute only if num_dimensions greater than 2
-                                if(num_dims > 2)
+                                if (num_dims > 2)
                                 {
-                                    if(_file_layout == DataLayout::NHWC) // i.e destination is NCHW --> permute(1,2,0)
+                                    if (_file_layout == DataLayout::NHWC) // i.e destination is NCHW --> permute(1,2,0)
                                     {
                                         arm_compute::permute(perm, arm_compute::PermutationVector(1U, 2U, 0U));
                                     }
@@ -477,22 +486,25 @@ public:
                         }
                         window.use_tensor_dimensions(permuted_shape);
 
-                        execute_window_loop(window, [&](const Coordinates & id)
-                        {
-                            Coordinates dst(id);
-                            arm_compute::permute(dst, perm);
-                            if(enable_f32_to_f16_conversion)
-                            {
-                                float f32_val = 0;
-                                _fs.read(reinterpret_cast<char *>(&f32_val), 4u);
-                                half f16_val                                            = half_float::half_cast<half, std::round_to_nearest>(f32_val);
-                                *(reinterpret_cast<half *>(tensor.ptr_to_element(dst))) = f16_val;
-                            }
-                            else
-                            {
-                                _fs.read(reinterpret_cast<char *>(tensor.ptr_to_element(dst)), tensor.info()->element_size());
-                            }
-                        });
+                        execute_window_loop(window,
+                                            [&](const Coordinates &id)
+                                            {
+                                                Coordinates dst(id);
+                                                arm_compute::permute(dst, perm);
+                                                if (enable_f32_to_f16_conversion)
+                                                {
+                                                    float f32_val = 0;
+                                                    _fs.read(reinterpret_cast<char *>(&f32_val), 4u);
+                                                    half f16_val =
+                                                        half_float::half_cast<half, std::round_to_nearest>(f32_val);
+                                                    *(reinterpret_cast<half *>(tensor.ptr_to_element(dst))) = f16_val;
+                                                }
+                                                else
+                                                {
+                                                    _fs.read(reinterpret_cast<char *>(tensor.ptr_to_element(dst)),
+                                                             tensor.info()->element_size());
+                                                }
+                                            });
                     }
 
                     break;
@@ -504,7 +516,7 @@ public:
             // Unmap buffer if creating a CLTensor
             unmap(tensor);
         }
-        catch(const std::ifstream::failure &e)
+        catch (const std::ifstream::failure &e)
         {
             ARM_COMPUTE_ERROR_VAR("Loading NPY file: %s", e.what());
         }
@@ -543,13 +555,12 @@ void save_to_ppm(T &tensor, const std::string &ppm_filename)
         const unsigned int width  = tensor.info()->tensor_shape()[0];
         const unsigned int height = tensor.info()->tensor_shape()[1];
 
-        fs << "P6\n"
-           << width << " " << height << " 255\n";
+        fs << "P6\n" << width << " " << height << " 255\n";
 
         // Map buffer if creating a CLTensor
         map(tensor, true);
 
-        switch(tensor.info()->format())
+        switch (tensor.info()->format())
         {
             case arm_compute::Format::U8:
             {
@@ -559,13 +570,15 @@ void save_to_ppm(T &tensor, const std::string &ppm_filename)
 
                 arm_compute::Iterator in(&tensor, window);
 
-                arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates &)
-                {
-                    const unsigned char value = *in.ptr();
+                arm_compute::execute_window_loop(
+                    window,
+                    [&](const arm_compute::Coordinates &)
+                    {
+                        const unsigned char value = *in.ptr();
 
-                    fs << value << value << value;
-                },
-                in);
+                        fs << value << value << value;
+                    },
+                    in);
 
                 break;
             }
@@ -577,11 +590,13 @@ void save_to_ppm(T &tensor, const std::string &ppm_filename)
 
                 arm_compute::Iterator in(&tensor, window);
 
-                arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates &)
-                {
-                    fs.write(reinterpret_cast<std::fstream::char_type *>(in.ptr()), width * tensor.info()->element_size());
-                },
-                in);
+                arm_compute::execute_window_loop(
+                    window,
+                    [&](const arm_compute::Coordinates &) {
+                        fs.write(reinterpret_cast<std::fstream::char_type *>(in.ptr()),
+                                 width * tensor.info()->element_size());
+                    },
+                    in);
 
                 break;
             }
@@ -592,7 +607,7 @@ void save_to_ppm(T &tensor, const std::string &ppm_filename)
         // Unmap buffer if creating a CLTensor
         unmap(tensor);
     }
-    catch(const std::ofstream::failure &e)
+    catch (const std::ofstream::failure &e)
     {
         ARM_COMPUTE_ERROR_VAR("Writing %s: (%s)", ppm_filename.c_str(), e.what());
     }
@@ -620,7 +635,7 @@ void save_to_npy(T &tensor, const std::string &npy_filename, bool fortran_order)
 
         std::vector<npy::ndarray_len_t> shape(tensor.info()->num_dimensions());
 
-        for(unsigned int i = 0, j = tensor.info()->num_dimensions() - 1; i < tensor.info()->num_dimensions(); ++i, --j)
+        for (unsigned int i = 0, j = tensor.info()->num_dimensions() - 1; i < tensor.info()->num_dimensions(); ++i, --j)
         {
             shape[i] = tensor.info()->tensor_shape()[!fortran_order ? j : i];
         }
@@ -634,7 +649,7 @@ void save_to_npy(T &tensor, const std::string &npy_filename, bool fortran_order)
         const npy::dtype_t           dtype = npy::dtype_map.at(std::type_index(typeid(tmp)));
 
         std::ofstream stream(npy_filename, std::ofstream::binary);
-        npy::header_t header{ dtype, fortran_order, shape };
+        npy::header_t header{dtype, fortran_order, shape};
         npy::write_header(stream, header);
 
         arm_compute::Window window;
@@ -642,16 +657,16 @@ void save_to_npy(T &tensor, const std::string &npy_filename, bool fortran_order)
 
         arm_compute::Iterator in(&tensor, window);
 
-        arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates &)
-        {
-            stream.write(reinterpret_cast<const char *>(in.ptr()), sizeof(typestring_type));
-        },
-        in);
+        arm_compute::execute_window_loop(
+            window,
+            [&](const arm_compute::Coordinates &)
+            { stream.write(reinterpret_cast<const char *>(in.ptr()), sizeof(typestring_type)); },
+            in);
 
         // Unmap buffer if creating a CLTensor
         unmap(tensor);
     }
-    catch(const std::ofstream::failure &e)
+    catch (const std::ofstream::failure &e)
     {
         ARM_COMPUTE_ERROR_VAR("Writing %s: (%s)", npy_filename.c_str(), e.what());
     }
@@ -675,7 +690,7 @@ void load_trained_data(T &tensor, const std::string &filename)
         // Open file
         fs.open(filename, std::ios::in | std::ios::binary);
 
-        if(!fs.good())
+        if (!fs.good())
         {
             throw std::runtime_error("Could not load binary data: " + filename);
         }
@@ -687,23 +702,26 @@ void load_trained_data(T &tensor, const std::string &filename)
 
         window.set(arm_compute::Window::DimX, arm_compute::Window::Dimension(0, 1, 1));
 
-        for(unsigned int d = 1; d < tensor.info()->num_dimensions(); ++d)
+        for (unsigned int d = 1; d < tensor.info()->num_dimensions(); ++d)
         {
             window.set(d, Window::Dimension(0, tensor.info()->tensor_shape()[d], 1));
         }
 
         arm_compute::Iterator in(&tensor, window);
 
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-            fs.read(reinterpret_cast<std::fstream::char_type *>(in.ptr()), tensor.info()->tensor_shape()[0] * tensor.info()->element_size());
-        },
-        in);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &)
+            {
+                fs.read(reinterpret_cast<std::fstream::char_type *>(in.ptr()),
+                        tensor.info()->tensor_shape()[0] * tensor.info()->element_size());
+            },
+            in);
 
         // Unmap buffer if creating a CLTensor
         unmap(tensor);
     }
-    catch(const std::ofstream::failure &e)
+    catch (const std::ofstream::failure &e)
     {
         ARM_COMPUTE_ERROR_VAR("Writing %s: (%s)", filename.c_str(), e.what());
     }
@@ -718,11 +736,8 @@ void fill_tensor_value(TensorType &tensor, T value)
     window.use_tensor_dimensions(tensor.info()->tensor_shape());
 
     Iterator it_tensor(&tensor, window);
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        *reinterpret_cast<T *>(it_tensor.ptr()) = value;
-    },
-    it_tensor);
+    execute_window_loop(
+        window, [&](const Coordinates &) { *reinterpret_cast<T *>(it_tensor.ptr()) = value; }, it_tensor);
 
     unmap(tensor);
 }
@@ -745,22 +760,23 @@ void fill_tensor_vector(TensorType &tensor, std::vector<T> vec)
 
     int      i = 0;
     Iterator it_tensor(&tensor, window);
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        *reinterpret_cast<T *>(it_tensor.ptr()) = vec.at(i++);
-    },
-    it_tensor);
+    execute_window_loop(
+        window, [&](const Coordinates &) { *reinterpret_cast<T *>(it_tensor.ptr()) = vec.at(i++); }, it_tensor);
 
     unmap(tensor);
 }
 
 template <typename T, typename TensorType>
-void fill_random_tensor(TensorType &tensor, std::random_device::result_type seed, T lower_bound = std::numeric_limits<T>::lowest(), T upper_bound = std::numeric_limits<T>::max())
+void fill_random_tensor(TensorType                     &tensor,
+                        std::random_device::result_type seed,
+                        T                               lower_bound = std::numeric_limits<T>::lowest(),
+                        T                               upper_bound = std::numeric_limits<T>::max())
 {
     constexpr bool is_fp_16bit = std::is_same<T, half>::value || std::is_same<T, bfloat16>::value;
     constexpr bool is_integral = std::is_integral<T>::value && !is_fp_16bit;
 
-    using fp_dist_type = typename std::conditional<is_fp_16bit, arm_compute::utils::uniform_real_distribution_16bit<T>, std::uniform_real_distribution<T>>::type;
+    using fp_dist_type = typename std::conditional<is_fp_16bit, arm_compute::utils::uniform_real_distribution_16bit<T>,
+                                                   std::uniform_real_distribution<T>>::type;
     using dist_type    = typename std::conditional<is_integral, std::uniform_int_distribution<T>, fp_dist_type>::type;
 
     std::mt19937 gen(seed);
@@ -772,17 +788,16 @@ void fill_random_tensor(TensorType &tensor, std::random_device::result_type seed
     window.use_tensor_dimensions(tensor.info()->tensor_shape());
 
     Iterator it(&tensor, window);
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        *reinterpret_cast<T *>(it.ptr()) = dist(gen);
-    },
-    it);
+    execute_window_loop(
+        window, [&](const Coordinates &) { *reinterpret_cast<T *>(it.ptr()) = dist(gen); }, it);
 
     unmap(tensor);
 }
 
 template <typename T, typename TensorType>
-void fill_random_tensor(TensorType &tensor, T lower_bound = std::numeric_limits<T>::lowest(), T upper_bound = std::numeric_limits<T>::max())
+void fill_random_tensor(TensorType &tensor,
+                        T           lower_bound = std::numeric_limits<T>::lowest(),
+                        T           upper_bound = std::numeric_limits<T>::max())
 {
     std::random_device rd;
     fill_random_tensor(tensor, rd(), lower_bound, upper_bound);
@@ -791,7 +806,8 @@ void fill_random_tensor(TensorType &tensor, T lower_bound = std::numeric_limits<
 template <typename T>
 void init_sgemm_output(T &dst, T &src0, T &src1, arm_compute::DataType dt)
 {
-    dst.allocator()->init(TensorInfo(TensorShape(src1.info()->dimension(0), src0.info()->dimension(1), src0.info()->dimension(2)), 1, dt));
+    dst.allocator()->init(TensorInfo(
+        TensorShape(src1.info()->dimension(0), src0.info()->dimension(1), src0.info()->dimension(2)), 1, dt));
 }
 /** This function returns the amount of memory free reading from /proc/meminfo
  *
@@ -823,14 +839,16 @@ int compare_tensor(ITensor &tensor1, ITensor &tensor2, T tolerance)
     Iterator itensor1(&tensor1, window);
     Iterator itensor2(&tensor2, window);
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        if(std::abs(*reinterpret_cast<T *>(itensor1.ptr()) - *reinterpret_cast<T *>(itensor2.ptr())) > tolerance)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
         {
-            ++num_mismatches;
-        }
-    },
-    itensor1, itensor2);
+            if (std::abs(*reinterpret_cast<T *>(itensor1.ptr()) - *reinterpret_cast<T *>(itensor2.ptr())) > tolerance)
+            {
+                ++num_mismatches;
+            }
+        },
+        itensor1, itensor2);
 
     unmap(itensor1);
     unmap(itensor2);
diff --git a/utils/command_line/CommandLineParser.h b/utils/command_line/CommandLineParser.h
index 523f25e8a1..57796bce73 100644
--- a/utils/command_line/CommandLineParser.h
+++ b/utils/command_line/CommandLineParser.h
@@ -24,14 +24,13 @@
 #ifndef ARM_COMPUTE_UTILS_COMMANDLINEPARSER
 #define ARM_COMPUTE_UTILS_COMMANDLINEPARSER
 
-#include "Option.h"
 #include "arm_compute/core/utils/misc/Utility.h"
 
+#include "Option.h"
 #include <cstring>
 #include <iostream>
 #include <map>
 #include <memory>
-#include <memory>
 #include <regex>
 #include <string>
 #include <utility>
@@ -56,7 +55,7 @@ public:
      * @return Pointer to the option. The option is owned by the parser.
      */
     template <typename T, typename... As>
-    T *add_option(const std::string &name, As &&... args);
+    T *add_option(const std::string &name, As &&...args);
 
     /** Function to add a new positional argument to the parser.
      *
@@ -65,7 +64,7 @@ public:
      * @return Pointer to the option. The option is owned by the parser.
      */
     template <typename T, typename... As>
-    T *add_positional_option(As &&... args);
+    T *add_positional_option(As &&...args);
 
     /** Parses the command line arguments and updates the options accordingly.
      *
@@ -101,14 +100,14 @@ private:
 };
 
 template <typename T, typename... As>
-inline T *CommandLineParser::add_option(const std::string &name, As &&... args)
+inline T *CommandLineParser::add_option(const std::string &name, As &&...args)
 {
     auto result = _options.emplace(name, std::make_unique<T>(name, std::forward<As>(args)...));
     return static_cast<T *>(result.first->second.get());
 }
 
 template <typename T, typename... As>
-inline T *CommandLineParser::add_positional_option(As &&... args)
+inline T *CommandLineParser::add_positional_option(As &&...args)
 {
     _positional_options.emplace_back(std::make_unique<T>(std::forward<As>(args)...));
     return static_cast<T *>(_positional_options.back().get());
@@ -116,11 +115,11 @@ inline T *CommandLineParser::add_positional_option(As &&... args)
 
 inline void CommandLineParser::parse(int argc, char **argv)
 {
-    const std::regex option_regex{ "--((?:no-)?)([^=]+)(?:=(.*))?" };
+    const std::regex option_regex{"--((?:no-)?)([^=]+)(?:=(.*))?"};
 
-    const auto set_option = [&](const std::string & option, const std::string & name, const std::string & value)
+    const auto set_option = [&](const std::string &option, const std::string &name, const std::string &value)
     {
-        if(_options.find(name) == _options.end())
+        if (_options.find(name) == _options.end())
         {
             _unknown_options.push_back(option);
             return;
@@ -128,7 +127,7 @@ inline void CommandLineParser::parse(int argc, char **argv)
 
         const bool success = _options[name]->parse(value);
 
-        if(!success)
+        if (!success)
         {
             _invalid_options.push_back(option);
         }
@@ -136,26 +135,27 @@ inline void CommandLineParser::parse(int argc, char **argv)
 
     unsigned int positional_index = 0;
 
-    for(int i = 1; i < argc; ++i)
+    for (int i = 1; i < argc; ++i)
     {
-        std::string mixed_case_opt{ argv[i] };
+        std::string mixed_case_opt{argv[i]};
         int         equal_sign = mixed_case_opt.find('=');
         int         pos        = (equal_sign == -1) ? strlen(argv[i]) : equal_sign;
 
-        const std::string option = arm_compute::utility::tolower(mixed_case_opt.substr(0, pos)) + mixed_case_opt.substr(pos);
-        std::smatch       option_matches;
+        const std::string option =
+            arm_compute::utility::tolower(mixed_case_opt.substr(0, pos)) + mixed_case_opt.substr(pos);
+        std::smatch option_matches;
 
-        if(std::regex_match(option, option_matches, option_regex))
+        if (std::regex_match(option, option_matches, option_regex))
         {
             // Boolean option
-            if(option_matches.str(3).empty())
+            if (option_matches.str(3).empty())
             {
                 set_option(option, option_matches.str(2), option_matches.str(1).empty() ? "true" : "false");
             }
             else
             {
                 // Can't have "no-" and a value
-                if(!option_matches.str(1).empty())
+                if (!option_matches.str(1).empty())
                 {
                     _invalid_options.emplace_back(option);
                 }
@@ -167,7 +167,7 @@ inline void CommandLineParser::parse(int argc, char **argv)
         }
         else
         {
-            if(positional_index >= _positional_options.size())
+            if (positional_index >= _positional_options.size())
             {
                 _invalid_options.push_back(mixed_case_opt);
             }
@@ -184,30 +184,30 @@ inline bool CommandLineParser::validate() const
 {
     bool is_valid = true;
 
-    for(const auto &option : _options)
+    for (const auto &option : _options)
     {
-        if(option.second->is_required() && !option.second->is_set())
+        if (option.second->is_required() && !option.second->is_set())
         {
             is_valid = false;
             std::cerr << "ERROR: Option '" << option.second->name() << "' is required but not given!\n";
         }
     }
 
-    for(const auto &option : _positional_options)
+    for (const auto &option : _positional_options)
     {
-        if(option->is_required() && !option->is_set())
+        if (option->is_required() && !option->is_set())
         {
             is_valid = false;
             std::cerr << "ERROR: Option '" << option->name() << "' is required but not given!\n";
         }
     }
 
-    for(const auto &option : _unknown_options)
+    for (const auto &option : _unknown_options)
     {
         std::cerr << "WARNING: Skipping unknown option '" << option << "'!\n";
     }
 
-    for(const auto &option : _invalid_options)
+    for (const auto &option : _invalid_options)
     {
         std::cerr << "WARNING: Skipping invalid option '" << option << "'!\n";
     }
@@ -219,19 +219,19 @@ inline void CommandLineParser::print_help(const std::string &program_name) const
 {
     std::cout << "usage: " << program_name << " \n";
 
-    for(const auto &option : _options)
+    for (const auto &option : _options)
     {
         std::cout << option.second->help() << "\n";
     }
 
-    for(const auto &option : _positional_options)
+    for (const auto &option : _positional_options)
     {
         std::string help_to_print;
 
         // Extract help sub-string
         const std::string help_str = option->help();
         const size_t      help_pos = help_str.find(" - ");
-        if(help_pos != std::string::npos)
+        if (help_pos != std::string::npos)
         {
             help_to_print = help_str.substr(help_pos);
         }
diff --git a/utils/command_line/EnumListOption.h b/utils/command_line/EnumListOption.h
index f4ee283528..6c4146fa75 100644
--- a/utils/command_line/EnumListOption.h
+++ b/utils/command_line/EnumListOption.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_UTILS_ENUMLISTOPTION
 
 #include "Option.h"
-
 #include <initializer_list>
 #include <set>
 #include <sstream>
@@ -57,7 +56,7 @@ public:
      */
     EnumListOption(std::string name, std::set<T> allowed_values, std::initializer_list<T> &&default_values);
 
-    bool parse(std::string value) override;
+    bool        parse(std::string value) override;
     std::string help() const override;
 
     /** Get the values of the option.
@@ -73,13 +72,17 @@ private:
 
 template <typename T>
 inline EnumListOption<T>::EnumListOption(std::string name, std::set<T> allowed_values)
-    : Option{ std::move(name) }, _allowed_values{ std::move(allowed_values) }
+    : Option{std::move(name)}, _allowed_values{std::move(allowed_values)}
 {
 }
 
 template <typename T>
-inline EnumListOption<T>::EnumListOption(std::string name, std::set<T> allowed_values, std::initializer_list<T> &&default_values)
-    : Option{ std::move(name), false, true }, _values{ std::forward<std::initializer_list<T>>(default_values) }, _allowed_values{ std::move(allowed_values) }
+inline EnumListOption<T>::EnumListOption(std::string                name,
+                                         std::set<T>                allowed_values,
+                                         std::initializer_list<T> &&default_values)
+    : Option{std::move(name), false, true},
+      _values{std::forward<std::initializer_list<T>>(default_values)},
+      _allowed_values{std::move(allowed_values)}
 {
 }
 
@@ -90,10 +93,10 @@ bool EnumListOption<T>::parse(std::string value)
     _values.clear();
     _is_set = true;
 
-    std::stringstream stream{ value };
+    std::stringstream stream{value};
     std::string       item;
 
-    while(!std::getline(stream, item, ',').fail())
+    while (!std::getline(stream, item, ',').fail())
     {
         try
         {
@@ -102,9 +105,9 @@ bool EnumListOption<T>::parse(std::string value)
 
             item_stream >> typed_value;
 
-            if(!item_stream.fail())
+            if (!item_stream.fail())
             {
-                if(_allowed_values.count(typed_value) == 0)
+                if (_allowed_values.count(typed_value) == 0)
                 {
                     _is_set = false;
                     continue;
@@ -115,7 +118,7 @@ bool EnumListOption<T>::parse(std::string value)
 
             _is_set = _is_set && !item_stream.fail();
         }
-        catch(const std::invalid_argument &)
+        catch (const std::invalid_argument &)
         {
             _is_set = false;
         }
@@ -130,7 +133,7 @@ std::string EnumListOption<T>::help() const
     std::stringstream msg;
     msg << "--" + name() + "={";
 
-    for(const auto &value : _allowed_values)
+    for (const auto &value : _allowed_values)
     {
         msg << value << ",";
     }
diff --git a/utils/command_line/EnumOption.h b/utils/command_line/EnumOption.h
index 6bcfe5f14e..eb43b6c54e 100644
--- a/utils/command_line/EnumOption.h
+++ b/utils/command_line/EnumOption.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_UTILS_ENUMOPTION
 
 #include "SimpleOption.h"
-
 #include <set>
 #include <sstream>
 #include <stdexcept>
@@ -55,7 +54,7 @@ public:
      */
     EnumOption(std::string name, std::set<T> allowed_values, T default_value);
 
-    bool parse(std::string value) override;
+    bool        parse(std::string value) override;
     std::string help() const override;
 
     /** Get the selected value.
@@ -70,13 +69,13 @@ private:
 
 template <typename T>
 inline EnumOption<T>::EnumOption(std::string name, std::set<T> allowed_values)
-    : SimpleOption<T>{ std::move(name) }, _allowed_values{ std::move(allowed_values) }
+    : SimpleOption<T>{std::move(name)}, _allowed_values{std::move(allowed_values)}
 {
 }
 
 template <typename T>
 inline EnumOption<T>::EnumOption(std::string name, std::set<T> allowed_values, T default_value)
-    : SimpleOption<T>{ std::move(name), std::move(default_value) }, _allowed_values{ std::move(allowed_values) }
+    : SimpleOption<T>{std::move(name), std::move(default_value)}, _allowed_values{std::move(allowed_values)}
 {
 }
 
@@ -85,14 +84,14 @@ bool EnumOption<T>::parse(std::string value)
 {
     try
     {
-        std::stringstream stream{ value };
+        std::stringstream stream{value};
         T                 typed_value{};
 
         stream >> typed_value;
 
-        if(!stream.fail())
+        if (!stream.fail())
         {
-            if(_allowed_values.count(typed_value) == 0)
+            if (_allowed_values.count(typed_value) == 0)
             {
                 return false;
             }
@@ -104,7 +103,7 @@ bool EnumOption<T>::parse(std::string value)
 
         return false;
     }
-    catch(const std::invalid_argument &)
+    catch (const std::invalid_argument &)
     {
         return false;
     }
@@ -116,7 +115,7 @@ std::string EnumOption<T>::help() const
     std::stringstream msg;
     msg << "--" + this->name() + "={";
 
-    for(const auto &value : _allowed_values)
+    for (const auto &value : _allowed_values)
     {
         msg << value << ",";
     }
diff --git a/utils/command_line/ListOption.h b/utils/command_line/ListOption.h
index b290191e08..f318e1646a 100644
--- a/utils/command_line/ListOption.h
+++ b/utils/command_line/ListOption.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_UTILS_LISTOPTION
 
 #include "Option.h"
-
 #include <initializer_list>
 #include <sstream>
 #include <stdexcept>
@@ -50,7 +49,7 @@ public:
      */
     ListOption(std::string name, std::initializer_list<T> &&default_values);
 
-    bool parse(std::string value) override;
+    bool        parse(std::string value) override;
     std::string help() const override;
 
     /** Get the list of option values.
@@ -65,7 +64,7 @@ private:
 
 template <typename T>
 inline ListOption<T>::ListOption(std::string name, std::initializer_list<T> &&default_values)
-    : Option{ std::move(name), false, true }, _values{ std::forward<std::initializer_list<T>>(default_values) }
+    : Option{std::move(name), false, true}, _values{std::forward<std::initializer_list<T>>(default_values)}
 {
 }
 
@@ -76,17 +75,17 @@ bool ListOption<T>::parse(std::string value)
 
     try
     {
-        std::stringstream stream{ value };
+        std::stringstream stream{value};
         std::string       item;
 
-        while(!std::getline(stream, item, ',').fail())
+        while (!std::getline(stream, item, ',').fail())
         {
             std::stringstream item_stream(item);
             T                 typed_value{};
 
             item_stream >> typed_value;
 
-            if(!item_stream.fail())
+            if (!item_stream.fail())
             {
                 _values.emplace_back(typed_value);
             }
@@ -96,7 +95,7 @@ bool ListOption<T>::parse(std::string value)
 
         return _is_set;
     }
-    catch(const std::invalid_argument &)
+    catch (const std::invalid_argument &)
     {
         return false;
     }
diff --git a/utils/command_line/Option.h b/utils/command_line/Option.h
index c845e5499f..b4288538b0 100644
--- a/utils/command_line/Option.h
+++ b/utils/command_line/Option.h
@@ -97,18 +97,17 @@ public:
 
 protected:
     std::string _name;
-    bool        _is_required{ false };
-    bool        _is_set{ false };
+    bool        _is_required{false};
+    bool        _is_set{false};
     std::string _help{};
 };
 
-inline Option::Option(std::string name)
-    : _name{ std::move(name) }
+inline Option::Option(std::string name) : _name{std::move(name)}
 {
 }
 
 inline Option::Option(std::string name, bool is_required, bool is_set)
-    : _name{ std::move(name) }, _is_required{ is_required }, _is_set{ is_set }
+    : _name{std::move(name)}, _is_required{is_required}, _is_set{is_set}
 {
 }
 
diff --git a/utils/command_line/SimpleOption.h b/utils/command_line/SimpleOption.h
index d76797375d..f6329c1790 100644
--- a/utils/command_line/SimpleOption.h
+++ b/utils/command_line/SimpleOption.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_UTILS_SIMPLEOPTION
 
 #include "Option.h"
-
 #include <sstream>
 #include <stdexcept>
 #include <string>
@@ -74,7 +73,7 @@ protected:
 
 template <typename T>
 inline SimpleOption<T>::SimpleOption(std::string name, T default_value)
-    : Option{ std::move(name), false, true }, _value{ std::move(default_value) }
+    : Option{std::move(name), false, true}, _value{std::move(default_value)}
 {
 }
 
@@ -83,12 +82,12 @@ bool SimpleOption<T>::parse(std::string value)
 {
     try
     {
-        std::stringstream stream{ std::move(value) };
+        std::stringstream stream{std::move(value)};
         stream >> _value;
         _is_set = !stream.fail();
         return _is_set;
     }
-    catch(const std::invalid_argument &)
+    catch (const std::invalid_argument &)
     {
         return false;
     }
diff --git a/utils/command_line/ToggleOption.h b/utils/command_line/ToggleOption.h
index d3c68663b5..694b7bb9e6 100644
--- a/utils/command_line/ToggleOption.h
+++ b/utils/command_line/ToggleOption.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_UTILS_TOGGLEOPTION
 
 #include "SimpleOption.h"
-
 #include <string>
 
 namespace arm_compute
@@ -45,26 +44,23 @@ public:
      */
     ToggleOption(std::string name, bool default_value);
 
-    bool parse(std::string value) override;
+    bool        parse(std::string value) override;
     std::string help() const override;
 };
 
 inline ToggleOption::ToggleOption(std::string name, bool default_value)
-    : SimpleOption<bool>
-{
-    std::move(name), default_value
-}
+    : SimpleOption<bool>{std::move(name), default_value}
 {
 }
 
 inline bool ToggleOption::parse(std::string value)
 {
-    if(value == "true")
+    if (value == "true")
     {
         _value  = true;
         _is_set = true;
     }
-    else if(value == "false")
+    else if (value == "false")
     {
         _value  = false;
         _is_set = true;
author	Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com>	2023-09-27 17:46:17 +0100
committer	felixjohnny.thomasmathibalan <felixjohnny.thomasmathibalan@arm.com>	2023-09-28 12:08:05 +0000
commit	afd38f0c617d6f89b2b4532c6c44f116617e2b6f (patch)
tree	03bc7d5a762099989b16a656fa8d397b490ed70e
parent	bdcb4c148ee2fdeaaddf4cf1e57bbb0de02bb894 (diff)
download	ComputeLibrary-afd38f0c617d6f89b2b4532c6c44f116617e2b6f.tar.gz